package org.grobid.trainer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.parsers.SAXParserFactory;
import org.grobid.core.GrobidModels;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.trainer.sax.TEIReferenceSegmenterSaxParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/grobid/trainer/ReferenceSegmenterTrainer.class */
public class ReferenceSegmenterTrainer extends AbstractTrainer {
    public static final Logger LOGGER = LoggerFactory.getLogger(ReferenceSegmenterTrainer.class);

    public ReferenceSegmenterTrainer() {
        super(GrobidModels.REFERENCE_SEGMENTER);
    }

    @Override // org.grobid.trainer.AbstractTrainer, org.grobid.trainer.Trainer
    public int createCRFPPData(File file, File file2) {
        return createCRFPPData(file, file2, null, 1.0d);
    }

    @Override // org.grobid.trainer.Trainer
    public int createCRFPPData(File file, File file2, File file3, double d) {
        int i = 0;
        try {
            LOGGER.info("Corpus directory: " + file);
            if (file2 != null) {
                LOGGER.info("output path for training data: " + file2);
            }
            if (file3 != null) {
                LOGGER.info("output path for evaluation data: " + file3);
            }
            File file4 = new File(file.getAbsolutePath() + "/tei/");
            if (!file4.exists()) {
                throw new IllegalStateException("Folder " + file.getAbsolutePath() + " does not exist. Please have a look!");
            }
            File[] listFiles = file4.listFiles(new FilenameFilter() { // from class: org.grobid.trainer.ReferenceSegmenterTrainer.1
                @Override // java.io.FilenameFilter
                public boolean accept(File file5, String str) {
                    return str.endsWith(".xml") || str.endsWith(".tei");
                }
            });
            if (listFiles == null) {
                throw new IllegalStateException("Folder " + file4.getAbsolutePath() + " does not seem to contain training data. Please check");
            }
            LOGGER.info("Processing " + listFiles.length + " tei files");
            FileOutputStream fileOutputStream = null;
            OutputStreamWriter outputStreamWriter = null;
            if (file2 != null) {
                fileOutputStream = new FileOutputStream(file2);
                outputStreamWriter = new OutputStreamWriter(fileOutputStream, "UTF8");
            }
            FileOutputStream fileOutputStream2 = null;
            OutputStreamWriter outputStreamWriter2 = null;
            if (file3 != null) {
                fileOutputStream2 = new FileOutputStream(file3);
                outputStreamWriter2 = new OutputStreamWriter(fileOutputStream2, "UTF8");
            }
            System.out.println("training data under: " + file2);
            System.out.println("evaluation data under: " + file3);
            SAXParserFactory newInstance = SAXParserFactory.newInstance();
            for (File file5 : listFiles) {
                TEIReferenceSegmenterSaxParser tEIReferenceSegmenterSaxParser = new TEIReferenceSegmenterSaxParser();
                String name = file5.getName();
                newInstance.newSAXParser().parse(file5, tEIReferenceSegmenterSaxParser);
                List<String> labeledResult = tEIReferenceSegmenterSaxParser.getLabeledResult();
                i += tEIReferenceSegmenterSaxParser.getTotalReferences();
                File file6 = new File(file.getAbsolutePath() + "/raw/");
                if (!file6.exists()) {
                    throw new IllegalStateException("Folder " + file6.getAbsolutePath() + " does not exist. Please have a look!");
                }
                File file7 = new File(file6.getAbsolutePath() + File.separator + name.replace(".tei.xml", ""));
                if (file7.exists()) {
                    int i2 = 0;
                    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file6.getAbsolutePath() + File.separator + name.replace(".tei.xml", "")), "UTF8"));
                    StringBuilder sb = new StringBuilder();
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        int indexOf = readLine.indexOf(32);
                        String normaliseTextAndRemoveSpaces = indexOf != -1 ? UnicodeUtil.normaliseTextAndRemoveSpaces(readLine.substring(0, indexOf)) : null;
                        int i3 = i2;
                        while (i3 < labeledResult.size()) {
                            StringTokenizer stringTokenizer = new StringTokenizer(labeledResult.get(i3), " ");
                            if (stringTokenizer.hasMoreTokens() && UnicodeUtil.normaliseTextAndRemoveSpaces(stringTokenizer.nextToken()).equals(normaliseTextAndRemoveSpaces)) {
                                sb.append(readLine).append(" ").append(stringTokenizer.nextToken()).append("\n");
                                i2 = i3 + 1;
                                i3 = i2 + 10;
                            }
                            if (i3 - i2 > 5) {
                                break;
                            }
                            i3++;
                        }
                    }
                    bufferedReader.close();
                    if (outputStreamWriter == null && outputStreamWriter2 != null) {
                        outputStreamWriter2.write(sb.toString() + "\n \n");
                    }
                    if (outputStreamWriter != null && outputStreamWriter2 == null) {
                        outputStreamWriter.write(sb.toString() + "\n \n");
                    } else if (Math.random() <= d && outputStreamWriter != null) {
                        outputStreamWriter.write(sb.toString() + "\n \n");
                    } else if (outputStreamWriter2 != null) {
                        outputStreamWriter2.write(sb.toString() + "\n \n");
                    }
                } else {
                    System.out.println("Raw file " + file7 + " does not exist. Please have a look!");
                }
            }
            if (outputStreamWriter != null) {
                outputStreamWriter.close();
                fileOutputStream.close();
            }
            if (outputStreamWriter2 != null) {
                outputStreamWriter2.close();
                fileOutputStream2.close();
            }
            return i;
        } catch (Exception e) {
            throw new GrobidException("An exception occurred while trainining/evaluating reference segmenter model.", e);
        }
    }

    public static void main(String[] strArr) throws Exception {
        GrobidProperties.getInstance();
        AbstractTrainer.runTraining(new ReferenceSegmenterTrainer());
        System.out.println(AbstractTrainer.runEvaluation(new ReferenceSegmenterTrainer()));
        System.exit(0);
    }
}
