package org.grobid.trainer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.StringTokenizer;
import javax.xml.parsers.SAXParserFactory;
import org.grobid.core.GrobidModels;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.UnicodeUtil;
import org.grobid.trainer.sax.TEIFulltextSaxParser;

/* loaded from: input_file:org/grobid/trainer/FulltextTrainer.class */
public class FulltextTrainer extends AbstractTrainer {
    public FulltextTrainer() {
        super(GrobidModels.FULLTEXT);
        this.epsilon = 1.0E-4d;
        this.window = 20;
        this.nbMaxIterations = 1500;
    }

    @Override // org.grobid.trainer.AbstractTrainer, org.grobid.trainer.Trainer
    public int createCRFPPData(File file, File file2) {
        return addFeaturesFulltext(file.getAbsolutePath() + "/tei", file + "/raw", file2, null, 1.0d);
    }

    @Override // org.grobid.trainer.Trainer
    public int createCRFPPData(File file, File file2, File file3, double d) {
        return addFeaturesFulltext(file.getAbsolutePath() + "/tei", file.getAbsolutePath() + "/raw", file2, file3, d);
    }

    public int addFeaturesFulltext(String str, String str2, File file, File file2, double d) {
        File[] listFiles;
        int i = 0;
        try {
            System.out.println("sourceTEIPathLabel: " + str);
            System.out.println("sourceRawPathLabel: " + str2);
            System.out.println("trainingOutputPath: " + file);
            System.out.println("evalOutputPath: " + file2);
            listFiles = new File(str).listFiles(new FilenameFilter() { // from class: org.grobid.trainer.FulltextTrainer.1
                @Override // java.io.FilenameFilter
                public boolean accept(File file3, String str3) {
                    return str3.endsWith(".tei.xml");
                }
            });
        } catch (Exception e) {
            LOGGER.error("An exception occured while running Grobid.", e);
        }
        if (listFiles == null) {
            return 0;
        }
        System.out.println(listFiles.length + " tei files");
        FileOutputStream fileOutputStream = null;
        OutputStreamWriter outputStreamWriter = null;
        if (file != null) {
            fileOutputStream = new FileOutputStream(file);
            outputStreamWriter = new OutputStreamWriter(fileOutputStream, "UTF8");
        }
        FileOutputStream fileOutputStream2 = null;
        OutputStreamWriter outputStreamWriter2 = null;
        if (file2 != null) {
            fileOutputStream2 = new FileOutputStream(file2);
            outputStreamWriter2 = new OutputStreamWriter(fileOutputStream2, "UTF8");
        }
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        for (File file3 : listFiles) {
            String name = file3.getName();
            LOGGER.info("Processing: " + name);
            TEIFulltextSaxParser tEIFulltextSaxParser = new TEIFulltextSaxParser();
            newInstance.newSAXParser().parse(file3, tEIFulltextSaxParser);
            List<String> labeledResult = tEIFulltextSaxParser.getLabeledResult();
            try {
                File file4 = new File(str2 + File.separator + name.replace(".tei.xml", ""));
                if (file4.exists()) {
                    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file4), "UTF8"));
                    int i2 = 0;
                    StringBuilder sb = new StringBuilder();
                    int i3 = 0;
                    String str3 = null;
                    int i4 = 0;
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        }
                        if (readLine.trim().length() != 0) {
                            i3++;
                            int indexOf = readLine.indexOf(32);
                            String normaliseTextAndRemoveSpaces = indexOf != -1 ? UnicodeUtil.normaliseTextAndRemoveSpaces(readLine.substring(0, indexOf)) : null;
                            int i5 = i2;
                            while (true) {
                                if (i5 >= labeledResult.size()) {
                                    break;
                                }
                                String str4 = labeledResult.get(i5);
                                StringTokenizer stringTokenizer = new StringTokenizer(str4, " ");
                                if (stringTokenizer.hasMoreTokens() && UnicodeUtil.normaliseTextAndRemoveSpaces(stringTokenizer.nextToken()).equals(normaliseTextAndRemoveSpaces)) {
                                    String nextToken = stringTokenizer.nextToken();
                                    sb.append(readLine).append(" ").append(nextToken);
                                    str3 = nextToken;
                                    i2 = i5 + 1;
                                    i4 = 0;
                                    break;
                                }
                                if (i5 - i2 > 5) {
                                    LOGGER.warn(name + " / Fulltext trainer: TEI and raw file unsynchronized at raw line " + i3 + " : " + str4);
                                    i4++;
                                    if (str3 != null) {
                                        sb.append(readLine).append(" ").append(str3);
                                    }
                                } else {
                                    i5++;
                                }
                            }
                            if (i4 > 20) {
                                break;
                            }
                        }
                    }
                    bufferedReader.close();
                    if (i4 < 10) {
                        if (outputStreamWriter == null && outputStreamWriter2 != null) {
                            outputStreamWriter2.write(sb.toString() + "\n");
                        }
                        if (outputStreamWriter != null && outputStreamWriter2 == null) {
                            outputStreamWriter.write(sb.toString() + "\n");
                        } else if (Math.random() <= d) {
                            outputStreamWriter.write(sb.toString() + "\n");
                        } else {
                            outputStreamWriter2.write(sb.toString() + "\n");
                        }
                        i++;
                    } else {
                        LOGGER.error(name + " / too many synchronization issues, file not used in training data and to be fixed!");
                    }
                } else {
                    LOGGER.error("The raw file does not exist: " + file4.getPath());
                }
            } catch (Exception e2) {
                LOGGER.error("Fail to open or process raw file", e2);
            }
        }
        if (outputStreamWriter != null) {
            outputStreamWriter.close();
            fileOutputStream.close();
        }
        if (outputStreamWriter2 != null) {
            outputStreamWriter2.close();
            fileOutputStream2.close();
        }
        return i;
    }

    public static void main(String[] strArr) throws Exception {
        GrobidProperties.getInstance();
        AbstractTrainer.runTraining(new FulltextTrainer());
        System.out.println(AbstractTrainer.runEvaluation(new FulltextTrainer()));
        System.exit(0);
    }
}
