package org.grobid.trainer.sax;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.StringTokenizer;
import org.grobid.core.exceptions.GrobidException;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/grobid/trainer/sax/WikiTextExtractSaxParser.class */
public class WikiTextExtractSaxParser extends DefaultHandler {
    private StringBuffer accumulator;
    private String PageID;
    private String lang;
    private Writer writer;
    private boolean textBegin;
    private int page;
    private String path;
    private int fileCount;

    public WikiTextExtractSaxParser() {
        this.accumulator = new StringBuffer();
        this.PageID = null;
        this.lang = null;
        this.writer = null;
        this.textBegin = false;
        this.page = 0;
        this.path = null;
        this.fileCount = 0;
    }

    public WikiTextExtractSaxParser(String str) {
        this.accumulator = new StringBuffer();
        this.PageID = null;
        this.lang = null;
        this.writer = null;
        this.textBegin = false;
        this.page = 0;
        this.path = null;
        this.fileCount = 0;
        this.path = str;
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void characters(char[] cArr, int i, int i2) {
        if (this.textBegin) {
            this.accumulator.append(cArr, i, i2);
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void endElement(String str, String str2, String str3) throws SAXException {
        if (str3.equals("text")) {
            this.textBegin = false;
            StringTokenizer stringTokenizer = new StringTokenizer(this.accumulator.toString(), "\n");
            while (stringTokenizer.hasMoreTokens()) {
                String nextToken = stringTokenizer.nextToken();
                if (nextToken.length() != 0 && !nextToken.startsWith("__") && !nextToken.startsWith("PMID") && !nextToken.startsWith("#")) {
                    String str4 = "";
                    boolean z = false;
                    int i = 0;
                    while (!z) {
                        int indexOf = nextToken.indexOf("[", i);
                        if (indexOf != -1) {
                            int indexOf2 = nextToken.indexOf(93, i);
                            if (indexOf2 != -1) {
                                str4 = str4 + nextToken.substring(i, indexOf);
                                i = indexOf2 + 2;
                            } else {
                                str4 = str4 + nextToken.substring(i, indexOf) + nextToken.substring(indexOf2 + 1, nextToken.length());
                                z = true;
                            }
                        } else {
                            if (i < nextToken.length() - 1) {
                                str4 = str4 + nextToken.substring(i, nextToken.length());
                            }
                            z = true;
                        }
                    }
                    String trim = str4.trim();
                    if (trim.indexOf("|") == -1 && !trim.startsWith("poly")) {
                        for (int i2 = 0; i2 < 5; i2++) {
                            if (trim.startsWith(".") | trim.startsWith("*") | trim.startsWith(":") | trim.startsWith("\"") | trim.startsWith(";")) {
                                trim = trim.substring(1, trim.length()).trim();
                            }
                        }
                        if ((((trim.length() > 0) & (!trim.startsWith("Help")) & (!trim.startsWith("NONE")) & (!trim.startsWith("beg")) & (!trim.startsWith(": See also")) & (!trim.startsWith(": \"See also")) & (!trim.startsWith(":See also"))) && (!trim.startsWith("Wiktionary"))) && (!trim.startsWith("subgroup"))) {
                            try {
                                this.writer.write(trim);
                                this.writer.write("\n");
                                this.writer.flush();
                            } catch (Exception e) {
                                throw new GrobidException("An exception occured while running Grobid.", e);
                            }
                        } else {
                            continue;
                        }
                    }
                }
            }
            this.PageID = null;
        }
    }

    @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (!str3.equals("page")) {
            if (str3.equals("text")) {
                this.textBegin = true;
                this.accumulator.setLength(0);
                return;
            }
            return;
        }
        int length = attributes.getLength();
        for (int i = 0; i < length; i++) {
            String qName = attributes.getQName(i);
            String value = attributes.getValue(i);
            if (qName != null && qName.equals("id")) {
                this.PageID = value;
                if (this.page > 4000) {
                    this.page = 0;
                }
                if (this.page == 0) {
                    try {
                        if (this.writer != null) {
                            this.writer.close();
                        }
                        File file = new File(this.path + "text-" + this.fileCount + ".txt");
                        System.out.println(this.path + "text-" + this.fileCount + ".txt");
                        this.writer = new OutputStreamWriter(new FileOutputStream(file, false), "UTF-8");
                        this.fileCount++;
                    } catch (Exception e) {
                        throw new GrobidException("An exception occured while running Grobid.", e);
                    }
                }
                this.page++;
            }
        }
    }
}
