package org.grobid.trainer.evaluation;

import com.fasterxml.jackson.core.io.JsonStringEncoder;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FilenameFilter;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.FileUtils;
import org.grobid.core.data.BibDataSet;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.engines.Engine;
import org.grobid.core.exceptions.GrobidResourceException;
import org.grobid.core.factory.GrobidFactory;
import org.grobid.core.utilities.Consolidation;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.trainer.evaluation.utilities.NamespaceContextMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/grobid/trainer/evaluation/EvaluationDOIMatching.class */
public class EvaluationDOIMatching {
    private static final Logger LOGGER = LoggerFactory.getLogger(EvaluationDOIMatching.class);
    private static String evaluationFilePath = null;
    private Engine engine;
    public static final int BIBLIO_GLUTTON = 0;
    public static final int CROSSREF_API = 1;
    public static final double minRatcliffObershelpSimilarity = 0.5d;
    private static final String path_nlm_ref = "/article/back/ref-list/ref/mixed-citation";
    private static final String path_nlm_doi = "pub-id[@pub-id-type=\"doi\"]/text()";
    private static final String path_nlm_pmid = "pub-id[@pub-id-type=\"pmid\"]/text()";
    private static final String path_nlm_title = "article-title/text()";
    private static final String path_nlm_author = "person-group[@person-group-type=\"author\"]/name/surname/text()";
    private static final String path_nlm_host = "source/text()";
    private static final String path_nlm_first_page = "fpage/text()";
    private static final String path_nlm_volume = "volume/text()";
    private static final String path_tei_ref = "//back/div/listBibl/biblStruct";
    private static final String path_tei_doi = "idno[@type=\"doi\"]/text()";
    private Pattern pattern = Pattern.compile("[^a-zA-Z0-9]+");

    /* loaded from: input_file:org/grobid/trainer/evaluation/EvaluationDOIMatching$BibRefAggregated.class */
    public class BibRefAggregated {
        private String rawRef = null;
        private String doi = null;
        private String pmid = null;
        private Node xml = null;
        private String atitle = null;
        private String jtitle = null;
        private String firstAuthor = null;
        private String volume = null;
        private String firstPage = null;

        public BibRefAggregated() {
        }

        public String getRawRef() {
            return this.rawRef;
        }

        public void setRawRef(String str) {
            this.rawRef = str;
        }

        public String getDOI() {
            return this.doi;
        }

        public void setDOI(String str) {
            this.doi = str;
        }

        public String getPMID() {
            return this.pmid;
        }

        public void setPMID(String str) {
            this.pmid = str;
        }

        public Node getXML() {
            return this.xml;
        }

        public void setXML(Node node) {
            this.xml = node;
        }

        public String getAtitle() {
            return this.atitle;
        }

        public void setAtitle(String str) {
            this.atitle = str;
        }

        public String getJtitle() {
            return this.jtitle;
        }

        public void setJtitle(String str) {
            this.jtitle = str;
        }

        public String getFirstAuthor() {
            return this.firstAuthor;
        }

        public void setFirstAuthor(String str) {
            this.firstAuthor = str;
        }

        public String getVolume() {
            return this.volume;
        }

        public void setVolume(String str) {
            this.volume = str;
        }

        public String getFirstPage() {
            return this.firstPage;
        }

        public void setFirstPage(String str) {
            this.firstPage = str;
        }
    }

    public EvaluationDOIMatching(String str) {
        this.engine = null;
        evaluationFilePath = str;
        if (!new File(str).exists()) {
            System.out.println("Path to evaluation (gold) XML data is not valid !");
            evaluationFilePath = null;
        }
        try {
            GrobidProperties.getInstance();
            LOGGER.info(">>>>>>>> GROBID_HOME=" + GrobidProperties.get_GROBID_HOME_PATH());
            this.engine = GrobidFactory.getInstance().createEngine();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public String evaluation() throws Exception {
        StringBuilder sb = new StringBuilder();
        File[] listFiles = new File(evaluationFilePath).listFiles(new FilenameFilter() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.1
            @Override // java.io.FilenameFilter
            public boolean accept(File file, String str) {
                return str.endsWith(".json");
            }
        });
        if (listFiles == null) {
            sb.append("No file in dataset");
            return sb.toString();
        }
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        long currentTimeMillis = System.currentTimeMillis();
        ObjectMapper objectMapper = new ObjectMapper();
        for (File file : listFiles) {
            File file2 = listFiles[0];
            Iterator elements = objectMapper.readTree(file2).elements();
            ArrayList arrayList = new ArrayList();
            ArrayList arrayList2 = new ArrayList();
            ArrayList arrayList3 = new ArrayList();
            ArrayList arrayList4 = new ArrayList();
            ArrayList arrayList5 = new ArrayList();
            ArrayList arrayList6 = new ArrayList();
            ArrayList arrayList7 = new ArrayList();
            ArrayList arrayList8 = new ArrayList();
            while (elements.hasNext()) {
                JsonNode jsonNode = (JsonNode) elements.next();
                String str = null;
                JsonNode findPath = jsonNode.findPath("reference");
                if (findPath != null && !findPath.isMissingNode()) {
                    str = findPath.textValue();
                }
                arrayList.add(str);
                String str2 = null;
                JsonNode findPath2 = jsonNode.findPath("doi");
                if (findPath2 != null && !findPath2.isMissingNode()) {
                    str2 = findPath2.textValue();
                }
                arrayList2.add(str2);
                String str3 = null;
                JsonNode findPath3 = jsonNode.findPath("pmid");
                if (findPath3 != null && !findPath3.isMissingNode()) {
                    str3 = findPath3.textValue();
                }
                arrayList3.add(str3);
                String str4 = null;
                JsonNode findPath4 = jsonNode.findPath("atitle");
                if (findPath4 != null && !findPath4.isMissingNode()) {
                    str4 = findPath4.textValue();
                }
                arrayList4.add(str4);
                String str5 = null;
                JsonNode findPath5 = jsonNode.findPath("jtitle");
                if (findPath5 != null && !findPath5.isMissingNode()) {
                    str5 = findPath5.textValue();
                }
                arrayList5.add(str5);
                String str6 = null;
                JsonNode findPath6 = jsonNode.findPath("volume");
                if (findPath6 != null && !findPath6.isMissingNode()) {
                    str6 = findPath6.textValue();
                }
                arrayList7.add(str6);
                String str7 = null;
                JsonNode findPath7 = jsonNode.findPath("firstPage");
                if (findPath7 != null && !findPath7.isMissingNode()) {
                    str7 = findPath7.textValue();
                }
                arrayList8.add(str7);
                String str8 = null;
                JsonNode findPath8 = jsonNode.findPath("author");
                if (findPath8 != null && !findPath8.isMissingNode()) {
                    str8 = findPath8.textValue();
                }
                arrayList6.add(str8);
                i++;
            }
            try {
                List processRawReferences = this.engine.processRawReferences(arrayList, 2);
                for (int i4 = 0; i4 < arrayList.size(); i4++) {
                    BiblioItem biblioItem = (BiblioItem) processRawReferences.get(i4);
                    String str9 = (String) arrayList4.get(i4);
                    String str10 = (String) arrayList5.get(i4);
                    String str11 = (String) arrayList7.get(i4);
                    String str12 = (String) arrayList6.get(i4);
                    biblioItem.setTitle(str9);
                    biblioItem.setJournal(str10);
                    biblioItem.setVolume(str11);
                    biblioItem.setFirstAuthorSurname(str12);
                }
                for (int i5 = 0; i5 < arrayList.size(); i5++) {
                    BiblioItem biblioItem2 = (BiblioItem) processRawReferences.get(i5);
                    String str13 = (String) arrayList2.get(i5);
                    if (biblioItem2.getDOI() != null) {
                        i2++;
                        if (biblioItem2.getDOI().toLowerCase().equals(str13.toLowerCase())) {
                            i3++;
                        }
                    }
                }
            } catch (Exception e) {
                LOGGER.error("Error when processing: " + file2.getPath(), e);
            }
        }
        double currentTimeMillis2 = (System.currentTimeMillis() - currentTimeMillis) / 1000.0d;
        System.out.println("\n\n" + i + " bibliographical references processed in " + currentTimeMillis2 + " seconds, " + (currentTimeMillis2 / i) + " seconds per bibliographical reference.");
        System.out.println("Found " + i2 + " DOI");
        System.currentTimeMillis();
        sb.append("\n======= ");
        GrobidProperties.getInstance();
        if (GrobidProperties.getConsolidationService() == Consolidation.GrobidConsolidationService.GLUTTON) {
            sb.append("GLUTTON");
        } else {
            sb.append("CROSSREF");
        }
        sb.append(" API ======= \n");
        double d = i3 / i2;
        sb.append("\nprecision:\t" + d);
        double d2 = i3 / i;
        sb.append("\nrecall:\t" + d2);
        sb.append("\nf-score:\t" + (d + d2 != 0.0d ? ((2.0d * d) * d2) / (d + d2) : 0.0d) + "\n");
        return sb.toString();
    }

    public void buildEvaluationDataset() throws Exception {
        String str;
        String str2;
        if (evaluationFilePath == null) {
            throw new GrobidResourceException("Path to evaluation (gold) XML data is not correctly set");
        }
        StringBuffer stringBuffer = new StringBuffer();
        SAXParserFactory.newInstance();
        XPath newXPath = XPathFactory.newInstance().newXPath();
        new HashMap().put("tei", "http://www.tei-c.org/ns/1.0");
        HashMap hashMap = new HashMap();
        hashMap.put("tei", "http://www.tei-c.org/ns/1.0");
        newXPath.setNamespaceContext(new NamespaceContextMap(hashMap));
        File[] listFiles = new File(evaluationFilePath).listFiles(new FilenameFilter() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.2
            @Override // java.io.FilenameFilter
            public boolean accept(File file, String str3) {
                return file.isDirectory();
            }
        });
        if (listFiles == null) {
            stringBuffer.append("No file in dataset");
            return;
        }
        ArrayList<BibRefAggregated> arrayList = new ArrayList();
        int i = 0;
        long currentTimeMillis = System.currentTimeMillis();
        int i2 = 0;
        for (File file : listFiles) {
            File[] listFiles2 = file.listFiles(new FilenameFilter() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.3
                @Override // java.io.FilenameFilter
                public boolean accept(File file2, String str3) {
                    return str3.endsWith(".pdf") || str3.endsWith(".PDF");
                }
            });
            if (listFiles2 == null || listFiles2.length == 0) {
                LOGGER.info("warning: no PDF found under " + file.getPath());
            } else {
                if (listFiles2.length != 1) {
                    LOGGER.warn("warning: more than one PDF found under " + file.getPath());
                    LOGGER.warn("processing only the first one...");
                }
                File file2 = listFiles2[0];
                File file3 = null;
                File file4 = null;
                ArrayList<BibRefAggregated> arrayList2 = new ArrayList();
                File[] listFiles3 = file.listFiles(new FilenameFilter() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.4
                    @Override // java.io.FilenameFilter
                    public boolean accept(File file5, String str3) {
                        return str3.endsWith(".nxml") || str3.endsWith(".xml");
                    }
                });
                if (listFiles3 != null && listFiles3.length != 0) {
                    file3 = listFiles3[0];
                }
                File[] listFiles4 = file.listFiles(new FilenameFilter() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.5
                    @Override // java.io.FilenameFilter
                    public boolean accept(File file5, String str3) {
                        return str3.endsWith(".tei");
                    }
                });
                if (listFiles4 != null && listFiles4.length != 0) {
                    file4 = listFiles4[0];
                }
                if (file3 == null && file4 == null) {
                    LOGGER.warn("warning: no reference NLM or TEI file found under " + file.getPath());
                } else {
                    DocumentBuilderFactory newInstance = DocumentBuilderFactory.newInstance();
                    newInstance.setValidating(false);
                    try {
                        DocumentBuilder newDocumentBuilder = newInstance.newDocumentBuilder();
                        newDocumentBuilder.setEntityResolver(new EntityResolver() { // from class: org.grobid.trainer.evaluation.EvaluationDOIMatching.6
                            @Override // org.xml.sax.EntityResolver
                            public InputSource resolveEntity(String str3, String str4) {
                                return new InputSource(new ByteArrayInputStream("<?xml version=\"1.0\" encoding=\"UTF-8\"?>".getBytes()));
                            }
                        });
                        Document parse = newDocumentBuilder.parse(file4 != null ? file4 : file3);
                        String str3 = null;
                        if (file4 == null) {
                            str = path_nlm_ref;
                            str2 = path_nlm_doi;
                            str3 = path_nlm_pmid;
                        } else {
                            str = path_tei_ref;
                            str2 = path_tei_doi;
                        }
                        NodeList nodeList = (NodeList) newXPath.compile(str).evaluate(parse.getDocumentElement(), XPathConstants.NODESET);
                        for (int i3 = 0; i3 < nodeList.getLength(); i3++) {
                            BibRefAggregated bibRefAggregated = new BibRefAggregated();
                            Node item = nodeList.item(i3);
                            bibRefAggregated.setXML(item);
                            NodeList nodeList2 = (NodeList) newXPath.compile(str2).evaluate(item, XPathConstants.NODESET);
                            if (nodeList2.getLength() > 0) {
                                bibRefAggregated.setDOI(nodeList2.item(0).getNodeValue());
                            }
                            NodeList nodeList3 = (NodeList) newXPath.compile(str3).evaluate(item, XPathConstants.NODESET);
                            if (nodeList3.getLength() > 0) {
                                bibRefAggregated.setPMID(nodeList3.item(0).getNodeValue());
                            }
                            arrayList2.add(bibRefAggregated);
                        }
                    } catch (Exception e) {
                        LOGGER.error("Error when collecting reference citations", e);
                    }
                    int i4 = 0;
                    try {
                        LOGGER.info(i + " - " + file2.getPath());
                        List processReferences = this.engine.processReferences(file2, 0);
                        Iterator it = processReferences.iterator();
                        while (it.hasNext()) {
                            String rawBib = ((BibDataSet) it.next()).getRawBib();
                            if (rawBib != null) {
                                String replaceAll = TextUtilities.DOIPattern.matcher(rawBib).replaceAll(" ");
                                for (BibRefAggregated bibRefAggregated2 : arrayList2) {
                                    if (bibRefAggregated2.getRawRef() == null && bibRefAggregated2.getDOI() != null) {
                                        Node xml = bibRefAggregated2.getXML();
                                        if (xml != null) {
                                            NodeList nodeList4 = (NodeList) newXPath.compile(path_nlm_title).evaluate(xml, XPathConstants.NODESET);
                                            String str4 = null;
                                            if (nodeList4 != null && nodeList4.getLength() > 0) {
                                                str4 = nodeList4.item(0).getNodeValue();
                                            }
                                            String str5 = null;
                                            String str6 = null;
                                            NodeList nodeList5 = (NodeList) newXPath.compile(path_nlm_author).evaluate(xml, XPathConstants.NODESET);
                                            if (nodeList5 != null && nodeList5.getLength() > 0) {
                                                str5 = nodeList5.item(0).getNodeValue();
                                                str6 = str5;
                                                for (int i5 = 1; i5 < nodeList5.getLength(); i5++) {
                                                    str5 = str5 + nodeList5.item(i5).getNodeValue();
                                                }
                                            }
                                            String str7 = null;
                                            NodeList nodeList6 = (NodeList) newXPath.compile(path_nlm_host).evaluate(xml, XPathConstants.NODESET);
                                            if (nodeList6 != null && nodeList6.getLength() > 0) {
                                                str7 = nodeList6.item(0).getNodeValue();
                                            }
                                            String str8 = null;
                                            NodeList nodeList7 = (NodeList) newXPath.compile(path_nlm_first_page).evaluate(xml, XPathConstants.NODESET);
                                            if (nodeList7 != null && nodeList7.getLength() > 0) {
                                                str8 = nodeList7.item(0).getNodeValue();
                                            }
                                            String str9 = null;
                                            NodeList nodeList8 = (NodeList) newXPath.compile(path_nlm_volume).evaluate(xml, XPathConstants.NODESET);
                                            if (nodeList8 != null && nodeList8.getLength() > 0) {
                                                str9 = nodeList8.item(0).getNodeValue();
                                            }
                                            if (str4 != null || str5 != null || str7 != null) {
                                                String signature = getSignature(replaceAll);
                                                String signature2 = getSignature(str4);
                                                String signature3 = getSignature(str5);
                                                String signature4 = getSignature(str7);
                                                String signature5 = getSignature(str8);
                                                String signature6 = getSignature(str9);
                                                int indexOf = str4 != null ? signature.indexOf(signature2) : -1;
                                                int indexOf2 = str5 != null ? signature.indexOf(signature3) : -1;
                                                int indexOf3 = str7 != null ? signature.indexOf(signature4) : -1;
                                                int indexOf4 = str8 != null ? signature.indexOf(signature5) : -1;
                                                int indexOf5 = str9 != null ? signature.indexOf(signature6) : -1;
                                                if (indexOf != -1 || (indexOf2 != -1 && indexOf3 != -1 && (indexOf4 != -1 || indexOf5 != -1))) {
                                                    bibRefAggregated2.setRawRef(replaceAll);
                                                    bibRefAggregated2.setFirstPage(str8);
                                                    bibRefAggregated2.setVolume(str9);
                                                    bibRefAggregated2.setAtitle(str4);
                                                    bibRefAggregated2.setJtitle(str7);
                                                    bibRefAggregated2.setFirstAuthor(str6);
                                                    i4++;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                        arrayList.addAll(arrayList2);
                        System.out.println("total of " + processReferences.size() + " ref. bib. found by GROBID");
                        System.out.println(arrayList2.size() + " DOI identified in gold");
                        System.out.println("and " + i4 + " original reference strings identified");
                    } catch (Exception e2) {
                        System.out.println("Error when processing: " + file2.getPath());
                        e2.printStackTrace();
                        i2++;
                    }
                    i++;
                }
            }
        }
        File file5 = new File(evaluationFilePath + File.separator + "references-doi-matching.json");
        JsonStringEncoder jsonStringEncoder = JsonStringEncoder.getInstance();
        StringBuilder sb = new StringBuilder();
        sb.append("[\n");
        boolean z = true;
        for (BibRefAggregated bibRefAggregated3 : arrayList) {
            if (bibRefAggregated3.getRawRef() != null && (bibRefAggregated3.getDOI() != null || bibRefAggregated3.getPMID() != null)) {
                if (z) {
                    z = false;
                } else {
                    sb.append(",\n");
                }
                sb.append("{");
                sb.append("\"reference\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getRawRef())) + "\"");
                if (bibRefAggregated3.getDOI() != null) {
                    sb.append(", \"doi\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getDOI())) + "\"");
                }
                if (bibRefAggregated3.getPMID() != null) {
                    sb.append(", \"pmid\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getPMID())) + "\"");
                }
                if (bibRefAggregated3.getAtitle() != null) {
                    sb.append(", \"atitle\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getAtitle())) + "\"");
                }
                if (bibRefAggregated3.getFirstAuthor() != null) {
                    sb.append(", \"firstAuthor\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getFirstAuthor())) + "\"");
                }
                if (bibRefAggregated3.getJtitle() != null) {
                    sb.append(", \"jtitle\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getJtitle())) + "\"");
                }
                if (bibRefAggregated3.getVolume() != null) {
                    sb.append(", \"volume\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getVolume())) + "\"");
                }
                if (bibRefAggregated3.getFirstPage() != null) {
                    sb.append(", \"firstPage\": \"" + new String(jsonStringEncoder.quoteAsUTF8(bibRefAggregated3.getFirstPage())) + "\"");
                }
                sb.append("}");
            }
        }
        sb.append("]");
        try {
            FileUtils.writeStringToFile(file5, sb.toString(), "UTF-8");
        } catch (Exception e3) {
            e3.printStackTrace();
        }
        System.out.println("GROBID failed on " + i2 + " PDF");
        double currentTimeMillis2 = (System.currentTimeMillis() - currentTimeMillis) / 1000.0d;
        System.out.println(i + " PDF files processed in " + currentTimeMillis2 + " seconds, " + (currentTimeMillis2 / i) + " seconds per PDF file.");
    }

    private String getSignature(String str) {
        if (str == null) {
            return null;
        }
        return this.pattern.matcher(Normalizer.normalize(str.toLowerCase(), Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", "")).replaceAll("");
    }

    public static void main(String[] strArr) {
        File file;
        if (strArr.length > 2 || strArr.length == 0) {
            System.err.println("command parameters: action[data|eval] [path to the (gold) evaluation dataset]");
            return;
        }
        String str = strArr[0];
        if (str == null || str.length() == 0 || !(str.equals("data") || str.equals("eval"))) {
            System.err.println("Action to be performed not correctly set, should be [data|eval]");
            return;
        }
        String str2 = strArr[1];
        if (str2 == null || str2.length() == 0) {
            System.err.println("Path to evaluation (gold) XML data is not correctly set");
            return;
        }
        try {
            file = new File(str2);
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (!file.exists()) {
            System.err.println("Path to evaluation (gold) XML data does not exist");
            return;
        }
        if (!file.isDirectory()) {
            System.err.println("Path to evaluation (gold) XML data is not a directory");
            return;
        }
        try {
            if (str.equals("data")) {
                new EvaluationDOIMatching(str2).buildEvaluationDataset();
            } else if (str.equals("eval")) {
                System.out.println(new EvaluationDOIMatching(str2).evaluation());
            }
            System.out.println(Engine.getCntManager());
        } catch (Exception e2) {
            e2.printStackTrace();
        }
        System.exit(0);
    }
}
