package org.grobid.core.engines;

import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.SortedSet;
import org.apache.commons.lang3.tuple.Pair;
import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentPointer;
import org.grobid.core.engines.citations.LabeledReferenceResult;
import org.grobid.core.engines.citations.ReferenceSegmenter;
import org.grobid.core.engines.label.SegmentationLabels;
import org.grobid.core.engines.label.TaggingLabels;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeatureFactory;
import org.grobid.core.features.FeaturesVectorReferenceSegmenter;
import org.grobid.core.layout.Block;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.tokenization.LabeledTokensContainer;
import org.grobid.core.tokenization.TaggingTokenSynchronizer;
import org.grobid.core.utilities.BoundingBoxCalculator;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.Triple;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/grobid/core/engines/ReferenceSegmenterParser.class */
public class ReferenceSegmenterParser extends AbstractParser implements ReferenceSegmenter {
    private static final Logger LOGGER = LoggerFactory.getLogger(ReferenceSegmenterParser.class);
    private static final int LINESCALE = 10;

    /* JADX INFO: Access modifiers changed from: protected */
    public ReferenceSegmenterParser() {
        super(GrobidModels.REFERENCE_SEGMENTER);
    }

    @Override // org.grobid.core.engines.citations.ReferenceSegmenter
    public List<LabeledReferenceResult> extract(String str) {
        Document createFromText = Document.createFromText(str);
        return extract(createFromText, Sets.newTreeSet(Collections.singletonList(new DocumentPiece(new DocumentPointer(0, 0, 0), new DocumentPointer(0, createFromText.getTokenizations().size() - 1, createFromText.getTokenizations().size() - 1)))), false);
    }

    @Override // org.grobid.core.engines.citations.ReferenceSegmenter
    public List<LabeledReferenceResult> extract(Document document) {
        return extract(document, false);
    }

    public List<LabeledReferenceResult> extract(Document document, boolean z) {
        return extract(document, document.getDocumentPart(SegmentationLabels.REFERENCES), z);
    }

    public List<LabeledReferenceResult> extract(Document document, SortedSet<DocumentPiece> sortedSet, boolean z) {
        Pair<String, List<LayoutToken>> referencesSectionFeatured = getReferencesSectionFeatured(document, sortedSet);
        if (referencesSectionFeatured == null) {
            return null;
        }
        String str = (String) referencesSectionFeatured.getLeft();
        List<LayoutToken> list = (List) referencesSectionFeatured.getRight();
        try {
            String label = label(str);
            if (label == null) {
                return null;
            }
            return getExtractionResult(list, GenericTaggerUtils.getTokensWithLabelsAndFeatures(label, z));
        } catch (Exception e) {
            throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
        }
    }

    private List<LabeledReferenceResult> getExtractionResult(List<LayoutToken> list, List<Triple<String, String, String>> list2) {
        final ArrayList arrayList = new ArrayList();
        final StringBuilder sb = new StringBuilder();
        final ArrayList arrayList2 = new ArrayList();
        final StringBuilder sb2 = new StringBuilder();
        final StringBuilder sb3 = new StringBuilder();
        TaggingTokenSynchronizer taggingTokenSynchronizer = new TaggingTokenSynchronizer((GrobidModel) null, list2, list);
        Function<LabeledTokensContainer, Void> function = new Function<LabeledTokensContainer, Void>() { // from class: org.grobid.core.engines.ReferenceSegmenterParser.1
            public Void apply(LabeledTokensContainer labeledTokensContainer) {
                sb2.append(labeledTokensContainer.getFeatureString());
                sb2.append('\n');
                if (!labeledTokensContainer.isBeginning() || sb.length() == 0) {
                    return null;
                }
                arrayList.add(new LabeledReferenceResult(sb3.length() == 0 ? null : sb3.toString().trim(), sb.toString().trim(), Lists.newArrayList(arrayList2), sb2.toString(), BoundingBoxCalculator.calculate(arrayList2)));
                sb.setLength(0);
                sb3.setLength(0);
                sb2.setLength(0);
                arrayList2.clear();
                return null;
            }
        };
        Iterator<LabeledTokensContainer> it = taggingTokenSynchronizer.iterator();
        while (it.hasNext()) {
            LabeledTokensContainer next = it.next();
            String token = next.getToken();
            String plainLabel = next.getPlainLabel();
            if (TaggingLabels.LABEL_LABEL.equals(plainLabel)) {
                function.apply(next);
                sb3.append(token);
                if (next.isTrailingSpace() || next.isTrailingNewLine()) {
                    sb3.append(' ');
                }
            } else if (plainLabel.equals(TaggingLabels.REFERENCE_LABEL)) {
                function.apply(next);
                sb.append(token);
                if (next.isTrailingSpace()) {
                    sb.append(' ');
                }
                if (next.isTrailingNewLine()) {
                    sb.append('\n');
                }
                arrayList2.addAll(next.getLayoutTokens());
            } else if (plainLabel.equals(TaggingLabels.OTHER_LABEL)) {
            }
            if (!it.hasNext()) {
                arrayList.add(new LabeledReferenceResult(sb3.length() == 0 ? null : sb3.toString().trim(), sb.toString().trim(), arrayList2, sb2.toString(), BoundingBoxCalculator.calculate(arrayList2)));
                sb.setLength(0);
                sb3.setLength(0);
            }
        }
        return arrayList;
    }

    public Pair<String, String> createTrainingData(Document document, int i) {
        Pair<String, List<LayoutToken>> referencesSectionFeatured = getReferencesSectionFeatured(document, document.getDocumentPart(SegmentationLabels.REFERENCES));
        if (referencesSectionFeatured == null) {
            return null;
        }
        String str = (String) referencesSectionFeatured.getLeft();
        List list = (List) referencesSectionFeatured.getRight();
        try {
            String label = label(str);
            if (label == null) {
                return null;
            }
            List<Pair<String, String>> tokensAndLabels = GenericTaggerUtils.getTokensAndLabels(label);
            StringBuilder sb = new StringBuilder();
            sb.append("<tei xml:space=\"preserve\">\n    <teiHeader>\n        <fileDesc xml:id=\"_" + i + "\"/>\n    </teiHeader>\n    <text xml:lang=\"en\">\n        <listBibl>\n");
            int i2 = 0;
            boolean z = false;
            boolean z2 = false;
            String str2 = null;
            boolean z3 = false;
            for (Pair<String, String> pair : tokensAndLabels) {
                String str3 = (String) pair.getLeft();
                String str4 = (String) pair.getRight();
                int i3 = i2;
                while (i3 < list.size()) {
                    if (!((LayoutToken) list.get(i3)).t().equals(" ")) {
                        if (!((LayoutToken) list.get(i3)).t().equals("\n") && !((LayoutToken) list.get(i2)).t().equals("\r")) {
                            break;
                        }
                        z2 = true;
                    } else {
                        z = true;
                    }
                    i3++;
                }
                int i4 = i3;
                if (i4 >= list.size()) {
                    LOGGER.error("Implementation error: Reached the end of tokenizations, but current token is " + str3);
                    z = true;
                } else {
                    String text = ((LayoutToken) list.get(i4)).getText();
                    if (i4 != list.size() && !text.equals(str3)) {
                        z = true;
                        if (!str3.startsWith(text)) {
                            i4++;
                            if (!str3.equals(((LayoutToken) list.get(i4)).getText())) {
                                i4++;
                                if (!str3.equals(((LayoutToken) list.get(i4)).getText())) {
                                    i4++;
                                    if (!str3.equals(((LayoutToken) list.get(i4)).getText())) {
                                        i4 -= 3;
                                        LOGGER.error("Implementation error, tokens out of sync: " + ((LayoutToken) list.get(i4)).getText() + " != " + str3 + ", at position " + i4);
                                    }
                                }
                            }
                        }
                    }
                }
                String plainLabel = GenericTaggerUtils.getPlainLabel(str4);
                boolean z4 = str2 != null && testClosingTag(sb, str4, str2, z, z2);
                if (z4) {
                    z = false;
                    z2 = false;
                }
                if (z4 && str2.equals(TaggingLabels.REFERENCE_LABEL)) {
                    z3 = false;
                }
                String writeField = writeField(str4, str2, str3, TaggingLabels.LABEL_LABEL, z3 ? TaggingLabels.LABEL_LABEL : "<bibl><label>", z, z2, 2);
                if (writeField != null) {
                    sb.append(writeField);
                    z3 = true;
                } else {
                    String writeField2 = writeField(str4, str2, str3, TaggingLabels.REFERENCE_LABEL, z3 ? "" : "<bibl>", z, z2, 2);
                    if (writeField2 != null) {
                        sb.append(writeField2);
                        z3 = true;
                    } else {
                        String writeField3 = writeField(str4, str2, str3, TaggingLabels.OTHER_LABEL, "", z, z2, 2);
                        if (writeField3 != null) {
                            sb.append(writeField3);
                            z3 = false;
                        }
                    }
                }
                str2 = plainLabel;
                z = false;
                z2 = false;
                i2 = i4 + 1;
            }
            if (z3) {
                sb.append("</bibl>");
            }
            sb.append("\n        </listBibl>\n    </text>\n</tei>\n");
            return Pair.of(sb.toString(), str);
        } catch (Exception e) {
            throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
        }
    }

    private boolean testClosingTag(StringBuilder sb, String str, String str2, boolean z, boolean z2) {
        boolean z3 = false;
        if (!str.equals(str2)) {
            z3 = true;
            if (str2.equals(TaggingLabels.OTHER_LABEL)) {
                if (z2) {
                    sb.append("<lb/>");
                }
                if (z) {
                    sb.append(" ");
                }
                sb.append("\n");
            } else if (str2.equals(TaggingLabels.LABEL_LABEL)) {
                sb.append("</label>");
                if (z2) {
                    sb.append("<lb/>");
                }
                if (z) {
                    sb.append(" ");
                }
            } else if (str2.equals(TaggingLabels.REFERENCE_LABEL)) {
                if (z2) {
                    sb.append("<lb/>");
                }
                if (z) {
                    sb.append(" ");
                }
                sb.append("</bibl>\n");
            } else {
                z3 = false;
            }
        }
        return z3;
    }

    private String writeField(String str, String str2, String str3, String str4, String str5, boolean z, boolean z2, int i) {
        String str6;
        String str7 = null;
        if (str.endsWith(str4)) {
            if (str.endsWith(TaggingLabels.OTHER_LABEL)) {
                String str8 = "";
                if (str.equals("I-<other>")) {
                    str8 = str8 + "\n";
                    for (int i2 = 0; i2 < i; i2++) {
                        str8 = str8 + "    ";
                    }
                }
                if (z2) {
                    str8 = str8 + "<lb/>";
                }
                if (z) {
                    str8 = str8 + " ";
                }
                str7 = str8 + TextUtilities.HTMLEncode(str3);
            } else if (str2 == null || !str.endsWith(str2)) {
                String str9 = "";
                if (str5.length() > 0) {
                    for (int i3 = 0; i3 < i; i3++) {
                        str9 = str9 + "    ";
                    }
                }
                if (z2) {
                    str9 = str9 + "<lb/>";
                }
                if (z) {
                    str9 = str9 + " ";
                }
                str7 = str9 + str5 + TextUtilities.HTMLEncode(str3);
            } else {
                str6 = "";
                str6 = z2 ? str6 + "<lb/>" : "";
                if (z) {
                    str6 = str6 + " ";
                }
                if (str.startsWith("I-")) {
                    str6 = str6 + str5;
                }
                str7 = str6 + TextUtilities.HTMLEncode(str3);
            }
        }
        return str7;
    }

    public static Pair<String, List<LayoutToken>> getReferencesSectionFeatured(Document document, SortedSet<DocumentPiece> sortedSet) {
        boolean z;
        boolean z2;
        if (sortedSet == null || sortedSet.size() == 0) {
            return null;
        }
        FeatureFactory featureFactory = FeatureFactory.getInstance();
        List<Block> blocks = document.getBlocks();
        if (blocks == null || blocks.size() == 0) {
            return null;
        }
        StringBuilder sb = new StringBuilder();
        FeaturesVectorReferenceSegmenter featuresVectorReferenceSegmenter = null;
        double d = Double.NaN;
        boolean z3 = false;
        ArrayList arrayList = new ArrayList();
        List<LayoutToken> tokenizations = document.getTokenizations();
        int i = 1;
        int i2 = 0;
        for (DocumentPiece documentPiece : sortedSet) {
            DocumentPointer left = documentPiece.getLeft();
            DocumentPointer right = documentPiece.getRight();
            int tokenDocPos = left.getTokenDocPos();
            int tokenDocPos2 = right.getTokenDocPos();
            for (int i3 = tokenDocPos; i3 <= tokenDocPos2; i3++) {
                arrayList.add(tokenizations.get(i3));
                i2 += tokenizations.get(i3).getText().length();
                if (tokenizations.get(i3).t().equals("\n") || tokenizations.get(i3).t().equals("\r")) {
                    if (i2 > i) {
                        i = i2;
                    }
                    i2 = 0;
                }
            }
        }
        for (DocumentPiece documentPiece2 : sortedSet) {
            DocumentPointer left2 = documentPiece2.getLeft();
            DocumentPointer right2 = documentPiece2.getRight();
            int i4 = 0;
            int i5 = 0;
            int blockPtr = left2.getBlockPtr();
            Block block = null;
            boolean z4 = true;
            int i6 = 0;
            String str = null;
            for (int tokenDocPos3 = left2.getTokenDocPos(); tokenDocPos3 <= right2.getTokenDocPos(); tokenDocPos3++) {
                String text = tokenizations.get(tokenDocPos3).getText();
                if (text != null) {
                    if (block != null && tokenDocPos3 > block.getEndToken()) {
                        blockPtr++;
                        i5 = 0;
                        i6 = 0;
                        str = null;
                    }
                    if (blockPtr < blocks.size()) {
                        block = blocks.get(blockPtr);
                        if (tokenDocPos3 == block.getStartToken()) {
                            z = true;
                            z2 = false;
                        } else if (tokenDocPos3 == block.getEndToken()) {
                            z = false;
                            z2 = true;
                        } else {
                            z = false;
                            z2 = false;
                        }
                    } else {
                        block = null;
                        z = false;
                        z2 = false;
                    }
                    List<LayoutToken> tokens = block != null ? block.getTokens() : null;
                    if (text.equals("\n") || text.equals("\r")) {
                        z4 = true;
                        i4 = 0;
                        i6 = 0;
                        str = null;
                    } else {
                        boolean z5 = false;
                        i4 += text.length();
                        if (text.equals(" ") || text.equals("\t")) {
                            i4++;
                        } else if (text.trim().length() != 0) {
                            LayoutToken layoutToken = null;
                            if (tokens != null) {
                                int i7 = i5;
                                while (true) {
                                    if (i7 >= tokens.size()) {
                                        break;
                                    }
                                    layoutToken = tokens.get(i7);
                                    if (text.equals(layoutToken.getText())) {
                                        i5 = i7;
                                        break;
                                    }
                                    i7++;
                                }
                            }
                            if (z4) {
                                z5 = true;
                                z4 = false;
                                if (layoutToken != null && featuresVectorReferenceSegmenter != null) {
                                    double d2 = d;
                                    d = layoutToken.getX();
                                    double length = layoutToken.width / layoutToken.getText().length();
                                    if (!Double.isNaN(d2)) {
                                        if (d - d2 > length) {
                                            z3 = true;
                                        } else if (d2 - d > length) {
                                            z3 = false;
                                        }
                                    }
                                }
                            }
                            if (!TextUtilities.filterLine(text)) {
                                FeaturesVectorReferenceSegmenter featuresVectorReferenceSegmenter2 = new FeaturesVectorReferenceSegmenter();
                                featuresVectorReferenceSegmenter2.token = layoutToken;
                                featuresVectorReferenceSegmenter2.string = text;
                                if (z5) {
                                    featuresVectorReferenceSegmenter2.lineStatus = "LINESTART";
                                }
                                if (featureFactory.isPunct.matcher(text).find()) {
                                    featuresVectorReferenceSegmenter2.punctType = "PUNCT";
                                }
                                if (text.equals(TextUtilities.START_BRACKET) || text.equals("[")) {
                                    featuresVectorReferenceSegmenter2.punctType = "OPENBRACKET";
                                } else if (text.equals(TextUtilities.END_BRACKET) || text.equals("]")) {
                                    featuresVectorReferenceSegmenter2.punctType = "ENDBRACKET";
                                } else if (text.equals(".")) {
                                    featuresVectorReferenceSegmenter2.punctType = "DOT";
                                } else if (text.equals(TextUtilities.COMMA)) {
                                    featuresVectorReferenceSegmenter2.punctType = "COMMA";
                                } else if (text.equals("-")) {
                                    featuresVectorReferenceSegmenter2.punctType = "HYPHEN";
                                } else if (text.equals(TextUtilities.DOUBLE_QUOTE) || text.equals(TextUtilities.QUOTE) || text.equals("`")) {
                                    featuresVectorReferenceSegmenter2.punctType = "QUOTE";
                                }
                                if (tokenDocPos3 == 0 || z4) {
                                    featuresVectorReferenceSegmenter2.lineStatus = "LINESTART";
                                    if (tokenDocPos3 == 0) {
                                        featuresVectorReferenceSegmenter2.blockStatus = "BLOCKSTART";
                                    }
                                    i4 = 0;
                                }
                                if (z3) {
                                    featuresVectorReferenceSegmenter2.alignmentStatus = "LINEINDENT";
                                } else {
                                    featuresVectorReferenceSegmenter2.alignmentStatus = "ALIGNEDLEFT";
                                }
                                boolean z6 = true;
                                boolean z7 = false;
                                String str2 = text;
                                for (int i8 = 1; tokenDocPos3 + i8 < tokenizations.size() && !z7; i8++) {
                                    String text2 = tokenizations.get(tokenDocPos3 + i8).getText();
                                    if (text2 != null) {
                                        if (str == null) {
                                            str2 = str2 + text2;
                                        }
                                        if (text2.equals("\n") || text2.equals("\r")) {
                                            z7 = true;
                                            if (i6 == 0) {
                                                i6 = str2.length();
                                            }
                                            if (str == null) {
                                                str = TextUtilities.punctuationProfile(str2);
                                            }
                                        } else if (!text2.equals(" ") && !text2.equals("\t")) {
                                            z6 = false;
                                        } else if (TextUtilities.filterLine(text2)) {
                                            z7 = true;
                                            if (i6 == 0) {
                                                i6 = str2.length();
                                            }
                                            if (str == null) {
                                                str = TextUtilities.punctuationProfile(str2);
                                            }
                                        }
                                    }
                                    if (tokenDocPos3 + i8 >= tokenizations.size() - 1) {
                                        z2 = true;
                                        z6 = true;
                                    }
                                    if (z6 && block != null && tokenDocPos3 + i8 == block.getEndToken()) {
                                        z2 = true;
                                    }
                                }
                                if (!z6 && !z5) {
                                    featuresVectorReferenceSegmenter2.lineStatus = "LINEIN";
                                } else if (!z5) {
                                    featuresVectorReferenceSegmenter2.lineStatus = "LINEEND";
                                    z4 = true;
                                }
                                if (z) {
                                    featuresVectorReferenceSegmenter2.blockStatus = "BLOCKSTART";
                                }
                                if (!z2 && featuresVectorReferenceSegmenter2.blockStatus == null) {
                                    featuresVectorReferenceSegmenter2.blockStatus = "BLOCKIN";
                                } else if (featuresVectorReferenceSegmenter2.blockStatus == null) {
                                    featuresVectorReferenceSegmenter2.blockStatus = "BLOCKEND";
                                }
                                if (text.length() == 1) {
                                    featuresVectorReferenceSegmenter2.singleChar = true;
                                }
                                if (Character.isUpperCase(text.charAt(0))) {
                                    featuresVectorReferenceSegmenter2.capitalisation = "INITCAP";
                                }
                                if (featureFactory.test_all_capital(text)) {
                                    featuresVectorReferenceSegmenter2.capitalisation = "ALLCAP";
                                }
                                if (FeatureFactory.test_digit(text)) {
                                    featuresVectorReferenceSegmenter2.digit = "CONTAINSDIGITS";
                                }
                                if (featureFactory.test_common(text)) {
                                    featuresVectorReferenceSegmenter2.commonName = true;
                                }
                                if (featureFactory.test_names(text)) {
                                    featuresVectorReferenceSegmenter2.properName = true;
                                }
                                if (featureFactory.test_month(text)) {
                                    featuresVectorReferenceSegmenter2.month = true;
                                }
                                if (featureFactory.isDigit.matcher(text).find()) {
                                    featuresVectorReferenceSegmenter2.digit = "ALLDIGIT";
                                }
                                if (featureFactory.year.matcher(text).find()) {
                                    featuresVectorReferenceSegmenter2.year = true;
                                }
                                if (featureFactory.email.matcher(text).find()) {
                                    featuresVectorReferenceSegmenter2.email = true;
                                }
                                if (featureFactory.http.matcher(text).find()) {
                                    featuresVectorReferenceSegmenter2.http = true;
                                }
                                if (layoutToken != null && layoutToken.getBold()) {
                                    featuresVectorReferenceSegmenter2.bold = true;
                                }
                                if (layoutToken != null && layoutToken.getItalic()) {
                                    featuresVectorReferenceSegmenter2.italic = true;
                                }
                                if (featuresVectorReferenceSegmenter2.capitalisation == null) {
                                    featuresVectorReferenceSegmenter2.capitalisation = "NOCAPS";
                                }
                                if (featuresVectorReferenceSegmenter2.digit == null) {
                                    featuresVectorReferenceSegmenter2.digit = "NODIGIT";
                                }
                                if (featuresVectorReferenceSegmenter2.punctType == null) {
                                    featuresVectorReferenceSegmenter2.punctType = "NOPUNCT";
                                }
                                featuresVectorReferenceSegmenter2.lineLength = featureFactory.linearScaling(i6, i, 10);
                                featuresVectorReferenceSegmenter2.relativePosition = featureFactory.linearScaling(i4, i6, 10);
                                featuresVectorReferenceSegmenter2.punctuationProfile = str;
                                if (featuresVectorReferenceSegmenter != null) {
                                    sb.append(featuresVectorReferenceSegmenter.printVector());
                                }
                                featuresVectorReferenceSegmenter = featuresVectorReferenceSegmenter2;
                            }
                        }
                    }
                }
            }
        }
        if (featuresVectorReferenceSegmenter != null) {
            sb.append(featuresVectorReferenceSegmenter.printVector());
        }
        return Pair.of(sb.toString(), arrayList);
    }
}
