package org.grobid.core.engines.citations;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.grobid.core.document.Document;
import org.grobid.core.engines.label.SegmentationLabels;
import org.grobid.core.utilities.TextUtilities;

/* loaded from: input_file:org/grobid/core/engines/citations/RegexReferenceSegmenter.class */
public class RegexReferenceSegmenter implements ReferenceSegmenter {
    private static final Pattern m1 = Pattern.compile("((^|\\n)( )*\\[.+?\\])");
    private static final Pattern m2 = Pattern.compile("((^|\\n)( )*\\(.+?\\))");
    private static final Pattern m3 = Pattern.compile("((^|\\n)( )*\\d{1,3}\\.)");
    private static final Pattern SPACE_DASH_PATTERN = Pattern.compile("[a-zA-Z]-\\s*[\\n\\r]+\\s*[a-zA-Z]");
    private static final Pattern[] CITATION_MARKERS = {m1, m2, m3};
    private static final AdditionalRegexTextSegmenter citationTextSegmenter = new AdditionalRegexTextSegmenter();
    public static final Function<String, LabeledReferenceResult> LABELED_REFERENCE_RESULT_FUNCTION = new Function<String, LabeledReferenceResult>() { // from class: org.grobid.core.engines.citations.RegexReferenceSegmenter.1
        public LabeledReferenceResult apply(String str) {
            return new LabeledReferenceResult(str);
        }
    };

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/grobid/core/engines/citations/RegexReferenceSegmenter$StringLengthPredicate.class */
    public static class StringLengthPredicate implements Predicate<String> {
        private int len;

        private StringLengthPredicate(int i) {
            this.len = i;
        }

        public boolean apply(String str) {
            return str != null && str.length() >= this.len;
        }
    }

    @Override // org.grobid.core.engines.citations.ReferenceSegmenter
    public List<LabeledReferenceResult> extract(String str) {
        return Lists.transform(segmentReferences(str), LABELED_REFERENCE_RESULT_FUNCTION);
    }

    @Override // org.grobid.core.engines.citations.ReferenceSegmenter
    public List<LabeledReferenceResult> extract(Document document) {
        return Lists.transform(segmentReferences(document.getDocumentPartText(SegmentationLabels.REFERENCES)), LABELED_REFERENCE_RESULT_FUNCTION);
    }

    private static List<String> segmentReferences(String str) {
        int end;
        ArrayList arrayList = new ArrayList();
        int i = 0;
        int i2 = -1;
        for (int i3 = 0; i3 < CITATION_MARKERS.length; i3++) {
            int i4 = 0;
            while (CITATION_MARKERS[i3].matcher(str).find()) {
                i4++;
            }
            if (i4 > i) {
                i2 = i3;
                i = i4;
            }
        }
        List<String> extractCitationSegments = citationTextSegmenter.extractCitationSegments(str);
        if (i2 == -1) {
            return extractCitationSegments;
        }
        Matcher matcher = CITATION_MARKERS[i2].matcher(str);
        int i5 = 0;
        int i6 = 0;
        while (matcher.find()) {
            if (i6 == 0) {
                end = matcher.end();
            } else {
                String substring = str.substring(i5, matcher.start());
                if (testCitationProfile(substring)) {
                    arrayList.add(substring);
                }
                end = matcher.end();
            }
            i5 = end;
            i6++;
        }
        if (i6 > 0) {
            String substring2 = str.substring(i5, str.length());
            if (testCitationProfile(substring2)) {
                arrayList.add(substring2);
            }
        }
        List<String> sanitizeCitationReferenceList = sanitizeCitationReferenceList(extractCitationSegments);
        List<String> sanitizeCitationReferenceList2 = sanitizeCitationReferenceList(arrayList);
        return sanitizeCitationReferenceList2.size() > sanitizeCitationReferenceList.size() ? sanitizeCitationReferenceList2 : sanitizeCitationReferenceList;
    }

    private static List<String> sanitizeCitationReferenceList(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(TextUtilities.dehyphenizeHard(stripCitation(it.next())));
        }
        return Lists.newArrayList(Iterables.filter(arrayList, new StringLengthPredicate(15)));
    }

    private static boolean testCitationProfile(String str) {
        return str.length() < 400 && new StringTokenizer(str, "\n").countTokens() < 9;
    }

    private static String stripCitation(String str) {
        return processSpaceDash(str).replaceAll("\\r\\d* ", " ").replaceAll("\\n\\d\\. ", " ").replaceAll("\\n", " ").replaceAll("\\\\", " ").replaceAll(TextUtilities.DOUBLE_QUOTE, " ").replaceAll(",\\s*,", TextUtilities.COMMA).replaceAll("\\r", " ").replaceAll("\\s\\s+", " ").trim().replaceAll("^[\\d]+\\s", "");
    }

    private static String processSpaceDash(String str) {
        while (true) {
            Matcher matcher = SPACE_DASH_PATTERN.matcher(str);
            if (!matcher.find()) {
                return str;
            }
            str = str.substring(0, matcher.start() + 1) + "-" + str.substring(matcher.end() - 1);
        }
    }
}
