package org.grobid.core.utilities.matching;

import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
import org.apache.lucene.util.Version;
import org.grobid.core.data.BibDataSet;
import org.grobid.core.engines.counters.ReferenceMarkerMatcherCounters;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.Pair;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.utilities.counters.CntManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/grobid/core/utilities/matching/ReferenceMarkerMatcher.class */
public class ReferenceMarkerMatcher {
    public static final int MAX_RANGE = 20;
    private final LuceneIndexMatcher<BibDataSet, String> authorMatcher;
    private final LuceneIndexMatcher<BibDataSet, String> labelMatcher;
    private CntManager cntManager;
    private Set<String> allLabels;
    private Set<String> allFirstAuthors;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) ReferenceMarkerMatcher.class);
    public static final Pattern YEAR_PATTERN = Pattern.compile("[12][0-9]{3}[a-d]?");
    public static final Pattern YEAR_PATTERN_WITH_LOOK_AROUND = Pattern.compile("(?<!\\d)[12][0-9]{3}(?!\\d)[a-d]?");
    public static final Pattern AUTHOR_NAME_PATTERN = Pattern.compile("[A-Z][\\p{L}]+");
    public static final Pattern NUMBERED_CITATION_PATTERN = Pattern.compile("[\\(\\[]?\\s*(?:\\d+[-−–]\\d+,|\\d+,[ ]*)*[ ]*(?:\\d+[-–]\\d+|\\d+)\\s*[\\)\\]]?");
    public static final Pattern AUTHOR_SEPARATOR_PATTERN = Pattern.compile(";");
    public static final ClassicAnalyzer ANALYZER = new ClassicAnalyzer(Version.LUCENE_45);
    public static final Pattern NUMBERED_CITATIONS_SPLIT_PATTERN = Pattern.compile("[,;]");
    public static final Pattern AND_WORD_PATTERN = Pattern.compile("(and)|&");
    public static final Pattern DASH_PATTERN = Pattern.compile("[–−-]");
    public static final Function<String, Object> IDENTITY = new Function<String, Object>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.1
        @Override // com.google.common.base.Function, java.util.function.Function
        public Object apply(String str) {
            return str;
        }
    };

    /* loaded from: input_file:org/grobid/core/utilities/matching/ReferenceMarkerMatcher$MatchResult.class */
    public class MatchResult {
        private String text;
        private List<LayoutToken> tokens;
        private BibDataSet bibDataSet;

        public MatchResult(String str, List<LayoutToken> list, BibDataSet bibDataSet) {
            this.text = str;
            this.tokens = list;
            this.bibDataSet = bibDataSet;
        }

        public String getText() {
            return this.text;
        }

        public List<LayoutToken> getTokens() {
            return this.tokens;
        }

        public BibDataSet getBibDataSet() {
            return this.bibDataSet;
        }
    }

    public ReferenceMarkerMatcher(List<BibDataSet> list, CntManager cntManager) throws EntityMatcherException {
        this.allLabels = null;
        this.allFirstAuthors = null;
        this.allLabels = new HashSet();
        this.allFirstAuthors = new HashSet();
        if (list != null && list.size() > 0) {
            for (BibDataSet bibDataSet : list) {
                this.allLabels.add(bibDataSet.getRefSymbol());
                String firstAuthorSurname = bibDataSet.getResBib().getFirstAuthorSurname();
                if (firstAuthorSurname != null && firstAuthorSurname.length() > 0) {
                    this.allFirstAuthors.add(firstAuthorSurname);
                }
            }
        }
        this.cntManager = cntManager;
        this.authorMatcher = new LuceneIndexMatcher<>(new Function<BibDataSet, Object>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.2
            @Override // com.google.common.base.Function, java.util.function.Function
            public Object apply(BibDataSet bibDataSet2) {
                String str = bibDataSet2.getResBib().getAuthors() + " et al";
                if (bibDataSet2.getResBib().getPublicationDate() != null) {
                    str = str + " " + bibDataSet2.getResBib().getPublicationDate();
                }
                return str;
            }
        }, IDENTITY);
        this.authorMatcher.setMustMatchPercentage(1.0d);
        if (list != null) {
            this.authorMatcher.load(list);
        }
        this.labelMatcher = new LuceneIndexMatcher<>(new Function<BibDataSet, Object>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.3
            @Override // com.google.common.base.Function, java.util.function.Function
            public Object apply(BibDataSet bibDataSet2) {
                return bibDataSet2.getRefSymbol();
            }
        }, IDENTITY);
        this.labelMatcher.setMustMatchPercentage(1.0d);
        if (list != null) {
            this.labelMatcher.load(list);
        }
    }

    public List<MatchResult> match(List<LayoutToken> list) throws EntityMatcherException {
        this.cntManager.i(ReferenceMarkerMatcherCounters.INPUT_REF_STRINGS_CNT);
        String text = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(LayoutTokensUtil.enrichWithNewLineInfo(list)));
        if (isAuthorCitationStyle(text)) {
            this.cntManager.i(ReferenceMarkerMatcherCounters.STYLE_AUTHORS);
            return matchAuthorCitation(text, list);
        }
        if (isNumberedCitationReference(text)) {
            this.cntManager.i(ReferenceMarkerMatcherCounters.STYLE_NUMBERED);
            return matchNumberedCitation(text, list);
        }
        this.cntManager.i(ReferenceMarkerMatcherCounters.STYLE_OTHER);
        return Collections.singletonList(new MatchResult(text, list, null));
    }

    public boolean isAuthorCitationStyle(String str) {
        return YEAR_PATTERN.matcher(str.trim()).find() && AUTHOR_NAME_PATTERN.matcher(str.trim()).find();
    }

    public static boolean isNumberedCitationReference(String str) {
        return NUMBERED_CITATION_PATTERN.matcher(str.trim()).matches() || (NUMBERED_CITATION_PATTERN.matcher(str.trim()).find() && AUTHOR_NAME_PATTERN.matcher(str.trim()).find());
    }

    private List<MatchResult> matchNumberedCitation(String str, List<LayoutToken> list) throws EntityMatcherException {
        List<Pair<String, List<LayoutToken>>> numberedLabels = getNumberedLabels(list);
        ArrayList arrayList = new ArrayList();
        for (Pair<String, List<LayoutToken>> pair : numberedLabels) {
            String str2 = pair.a;
            List<LayoutToken> list2 = pair.b;
            List<BibDataSet> match = this.labelMatcher.match(str2);
            if (match.size() == 1) {
                this.cntManager.i(ReferenceMarkerMatcherCounters.MATCHED_REF_MARKERS);
                arrayList.add(new MatchResult(str2, list2, match.get(0)));
            } else {
                this.cntManager.i(ReferenceMarkerMatcherCounters.UNMATCHED_REF_MARKERS);
                if (match.size() != 0) {
                    this.cntManager.i(ReferenceMarkerMatcherCounters.MANY_CANDIDATES);
                    for (BibDataSet bibDataSet : match) {
                    }
                } else {
                    this.cntManager.i(ReferenceMarkerMatcherCounters.NO_CANDIDATES);
                }
                arrayList.add(new MatchResult(str2, list2, null));
            }
        }
        return arrayList;
    }

    private static List<Pair<String, List<LayoutToken>>> getNumberedLabels(List<LayoutToken> list) {
        List<List<LayoutToken>> split = LayoutTokensUtil.split(list, NUMBERED_CITATIONS_SPLIT_PATTERN, true);
        ArrayList arrayList = new ArrayList();
        Pair<Character, Character> wrappingSymbols = getWrappingSymbols(split.get(0));
        for (List<LayoutToken> list2 : split) {
            int i = LayoutTokensUtil.tokenPos(list2, DASH_PATTERN);
            if (i < 0) {
                arrayList.add(new Pair(LayoutTokensUtil.toText(list2), list2));
            } else {
                try {
                    LayoutToken layoutToken = list2.get(i);
                    List<LayoutToken> subList = list2.subList(0, i);
                    List<LayoutToken> subList2 = list2.subList(i + 1, list2.size());
                    Integer valueOf = Integer.valueOf(LuceneUtil.tokenizeString(ANALYZER, LayoutTokensUtil.toText(subList)).get(0), 10);
                    Integer valueOf2 = Integer.valueOf(LuceneUtil.tokenizeString(ANALYZER, LayoutTokensUtil.toText(subList2)).get(0), 10);
                    if (valueOf.intValue() < valueOf2.intValue() && valueOf2.intValue() - valueOf.intValue() < 20) {
                        int intValue = valueOf.intValue();
                        while (intValue <= valueOf2.intValue()) {
                            arrayList.add(new Pair(wrappingSymbols.a + String.valueOf(intValue) + wrappingSymbols.b, intValue == valueOf.intValue() ? subList : intValue == valueOf2.intValue() ? subList2 : Collections.singletonList(layoutToken)));
                            intValue++;
                        }
                    }
                } catch (Exception e) {
                    LOGGER.warn("Cannot parse citation reference range: " + list2);
                }
            }
        }
        return arrayList;
    }

    private static Pair<Character, Character> getWrappingSymbols(List<LayoutToken> list) {
        for (LayoutToken layoutToken : list) {
            if (!LayoutTokensUtil.spaceyToken(layoutToken.t()) && !LayoutTokensUtil.newLineToken(layoutToken.t())) {
                return layoutToken.t().equals(TextUtilities.START_BRACKET) ? new Pair<>('(', ')') : new Pair<>('[', ']');
            }
        }
        return new Pair<>('[', ']');
    }

    private List<MatchResult> matchAuthorCitation(String str, List<LayoutToken> list) throws EntityMatcherException {
        List<Pair<String, List<LayoutToken>>> splitAuthors = splitAuthors(list);
        ArrayList arrayList = new ArrayList();
        for (Pair<String, List<LayoutToken>> pair : splitAuthors) {
            String str2 = pair.a;
            List<LayoutToken> list2 = pair.b;
            List<BibDataSet> match = this.authorMatcher.match(str2);
            if (match.size() == 1) {
                this.cntManager.i(ReferenceMarkerMatcherCounters.MATCHED_REF_MARKERS);
                arrayList.add(new MatchResult(str2, list2, match.get(0)));
            } else if (match.size() != 0) {
                this.cntManager.i(ReferenceMarkerMatcherCounters.MANY_CANDIDATES);
                List<BibDataSet> postFilterMatches = postFilterMatches(str2, match);
                if (postFilterMatches.size() == 1) {
                    arrayList.add(new MatchResult(str2, list2, postFilterMatches.get(0)));
                    this.cntManager.i(ReferenceMarkerMatcherCounters.MATCHED_REF_MARKERS);
                    this.cntManager.i(ReferenceMarkerMatcherCounters.MATCHED_REF_MARKERS_AFTER_POST_FILTERING);
                } else {
                    this.cntManager.i(ReferenceMarkerMatcherCounters.UNMATCHED_REF_MARKERS);
                    if (postFilterMatches.size() == 0) {
                        this.cntManager.i(ReferenceMarkerMatcherCounters.NO_CANDIDATES_AFTER_POST_FILTERING);
                    } else {
                        this.cntManager.i(ReferenceMarkerMatcherCounters.MANY_CANDIDATES_AFTER_POST_FILTERING);
                    }
                }
            } else {
                arrayList.add(new MatchResult(str2, list2, null));
                this.cntManager.i(ReferenceMarkerMatcherCounters.NO_CANDIDATES);
            }
        }
        return arrayList;
    }

    private static List<Pair<String, List<LayoutToken>>> splitAuthors(List<LayoutToken> list) {
        List<List<LayoutToken>> split = LayoutTokensUtil.split(list, AUTHOR_SEPARATOR_PATTERN, true);
        ArrayList arrayList = new ArrayList();
        for (List<LayoutToken> list2 : split) {
            String text = LayoutTokensUtil.toText(list2);
            int matchCount = matchCount(text, YEAR_PATTERN_WITH_LOOK_AROUND);
            if (matchCount == 2 && text.contains(" and ")) {
                for (List<LayoutToken> list3 : LayoutTokensUtil.split(list2, AND_WORD_PATTERN, true)) {
                    arrayList.add(new Pair(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list3)), list3));
                }
            } else if (matchCount > 1) {
                List<List<LayoutToken>> split2 = LayoutTokensUtil.split(list2, YEAR_PATTERN, true, false);
                if (split2.isEmpty()) {
                    arrayList.add(new Pair(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list2)), list2));
                } else if (matchCount(list2, AUTHOR_NAME_PATTERN) == 1) {
                    List<LayoutToken> list4 = split2.get(0);
                    arrayList.add(new Pair(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list4)), list4));
                    String text2 = LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list4.subList(0, list4.size() - 1)));
                    for (int i = 1; i < split2.size(); i++) {
                        List<LayoutToken> list5 = split2.get(i);
                        arrayList.add(new Pair(text2 + " " + LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list5)), list5.subList(list5.size() - 1, list5.size())));
                    }
                } else {
                    for (List<LayoutToken> list6 : split2) {
                        arrayList.add(new Pair(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list6)), list6));
                    }
                }
            } else {
                arrayList.add(new Pair(LayoutTokensUtil.toText(LayoutTokensUtil.dehyphenize(list2)), list2));
            }
        }
        return arrayList;
    }

    private static int matchCount(String str, Pattern pattern) {
        int i = 0;
        while (pattern.matcher(str).find()) {
            i++;
        }
        return i;
    }

    private static int matchCount(List<LayoutToken> list, Pattern pattern) {
        return matchCount(LayoutTokensUtil.toText(list), pattern);
    }

    private List<BibDataSet> postFilterMatches(String str, List<BibDataSet> list) {
        if (!str.toLowerCase().contains("et al") && !str.toLowerCase().contains(" and ")) {
            return Lists.newArrayList(Iterables.filter(list, new Predicate<BibDataSet>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.6
                @Override // com.google.common.base.Predicate
                public boolean apply(BibDataSet bibDataSet) {
                    return bibDataSet.getResBib().getFullAuthors() != null && bibDataSet.getResBib().getFullAuthors().size() == 1;
                }
            }));
        }
        final String lowerCase = str.trim().split(" ")[0].toLowerCase();
        ArrayList newArrayList = Lists.newArrayList(Iterables.filter(list, new Predicate<BibDataSet>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.4
            @Override // com.google.common.base.Predicate
            public boolean apply(BibDataSet bibDataSet) {
                return bibDataSet.getRawBib().trim().toLowerCase().startsWith(lowerCase);
            }
        }));
        return newArrayList.size() <= 1 ? newArrayList : Lists.newArrayList(Iterables.filter(newArrayList, new Predicate<BibDataSet>() { // from class: org.grobid.core.utilities.matching.ReferenceMarkerMatcher.5
            @Override // com.google.common.base.Predicate
            public boolean apply(BibDataSet bibDataSet) {
                return bibDataSet.getResBib().getFullAuthors() != null && bibDataSet.getResBib().getFullAuthors().size() > 1;
            }
        }));
    }

    public boolean isKnownLabel(String str) {
        return this.allLabels != null && this.allLabels.contains(str.trim());
    }

    public boolean isKnownFirstAuthor(String str) {
        return this.allFirstAuthors != null && this.allFirstAuthors.contains(str.trim());
    }
}
