package org.grobid.core.engines;

import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.TreeMap;
import org.apache.commons.lang3.StringUtils;
import org.grobid.core.GrobidModels;
import org.grobid.core.document.BasicStructureBuilder;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentNode;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.config.GrobidAnalysisConfig;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.exceptions.GrobidExceptionStatus;
import org.grobid.core.exceptions.GrobidResourceException;
import org.grobid.core.features.FeatureFactory;
import org.grobid.core.features.FeaturesVectorMonograph;
import org.grobid.core.lang.Language;
import org.grobid.core.layout.Block;
import org.grobid.core.layout.BoundingBox;
import org.grobid.core.layout.GraphicObject;
import org.grobid.core.layout.GraphicObjectType;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.layout.Page;
import org.grobid.core.utilities.GrobidProperties;
import org.grobid.core.utilities.LanguageUtilities;
import org.grobid.core.utilities.TextUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wipo.analyzers.wipokr.utils.EomiUtil;

/* loaded from: input_file:org/grobid/core/engines/MonographParser.class */
public class MonographParser extends AbstractParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(MonographParser.class);
    private static final int NBBINS_POSITION = 12;
    private static final int NBBINS_SPACE = 5;
    private static final int NBBINS_DENSITY = 5;
    private static final int LINESCALE = 10;
    private static final int BLOCKSCALE = 10;
    private LanguageUtilities languageUtilities;
    private FeatureFactory featureFactory;
    private File tmpPath;

    public MonographParser() {
        super(GrobidModels.MONOGRAPH);
        this.languageUtilities = LanguageUtilities.getInstance();
        this.featureFactory = FeatureFactory.getInstance();
        this.tmpPath = null;
        this.tmpPath = GrobidProperties.getTempPath();
    }

    public Document processing(DocumentSource documentSource, GrobidAnalysisConfig grobidAnalysisConfig) {
        try {
            Document document = new Document(documentSource);
            if (grobidAnalysisConfig.getAnalyzer() != null) {
                document.setAnalyzer(grobidAnalysisConfig.getAnalyzer());
            }
            document.addTokenizedDocument(grobidAnalysisConfig);
            Document prepareDocument = prepareDocument(document);
            if (grobidAnalysisConfig.getPdfAssetPath() == null) {
                DocumentSource.close(documentSource, false, true, true);
            } else {
                DocumentSource.close(documentSource, true, true, true);
            }
            return prepareDocument;
        } catch (Throwable th) {
            if (grobidAnalysisConfig.getPdfAssetPath() == null) {
                DocumentSource.close(documentSource, false, true, true);
            } else {
                DocumentSource.close(documentSource, true, true, true);
            }
            throw th;
        }
    }

    public Document prepareDocument(Document document) {
        List<LayoutToken> tokenizations = document.getTokenizations();
        if (tokenizations.size() > GrobidProperties.getPdfTokensMax().intValue()) {
            throw new GrobidException("The document has " + tokenizations.size() + " tokens, but the limit is " + GrobidProperties.getPdfTokensMax(), GrobidExceptionStatus.TOO_MANY_TOKENS);
        }
        document.produceStatistics();
        String allBlocksFeatured = getAllBlocksFeatured(document);
        if (StringUtils.isNotEmpty(StringUtils.trim(allBlocksFeatured))) {
            document = BasicStructureBuilder.generalResultSegmentation(document, label(allBlocksFeatured), tokenizations);
        }
        return document;
    }

    public String getAllLinesFeatured(Document document) {
        String text;
        List<Block> blocks = document.getBlocks();
        if (blocks == null || blocks.size() == 0) {
            return null;
        }
        if (blocks.size() > GrobidProperties.getPdfBlocksMax().intValue()) {
            throw new GrobidException("Postprocessed document is too big, contains: " + blocks.size(), GrobidExceptionStatus.TOO_MANY_BLOCKS);
        }
        TreeMap treeMap = new TreeMap();
        TreeMap treeMap2 = new TreeMap();
        for (Page page : document.getPages()) {
            if (page.getBlocks() != null && page.getBlocks().size() > 0) {
                for (int i = 0; i < page.getBlocks().size(); i++) {
                    if ((i < 2 || i > page.getBlocks().size() - 2) && (text = page.getBlocks().get(i).getText()) != null && text.length() > 0) {
                        String[] split = text.split("[\\n\\r]");
                        if (split.length > 0) {
                            String pattern = this.featureFactory.getPattern(split[0]);
                            if (pattern.length() > 8) {
                                Integer num = treeMap.get(pattern);
                                if (num == null) {
                                    treeMap.put(pattern, Integer.valueOf(EomiUtil.RESULT_SUCCESS));
                                    treeMap2.put(pattern, false);
                                } else {
                                    treeMap.put(pattern, Integer.valueOf(num.intValue() + 1));
                                }
                            }
                        }
                    }
                }
            }
        }
        return getFeatureVectorsAsString(document, treeMap, treeMap2);
    }

    public String getAllBlocksFeatured(Document document) {
        String text;
        List<Block> blocks = document.getBlocks();
        if (blocks == null || blocks.size() == 0) {
            return null;
        }
        if (blocks.size() > GrobidProperties.getPdfBlocksMax().intValue()) {
            throw new GrobidException("Postprocessed document is too big, contains: " + blocks.size(), GrobidExceptionStatus.TOO_MANY_BLOCKS);
        }
        TreeMap treeMap = new TreeMap();
        TreeMap treeMap2 = new TreeMap();
        for (Page page : document.getPages()) {
            if (page.getBlocks() != null && page.getBlocks().size() > 0) {
                for (int i = 0; i < page.getBlocks().size(); i++) {
                    if ((i < 2 || i > page.getBlocks().size() - 2) && (text = page.getBlocks().get(i).getText()) != null && text.length() > 0) {
                        String pattern = this.featureFactory.getPattern(text);
                        if (pattern.length() > 8) {
                            Integer num = treeMap.get(pattern);
                            if (num == null) {
                                treeMap.put(pattern, Integer.valueOf(EomiUtil.RESULT_SUCCESS));
                                treeMap2.put(pattern, false);
                            } else {
                                treeMap.put(pattern, Integer.valueOf(num.intValue() + 1));
                            }
                        }
                    }
                }
            }
        }
        return getFeatureVectorsAsString(document, treeMap, treeMap2);
    }

    private String getFeatureVectorsAsString(Document document, Map<String, Integer> map, Map<String, Boolean> map2) {
        String pattern;
        Integer num;
        StringBuilder sb = new StringBuilder();
        int documentLenghtChar = document.getDocumentLenghtChar();
        String str = null;
        int i = -1;
        int i2 = 0;
        FeaturesVectorMonograph featuresVectorMonograph = null;
        for (Page page : document.getPages()) {
            double height = page.getHeight();
            boolean z = true;
            double d = 0.0d;
            int pageLengthChar = page.getPageLengthChar();
            BoundingBox mainArea = page.getMainArea();
            int i3 = 0;
            if (page.getBlocks() != null && page.getBlocks().size() != 0) {
                for (int i4 = 0; i4 < page.getBlocks().size(); i4++) {
                    Block block = page.getBlocks().get(i4);
                    boolean z2 = false;
                    boolean z3 = false;
                    boolean z4 = i4 == page.getBlocks().size() - 1;
                    boolean z5 = i4 == 0;
                    List<GraphicObject> connectedGraphics = Document.getConnectedGraphics(block, document);
                    if (connectedGraphics != null) {
                        for (GraphicObject graphicObject : connectedGraphics) {
                            if (graphicObject.getType() == GraphicObjectType.BITMAP) {
                                z3 = true;
                            }
                            if (graphicObject.getType() == GraphicObjectType.VECTOR) {
                                z2 = true;
                            }
                        }
                    }
                    double maxBlockSpacing = d > block.getY() ? document.getMaxBlockSpacing() / 5.0d : block.getY() - d;
                    String text = block.getText();
                    if (text != null) {
                        double d2 = 0.0d;
                        if (block.getHeight() != 0.0d && block.getWidth() != 0.0d && block.getText() != null && !block.getText().contains("@PAGE") && !block.getText().contains("@IMAGE")) {
                            d2 = block.getText().length() / (block.getHeight() * block.getWidth());
                        }
                        boolean z6 = true;
                        BoundingBox fromPointAndDimensions = BoundingBox.fromPointAndDimensions(page.getNumber(), block.getX(), block.getY(), block.getWidth(), block.getHeight());
                        if (mainArea == null || (!mainArea.contains(fromPointAndDimensions) && !mainArea.intersect(fromPointAndDimensions))) {
                            z6 = false;
                        }
                        String[] split = text.split("[\\n\\r]");
                        int i5 = 0;
                        for (int i6 = 0; i6 < split.length; i6++) {
                            if (split[i6].length() > i5) {
                                i5 = split[i6].length();
                            }
                        }
                        List<LayoutToken> tokens = block.getTokens();
                        if (tokens != null && tokens.size() != 0) {
                            for (int i7 = 0; i7 < split.length; i7++) {
                                String str2 = split[i7];
                                LayoutToken layoutToken = tokens.size() > 0 ? tokens.get(0) : null;
                                double y = layoutToken.getY();
                                FeaturesVectorMonograph featuresVectorMonograph2 = new FeaturesVectorMonograph();
                                featuresVectorMonograph2.token = layoutToken;
                                featuresVectorMonograph2.line = str2;
                                if ((i4 < 2 || i4 > page.getBlocks().size() - 2) && (num = map.get((pattern = this.featureFactory.getPattern(str2)))) != null && num.intValue() > 1) {
                                    featuresVectorMonograph2.repetitivePattern = true;
                                    Boolean bool = map2.get(pattern);
                                    if (bool != null && !bool.booleanValue()) {
                                        featuresVectorMonograph2.firstRepetitivePattern = true;
                                        map2.put(pattern, true);
                                    }
                                }
                                StringTokenizer stringTokenizer = new StringTokenizer(str2, " \t");
                                String nextToken = stringTokenizer.hasMoreTokens() ? stringTokenizer.nextToken() : null;
                                String nextToken2 = stringTokenizer.hasMoreTokens() ? stringTokenizer.nextToken() : null;
                                if (nextToken != null) {
                                    String trim = nextToken.replaceAll("[ \n]", "").trim();
                                    if (trim.length() != 0 && !TextUtilities.filterLine(str2)) {
                                        featuresVectorMonograph2.string = trim;
                                        featuresVectorMonograph2.secondString = nextToken2;
                                        featuresVectorMonograph2.firstPageBlock = z5;
                                        featuresVectorMonograph2.lastPageBlock = z4;
                                        featuresVectorMonograph2.lineLength = this.featureFactory.linearScaling(str2.length(), i5, 10);
                                        featuresVectorMonograph2.punctuationProfile = TextUtilities.punctuationProfile(str2);
                                        if (z3) {
                                            featuresVectorMonograph2.bitmapAround = true;
                                        }
                                        if (z2) {
                                            featuresVectorMonograph2.vectorAround = true;
                                        }
                                        featuresVectorMonograph2.lineStatus = null;
                                        featuresVectorMonograph2.punctType = null;
                                        if (i7 == 0 || (featuresVectorMonograph != null && featuresVectorMonograph.blockStatus.equals("BLOCKEND"))) {
                                            featuresVectorMonograph2.blockStatus = "BLOCKSTART";
                                        } else if (i7 == split.length - 1) {
                                            featuresVectorMonograph2.blockStatus = "BLOCKEND";
                                        } else if (featuresVectorMonograph2.blockStatus == null) {
                                            featuresVectorMonograph2.blockStatus = "BLOCKIN";
                                        }
                                        if (z) {
                                            featuresVectorMonograph2.pageStatus = "PAGESTART";
                                            z = false;
                                            if (featuresVectorMonograph != null) {
                                                featuresVectorMonograph.pageStatus = "PAGEEND";
                                            }
                                        } else {
                                            featuresVectorMonograph2.pageStatus = "PAGEIN";
                                            z = false;
                                        }
                                        if (trim.length() == 1) {
                                            featuresVectorMonograph2.singleChar = true;
                                        }
                                        if (Character.isUpperCase(trim.charAt(0))) {
                                            featuresVectorMonograph2.capitalisation = "INITCAP";
                                        }
                                        if (this.featureFactory.test_all_capital(trim)) {
                                            featuresVectorMonograph2.capitalisation = "ALLCAP";
                                        }
                                        FeatureFactory featureFactory = this.featureFactory;
                                        if (FeatureFactory.test_digit(trim)) {
                                            featuresVectorMonograph2.digit = "CONTAINSDIGITS";
                                        }
                                        if (this.featureFactory.test_common(trim)) {
                                            featuresVectorMonograph2.commonName = true;
                                        }
                                        if (this.featureFactory.test_names(trim)) {
                                            featuresVectorMonograph2.properName = true;
                                        }
                                        if (this.featureFactory.test_month(trim)) {
                                            featuresVectorMonograph2.month = true;
                                        }
                                        if (this.featureFactory.isDigit.matcher(trim).find()) {
                                            featuresVectorMonograph2.digit = "ALLDIGIT";
                                        }
                                        if (this.featureFactory.year.matcher(trim).find()) {
                                            featuresVectorMonograph2.year = true;
                                        }
                                        if (this.featureFactory.email.matcher(trim).find()) {
                                            featuresVectorMonograph2.email = true;
                                        }
                                        if (this.featureFactory.http.matcher(trim).find()) {
                                            featuresVectorMonograph2.http = true;
                                        }
                                        if (str == null) {
                                            str = layoutToken.getFont();
                                            featuresVectorMonograph2.fontStatus = "NEWFONT";
                                        } else if (str.equals(layoutToken.getFont())) {
                                            featuresVectorMonograph2.fontStatus = "SAMEFONT";
                                        } else {
                                            str = layoutToken.getFont();
                                            featuresVectorMonograph2.fontStatus = "NEWFONT";
                                        }
                                        int fontSize = (int) layoutToken.getFontSize();
                                        if (i == -1) {
                                            i = fontSize;
                                            featuresVectorMonograph2.fontSize = "HIGHERFONT";
                                        } else if (i == fontSize) {
                                            featuresVectorMonograph2.fontSize = "SAMEFONTSIZE";
                                        } else if (i < fontSize) {
                                            featuresVectorMonograph2.fontSize = "HIGHERFONT";
                                            i = fontSize;
                                        } else if (i > fontSize) {
                                            featuresVectorMonograph2.fontSize = "LOWERFONT";
                                            i = fontSize;
                                        }
                                        if (layoutToken.getBold()) {
                                            featuresVectorMonograph2.bold = true;
                                        }
                                        if (layoutToken.getItalic()) {
                                            featuresVectorMonograph2.italic = true;
                                        }
                                        if (featuresVectorMonograph2.capitalisation == null) {
                                            featuresVectorMonograph2.capitalisation = "NOCAPS";
                                        }
                                        if (featuresVectorMonograph2.digit == null) {
                                            featuresVectorMonograph2.digit = "NODIGIT";
                                        }
                                        featuresVectorMonograph2.relativeDocumentPosition = this.featureFactory.linearScaling(i2, documentLenghtChar, 12);
                                        featuresVectorMonograph2.relativePagePositionChar = this.featureFactory.linearScaling(i3, pageLengthChar, 12);
                                        int linearScaling = this.featureFactory.linearScaling(y, height, 12);
                                        if (linearScaling > 12) {
                                            linearScaling = 12;
                                        }
                                        featuresVectorMonograph2.relativePagePosition = linearScaling;
                                        if (maxBlockSpacing != 0.0d) {
                                            featuresVectorMonograph2.spacingWithPreviousBlock = this.featureFactory.linearScaling(maxBlockSpacing - document.getMinBlockSpacing(), document.getMaxBlockSpacing() - document.getMinBlockSpacing(), 5);
                                        }
                                        featuresVectorMonograph2.inMainArea = z6;
                                        if (d2 != -1.0d) {
                                            featuresVectorMonograph2.characterDensity = this.featureFactory.linearScaling(d2 - document.getMinCharacterDensity(), document.getMaxCharacterDensity() - document.getMinCharacterDensity(), 5);
                                        }
                                        if (featuresVectorMonograph != null) {
                                            sb.append(featuresVectorMonograph.printVector());
                                        }
                                        featuresVectorMonograph = featuresVectorMonograph2;
                                    }
                                }
                            }
                            d = block.getY() + block.getHeight();
                            if (tokens != null) {
                                i3 += tokens.size();
                                i2 += tokens.size();
                            }
                        }
                    }
                }
            }
        }
        if (featuresVectorMonograph != null) {
            sb.append(featuresVectorMonograph.printVector());
        }
        return sb.toString();
    }

    public Document createTrainingFromPDF(File file, String str, String str2, int i) {
        if (this.tmpPath == null) {
            throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
        }
        if (!this.tmpPath.exists()) {
            throw new GrobidResourceException("Cannot process pdf file, because temp path '" + this.tmpPath.getAbsolutePath() + "' does not exists.");
        }
        try {
            try {
                if (!file.exists()) {
                    throw new GrobidResourceException("Cannot train for monograph, because file '" + file.getAbsolutePath() + "' does not exists.");
                }
                String name = file.getName();
                File file2 = new File(str2 + TextUtilities.SLASH + name.replace(".pdf", "training.monograph.tei.xml"));
                new File(str + TextUtilities.SLASH + name.replace(".pdf", ".monograph.raw"));
                DocumentSource fromPdf = DocumentSource.fromPdf(file, -1, -1, true, true, true);
                Document document = new Document(fromPdf);
                document.addTokenizedDocument(GrobidAnalysisConfig.defaultInstance());
                if (document.getBlocks() == null) {
                    throw new Exception("PDF parsing resulted in empty content");
                }
                document.produceStatistics();
                StringBuilder sb = new StringBuilder();
                sb.append("<?xml version=\"1.0\" ?>\n<tei xml:space=\"preserve\">\n\t<teiHeader>\n\t\t<fileDesc xml:id=\"" + i + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"" + Language.EN + "\">\n");
                DocumentNode outlineRoot = document.getOutlineRoot();
                List<LayoutToken> tokenizations = document.getTokenizations();
                DocumentNode documentNode = outlineRoot;
                while (documentNode.getChildren() != null) {
                    List<DocumentNode> children = documentNode.getChildren();
                    if (children.size() == 0) {
                        break;
                    }
                    documentNode = children.get(0);
                }
                Iterator<LayoutToken> it = tokenizations.iterator();
                while (it.hasNext()) {
                    sb.append(it.next().getText());
                }
                sb.append("\t</text>\n</tei>");
                OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(file2, false), "UTF-8");
                outputStreamWriter.write(sb.toString());
                outputStreamWriter.close();
                DocumentSource.close(fromPdf, true, true, true);
                return document;
            } catch (Exception e) {
                e.printStackTrace();
                throw new GrobidException("An exception occured while running Grobid training data generation for monograph.", e);
            }
        } catch (Throwable th) {
            DocumentSource.close(null, true, true, true);
            throw th;
        }
    }
}
