package org.codelibs.fess.crawler.extractor.impl;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSInputStream;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
import org.apache.pdfbox.text.PDFTextStripper;
import org.codelibs.core.lang.ThreadUtil;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.helper.MimeTypeHelper;

/* loaded from: input_file:org/codelibs/fess/crawler/extractor/impl/PdfExtractor.class */
public class PdfExtractor extends PasswordBasedExtractor {
    private static final Logger logger = LogManager.getLogger(PdfExtractor.class);
    protected long timeout = 30000;
    protected boolean isDaemonThread = false;

    @Override // org.codelibs.fess.crawler.extractor.Extractor
    public ExtractData getText(InputStream inputStream, Map<String, String> map) {
        if (inputStream == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }
        try {
            PDDocument loadPDF = Loader.loadPDF(new RandomAccessReadBuffer(inputStream), getPassword(map));
            try {
                StringWriter stringWriter = new StringWriter();
                PDFTextStripper pDFTextStripper = new PDFTextStripper();
                AtomicBoolean atomicBoolean = new AtomicBoolean(false);
                HashSet hashSet = new HashSet();
                Thread thread = new Thread(() -> {
                    try {
                        try {
                            pDFTextStripper.writeText(loadPDF, stringWriter);
                            extractEmbeddedDocuments(loadPDF, stringWriter);
                            extractAnnotations(loadPDF, stringWriter);
                            atomicBoolean.set(true);
                        } catch (Exception e) {
                            hashSet.add(e);
                            atomicBoolean.set(true);
                        }
                    } catch (Throwable th) {
                        atomicBoolean.set(true);
                        throw th;
                    }
                }, Thread.currentThread().getName() + "-pdf");
                thread.setDaemon(this.isDaemonThread);
                thread.start();
                thread.join(this.timeout);
                if (!atomicBoolean.get()) {
                    for (int i = 0; i < 100 && !atomicBoolean.get(); i++) {
                        thread.interrupt();
                        ThreadUtil.sleep(100L);
                    }
                    throw new ExtractException("PDFBox process cannot finish in " + this.timeout + " sec.");
                }
                if (!hashSet.isEmpty()) {
                    throw ((Exception) hashSet.iterator().next());
                }
                stringWriter.flush();
                ExtractData extractData = new ExtractData(stringWriter.toString());
                extractMetadata(loadPDF, extractData);
                if (loadPDF != null) {
                    loadPDF.close();
                }
                return extractData;
            } finally {
            }
        } catch (Exception e) {
            throw new ExtractException(e);
        }
    }

    protected void extractAnnotations(PDDocument pDDocument, StringWriter stringWriter) {
        Iterator it = pDDocument.getPages().iterator();
        while (it.hasNext()) {
            try {
                for (PDAnnotationFileAttachment pDAnnotationFileAttachment : ((PDPage) it.next()).getAnnotations()) {
                    if (pDAnnotationFileAttachment instanceof PDAnnotationFileAttachment) {
                        PDFileSpecification file = pDAnnotationFileAttachment.getFile();
                        if (file instanceof PDComplexFileSpecification) {
                            PDComplexFileSpecification pDComplexFileSpecification = (PDComplexFileSpecification) file;
                            extractFile(pDComplexFileSpecification.getFilename(), getEmbeddedFile(pDComplexFileSpecification), stringWriter);
                        }
                    }
                }
            } catch (IOException e) {
                logger.warn("Failed to parse annotation.", e);
            }
        }
    }

    protected void extractFile(String str, PDEmbeddedFile pDEmbeddedFile, StringWriter stringWriter) {
        Extractor extractor;
        MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
        ExtractorFactory extractorFactory = getExtractorFactory();
        String contentType = mimeTypeHelper.getContentType((InputStream) null, str);
        if (contentType == null || (extractor = extractorFactory.getExtractor(contentType)) == null) {
            return;
        }
        try {
            COSInputStream createInputStream = pDEmbeddedFile.createInputStream();
            try {
                HashMap hashMap = new HashMap();
                hashMap.put("resourceName", str);
                stringWriter.write(extractor.getText(createInputStream, hashMap).getContent());
                stringWriter.write(10);
                if (createInputStream != null) {
                    createInputStream.close();
                }
            } finally {
            }
        } catch (Exception e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Exception in an internal extractor.", e);
            }
        }
    }

    protected void extractEmbeddedDocuments(PDDocument pDDocument, StringWriter stringWriter) {
        PDEmbeddedFilesNameTreeNode embeddedFiles = new PDDocumentNameDictionary(pDDocument.getDocumentCatalog()).getEmbeddedFiles();
        if (embeddedFiles == null) {
            return;
        }
        try {
            Map<String, PDComplexFileSpecification> names = embeddedFiles.getNames();
            if (names != null) {
                processEmbeddedDocNames(names, stringWriter);
            } else {
                List kids = embeddedFiles.getKids();
                if (kids == null) {
                    return;
                }
                Iterator it = kids.iterator();
                while (it.hasNext()) {
                    processEmbeddedDocNames(((PDNameTreeNode) it.next()).getNames(), stringWriter);
                }
            }
        } catch (IOException e) {
            logger.warn("Failed to parse embedded documents.", e);
        }
    }

    protected void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> map, StringWriter stringWriter) {
        if (map == null || map.isEmpty()) {
            return;
        }
        for (Map.Entry<String, PDComplexFileSpecification> entry : map.entrySet()) {
            PDComplexFileSpecification value = entry.getValue();
            if (value != null) {
                extractFile(entry.getKey(), getEmbeddedFile(value), stringWriter);
            }
        }
    }

    protected PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification pDComplexFileSpecification) {
        PDEmbeddedFile pDEmbeddedFile = null;
        if (pDComplexFileSpecification != null) {
            pDEmbeddedFile = pDComplexFileSpecification.getEmbeddedFileUnicode();
            if (pDEmbeddedFile == null) {
                pDEmbeddedFile = pDComplexFileSpecification.getEmbeddedFileDos();
            }
            if (pDEmbeddedFile == null) {
                pDEmbeddedFile = pDComplexFileSpecification.getEmbeddedFileMac();
            }
            if (pDEmbeddedFile == null) {
                pDEmbeddedFile = pDComplexFileSpecification.getEmbeddedFileUnix();
            }
            if (pDEmbeddedFile == null) {
                pDEmbeddedFile = pDComplexFileSpecification.getEmbeddedFile();
            }
        }
        return pDEmbeddedFile;
    }

    protected void extractMetadata(PDDocument pDDocument, ExtractData extractData) {
        PDDocumentInformation documentInformation = pDDocument.getDocumentInformation();
        if (documentInformation == null) {
            return;
        }
        for (String str : documentInformation.getMetadataKeys()) {
            addMetadata(extractData, str, documentInformation.getCustomMetadataValue(str));
        }
    }

    protected void addMetadata(ExtractData extractData, String str, String str2) {
        if (str2 != null) {
            extractData.putValue(str, str2);
        }
    }

    public long getTimeout() {
        return this.timeout;
    }

    public void setTimeout(long j) {
        this.timeout = j;
    }

    public void setDaemonThread(boolean z) {
        this.isDaemonThread = z;
    }
}
