package org.codelibs.fess.crawler.extractor.impl;

import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import jakarta.annotation.Resource;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import javax.xml.xpath.XPathNodes;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;
import org.codelibs.fess.crawler.util.XPathAPI;
import org.codelibs.nekohtml.parsers.DOMParser;
import org.w3c.dom.Document;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/codelibs/fess/crawler/extractor/impl/HtmlXpathExtractor.class */
public class HtmlXpathExtractor extends AbstractXmlExtractor {
    protected LoadingCache<String, XPathAPI> xpathAPICache;
    protected Pattern metaCharsetPattern = Pattern.compile("<meta.*content\\s*=\\s*['\"].*;\\s*charset=([\\w\\d\\-_]*)['\"]\\s*/?>", 10);
    protected Map<String, String> featureMap = new HashMap();
    protected Map<String, String> propertyMap = new HashMap();
    protected String targetNodePath = "//HTML/BODY | //@alt | //@title";
    protected long cacheDuration = 10;

    @Resource
    public void init() {
        this.xpathAPICache = CacheBuilder.newBuilder().expireAfterAccess(this.cacheDuration, TimeUnit.MINUTES).build(new CacheLoader<String, XPathAPI>(this) { // from class: org.codelibs.fess.crawler.extractor.impl.HtmlXpathExtractor.1
            public XPathAPI load(String str) {
                if (AbstractXmlExtractor.logger.isDebugEnabled()) {
                    AbstractXmlExtractor.logger.debug("created XPathAPI by {}", str);
                }
                return new XPathAPI();
            }
        });
    }

    @Override // org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor, org.codelibs.fess.crawler.extractor.Extractor
    public ExtractData getText(InputStream inputStream, Map<String, String> map) {
        if (inputStream == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
            String encoding = getEncoding(bufferedInputStream);
            DOMParser domParser = getDomParser();
            InputSource inputSource = new InputSource(bufferedInputStream);
            inputSource.setEncoding(encoding);
            domParser.parse(inputSource);
            Document document = domParser.getDocument();
            StringBuilder sb = new StringBuilder(255);
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, this.targetNodePath);
            for (int i = 0; i < selectNodeList.size(); i++) {
                sb.append(selectNodeList.get(i).getTextContent()).append(' ');
            }
            return new ExtractData(sb.toString().replaceAll("\\s+", " ").trim());
        } catch (Exception e) {
            throw new ExtractException(e);
        }
    }

    protected XPathAPI getXPathAPI() {
        try {
            return (XPathAPI) this.xpathAPICache.get(Thread.currentThread().getName());
        } catch (ExecutionException e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Failed to retrieval a cache.", e);
            }
            return new XPathAPI();
        }
    }

    protected DOMParser getDomParser() {
        DOMParser dOMParser = new DOMParser();
        try {
            for (Map.Entry<String, String> entry : this.featureMap.entrySet()) {
                dOMParser.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
            }
            for (Map.Entry<String, String> entry2 : this.propertyMap.entrySet()) {
                dOMParser.setProperty(entry2.getKey(), entry2.getValue());
            }
            return dOMParser;
        } catch (Exception e) {
            throw new CrawlerSystemException("Invalid parser configuration.", e);
        }
    }

    @Override // org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor
    protected Pattern getEncodingPattern() {
        return this.metaCharsetPattern;
    }

    @Override // org.codelibs.fess.crawler.extractor.impl.AbstractXmlExtractor
    protected Pattern getTagPattern() {
        return null;
    }

    public void addFeature(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new CrawlerSystemException("key or value is null.");
        }
        this.featureMap.put(str, str2);
    }

    public void addProperty(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new CrawlerSystemException("key or value is null.");
        }
        this.propertyMap.put(str, str2);
    }

    public Map<String, String> getFeatureMap() {
        return this.featureMap;
    }

    public void setFeatureMap(Map<String, String> map) {
        this.featureMap = map;
    }

    public Map<String, String> getPropertyMap() {
        return this.propertyMap;
    }

    public void setPropertyMap(Map<String, String> map) {
        this.propertyMap = map;
    }

    public Pattern getMetaCharsetPattern() {
        return this.metaCharsetPattern;
    }

    public void setMetaCharsetPattern(Pattern pattern) {
        this.metaCharsetPattern = pattern;
    }

    public String getTargetNodePath() {
        return this.targetNodePath;
    }

    public void setTargetNodePath(String str) {
        this.targetNodePath = str;
    }

    public void setCacheDuration(long j) {
        this.cacheDuration = j;
    }
}
