package org.codelibs.fess.crawler.transformer.impl;

import jakarta.annotation.Resource;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import javax.xml.xpath.XPathException;
import javax.xml.xpath.XPathNodes;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.container.CrawlerContainer;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.helper.EncodingHelper;
import org.codelibs.fess.crawler.helper.UrlConvertHelper;
import org.codelibs.fess.crawler.util.CharUtil;
import org.codelibs.fess.crawler.util.XPathAPI;
import org.codelibs.nekohtml.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;

/* loaded from: input_file:org/codelibs/fess/crawler/transformer/impl/HtmlTransformer.class */
public class HtmlTransformer extends AbstractTransformer {
    private static final Logger logger = LogManager.getLogger(HtmlTransformer.class);
    protected static final String LOCATION_HEADER = "Location";

    @Resource
    protected CrawlerContainer crawlerContainer;
    protected String defaultEncoding;
    protected Map<String, String> featureMap = new HashMap();
    protected Map<String, String> propertyMap = new HashMap();
    protected Map<String, String> childUrlRuleMap = new LinkedHashMap();
    protected int preloadSizeForCharset = 2048;
    protected Pattern invalidUrlPattern = Pattern.compile("^\\s*javascript:|^\\s*mailto:|^\\s*irc:|^\\s*skype:|^\\s*about:|^\\s*fscommand:|^\\s*aim:|^\\s*msnim:|^\\s*news:|^\\s*tel:|^\\s*unsaved:|^\\s*data:|^\\s*android-app:|^\\s*ios-app:|^\\s*callto:", 2);
    private final ThreadLocal<XPathAPI> xpathAPI = new ThreadLocal<>();

    @Override // org.codelibs.fess.crawler.transformer.Transformer
    public ResultData transform(ResponseData responseData) {
        String obj;
        if (responseData == null || !responseData.hasResponseBody()) {
            throw new CrawlingAccessException("No response body.");
        }
        updateCharset(responseData);
        ResultData resultData = new ResultData();
        resultData.setTransformerName(getName());
        try {
            storeData(responseData, resultData);
            if (isHtml(responseData) && !responseData.isNoFollow()) {
                storeChildUrls(responseData, resultData);
            }
            Object obj2 = responseData.getMetaDataMap().get(LOCATION_HEADER);
            if (obj2 instanceof String) {
                UrlConvertHelper urlConvertHelper = (UrlConvertHelper) this.crawlerContainer.getComponent("urlConvertHelper");
                if (urlConvertHelper != null) {
                    obj = urlConvertHelper.convert(obj2.toString());
                } else {
                    logger.warn("urlConvertHelper is null.");
                    obj = obj2.toString();
                }
                resultData.addUrl(RequestDataBuilder.newRequestData().get().url(obj).build());
            }
            return resultData;
        } finally {
            this.xpathAPI.remove();
        }
    }

    protected boolean isHtml(ResponseData responseData) {
        String mimeType = responseData.getMimeType();
        return "text/html".equals(mimeType) || "application/xhtml+xml".equals(mimeType);
    }

    public void addChildUrlRule(String str, String str2) {
        if (StringUtil.isNotBlank(str) && StringUtil.isNotBlank(str2)) {
            this.childUrlRuleMap.put(str, str2);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public XPathAPI getXPathAPI() {
        XPathAPI xPathAPI = this.xpathAPI.get();
        if (xPathAPI == null) {
            xPathAPI = new XPathAPI();
            this.xpathAPI.set(xPathAPI);
        }
        return xPathAPI;
    }

    protected void storeChildUrls(ResponseData responseData, ResultData resultData) {
        URI uri;
        try {
            InputStream responseBody = responseData.getResponseBody();
            try {
                DOMParser domParser = getDomParser();
                domParser.parse(new InputSource(responseBody));
                Document document = domParser.getDocument();
                String baseHref = getBaseHref(document);
                try {
                    uri = new URI(baseHref == null ? responseData.getUrl() : baseHref);
                } catch (Exception e) {
                    uri = new URI(responseData.getUrl());
                }
                URL url = uri.toURL();
                getChildUrlRules(responseData, resultData).forEach(pair -> {
                    ArrayList arrayList = new ArrayList();
                    Iterator<String> it = getUrlFromTagAttribute(url, document, (String) pair.getFirst(), (String) pair.getSecond(), responseData.getCharSet()).iterator();
                    while (it.hasNext()) {
                        arrayList.add(RequestDataBuilder.newRequestData().get().url(it.next()).build());
                    }
                    resultData.addAllUrl(convertChildUrlList(arrayList));
                });
                resultData.addAllUrl(responseData.getChildUrlSet());
                RequestData requestData = responseData.getRequestData();
                resultData.removeUrl(requestData);
                resultData.removeUrl(getDuplicateUrl(requestData));
                if (responseBody != null) {
                    responseBody.close();
                }
            } finally {
            }
        } catch (CrawlerSystemException e2) {
            throw e2;
        } catch (Exception e3) {
            throw new CrawlerSystemException("Could not store data.", e3);
        }
    }

    protected Stream<Pair<String, String>> getChildUrlRules(ResponseData responseData, ResultData resultData) {
        return this.childUrlRuleMap.entrySet().stream().map(entry -> {
            return new Pair((String) entry.getKey(), (String) entry.getValue());
        });
    }

    protected List<RequestData> convertChildUrlList(List<RequestData> list) {
        try {
            UrlConvertHelper urlConvertHelper = (UrlConvertHelper) this.crawlerContainer.getComponent("urlConvertHelper");
            for (RequestData requestData : list) {
                requestData.setUrl(urlConvertHelper.convert(requestData.getUrl()));
            }
            return list;
        } catch (Exception e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Failed to convert child URLs.", e);
            }
            return list;
        }
    }

    protected void storeData(ResponseData responseData, ResultData resultData) {
        try {
            InputStream responseBody = responseData.getResponseBody();
            try {
                resultData.setData(InputStreamUtil.getBytes(responseBody));
                resultData.setEncoding(responseData.getCharSet());
                if (responseBody != null) {
                    responseBody.close();
                }
            } catch (Throwable th) {
                if (responseBody != null) {
                    try {
                        responseBody.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
                throw th;
            }
        } catch (CrawlerSystemException e) {
            throw e;
        } catch (Exception e2) {
            throw new CrawlerSystemException("Could not store data.", e2);
        }
    }

    protected void updateCharset(ResponseData responseData) {
        try {
            InputStream responseBody = responseData.getResponseBody();
            try {
                String loadCharset = loadCharset(responseBody);
                if (loadCharset != null) {
                    responseData.setCharSet(loadCharset.trim());
                } else if (this.defaultEncoding == null) {
                    responseData.setCharSet(Constants.UTF_8);
                } else if (responseData.getCharSet() == null) {
                    responseData.setCharSet(this.defaultEncoding);
                }
                if (!isSupportedCharset(responseData.getCharSet())) {
                    responseData.setCharSet(Constants.UTF_8);
                }
                if (responseBody != null) {
                    responseBody.close();
                }
            } catch (Throwable th) {
                if (responseBody != null) {
                    try {
                        responseBody.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
                throw th;
            }
        } catch (CrawlerSystemException e) {
            throw e;
        } catch (Exception e2) {
            throw new CrawlerSystemException("Could not load response data: " + responseData.getUrl(), e2);
        }
    }

    protected boolean isSupportedCharset(String str) {
        if (str == null) {
            return false;
        }
        try {
            Charset.forName(str);
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    protected String loadCharset(InputStream inputStream) {
        String str = null;
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
            byte[] bArr = new byte[this.preloadSizeForCharset];
            int read = bufferedInputStream.read(bArr);
            if (read != -1) {
                str = parseCharset(new String(bArr, 0, read));
            }
            return normalizeEncoding(str);
        } catch (IOException e) {
            throw new CrawlingAccessException("Could not load a content.", e);
        }
    }

    protected String normalizeEncoding(String str) {
        try {
            return ((EncodingHelper) this.crawlerContainer.getComponent("encodingHelper")).normalize(str);
        } catch (Exception e) {
            return str;
        }
    }

    protected String parseCharset(String str) {
        Matcher matcher = Pattern.compile("; *charset *= *([a-zA-Z0-9\\-_]+)", 2).matcher(str);
        if (matcher.find()) {
            return matcher.group(1);
        }
        return null;
    }

    protected RequestData getDuplicateUrl(RequestData requestData) {
        String url = requestData.getUrl();
        if (url.endsWith("/")) {
            requestData.setUrl(url.substring(0, url.length() - 1));
        } else {
            requestData.setUrl(url + "/");
        }
        return requestData;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public DOMParser getDomParser() {
        DOMParser dOMParser = new DOMParser();
        try {
            for (Map.Entry<String, String> entry : this.featureMap.entrySet()) {
                dOMParser.setFeature(entry.getKey(), "true".equalsIgnoreCase(entry.getValue()));
            }
            for (Map.Entry<String, String> entry2 : this.propertyMap.entrySet()) {
                dOMParser.setProperty(entry2.getKey(), entry2.getValue());
            }
            return dOMParser;
        } catch (Exception e) {
            throw new CrawlerSystemException("Invalid parser configuration.", e);
        }
    }

    protected String getBaseHref(Document document) {
        Node namedItem;
        try {
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, "//BASE");
            if (selectNodeList.size() <= 0 || (namedItem = selectNodeList.get(0).getAttributes().getNamedItem("href")) == null) {
                return null;
            }
            String nodeValue = namedItem.getNodeValue();
            if (!StringUtil.isNotBlank(nodeValue)) {
                return null;
            }
            if (nodeValue.startsWith("www.")) {
                nodeValue = "http://" + nodeValue;
            }
            return nodeValue;
        } catch (Exception e) {
            logger.warn("Could not get a base tag. ", e);
            return null;
        }
    }

    protected List<String> getUrlFromTagAttribute(URL url, Document document, String str, String str2, String str3) {
        if (logger.isDebugEnabled()) {
            logger.debug("Base URL: {}", url);
        }
        ArrayList arrayList = new ArrayList();
        try {
            XPathNodes selectNodeList = getXPathAPI().selectNodeList(document, str);
            for (int i = 0; i < selectNodeList.size(); i++) {
                Node namedItem = selectNodeList.get(i).getAttributes().getNamedItem(str2);
                if (namedItem != null) {
                    String nodeValue = namedItem.getNodeValue();
                    if (isValidPath(nodeValue)) {
                        addChildUrlFromTagAttribute(arrayList, url, nodeValue, str3);
                    }
                }
            }
        } catch (XPathException e) {
            logger.warn("Could not get urls: (" + str + ", " + str2 + ")", e);
        }
        return arrayList;
    }

    protected void addChildUrlFromTagAttribute(List<String> list, URL url, String str, String str2) {
        try {
            String trim = str.trim();
            String encodeUrl = encodeUrl(normalizeUrl((trim.startsWith("?") ? new URL(url.toExternalForm() + trim) : new URL(url, trim)).toExternalForm()), str2);
            if (logger.isDebugEnabled()) {
                logger.debug("{} -> {}", str, encodeUrl);
            }
            if (StringUtil.isNotBlank(encodeUrl)) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Add Child: {}", encodeUrl);
                }
                list.add(encodeUrl);
            } else if (logger.isDebugEnabled()) {
                logger.debug("Skip Child: {}", encodeUrl);
            }
        } catch (MalformedURLException e) {
            logger.warn("Malformed URL: " + str, e);
        }
    }

    protected String encodeUrl(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            return str;
        }
        StringBuilder sb = new StringBuilder(str.length() + 100);
        for (char c : str.toCharArray()) {
            if (CharUtil.isUrlChar(c)) {
                sb.append(c);
            } else {
                try {
                    sb.append(URLEncoder.encode(String.valueOf(c), str2));
                } catch (UnsupportedEncodingException e) {
                }
            }
        }
        return sb.toString();
    }

    protected String normalizeUrl(String str) {
        if (str == null) {
            return null;
        }
        String trim = str.trim();
        int indexOf = trim.indexOf(35);
        if (indexOf >= 0) {
            trim = trim.substring(0, indexOf);
        }
        String replace = trim.replace("/./", "/");
        if (replace.indexOf(";jsessionid") >= 0) {
            replace = replace.replaceFirst(";jsessionid=[a-zA-Z0-9\\.]*", "");
        }
        if (replace.indexOf(32) >= 0) {
            replace = replace.replace(" ", "%20");
        }
        String str2 = null;
        while (replace.indexOf("/../") >= 0 && !replace.equals(str2)) {
            str2 = replace;
            replace = replace.replaceFirst("/[^/]+/\\.\\./", "/");
        }
        return replace.replaceAll("([^:])/+", "$1/");
    }

    protected boolean isValidPath(String str) {
        return (StringUtil.isBlank(str) || this.invalidUrlPattern.matcher(str).find()) ? false : true;
    }

    public void addFeature(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new CrawlerSystemException("key or value is null.");
        }
        this.featureMap.put(str, str2);
    }

    public void addProperty(String str, String str2) {
        if (StringUtil.isBlank(str) || StringUtil.isBlank(str2)) {
            throw new CrawlerSystemException("key or value is null.");
        }
        this.propertyMap.put(str, str2);
    }

    @Override // org.codelibs.fess.crawler.transformer.Transformer
    public Object getData(AccessResultData<?> accessResultData) {
        if (!getName().equals(accessResultData.getTransformerName())) {
            throw new CrawlerSystemException("Transformer is invalid. Use " + accessResultData.getTransformerName() + ". This transformer is " + getName() + ".");
        }
        byte[] data = accessResultData.getData();
        if (data == null) {
            return null;
        }
        String encoding = accessResultData.getEncoding();
        try {
            return new String(data, encoding == null ? Constants.UTF_8 : encoding);
        } catch (UnsupportedEncodingException e) {
            if (logger.isInfoEnabled()) {
                logger.info("Invalid charsetName: " + encoding + ". Changed to UTF-8", e);
            }
            return new String(data, Constants.UTF_8_CHARSET);
        }
    }

    public Map<String, String> getFeatureMap() {
        return this.featureMap;
    }

    public void setFeatureMap(Map<String, String> map) {
        this.featureMap = map;
    }

    public Map<String, String> getPropertyMap() {
        return this.propertyMap;
    }

    public void setPropertyMap(Map<String, String> map) {
        this.propertyMap = map;
    }

    public Map<String, String> getChildUrlRuleMap() {
        return this.childUrlRuleMap;
    }

    public void setChildUrlRuleMap(Map<String, String> map) {
        this.childUrlRuleMap = map;
    }

    public String getDefaultEncoding() {
        return this.defaultEncoding;
    }

    public void setDefaultEncoding(String str) {
        this.defaultEncoding = str;
    }

    public int getPreloadSizeForCharset() {
        return this.preloadSizeForCharset;
    }

    public void setPreloadSizeForCharset(int i) {
        this.preloadSizeForCharset = i;
    }

    public Pattern getInvalidUrlPattern() {
        return this.invalidUrlPattern;
    }

    public void setInvalidUrlPattern(Pattern pattern) {
        this.invalidUrlPattern = pattern;
    }
}
