package org.codelibs.fess.crawler.extractor.impl;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
import org.apache.commons.text.translate.AggregateTranslator;
import org.apache.commons.text.translate.CharSequenceTranslator;
import org.apache.commons.text.translate.EntityArrays;
import org.apache.commons.text.translate.LookupTranslator;
import org.apache.commons.text.translate.NumericEntityUnescaper;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.Constants;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.ExtractException;

/* loaded from: input_file:org/codelibs/fess/crawler/extractor/impl/AbstractXmlExtractor.class */
public abstract class AbstractXmlExtractor extends AbstractExtractor {
    protected static final Logger logger = LogManager.getLogger(AbstractXmlExtractor.class);
    protected static final ByteOrderMark BOM_UTF_7 = new ByteOrderMark("UTF-7", new int[]{43, 47, 118});
    protected static final CharSequenceTranslator UNESCAPE_HTML4 = new AggregateTranslator(new CharSequenceTranslator[]{new LookupTranslator(EntityArrays.BASIC_UNESCAPE), new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE), new NumericEntityUnescaper(new NumericEntityUnescaper.OPTION[0])});
    protected String encoding = Constants.UTF_8;
    protected int preloadSizeForCharset = 2048;
    protected boolean ignoreCommentTag = false;

    protected abstract Pattern getEncodingPattern();

    protected abstract Pattern getTagPattern();

    @Override // org.codelibs.fess.crawler.extractor.Extractor
    public ExtractData getText(InputStream inputStream, Map<String, String> map) {
        if (inputStream == null) {
            throw new CrawlerSystemException("The inputstream is null.");
        }
        try {
            BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream);
            return createExtractData(UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bufferedInputStream), getEncoding(bufferedInputStream))));
        } catch (Exception e) {
            throw new ExtractException(e);
        }
    }

    protected ExtractData createExtractData(String str) {
        return new ExtractData(extractString(str));
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getEncoding(BufferedInputStream bufferedInputStream) {
        BOMInputStream bOMInputStream;
        byte[] bArr = new byte[this.preloadSizeForCharset];
        try {
            try {
                bufferedInputStream.mark(this.preloadSizeForCharset);
                bOMInputStream = new BOMInputStream(bufferedInputStream, false, new ByteOrderMark[]{ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE, BOM_UTF_7});
            } catch (Exception e) {
                if (logger.isInfoEnabled()) {
                    logger.info("Use a default encoding: " + this.encoding, e);
                }
                try {
                    bufferedInputStream.reset();
                } catch (IOException e2) {
                    throw new ExtractException(e2);
                }
            }
            if (bOMInputStream.hasBOM()) {
                if (logger.isDebugEnabled()) {
                    logger.debug("BOM: {}", bOMInputStream.getBOMCharsetName());
                }
                String bOMCharsetName = bOMInputStream.getBOMCharsetName();
                try {
                    bufferedInputStream.reset();
                    return bOMCharsetName;
                } catch (IOException e3) {
                    throw new ExtractException(e3);
                }
            }
            int read = bOMInputStream.read(bArr);
            if (read == -1) {
                String str = this.encoding;
                try {
                    bufferedInputStream.reset();
                    return str;
                } catch (IOException e4) {
                    throw new ExtractException(e4);
                }
            }
            String str2 = new String(bArr, 0, read, this.encoding);
            if (StringUtil.isBlank(str2)) {
                String str3 = this.encoding;
                try {
                    bufferedInputStream.reset();
                    return str3;
                } catch (IOException e5) {
                    throw new ExtractException(e5);
                }
            }
            Matcher matcher = getEncodingPattern().matcher(str2);
            if (matcher.find()) {
                String group = matcher.group(1);
                if (Charset.isSupported(group)) {
                    try {
                        bufferedInputStream.reset();
                        return group;
                    } catch (IOException e6) {
                        throw new ExtractException(e6);
                    }
                }
            }
            try {
                bufferedInputStream.reset();
                return this.encoding;
            } catch (IOException e7) {
                throw new ExtractException(e7);
            }
        } catch (Throwable th) {
            try {
                bufferedInputStream.reset();
                throw th;
            } catch (IOException e8) {
                throw new ExtractException(e8);
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String extractString(String str) {
        String replaceAll = str.replaceAll("[\\r\\n]", " ");
        Matcher matcher = getTagPattern().matcher(this.ignoreCommentTag ? replaceAll.replaceAll("<!--[^>]+-->", "") : replaceAll.replace("<!--", "").replace("-->", ""));
        StringBuffer stringBuffer = new StringBuffer();
        Pattern compile = Pattern.compile("\\s[^ ]+=\"([^\"]*)\"");
        while (matcher.find()) {
            Matcher matcher2 = compile.matcher(matcher.group());
            StringBuilder sb = new StringBuilder(100);
            while (matcher2.find()) {
                sb.append(matcher2.group(1)).append(' ');
            }
            matcher.appendReplacement(stringBuffer, sb.toString().replace("\\", "\\\\").replace("$", "\\$"));
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString().replaceAll("\\s+", " ").trim();
    }

    public String getEncoding() {
        return this.encoding;
    }

    public void setEncoding(String str) {
        this.encoding = str;
    }

    public int getPreloadSizeForCharset() {
        return this.preloadSizeForCharset;
    }

    public void setPreloadSizeForCharset(int i) {
        this.preloadSizeForCharset = i;
    }

    public boolean isIgnoreCommentTag() {
        return this.ignoreCommentTag;
    }

    public void setIgnoreCommentTag(boolean z) {
        this.ignoreCommentTag = z;
    }
}
