package org.codelibs.fess.crawler.util;

import java.io.IOException;
import java.io.Reader;
import java.util.HashSet;
import java.util.Set;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/* loaded from: input_file:org/codelibs/fess/crawler/util/TextUtil.class */
public final class TextUtil {
    private static final Logger logger = LogManager.getLogger(TextUtil.class);

    /* loaded from: input_file:org/codelibs/fess/crawler/util/TextUtil$TextNormalizeContext.class */
    public static class TextNormalizeContext {
        private final Reader reader;
        private int initialCapacity = 10000;
        private int maxAlphanumTermSize = -1;
        private int maxSymbolTermSize = -1;
        private boolean duplicateTermRemoved = false;
        private int[] spaceChars = {32, 160, 12288, 65533};

        public TextNormalizeContext(Reader reader) {
            this.reader = reader;
        }

        public String execute() {
            if (this.reader == null) {
                return "";
            }
            StringBuilder sb = new StringBuilder(this.initialCapacity);
            boolean z = false;
            int i = 0;
            int i2 = 0;
            HashSet hashSet = new HashSet(1000);
            while (true) {
                try {
                    int read = this.reader.read();
                    if (read == -1) {
                        break;
                    }
                    if (Character.isISOControl(read) || isSpaceChar(read)) {
                        if (this.duplicateTermRemoved) {
                            if (i > 0) {
                                z = TextUtil.removeLastDuplication(sb, i, z, hashSet);
                            } else if (i2 > 0) {
                                z = TextUtil.removeLastDuplication(sb, i2, z, hashSet);
                            }
                        }
                        if (!z && !TextUtil.isLastSpaceChar(sb)) {
                            sb.appendCodePoint(32);
                            z = true;
                        }
                        i = 0;
                        i2 = 0;
                    } else if ((read >= 48 && read <= 57) || ((read >= 65 && read <= 90) || (read >= 97 && read <= 122))) {
                        if (this.duplicateTermRemoved && i2 > 0) {
                            TextUtil.removeLastDuplication(sb, i2, z, hashSet);
                        }
                        if (this.maxAlphanumTermSize < 0) {
                            sb.appendCodePoint(read);
                            i++;
                        } else if (i < this.maxAlphanumTermSize) {
                            sb.appendCodePoint(read);
                            i++;
                        }
                        z = false;
                        i2 = 0;
                    } else if ((read < 33 || read > 47) && ((read < 58 || read > 64) && ((read < 91 || read > 96) && (read < 123 || read > 126)))) {
                        if (this.duplicateTermRemoved) {
                            if (i > 0) {
                                TextUtil.removeLastDuplication(sb, i, z, hashSet);
                            } else if (i2 > 0) {
                                TextUtil.removeLastDuplication(sb, i2, z, hashSet);
                            }
                        }
                        sb.appendCodePoint(read);
                        z = false;
                        i = 0;
                        i2 = 0;
                    } else {
                        if (this.duplicateTermRemoved && i > 0) {
                            TextUtil.removeLastDuplication(sb, i, z, hashSet);
                        }
                        if (this.maxSymbolTermSize < 0) {
                            sb.appendCodePoint(read);
                            i2++;
                        } else if (i2 < this.maxSymbolTermSize) {
                            sb.appendCodePoint(read);
                            i2++;
                        }
                        z = false;
                        i = 0;
                    }
                } catch (IOException e) {
                    if (!TextUtil.logger.isDebugEnabled()) {
                        return "";
                    }
                    TextUtil.logger.debug("Failed to read data.", e);
                    return "";
                }
            }
            if (this.duplicateTermRemoved) {
                if (i > 0) {
                    TextUtil.removeLastDuplication(sb, i, z, hashSet);
                } else if (i2 > 0) {
                    TextUtil.removeLastDuplication(sb, i2, z, hashSet);
                }
            }
            return sb.toString().trim();
        }

        private boolean isSpaceChar(int i) {
            for (int i2 : this.spaceChars) {
                if (i == i2) {
                    return true;
                }
            }
            return false;
        }

        public TextNormalizeContext initialCapacity(int i) {
            this.initialCapacity = i;
            return this;
        }

        public TextNormalizeContext maxAlphanumTermSize(int i) {
            this.maxAlphanumTermSize = i;
            return this;
        }

        public TextNormalizeContext maxSymbolTermSize(int i) {
            this.maxSymbolTermSize = i;
            return this;
        }

        public TextNormalizeContext duplicateTermRemoved(boolean z) {
            this.duplicateTermRemoved = z;
            return this;
        }

        public TextNormalizeContext spaceChars(int[] iArr) {
            this.spaceChars = iArr;
            return this;
        }
    }

    private TextUtil() {
    }

    public static TextNormalizeContext normalizeText(Reader reader) {
        return new TextNormalizeContext(reader);
    }

    private static boolean isLastSpaceChar(StringBuilder sb) {
        return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
    }

    private static boolean removeLastDuplication(StringBuilder sb, int i, boolean z, Set<String> set) {
        String rightString = rightString(sb, i);
        if (!set.contains(rightString)) {
            set.add(rightString);
            return z;
        }
        sb.setLength(sb.length() - i);
        if (z || isLastSpaceChar(sb)) {
            return z;
        }
        sb.appendCodePoint(32);
        return true;
    }

    private static String rightString(StringBuilder sb, int i) {
        return i <= 0 ? "" : i >= sb.length() ? sb.toString() : sb.substring(sb.length() - i, sb.length());
    }
}
