/*
 * Decompiled with CFR 0.152.
 */
package de.dfki.lt.tools.tokenizer;

import de.dfki.lt.tools.tokenizer.Description;
import de.dfki.lt.tools.tokenizer.FileTools;
import de.dfki.lt.tools.tokenizer.LanguageResource;
import de.dfki.lt.tools.tokenizer.annotate.AnnotatedString;
import de.dfki.lt.tools.tokenizer.annotate.FastAnnotatedString;
import de.dfki.lt.tools.tokenizer.exceptions.LanguageNotSupportedException;
import de.dfki.lt.tools.tokenizer.exceptions.ProcessingException;
import de.dfki.lt.tools.tokenizer.output.Paragraph;
import de.dfki.lt.tools.tokenizer.output.ParagraphOutputter;
import de.dfki.lt.tools.tokenizer.regexp.Match;
import de.dfki.lt.tools.tokenizer.regexp.RegExp;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.StringTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JTok {
    private static final Logger LOG = LoggerFactory.getLogger(JTok.class);
    private static final String LANGUAGES_PROP = "languages";
    public static final String CLASS_ANNO = "class";
    public static final String BORDER_ANNO = "border";
    public static final String TU_BORDER = "tu";
    public static final String P_BORDER = "p";
    private Map langResources;

    public JTok() throws IOException {
        Properties props = new Properties();
        props.load(FileTools.openResourceFileAsStream("jtok/jtok.cfg"));
        this.init(props);
    }

    public JTok(Properties configProps) {
        this.init(configProps);
    }

    private void init(Properties configProps) {
        this.setLangResources(new HashMap());
        String languages = configProps.getProperty(LANGUAGES_PROP);
        StringTokenizer st = new StringTokenizer(languages, ",".intern());
        while (st.hasMoreTokens()) {
            String oneLanguage = st.nextToken();
            String langDir = configProps.getProperty(oneLanguage);
            LOG.info("loading language resources for " + oneLanguage + " from " + langDir);
            this.getLangResources().put(oneLanguage, new LanguageResource(oneLanguage, langDir));
        }
    }

    private Map getLangResources() {
        return this.langResources;
    }

    private void setLangResources(HashMap aLangResources) {
        this.langResources = aLangResources;
    }

    public LanguageResource getLanguageResource(String aLanguage) throws LanguageNotSupportedException {
        Object probe = this.getLangResources().get(aLanguage);
        if (null != probe) {
            return (LanguageResource)probe;
        }
        throw new LanguageNotSupportedException("language " + aLanguage + " not supported");
    }

    public AnnotatedString tokenize(String anInputText, String aLanguage) {
        LanguageResource langRes = this.getLanguageResource(aLanguage);
        FastAnnotatedString input = new FastAnnotatedString(anInputText);
        this.identifyTokens(input, langRes);
        this.identifyPunct(input, langRes);
        this.identifyClitics(input, langRes);
        this.identifyNumbers(input, langRes);
        this.identifyAbbrev(input, langRes);
        this.identifyTus(input, langRes);
        return input;
    }

    private void identifyTokens(AnnotatedString input, LanguageResource langRes) {
        int tokenStart = 0;
        boolean tokenFound = false;
        String rootClass = langRes.getClassesRoot().getTagName();
        char c = input.first();
        while (c != '\uffff') {
            if (Character.isWhitespace(c) || c == '\u00a0') {
                if (tokenFound) {
                    input.annotate(CLASS_ANNO, rootClass, tokenStart, input.getIndex());
                    tokenFound = false;
                }
            } else if (!tokenFound) {
                tokenFound = true;
                tokenStart = input.getIndex();
            }
            c = input.next();
        }
        if (tokenFound) {
            input.annotate(CLASS_ANNO, rootClass, tokenStart, input.getIndex());
        }
    }

    private void identifyPunct(AnnotatedString input, LanguageResource langRes) {
        RegExp allPunctMatcher = langRes.getAllPunctMatcher();
        RegExp internalMatcher = langRes.getInternalMatcher();
        RegExp nbrMatcher = langRes.getNbrMatcher();
        RegExp nblMatcher = langRes.getNblMatcher();
        char c = input.setIndex(0);
        if (null == input.getAnnotation(CLASS_ANNO)) {
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
        }
        while (c != '\uffff') {
            int tokenEnd = input.getRunLimit(CLASS_ANNO);
            if (null == input.getAnnotation(CLASS_ANNO)) {
                c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
                continue;
            }
            String tokClass = (String)input.getAnnotation(CLASS_ANNO);
            int tokenStart = input.getIndex();
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
            String image = input.substring(tokenStart, tokenEnd);
            List matches = allPunctMatcher.getAllMatches(image);
            if (0 == matches.size()) continue;
            int index = 0;
            Match oneMatch = null;
            for (int i = 0; i < matches.size(); ++i) {
                oneMatch = (Match)matches.get(i);
                if (index != oneMatch.getStartIndex()) {
                    if (internalMatcher.matches(oneMatch.toString()) && this.isRightContextEnd(oneMatch, matches, image, i)) continue;
                    if (nbrMatcher.matches(oneMatch.toString())) {
                        input.annotate(CLASS_ANNO, tokClass, tokenStart + index, tokenStart + oneMatch.getEndIndex());
                        index = oneMatch.getEndIndex();
                        continue;
                    }
                    input.annotate(CLASS_ANNO, tokClass, tokenStart + index, tokenStart + oneMatch.getStartIndex());
                    index = oneMatch.getStartIndex();
                } else if (nblMatcher.matches(oneMatch.toString()) && this.isRightContextEnd(oneMatch, matches, image, i)) continue;
                String punctClass = this.identifyPunctClass(oneMatch, null, image, langRes);
                input.annotate(CLASS_ANNO, punctClass, tokenStart + index, tokenStart + oneMatch.getEndIndex());
                index = oneMatch.getEndIndex();
            }
            if (index == image.length()) continue;
            input.annotate(CLASS_ANNO, tokClass, tokenStart + index, tokenStart + image.length());
        }
    }

    private boolean isRightContextEnd(Match oneMatch, List matches, String image, int i) {
        if (i < matches.size() - 1) {
            Match nextMatch = (Match)matches.get(i + 1);
            return nextMatch.getStartIndex() != oneMatch.getEndIndex();
        }
        return oneMatch.getEndIndex() != image.length();
    }

    private String identifyPunctClass(Match punct, RegExp regExp, String image, LanguageResource langRes) {
        String oneClass = this.identifyClass(punct.toString(), regExp, langRes.getPunctDescr());
        if (oneClass.equals("OPENCLOSE_PUNCT")) {
            int nextIndex = punct.getEndIndex();
            if (nextIndex >= image.length() || !Character.isLetter(image.charAt(nextIndex))) {
                oneClass = "CLOSE_PUNCT";
            } else {
                int prevIndex = punct.getStartIndex() - 1;
                if (prevIndex < 0 || !Character.isLetter(image.charAt(prevIndex))) {
                    oneClass = "OPEN_PUNCT";
                }
            }
        }
        return oneClass;
    }

    private void identifyClitics(AnnotatedString input, LanguageResource langRes) {
        RegExp clitMatcher = langRes.getCliticsMatcher();
        RegExp proclitMatcher = langRes.getProcliticsMatcher();
        RegExp enclitMatcher = langRes.getEncliticsMatcher();
        RegExp nbrMatcher = langRes.getNbrMatcher();
        RegExp nblMatcher = langRes.getNblMatcher();
        String rootClass = langRes.getClassesRoot().getTagName();
        char c = input.setIndex(0);
        if (null == input.getAnnotation(CLASS_ANNO)) {
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
        }
        while (c != '\uffff') {
            int tokenEnd = input.getRunLimit(CLASS_ANNO);
            String tokClass = (String)input.getAnnotation(CLASS_ANNO);
            if (tokClass != rootClass) {
                c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
                continue;
            }
            int tokenStart = input.getIndex();
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
            String image = input.substring(tokenStart, tokenEnd);
            int startIndex = 0;
            int endIndex = image.length();
            Match nbl = nblMatcher.contains(image);
            Match proclit = null;
            proclit = null != nbl ? proclitMatcher.contains(image.substring(nbl.getEndIndex(), endIndex)) : proclitMatcher.contains(image);
            if (null != nbl && null != proclit) {
                String punctClass = this.identifyPunctClass(nbl, nblMatcher, image, langRes);
                input.annotate(CLASS_ANNO, punctClass, tokenStart + nbl.getStartIndex(), tokenStart + nbl.getEndIndex());
                startIndex = nbl.getEndIndex();
            }
            while (null != proclit) {
                String clitClass = this.identifyClass(proclit.toString(), proclitMatcher, langRes.getClitDescr());
                input.annotate(CLASS_ANNO, clitClass, tokenStart + startIndex + proclit.getStartIndex(), tokenStart + startIndex + proclit.getEndIndex());
                proclit = proclitMatcher.contains(image.substring(startIndex += proclit.getEndIndex(), image.length()));
            }
            Match nbr = nbrMatcher.contains(image);
            Match enclit = null;
            enclit = null != nbr ? enclitMatcher.contains(image.substring(startIndex, nbr.getStartIndex())) : enclitMatcher.contains(image.substring(startIndex, endIndex));
            if (null != nbr && null != enclit) {
                String punctClass = this.identifyPunctClass(nbr, nbrMatcher, image, langRes);
                input.annotate(CLASS_ANNO, punctClass, tokenStart + nbr.getStartIndex(), tokenStart + nbr.getEndIndex());
            }
            while (null != enclit) {
                String clitClass = this.identifyClass(enclit.toString(), enclitMatcher, langRes.getClitDescr());
                input.annotate(CLASS_ANNO, clitClass, tokenStart + startIndex + enclit.getStartIndex(), tokenStart + startIndex + enclit.getEndIndex());
                endIndex = startIndex + enclit.getStartIndex();
                enclit = enclitMatcher.contains(image.substring(startIndex, endIndex));
            }
            if (startIndex == endIndex) continue;
            input.annotate(CLASS_ANNO, rootClass, tokenStart + startIndex, tokenStart + endIndex);
        }
    }

    private void identifyNumbers(AnnotatedString input, LanguageResource langRes) {
        RegExp simpleDigitsMatcher = langRes.getSimpleDigitsMatcher();
        RegExp ordinalMatcher = langRes.getOrdinalMatcher();
        RegExp digitsMatcher = langRes.getDigitsMatcher();
        String rootClass = langRes.getClassesRoot().getTagName();
        char c = input.setIndex(0);
        if (null == input.getAnnotation(CLASS_ANNO)) {
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
        }
        while (c != '\uffff') {
            Match digit;
            int tokenEnd = input.getRunLimit(CLASS_ANNO);
            String tokClass = (String)input.getAnnotation(CLASS_ANNO);
            if (tokClass != rootClass) {
                c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
                continue;
            }
            int tokenStart = input.getIndex();
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
            String image = input.substring(tokenStart, tokenEnd);
            if (null == simpleDigitsMatcher.contains(image)) continue;
            boolean periodFlag = false;
            if ('.' == image.charAt(image.length() - 1)) {
                periodFlag = true;
                if (ordinalMatcher.matches(image)) {
                    String ordClass = this.identifyClass(image, ordinalMatcher, langRes.getNumbDescr());
                    input.annotate(CLASS_ANNO, ordClass, tokenStart, tokenEnd);
                    continue;
                }
                image = image.substring(0, image.length() - 1);
                --tokenEnd;
            }
            if (null == (digit = digitsMatcher.contains(image))) continue;
            String numbClass = this.identifyClass(digit.toString(), digitsMatcher, langRes.getNumbDescr());
            input.annotate(CLASS_ANNO, numbClass, tokenStart + digit.getStartIndex(), tokenStart + digit.getEndIndex());
            if (!periodFlag) continue;
            String punctClass = this.identifyClass(".".intern(), null, langRes.getPunctDescr());
            input.annotate(CLASS_ANNO, punctClass, tokenEnd, tokenEnd + 1);
        }
    }

    private void identifyAbbrev(AnnotatedString input, LanguageResource langRes) {
        RegExp nbrMatcher = langRes.getNbrMatcher();
        RegExp abbrevMatcher = langRes.getAbbrevMatcher();
        RegExp initialMatcher = langRes.getInitialMatcher();
        HashMap abbrevLists = langRes.getAbbrevLists();
        String rootClass = langRes.getClassesRoot().getTagName();
        char c = input.setIndex(0);
        if (null == input.getAnnotation(CLASS_ANNO)) {
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
        }
        while (c != '\uffff') {
            String afterHyphen;
            int tokenEnd = input.getRunLimit(CLASS_ANNO);
            String tokClass = (String)input.getAnnotation(CLASS_ANNO);
            if (tokClass != rootClass) {
                c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
                continue;
            }
            int tokenStart = input.getIndex();
            c = input.setIndex(input.findNextAnnotation(CLASS_ANNO));
            String image = input.substring(tokenStart, tokenEnd);
            Match nbr = nbrMatcher.contains(image);
            if (null == nbr || nbr.getEndIndex() - nbr.getStartIndex() != 1 || input.charAt(tokenStart + nbr.getStartIndex()) != '.') continue;
            int hyphenPos = image.lastIndexOf("-");
            if (hyphenPos != -1 && (afterHyphen = image.substring(hyphenPos + 1)).matches("[^0-9]{2,}")) {
                image = afterHyphen;
            }
            boolean found = false;
            for (String abbrevClass : abbrevLists.keySet()) {
                Set oneList = (Set)abbrevLists.get(abbrevClass);
                if (!oneList.contains(image)) continue;
                input.annotate(CLASS_ANNO, abbrevClass, tokenStart, tokenStart + nbr.getEndIndex());
                found = true;
                break;
            }
            if (found) continue;
            if (initialMatcher != null && initialMatcher.matches(image)) {
                String initialClass = this.identifyClass(image, initialMatcher, langRes.getAbbrevDescr());
                input.annotate(CLASS_ANNO, initialClass, tokenStart, tokenStart + nbr.getEndIndex());
                continue;
            }
            if (abbrevMatcher.matches(image)) {
                String abbrevClass;
                abbrevClass = this.identifyClass(image, abbrevMatcher, langRes.getAbbrevDescr());
                input.annotate(CLASS_ANNO, abbrevClass, tokenStart, tokenStart + nbr.getEndIndex());
                continue;
            }
            String punctClass = this.identifyPunctClass(nbr, nbrMatcher, image, langRes);
            input.annotate(CLASS_ANNO, punctClass, tokenStart + nbr.getStartIndex(), tokenStart + nbr.getEndIndex());
        }
    }

    private void identifyTus(AnnotatedString input, LanguageResource langRes) {
        RegExp intPunctMatcher = langRes.getInternalTuMatcher();
        boolean eosMode = false;
        boolean abbrevMode = false;
        char c = input.setIndex(0);
        while (c != '\uffff') {
            int tokenStart = input.getRunStart(CLASS_ANNO);
            int tokenEnd = input.getRunLimit(CLASS_ANNO);
            if (null != input.getAnnotation(CLASS_ANNO)) {
                if (eosMode) {
                    if (!(langRes.isAncestor("TERM_PUNCT".intern(), (String)input.getAnnotation(CLASS_ANNO)) || langRes.isAncestor("TERM_PUNCT_P".intern(), (String)input.getAnnotation(CLASS_ANNO)) || langRes.isAncestor("CLOSE_PUNCT".intern(), (String)input.getAnnotation(CLASS_ANNO)) || langRes.isAncestor("CLOSE_BRACKET".intern(), (String)input.getAnnotation(CLASS_ANNO)))) {
                        if (Character.isLowerCase(c) || intPunctMatcher.matches(input.substring(input.getIndex(), input.getIndex() + 1))) {
                            eosMode = false;
                        } else {
                            input.annotate(BORDER_ANNO, TU_BORDER, tokenStart, tokenStart + 1);
                            eosMode = false;
                        }
                    }
                } else {
                    if (abbrevMode) {
                        String image = input.substring(tokenStart, tokenEnd);
                        if (langRes.getNonCapTerms().contains(image) || langRes.isAncestor("OPEN_PUNCT".intern(), (String)input.getAnnotation(CLASS_ANNO))) {
                            input.annotate(BORDER_ANNO, TU_BORDER, tokenStart, tokenStart + 1);
                        }
                        abbrevMode = false;
                        continue;
                    }
                    if (langRes.isAncestor("TERM_PUNCT".intern(), (String)input.getAnnotation(CLASS_ANNO)) || langRes.isAncestor("TERM_PUNCT_P".intern(), (String)input.getAnnotation(CLASS_ANNO))) {
                        eosMode = true;
                    } else if (langRes.isAncestor("B_ABBREVIATION".intern(), (String)input.getAnnotation(CLASS_ANNO))) {
                        abbrevMode = true;
                    }
                }
                c = input.setIndex(tokenEnd);
                continue;
            }
            if (this.isParagraphChange(input.substring(tokenStart, tokenEnd))) {
                eosMode = false;
                abbrevMode = false;
                c = input.setIndex(tokenEnd);
                if (c == '\uffff') continue;
                input.annotate(BORDER_ANNO, P_BORDER, input.getIndex(), input.getIndex() + 1);
                continue;
            }
            c = input.setIndex(tokenEnd);
        }
    }

    private boolean isParagraphChange(String wSpaces) {
        int len = wSpaces.length();
        for (int i = 0; i < len; ++i) {
            char c = wSpaces.charAt(i);
            if ('\n' != c && '\r' != c) continue;
            for (int j = i + 1; j < len; ++j) {
                if (c != wSpaces.charAt(j)) continue;
                return true;
            }
        }
        return false;
    }

    private String identifyClass(String aString, RegExp regExp, Description descr) {
        HashMap regExpMap;
        String oneClass;
        if (null != regExp && null != (oneClass = (String)(regExpMap = descr.getRegExpMap()).get(regExp))) {
            return oneClass;
        }
        HashMap definitionsMap = descr.getDefinitionsMap();
        for (String oneClass2 : definitionsMap.keySet()) {
            RegExp oneRE = (RegExp)definitionsMap.get(oneClass2);
            if (!oneRE.matches(aString)) continue;
            return oneClass2;
        }
        throw new ProcessingException("could not find class for " + aString);
    }

    public boolean isAncestor(String tag1, String tag2, String aLanguage) throws ProcessingException {
        LanguageResource langRes = this.getLanguageResource(aLanguage);
        String class1 = (String)langRes.getClassesMap().get(tag1);
        if (null == class1) {
            throw new ProcessingException("undefined token class tag " + tag1);
        }
        String class2 = (String)langRes.getClassesMap().get(tag2);
        if (null == class2) {
            throw new ProcessingException("undefined token class tag " + tag2);
        }
        return langRes.isAncestor(class1, class2);
    }

    public static void main(String[] args) {
        if (args.length != 2 && args.length != 3) {
            System.out.println("This method needs two arguments:\n- a file name for the document to tokenize\n- the language of the document\n- an optional encoding to use (default is ISO-8859-1)\nSupported languages are: de, en, it");
            System.exit(1);
        }
        String encoding = "ISO-8859-1";
        if (args.length == 3) {
            encoding = args[2];
        }
        String text = null;
        try {
            text = FileTools.readFileAsString(new File(args[0]), encoding);
        }
        catch (IOException ioe) {
            System.err.println(ioe.toString());
            System.exit(1);
        }
        try {
            JTok testTok = new JTok();
            AnnotatedString result = testTok.tokenize(text, args[1]);
            Iterator<Paragraph> it = ParagraphOutputter.createParagraphs(result).iterator();
            while (it.hasNext()) {
                System.out.println(it.next());
            }
        }
        catch (IOException e) {
            LOG.error(e.getLocalizedMessage(), (Throwable)e);
        }
    }
}

