/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.PTB2TextLexer;
import edu.stanford.nlp.process.PTBLexer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WordTokenFactory;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

public class PTBTokenizer<T extends HasWord>
extends AbstractTokenizer<T> {
    private static final Redwood.RedwoodChannels log = Redwood.channels(PTBTokenizer.class);
    private final PTBLexer lexer;

    public static PTBTokenizer<Word> newPTBTokenizer(Reader r) {
        return new PTBTokenizer<Word>(r, new WordTokenFactory(), "");
    }

    public static PTBTokenizer<CoreLabel> newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
        return new PTBTokenizer<CoreLabel>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
    }

    private PTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> tokenFactory) {
        StringBuilder options = new StringBuilder();
        if (suppressEscaping) {
            options.append("ptb3Escaping=false");
        } else {
            options.append("ptb3Escaping=true");
        }
        if (tokenizeNLs) {
            options.append(",tokenizeNLs");
        }
        if (invertible) {
            options.append(",invertible");
        }
        this.lexer = new PTBLexer(r, tokenFactory, options.toString());
    }

    public PTBTokenizer(Reader r, LexedTokenFactory<T> tokenFactory, String options) {
        this.lexer = new PTBLexer(r, tokenFactory, options);
    }

    @Override
    protected T getNext() {
        try {
            return (T)((HasWord)this.lexer.next());
        }
        catch (IOException e) {
            throw new RuntimeIOException(e);
        }
    }

    public static String getNewlineToken() {
        return "*NL*";
    }

    public static String ptb2Text(String ptbText) {
        StringBuilder sb = new StringBuilder(ptbText.length());
        PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
        try {
            String token;
            while ((token = lexer.next()) != null) {
                sb.append(token);
            }
        }
        catch (IOException e) {
            throw new RuntimeIOException(e);
        }
        return sb.toString();
    }

    public static String ptbToken2Text(String ptbText) {
        return PTBTokenizer.ptb2Text(' ' + ptbText + ' ').trim();
    }

    public static int ptb2Text(Reader ptbText, Writer w) throws IOException {
        String token;
        int numTokens = 0;
        PTB2TextLexer lexer = new PTB2TextLexer(ptbText);
        while ((token = lexer.next()) != null) {
            ++numTokens;
            w.write(token);
        }
        return numTokens;
    }

    private static void untok(List<String> inputFileList, List<String> outputFileList, String charset) throws IOException {
        long start = System.nanoTime();
        int numTokens = 0;
        int sz = inputFileList.size();
        if (sz == 0) {
            InputStreamReader r = new InputStreamReader(System.in, charset);
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter((OutputStream)System.out, charset));
            numTokens = PTBTokenizer.ptb2Text(r, writer);
            writer.close();
        } else {
            for (int j = 0; j < sz; ++j) {
                BufferedReader r = IOUtils.readerFromString(inputFileList.get(j), charset);
                BufferedWriter writer = outputFileList == null ? new BufferedWriter(new OutputStreamWriter((OutputStream)System.out, charset)) : new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outputFileList.get(j)), charset));
                numTokens += PTBTokenizer.ptb2Text(r, writer);
                writer.close();
                ((Reader)r).close();
            }
        }
        long duration = System.nanoTime() - start;
        double wordsPerSec = (double)numTokens / ((double)duration / 1.0E9);
        System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
    }

    public static String ptb2Text(List<String> ptbWords) {
        return PTBTokenizer.ptb2Text(StringUtils.join(ptbWords));
    }

    public static String labelList2Text(List<? extends HasWord> ptbWords) {
        ArrayList<String> words = new ArrayList<String>();
        for (HasWord hasWord : ptbWords) {
            words.add(hasWord.word());
        }
        return PTBTokenizer.ptb2Text(words);
    }

    private static void tok(List<String> inputFileList, List<String> outputFileList, String charset, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
        long start = System.nanoTime();
        long numTokens = 0L;
        int numFiles = inputFileList.size();
        if (numFiles == 0) {
            BufferedReader stdin = IOUtils.readerFromStdin(charset);
            BufferedWriter writer = new BufferedWriter(new OutputStreamWriter((OutputStream)System.out, charset));
            numTokens += (long)PTBTokenizer.tokReader(stdin, writer, parseInsidePattern, options, preserveLines, dump, lowerCase);
            IOUtils.closeIgnoringExceptions(writer);
        } else {
            BufferedWriter out2 = null;
            if (outputFileList == null) {
                out2 = new BufferedWriter(new OutputStreamWriter((OutputStream)System.out, charset));
            }
            for (int j = 0; j < numFiles; ++j) {
                BufferedReader r = IOUtils.readerFromString(inputFileList.get(j), charset);
                if (out2 == null) {
                    out2 = new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(outputFileList.get(j)), charset));
                }
                numTokens += (long)PTBTokenizer.tokReader(r, out2, parseInsidePattern, options, preserveLines, dump, lowerCase);
                ((Reader)r).close();
                if (outputFileList == null) continue;
                IOUtils.closeIgnoringExceptions(out2);
            }
            if (outputFileList == null) {
                IOUtils.closeIgnoringExceptions(out2);
            }
        }
        long duration = System.nanoTime() - start;
        double wordsPerSec = (double)numTokens / ((double)duration / 1.0E9);
        System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
    }

    private static int tokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
        int numTokens = 0;
        boolean beginLine = true;
        boolean printing = parseInsidePattern == null;
        Matcher m = null;
        if (parseInsidePattern != null) {
            m = parseInsidePattern.matcher("");
        }
        PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options);
        while (tokenizer.hasNext()) {
            String str;
            CoreLabel obj = (CoreLabel)tokenizer.next();
            String origStr = (String)obj.get(CoreAnnotations.TextAnnotation.class);
            if (lowerCase) {
                str = origStr.toLowerCase(Locale.ENGLISH);
                obj.set(CoreAnnotations.TextAnnotation.class, str);
            } else {
                str = origStr;
            }
            if (m != null && m.reset(origStr).matches()) {
                printing = m.group(1).isEmpty();
            } else if (printing) {
                if (dump) {
                    str = obj.toShorterString(new String[0]);
                }
                if (preserveLines) {
                    if ("*NL*".equals(origStr)) {
                        beginLine = true;
                        writer.newLine();
                    } else {
                        if (!beginLine) {
                            writer.write(32);
                        } else {
                            beginLine = false;
                        }
                        writer.write(str);
                    }
                } else {
                    writer.write(str);
                    writer.newLine();
                }
            }
            ++numTokens;
        }
        return numTokens;
    }

    public static TokenizerFactory<Word> factory() {
        return PTBTokenizerFactory.newTokenizerFactory();
    }

    public static TokenizerFactory<CoreLabel> factory(boolean tokenizeNLs, boolean invertible) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeNLs, invertible);
    }

    public static TokenizerFactory<CoreLabel> coreLabelFactory() {
        return PTBTokenizer.coreLabelFactory("");
    }

    public static TokenizerFactory<CoreLabel> coreLabelFactory(String options) {
        return PTBTokenizerFactory.newPTBTokenizerFactory(new CoreLabelTokenFactory(), options);
    }

    public static <T extends HasWord> TokenizerFactory<T> factory(LexedTokenFactory<T> factory, String options) {
        return new PTBTokenizerFactory(factory, options);
    }

    private static Map<String, Integer> optionArgDefs() {
        Map<String, Integer> optionArgDefs = Generics.newHashMap();
        optionArgDefs.put("options", 1);
        optionArgDefs.put("ioFileList", 0);
        optionArgDefs.put("fileList", 0);
        optionArgDefs.put("lowerCase", 0);
        optionArgDefs.put("dump", 0);
        optionArgDefs.put("untok", 0);
        optionArgDefs.put("encoding", 1);
        optionArgDefs.put("parseInside", 1);
        optionArgDefs.put("preserveLines", 0);
        return optionArgDefs;
    }

    public static void main(String[] args) throws IOException {
        String parsedArgStr;
        boolean preserveLines;
        Properties options = StringUtils.argsToProperties(args, PTBTokenizer.optionArgDefs());
        boolean showHelp = PropertiesUtils.getBool(options, "help", false);
        if (showHelp = PropertiesUtils.getBool(options, "h", showHelp)) {
            log.info("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
            log.info("  options: -h|-help|-options tokenizerOptions|-preserveLines|-lowerCase|-dump|");
            log.info("           -fileList|-ioFileList|-encoding encoding|-parseInside regex|-untok");
            return;
        }
        StringBuilder optionsSB = new StringBuilder();
        String tokenizerOptions = options.getProperty("options", null);
        if (tokenizerOptions != null) {
            optionsSB.append(tokenizerOptions);
        }
        if (preserveLines = PropertiesUtils.getBool(options, "preserveLines", false)) {
            optionsSB.append(",tokenizeNLs");
        }
        boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false);
        boolean fileList = PropertiesUtils.getBool(options, "fileList", false);
        boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false);
        boolean dump = PropertiesUtils.getBool(options, "dump", false);
        boolean untok = PropertiesUtils.getBool(options, "untok", false);
        String charset = options.getProperty("encoding", "utf-8");
        String parseInsideKey = options.getProperty("parseInside", null);
        Pattern parseInsidePattern = null;
        if (parseInsideKey != null) {
            try {
                parseInsidePattern = Pattern.compile("<(/?)(?:" + parseInsideKey + ")(?:(?:\\s|\u00a0)[^>]*?)?>");
            }
            catch (PatternSyntaxException patternSyntaxException) {
                // empty catch block
            }
        }
        String[] parsedArgs = (parsedArgStr = options.getProperty("", null)) == null ? null : parsedArgStr.split("\\s+");
        ArrayList<String> inputFileList = new ArrayList<String>();
        ArrayList<String> outputFileList = null;
        if (parsedArgs != null) {
            if (fileList || inputOutputFileList) {
                outputFileList = new ArrayList<String>();
                for (String fileName : parsedArgs) {
                    String inLine;
                    BufferedReader r = IOUtils.readerFromString(fileName, charset);
                    while ((inLine = r.readLine()) != null) {
                        String[] fields = inLine.split("\\s+");
                        inputFileList.add(fields[0]);
                        if (fields.length > 1) {
                            outputFileList.add(fields[1]);
                            continue;
                        }
                        outputFileList.add(fields[0] + ".tok");
                    }
                    r.close();
                }
                if (fileList) {
                    outputFileList = null;
                }
            } else {
                inputFileList.addAll(Arrays.asList(parsedArgs));
            }
        }
        if (untok) {
            PTBTokenizer.untok(inputFileList, outputFileList, charset);
        } else {
            PTBTokenizer.tok(inputFileList, outputFileList, charset, parseInsidePattern, optionsSB.toString(), preserveLines, dump, lowerCase);
        }
    }

    public static class PTBTokenizerFactory<T extends HasWord>
    implements TokenizerFactory<T> {
        private static final long serialVersionUID = -8859638719818931606L;
        protected final LexedTokenFactory<T> factory;
        protected String options;

        public static TokenizerFactory<Word> newTokenizerFactory() {
            return PTBTokenizerFactory.newPTBTokenizerFactory(new WordTokenFactory(), "");
        }

        public static PTBTokenizerFactory<Word> newWordTokenizerFactory(String options) {
            return new PTBTokenizerFactory<Word>(new WordTokenFactory(), options);
        }

        public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
            return new PTBTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), options);
        }

        public static <T extends HasWord> PTBTokenizerFactory<T> newPTBTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
            return new PTBTokenizerFactory<T>(tokenFactory, options);
        }

        public static PTBTokenizerFactory<CoreLabel> newPTBTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
            return new PTBTokenizerFactory<CoreLabel>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
        }

        private PTBTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory<T> factory) {
            this.factory = factory;
            StringBuilder optionsSB = new StringBuilder();
            if (suppressEscaping) {
                optionsSB.append("ptb3Escaping=false");
            } else {
                optionsSB.append("ptb3Escaping=true");
            }
            if (tokenizeNLs) {
                optionsSB.append(",tokenizeNLs");
            }
            if (invertible) {
                optionsSB.append(",invertible");
            }
            this.options = optionsSB.toString();
        }

        private PTBTokenizerFactory(LexedTokenFactory<T> tokenFactory, String options) {
            this.factory = tokenFactory;
            this.options = options;
        }

        @Override
        public Iterator<T> getIterator(Reader r) {
            return this.getTokenizer(r);
        }

        @Override
        public Tokenizer<T> getTokenizer(Reader r) {
            return new PTBTokenizer<T>(r, this.factory, this.options);
        }

        @Override
        public Tokenizer<T> getTokenizer(Reader r, String extraOptions) {
            if (this.options == null || this.options.isEmpty()) {
                return new PTBTokenizer<T>(r, this.factory, extraOptions);
            }
            return new PTBTokenizer<T>(r, this.factory, this.options + ',' + extraOptions);
        }

        @Override
        public void setOptions(String options) {
            this.options = options;
        }
    }
}

