/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.util.Function;
import edu.stanford.nlp.util.Generics;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.regex.Pattern;

public class DocumentPreprocessor
implements Iterable<List<HasWord>> {
    public static final String[] DEFAULT_SENTENCE_DELIMS = new String[]{".", "?", "!"};
    private Reader inputReader;
    private final DocType docType;
    private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.coreLabelFactory();
    private String[] sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS;
    private Function<List<HasWord>, List<HasWord>> escaper = null;
    private String sentenceDelimiter = null;
    private String tagDelimiter = null;
    private String elementDelimiter = ".*";
    private static final Pattern wsPattern = Pattern.compile("\\s+");
    private final String[] sentenceFinalFollowers = new String[]{")", "]", "\"", "'", "''", "-RRB-", "-RSB-", "-RCB-"};

    public DocumentPreprocessor(Reader input) {
        this(input, DocType.Plain);
    }

    public DocumentPreprocessor(Reader input, DocType t) {
        if (input == null) {
            throw new IllegalArgumentException("Cannot read from null object!");
        }
        this.docType = t;
        this.inputReader = input;
    }

    public DocumentPreprocessor(String docPath) {
        this(docPath, DocType.Plain, "UTF-8");
    }

    public DocumentPreprocessor(String docPath, DocType t) {
        this(docPath, t, "UTF-8");
    }

    public DocumentPreprocessor(String docPath, DocType t, String encoding) {
        if (docPath == null) {
            throw new IllegalArgumentException("Cannot open null document path!");
        }
        this.docType = t;
        try {
            this.inputReader = IOUtils.readerFromString(docPath, encoding);
        }
        catch (IOException ioe) {
            System.err.printf("%s: Could not open path %s\n", this.getClass().getName(), docPath);
            throw new RuntimeIOException(ioe);
        }
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> e) {
        this.escaper = e;
    }

    public void setSentenceDelimiter(String s) {
        this.sentenceDelimiter = s;
    }

    public void setTagDelimiter(String s) {
        this.tagDelimiter = s;
    }

    public void setElementDelimiter(String s) {
        this.elementDelimiter = s;
    }

    @Override
    public Iterator<List<HasWord>> iterator() {
        if (this.docType == DocType.Plain) {
            return new PlainTextIterator();
        }
        if (this.docType == DocType.XML) {
            return new XMLIterator();
        }
        throw new IllegalStateException("Someone didn't add a handler for a new docType.");
    }

    public static void main(String[] args) throws IOException {
        int i;
        if (args.length < 1) {
            System.err.println("usage: DocumentPreprocessor OPT* filename");
            System.err.println("    OPT = -xml|-encoding ENC|-tokenizerOptions opts|-tag delim|...");
            return;
        }
        String encoding = "utf-8";
        boolean printSentenceLengths = false;
        DocType docType = DocType.Plain;
        String xmlElementDelimiter = null;
        TokenizerFactory<CoreLabel> tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        String sentenceDelimiter = null;
        String tagDelimiter = null;
        boolean printOriginalText = false;
        String[] sentenceDelims = null;
        for (i = 0; i < args.length && args[i].length() != 0 && args[i].startsWith("-"); ++i) {
            if (args[i].equals("-xml")) {
                docType = DocType.XML;
                xmlElementDelimiter = args[++i];
                continue;
            }
            if (args[i].equals("-encoding") && i + 1 < args.length) {
                encoding = args[++i];
                continue;
            }
            if (args[i].equals("-printSentenceLengths")) {
                printSentenceLengths = true;
                continue;
            }
            if (args[i].equals("-suppressEscaping")) {
                tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
                continue;
            }
            if (args[i].equals("-tokenizerOptions") && i + 1 < args.length) {
                tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), args[++i]);
                continue;
            }
            if (args[i].equals("-noTokenization")) {
                tf = null;
                sentenceDelimiter = System.getProperty("line.separator");
                continue;
            }
            if (args[i].equals("-whitespaceTokenization")) {
                tf = null;
                ArrayList<String> whitespaceDelims = new ArrayList<String>(Arrays.asList(DEFAULT_SENTENCE_DELIMS));
                whitespaceDelims.add("\n");
                sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
                continue;
            }
            if (args[i].equals("-tag")) {
                tagDelimiter = args[++i];
                continue;
            }
            if (args[i].equals("-printOriginalText")) {
                printOriginalText = true;
                tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
                continue;
            }
            System.err.println("Unknown option: " + args[i]);
        }
        int numSents = 0;
        PrintWriter pw = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, encoding), true);
        while (i < args.length) {
            DocumentPreprocessor docPreprocessor = new DocumentPreprocessor(args[i], docType, encoding);
            if (docType == DocType.XML) {
                docPreprocessor.setElementDelimiter(xmlElementDelimiter);
            }
            docPreprocessor.setTokenizerFactory(tf);
            if (sentenceDelimiter != null) {
                docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
            }
            if (tagDelimiter != null) {
                docPreprocessor.setTagDelimiter(args[++i]);
            }
            if (sentenceDelims != null) {
                docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
            }
            for (List<HasWord> sentence : docPreprocessor) {
                ++numSents;
                if (printSentenceLengths) {
                    System.err.println("Length:\t" + sentence.size());
                }
                boolean printSpace = false;
                for (HasWord word : sentence) {
                    if (printOriginalText) {
                        CoreLabel cl = (CoreLabel)word;
                        if (!printSpace) {
                            pw.print((String)cl.get(CoreAnnotations.BeforeAnnotation.class));
                            printSpace = true;
                        }
                        pw.print((String)cl.get(CoreAnnotations.OriginalTextAnnotation.class));
                        pw.print((String)cl.get(CoreAnnotations.AfterAnnotation.class));
                        continue;
                    }
                    if (printSpace) {
                        pw.print(" ");
                    }
                    printSpace = true;
                    pw.print(word.word());
                }
                pw.println();
            }
            ++i;
        }
        pw.close();
        System.err.println("Read in " + numSents + " sentences.");
    }

    private class XMLIterator
    implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            this.primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                    continue;
                }
                if (this.xmlItr.hasNext()) {
                    String block = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(block));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                        continue;
                    }
                    this.nextSent = null;
                    continue;
                }
                IOUtils.closeIgnoringExceptions(this.originalDocReader);
                this.nextSent = null;
                break;
            } while (this.nextSent == null);
        }

        @Override
        public boolean hasNext() {
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisSentence = this.nextSent;
            this.primeNext();
            return thisSentence;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    private class PlainTextIterator
    implements Iterator<List<HasWord>> {
        private final Tokenizer<? extends HasWord> tokenizer;
        private final Set<String> sentDelims;
        private final Set<String> delimFollowers;
        private Function<String, String[]> splitTag;
        private List<HasWord> nextSent = null;
        private final List<HasWord> nextSentCarryover = new ArrayList<HasWord>();

        public PlainTextIterator() {
            boolean eolIsSignificant = false;
            this.sentDelims = Generics.newHashSet();
            if (DocumentPreprocessor.this.sentenceDelimiter == null) {
                if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                    this.sentDelims.addAll(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
                }
                this.delimFollowers = Generics.newHashSet(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            } else {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = Generics.newHashSet();
                eolIsSignificant = wsPattern.matcher(DocumentPreprocessor.this.sentenceDelimiter).matches();
                if (eolIsSignificant) {
                    this.sentDelims.add("*NL*");
                }
            }
            if (DocumentPreprocessor.this.tokenizerFactory == null) {
                eolIsSignificant = this.sentDelims.contains("\n");
                this.tokenizer = WhitespaceTokenizer.newWordWhitespaceTokenizer(DocumentPreprocessor.this.inputReader, eolIsSignificant);
            } else {
                this.tokenizer = eolIsSignificant ? DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader, "tokenizeNLs") : DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader);
            }
            if (DocumentPreprocessor.this.tagDelimiter != null) {
                this.splitTag = new Function<String, String[]>(){
                    private final String splitRegex;
                    {
                        this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                    }

                    @Override
                    public String[] apply(String in) {
                        String[] splits = in.trim().split(this.splitRegex);
                        if (splits.length == 2) {
                            return splits;
                        }
                        String[] oldStr = new String[]{in};
                        return oldStr;
                    }
                };
            }
        }

        private void primeNext() {
            this.nextSent = new ArrayList<HasWord>(this.nextSentCarryover);
            this.nextSentCarryover.clear();
            boolean seenBoundary = false;
            while (this.tokenizer.hasNext()) {
                HasWord token = this.tokenizer.next();
                if (this.splitTag != null) {
                    String[] toks = this.splitTag.apply(token.word());
                    token.setWord(toks[0]);
                    if (token instanceof Label) {
                        ((Label)((Object)token)).setValue(toks[0]);
                    }
                    if (toks.length == 2 && token instanceof HasTag) {
                        ((HasTag)((Object)token)).setTag(toks[1]);
                    }
                }
                if (this.sentDelims.contains(token.word())) {
                    seenBoundary = true;
                } else if (seenBoundary && !this.delimFollowers.contains(token.word())) {
                    this.nextSentCarryover.add(token);
                    break;
                }
                if (!wsPattern.matcher(token.word()).matches() && !token.word().equals("*NL*")) {
                    this.nextSent.add(token);
                }
                if (!seenBoundary || this.delimFollowers.size() != 0) continue;
                if (this.nextSent.size() > 0) break;
                seenBoundary = false;
            }
            if (this.nextSent.size() == 0 && this.nextSentCarryover.size() == 0) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List)DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override
        public boolean hasNext() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisIteration = this.nextSent;
            this.nextSent = null;
            return thisIteration;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public static enum DocType {
        Plain,
        XML;

    }
}

