/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.international.french.pipeline;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeReader;
import edu.stanford.nlp.trees.international.french.FrenchTreeReaderFactory;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

public final class MWEPreprocessor {
    private static Redwood.RedwoodChannels log = Redwood.channels(MWEPreprocessor.class);
    private static final boolean RESOLVE_DUMMY_TAGS = true;
    private static int nMissingPOS = 0;
    private static int nMissingPhrasal = 0;
    static final TregexPattern pMWE = TregexPattern.compile("/^MW/");

    private MWEPreprocessor() {
    }

    public static void printCounter(TwoDimensionalCounter<String, String> cnt, String fname) {
        try {
            PrintWriter pw = new PrintWriter(new PrintStream((OutputStream)new FileOutputStream(new File(fname)), false, "UTF-8"));
            for (String key : cnt.firstKeySet()) {
                for (String val : ((ClassicCounter)cnt.getCounter((Object)key)).keySet()) {
                    pw.printf("%s\t%s\t%d%n", key, val, (int)cnt.getCount(key, val));
                }
            }
            pw.close();
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }

    public static void updateTagger(TwoDimensionalCounter<String, String> tagger, Tree t) {
        List<CoreLabel> yield = t.taggedLabeledYield();
        for (CoreLabel cl : yield) {
            if (cl.tag().equals("DUMMY")) continue;
            tagger.incrementCount(cl.word(), cl.tag());
        }
    }

    public static void traverseAndFix(Tree t, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
        if (t.isPreTerminal()) {
            if (t.value().equals("DUMMY")) {
                ++nMissingPOS;
                String word = t.firstChild().value();
                String tag = unigramTagger.firstKeySet().contains(word) ? (String)Counters.argmax(unigramTagger.getCounter((Object)word)) : ManualUWModel.getTag(word);
                t.setValue(tag);
            }
            return;
        }
        for (Tree kid : t.children()) {
            MWEPreprocessor.traverseAndFix(kid, pretermLabel, unigramTagger);
        }
        if (t.value().equals("DUMMYP")) {
            ++nMissingPhrasal;
            StringBuilder sb = new StringBuilder();
            for (Tree kid : t.children()) {
                sb.append(kid.value()).append(" ");
            }
            String posSequence = sb.toString().trim();
            if (pretermLabel.firstKeySet().contains(posSequence)) {
                String phrasalCat = (String)Counters.argmax(pretermLabel.getCounter((Object)posSequence));
                t.setValue(phrasalCat);
            } else {
                System.out.println("No phrasal cat for: " + posSequence);
            }
        }
    }

    private static void resolveDummyTags(File treeFile, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> unigramTagger) {
        try {
            Tree t;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(treeFile), "UTF-8"));
            FrenchTreeReaderFactory trf = new FrenchTreeReaderFactory();
            TreeReader tr = trf.newTreeReader(br);
            PrintWriter pw = new PrintWriter(new PrintStream((OutputStream)new FileOutputStream(new File(treeFile + ".fixed")), false, "UTF-8"));
            int nTrees = 0;
            while ((t = tr.readTree()) != null) {
                MWEPreprocessor.traverseAndFix(t, pretermLabel, unigramTagger);
                pw.println(t.toString());
                ++nTrees;
            }
            pw.close();
            tr.close();
            System.out.println("Processed " + nTrees + " trees");
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void countMWEStatistics(Tree t, TwoDimensionalCounter<String, String> unigramTagger, TwoDimensionalCounter<String, String> labelPreterm, TwoDimensionalCounter<String, String> pretermLabel, TwoDimensionalCounter<String, String> labelTerm, TwoDimensionalCounter<String, String> termLabel) {
        MWEPreprocessor.updateTagger(unigramTagger, t);
        TregexMatcher m = pMWE.matcher(t);
        while (m.findNextMatchingNode()) {
            Tree match = m.getMatch();
            String label = match.value();
            if (label.equals("DUMMYP")) continue;
            String preterm = SentenceUtils.listToString(match.preTerminalYield());
            String term = SentenceUtils.listToString(match.yield());
            labelPreterm.incrementCount(label, preterm);
            pretermLabel.incrementCount(preterm, label);
            labelTerm.incrementCount(label, term);
            termLabel.incrementCount(term, label);
        }
    }

    public static void main(String[] args) {
        if (args.length != 1) {
            System.err.printf("Usage: java %s file%n", MWEPreprocessor.class.getName());
            System.exit(-1);
        }
        File treeFile = new File(args[0]);
        TwoDimensionalCounter<String, String> labelTerm = new TwoDimensionalCounter<String, String>();
        TwoDimensionalCounter<String, String> termLabel = new TwoDimensionalCounter<String, String>();
        TwoDimensionalCounter<String, String> labelPreterm = new TwoDimensionalCounter<String, String>();
        TwoDimensionalCounter<String, String> pretermLabel = new TwoDimensionalCounter<String, String>();
        TwoDimensionalCounter<String, String> unigramTagger = new TwoDimensionalCounter<String, String>();
        try {
            Tree t;
            BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(treeFile), "UTF-8"));
            FrenchTreeReaderFactory trf = new FrenchTreeReaderFactory();
            TreeReader tr = trf.newTreeReader(br);
            while ((t = tr.readTree()) != null) {
                MWEPreprocessor.countMWEStatistics(t, unigramTagger, labelPreterm, pretermLabel, labelTerm, termLabel);
            }
            tr.close();
            System.out.println("Generating {MWE Type -> Terminal}");
            MWEPreprocessor.printCounter(labelTerm, "label_term.csv");
            System.out.println("Generating {Terminal -> MWE Type}");
            MWEPreprocessor.printCounter(termLabel, "term_label.csv");
            System.out.println("Generating {MWE Type -> POS sequence}");
            MWEPreprocessor.printCounter(labelPreterm, "label_pos.csv");
            System.out.println("Generating {POS sequence -> MWE Type}");
            MWEPreprocessor.printCounter(pretermLabel, "pos_label.csv");
            System.out.println("Resolving DUMMY tags");
            MWEPreprocessor.resolveDummyTags(treeFile, pretermLabel, unigramTagger);
            System.out.println("#Unknown Word Types: " + ManualUWModel.nUnknownWordTypes);
            System.out.println("#Missing POS: " + nMissingPOS);
            System.out.println("#Missing Phrasal: " + nMissingPhrasal);
            System.out.println("Done!");
        }
        catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    private static class ManualUWModel {
        private static final Set<String> nouns = Generics.newHashSet();
        private static final String nStr = "A. Alezais alfa Annick Appliances Ardenne Artois baptiste Bargue Bellanger Bregenz clefs Coeurs ...conomie consumer contr\u00f4leur Coop\u00e9rative Copp\u00e9e cuisson d\u00e9doublement demandeuse d\u00e9fraie Domestic d\u00e9pistage Elektra Elettrodomestici Essonnes Fair Finparcom Gelisim gorge Happy Indesit Italia jockey Lawrence leone Levi machinisme Mc.Donnel MD Merloni Meydan m\u00e9nagers Muenchener Parcel Prost R. sam Sara Si\u00e8ge silos SPA Stateman Valley Vanity VF Vidal Vives Yorker Young Zemment";
        private static final Set<String> adjectives = Generics.newHashSet();
        private static final String aStr = "astral bis bovin gracieux int\u00e9grante italiano sanguin s\u00e8che";
        private static final Set<String> preps = Generics.newHashSet();
        private static final String pStr = "c o t";
        private static int nUnknownWordTypes;
        private static final Pattern digit;

        private ManualUWModel() {
        }

        public static String getTag(String word) {
            if (digit.matcher(word).find()) {
                return "N";
            }
            if (nouns.contains(word)) {
                return "N";
            }
            if (adjectives.contains(word)) {
                return "A";
            }
            if (preps.contains(word)) {
                return "P";
            }
            log.info("No POS tag for " + word);
            return "N";
        }

        static {
            nouns.addAll(Arrays.asList(nStr.split("\\s+")));
            adjectives.addAll(Arrays.asList(aStr.split("\\s+")));
            preps.addAll(Arrays.asList(pStr.split("\\s+")));
            nUnknownWordTypes = nouns.size() + adjectives.size() + preps.size();
            digit = Pattern.compile("\\d+");
        }
    }
}

