public class ConstantsAndVariables extends Object implements Serializable
| Modifier and Type | Class and Description |
|---|---|
static class |
ConstantsAndVariables.DataSentsIterator |
static class |
ConstantsAndVariables.PatternForEachTokenWay |
static class |
ConstantsAndVariables.PatternIndexWay |
static class |
ConstantsAndVariables.ScorePhraseMeasures |
| Modifier and Type | Field and Description |
|---|---|
boolean |
addIndvWordsFromPhrasesExceptLastAsNeg
For example, if positive seed dict contains "cancer" and "breast cancer" then "breast" is included as negative
|
Map<String,Set<String>> |
allowedNERsforLabels |
Map<String,Set<String>> |
allowedTagsInitials |
String |
allPatternsDir
Cached file of all patterns for all tokens
|
static String |
backgroundSymbol |
boolean |
batchProcessSents
Use this option if you are limited by memory ; ignored if fileFormat is ser.
|
boolean |
clubNeighboringLabeledWords |
String |
commonWordsPatternFiles
Words to be ignored when learning phrases if
removePhrasesWithStopWords or
removeStopWordsFromSelectedPhrases is true. |
boolean |
computeAllPatterns
If all patterns should be computed.
|
int |
debug
Debug flag for learning patterns.
|
Map<String,Counter<CandidatePhrase>> |
dictOddsWeights |
Map<String,Counter<Integer>> |
distSimWeights |
boolean |
doNotApplyPatterns |
boolean |
doNotExtractPhraseAnyWordLabeledOtherClass
Especially useful for multi word phrase extraction.
|
String |
englishWordsFiles
English words that are not labeled when labeling using seed dictionaries
|
Map<String,Env> |
env
Environment for
TokenSequencePattern |
boolean |
evaluate |
boolean |
expandNegativesWhenSampling |
int |
expandPhrasesNumTopSimilar |
boolean |
expandPositivesWhenSampling |
String |
externalFeatureWeightsDir |
static String |
extremedebug |
int |
featureCountThreshold |
List<String> |
functionWords |
boolean |
fuzzyMatch
Whether to do a fuzzy matching when matching seeds to text.
|
static Env |
globalEnv |
String |
goldEntitiesEvalFiles |
String |
identifier
Save this run as ...
|
SentenceIndex |
invertedIndex |
Class<? extends SentenceIndex> |
invertedIndexClass |
String |
invertedIndexDirectory
Where the inverted index (either in memory or lucene) is stored
|
boolean |
justify |
boolean |
loadInvertedIndex
You can load the inverted index using this file.
|
double |
LRSigma
Sigma for L2 regularization in Logisitic regression, if a classifier is
used to score phrases
|
static boolean |
matchLowerCaseContext
Lowercase the context words/lemmas
|
int |
maxExtractNumWords
Maximum number of words to learn
|
static String |
minimaldebug |
int |
minLen4FuzzyForPattern
Minimum length of words that can be matched fuzzily
|
int |
minPosPhraseSupportForPat
Remove patterns that have number of positive words less than this.
|
int |
minUnlabPhraseSupportForPat
Remove patterns that have number of unlabeled words is less than this.
|
Integer |
numIterationsForPatterns
Maximum number of iterations to run
|
int |
numPatterns
Maximum number of patterns learned in each iteration
|
int |
numThreads
Number of threads
|
int |
numWordsToAdd
Number of words to learn in each iteration
|
String |
otherSemanticClassesFiles
List of dictionary phrases that are negative for all labels to be learned.
|
String |
outDir
The output directory where the justifications of learning patterns and
phrases would be saved.
|
GetPatternsFromDataMultiClass.PatternScoring |
patternScoring
Pattern Scoring mechanism.
|
PatternFactory.PatternType |
patternType |
double |
perSelectNeg
These are used to learn weights for features if using logistic regression.
|
double |
perSelectRand
These are used to learn weights for features if using logistic regression.
|
double |
positiveSimilarityThresholdLowPrecision |
boolean |
removeOverLappingLabelsFromSeed
Keeps only one label for each token, whichever has the longest
|
boolean |
removePhrasesWithStopWords |
boolean |
removeStopWordsFromSelectedPhrases |
boolean |
restrictToMatched
Currently, does not work correctly.
|
boolean |
saveInvertedIndex
You can save the inverted index.
|
double |
similarityThresholdHighPrecision |
boolean |
sqrtPatScore
If score for a pattern is square rooted
|
String |
stopWordsPatternFiles
Words that are not learned.
|
ConstantsAndVariables.PatternForEachTokenWay |
storePatsForEachToken |
boolean |
subsampleUnkAsNegUsingSim |
String |
targetAllowedNERs
Allowed NERs for labels.
|
String |
targetAllowedTagsInitialsStr
Initials of all POS tags to use if
usePOS4Pattern is true, separated by comma. |
double |
thresholdNumPatternsApplied |
double |
thresholdSelectPattern
Threshold for learning a pattern
|
double |
thresholdWordExtract |
boolean |
tuneThresholdKeepRunning
Reduce pattern threshold (=0.8*current_value) to extract as many patterns
as possible (still restricted by
numPatterns) |
boolean |
useMatchingPhrase
Use the actual dictionary matching phrase(s) instead of the token word or
lemma in calculating the stats
|
boolean |
useOtherLabelsWordsasNegative
use the seed dictionaries and the new words learned for the other labels in
the previous iterations as negative
|
boolean |
usePatternEvalDomainNgram
|
boolean |
usePatternEvalEditDistOther
|
boolean |
usePatternEvalEditDistSame
|
boolean |
usePatternEvalGoogleNgram
|
boolean |
usePatternEvalSemanticOdds
|
boolean |
usePatternEvalWordClass
|
boolean |
usePatternEvalWordShape
|
boolean |
usePatternResultAsLabel
Label words that are learned so that in further iterations we have more
information
|
boolean |
usePhraseEvalDomainNgram
use domain tf-idf for learning phrases
|
boolean |
usePhraseEvalEditDistOther
Edit distance between this phrase and other phrases in other dictionaries
|
boolean |
usePhraseEvalEditDistSame
Edit distance between this phrase and the other phrases in the label
dictionary
|
boolean |
usePhraseEvalGoogleNgram
use google tf-idf for learning phrases.
|
boolean |
usePhraseEvalPatWtByFreq
use \sum_allpat pattern_wt_that_extracted_phrase/phrase_freq for learning
phrases
|
boolean |
usePhraseEvalSemanticOdds
odds of the phrase freq in the label dictionary vs other dictionaries
|
boolean |
usePhraseEvalWordClass
Only works if you have single label.
|
boolean |
usePhraseEvalWordShape |
boolean |
usePhraseEvalWordVector
Only works if you have single label.
|
boolean |
useWordVectorsToComputeSim |
String |
wordIgnoreRegex
Do not learn phrases that match this regex.
|
edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring |
wordScoring |
String |
wordVectorFile |
boolean |
writeMatchedTokensFiles |
boolean |
writeMatchedTokensIdsForEachPhrase |
| Constructor and Description |
|---|
ConstantsAndVariables(Properties props,
Map<String,Set<CandidatePhrase>> labelDictionary,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses,
Map<String,Map<Class,Object>> ignoreClasses) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses) |
ConstantsAndVariables(Properties props,
Set<String> labels,
Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass,
Map<String,Class> generalizeClasses,
Map<String,Map<Class,Object>> ignoreClasses) |
ConstantsAndVariables(Properties props,
String label,
Class<? extends TypesafeMap.Key<String>> answerClass) |
@Execution.Option(name="numIterationsForPatterns") public Integer numIterationsForPatterns
@Execution.Option(name="numPatterns") public int numPatterns
@Execution.Option(name="outDir") public String outDir
@Execution.Option(name="allPatternsDir") public String allPatternsDir
@Execution.Option(name="computeAllPatterns") public boolean computeAllPatterns
@Execution.Option(name="patternScoring") public GetPatternsFromDataMultiClass.PatternScoring patternScoring
GetPatternsFromDataMultiClass.PatternScoring for options.@Execution.Option(name="thresholdSelectPattern") public double thresholdSelectPattern
@Execution.Option(name="restrictToMatched") public boolean restrictToMatched
@Execution.Option(name="usePatternResultAsLabel") public boolean usePatternResultAsLabel
@Execution.Option(name="debug") public int debug
@Execution.Option(name="identifier") public String identifier
@Execution.Option(name="useMatchingPhrase") public boolean useMatchingPhrase
@Execution.Option(name="tuneThresholdKeepRunning") public boolean tuneThresholdKeepRunning
numPatterns)@Execution.Option(name="maxExtractNumWords") public int maxExtractNumWords
@Execution.Option(name="useOtherLabelsWordsasNegative") public boolean useOtherLabelsWordsasNegative
@Execution.Option(name="matchLowerCaseContext") public static boolean matchLowerCaseContext
@Execution.Option(name="targetAllowedTagsInitialsStr") public String targetAllowedTagsInitialsStr
usePOS4Pattern is true, separated by comma.@Execution.Option(name="targetAllowedNERs") public String targetAllowedNERs
useTargetNERRestriction flag should be true@Execution.Option(name="numWordsToAdd") public int numWordsToAdd
@Execution.Option(name="thresholdNumPatternsApplied") public double thresholdNumPatternsApplied
@Execution.Option(name="wordScoring") public edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring wordScoring
@Execution.Option(name="thresholdWordExtract") public double thresholdWordExtract
public boolean justify
@Execution.Option(name="LRSigma") public double LRSigma
@Execution.Option(name="englishWordsFiles") public String englishWordsFiles
@Execution.Option(name="commonWordsPatternFiles") public String commonWordsPatternFiles
removePhrasesWithStopWords or
removeStopWordsFromSelectedPhrases is true. Also, these words
are considered negative when scoring a pattern (similar to
othersemanticclasses).@Execution.Option(name="otherSemanticClassesFiles") public String otherSemanticClassesFiles
@Execution.Option(name="minLen4FuzzyForPattern") public int minLen4FuzzyForPattern
@Execution.Option(name="wordIgnoreRegex") public String wordIgnoreRegex
@Execution.Option(name="numThreads") public int numThreads
@Execution.Option(name="stopWordsPatternFiles", gloss="stop words") public String stopWordsPatternFiles
CreatePatterns is true.public Map<String,Env> env
TokenSequencePatternpublic static Env globalEnv
@Execution.Option(name="removeStopWordsFromSelectedPhrases") public boolean removeStopWordsFromSelectedPhrases
@Execution.Option(name="removePhrasesWithStopWords") public boolean removePhrasesWithStopWords
@Execution.Option(name="externalFeatureWeightsFile") public String externalFeatureWeightsDir
@Execution.Option(name="doNotApplyPatterns") public boolean doNotApplyPatterns
@Execution.Option(name="sqrtPatScore") public boolean sqrtPatScore
@Execution.Option(name="minUnlabPhraseSupportForPat") public int minUnlabPhraseSupportForPat
@Execution.Option(name="minPosPhraseSupportForPat") public int minPosPhraseSupportForPat
@Execution.Option(name="addIndvWordsFromPhrasesExceptLastAsNeg") public boolean addIndvWordsFromPhrasesExceptLastAsNeg
public Map<String,Counter<CandidatePhrase>> dictOddsWeights
@Execution.Option(name="invertedIndexClass", gloss="another option is Lucene backed, which is not included in the CoreNLP release. Contact us to get a copy (distributed under Apache License).") public Class<? extends SentenceIndex> invertedIndexClass
@Execution.Option(name="invertedIndexDirectory") public String invertedIndexDirectory
@Execution.Option(name="clubNeighboringLabeledWords") public boolean clubNeighboringLabeledWords
@Execution.Option(name="patternType", required=true) public PatternFactory.PatternType patternType
@Execution.Option(name="subsampleUnkAsNegUsingSim", gloss="When learning a classifier, remove phrases from unknown phrases that are too close to the positive phrases") public boolean subsampleUnkAsNegUsingSim
@Execution.Option(name="expandPositivesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the positives") public boolean expandPositivesWhenSampling
@Execution.Option(name="expandNegativesWhenSampling", gloss="when sampling for learning feature wts for learning phrases, expand the negatives") public boolean expandNegativesWhenSampling
@Execution.Option(name="similarityThresholdHighPrecision", gloss="used for expanding positives") public double similarityThresholdHighPrecision
@Execution.Option(name="positiveSimilarityThresholdLowPrecision", gloss="used for not choosing close unknowns as positives") public double positiveSimilarityThresholdLowPrecision
@Execution.Option(name="wordVectorFile", gloss="if using word vectors for computing similarities") public String wordVectorFile
@Execution.Option(name="useWordVectorsToComputeSim", gloss="use vectors directly instead of word classes for computing similarity") public boolean useWordVectorsToComputeSim
@Execution.Option(name="goldEntitiesEvalFiles", gloss="label1,gold_list_of_entities_file;label2,...") public String goldEntitiesEvalFiles
@Execution.Option(name="evaluate") public boolean evaluate
@Execution.Option(name="featureCountThreshold") public int featureCountThreshold
@Execution.Option(name="expandPhrasesNumTopSimilar", gloss="k in kNN") public int expandPhrasesNumTopSimilar
@Execution.Option(name="fuzzyMatch") public boolean fuzzyMatch
@Execution.Option(name="removeOverLappingLabelsFromSeed") public boolean removeOverLappingLabelsFromSeed
@Execution.Option(name="usePhraseEvalWordClass") public boolean usePhraseEvalWordClass
@Execution.Option(name="usePhraseEvalWordVector") public boolean usePhraseEvalWordVector
@Execution.Option(name="usePhraseEvalGoogleNgram") public boolean usePhraseEvalGoogleNgram
@Execution.Option(name="usePhraseEvalDomainNgram") public boolean usePhraseEvalDomainNgram
@Execution.Option(name="usePhraseEvalPatWtByFreq") public boolean usePhraseEvalPatWtByFreq
@Execution.Option(name="usePhraseEvalSemanticOdds") public boolean usePhraseEvalSemanticOdds
@Execution.Option(name="usePhraseEvalEditDistSame") public boolean usePhraseEvalEditDistSame
@Execution.Option(name="usePhraseEvalEditDistOther") public boolean usePhraseEvalEditDistOther
@Execution.Option(name="usePhraseEvalWordShape") public boolean usePhraseEvalWordShape
@Execution.Option(name="usePatternEvalWordClass") public boolean usePatternEvalWordClass
@Execution.Option(name="usePatternEvalWordShape") public boolean usePatternEvalWordShape
@Execution.Option(name="usePatternEvalGoogleNgram") public boolean usePatternEvalGoogleNgram
@Execution.Option(name="usePatternEvalDomainNgram") public boolean usePatternEvalDomainNgram
patternScoring is PhEvalInPat or
PhEvalInPat. See usePhrase* for meanings. Need to also provide googleNgram_dbname,
googleNgram_username and googleNgram_host@Execution.Option(name="usePatternEvalSemanticOdds") public boolean usePatternEvalSemanticOdds
@Execution.Option(name="usePatternEvalEditDistSame") public boolean usePatternEvalEditDistSame
@Execution.Option(name="usePatternEvalEditDistOther") public boolean usePatternEvalEditDistOther
@Execution.Option(name="perSelectRand") public double perSelectRand
@Execution.Option(name="perSelectNeg") public double perSelectNeg
@Execution.Option(name="doNotExtractPhraseAnyWordLabeledOtherClass") public boolean doNotExtractPhraseAnyWordLabeledOtherClass
@Execution.Option(name="saveInvertedIndex") public boolean saveInvertedIndex
invertedIndexDirectory if given.@Execution.Option(name="loadInvertedIndex") public boolean loadInvertedIndex
@Execution.Option(name="storePatsForEachToken", gloss="used for storing patterns in PSQL/MEMORY/LUCENE") public ConstantsAndVariables.PatternForEachTokenWay storePatsForEachToken
public static String backgroundSymbol
public SentenceIndex invertedIndex
public static String extremedebug
public static String minimaldebug
@Execution.Option(name="batchProcessSents") public boolean batchProcessSents
@Execution.Option(name="writeMatchedTokensFiles") public boolean writeMatchedTokensFiles
@Execution.Option(name="writeMatchedTokensIdsForEachPhrase") public boolean writeMatchedTokensIdsForEachPhrase
public ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses, Map<String,Map<Class,Object>> ignoreClasses) throws IOException
IOExceptionpublic ConstantsAndVariables(Properties props, Map<String,Set<CandidatePhrase>> labelDictionary, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses, Map<String,Map<Class,Object>> ignoreClasses) throws IOException
IOExceptionpublic ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass) throws IOException
IOExceptionpublic ConstantsAndVariables(Properties props, String label, Class<? extends TypesafeMap.Key<String>> answerClass) throws IOException
IOExceptionpublic ConstantsAndVariables(Properties props, Set<String> labels, Map<String,Class<? extends TypesafeMap.Key<String>>> answerClass, Map<String,Class> generalizeClasses) throws IOException
IOExceptionpublic void addLearnedWords(String trainLabel, Counter<CandidatePhrase> identifiedWords)
public boolean hasSeedWordOrOtherSem(CandidatePhrase p)
public void setUp(Properties props) throws IOException
IOExceptionpublic void setWordShapesForLabels(ConcurrentHashMap<String,Counter<String>> wordShapesForLabels)
public static Set<CandidatePhrase> getStopWords()
public void addWordShapes(String label, Set<CandidatePhrase> words)
public void setSeedLabelDictionary(Map<String,Set<CandidatePhrase>> seedSets)
public Map<String,Set<CandidatePhrase>> getSeedLabelDictionary()
public void addSeedLabelDictionary(String label, Set<CandidatePhrase> words)
public Counter<CandidatePhrase> getLearnedWords(String label)
public Map<String,Counter<CandidatePhrase>> getLearnedWords()
public String getLearnedWordsAsJson()
public void setLearnedWords(Counter<CandidatePhrase> words, String label)
public Set<CandidatePhrase> getOtherSemanticClassesWords()
public void setOtherSemanticClassesWords(Set<CandidatePhrase> other)
public Pair<String,Double> getEditDistanceFromThisClass(String label, String ph, int minLen)
public Pair<String,Double> getEditDistanceFromOtherClasses(String label, String ph, int minLen)
public ConcurrentHashMap<String,Double> getEditDistanceFromEnglishWords()
public ConcurrentHashMap<String,String> getEditDistanceFromEnglishWordsMatches()
public double getEditDistanceScoresOtherClass(String label, String g)
public double getEditDistanceScoresOtherClassThreshold(String label, String g)
g - public double getEditDistanceScoresThisClassThreshold(String label, String g)
public static CandidatePhrase containsFuzzy(Set<CandidatePhrase> words, CandidatePhrase w, int minLen4Fuzzy)
public void setGeneralWordClassClusters(Map<String,Integer> generalWordClassClusters)
public Map<String,Class<? extends TypesafeMap.Key<String>>> getAnswerClass()
public Map<String,Map<Class,Object>> getIgnoreWordswithClassesDuringSelection()
public void addSeedWords(String label, Collection<CandidatePhrase> seeds) throws Exception
Exception