edu.stanford.nlp.ling.tokensregex
Class PhraseTable

java.lang.Object
  extended by edu.stanford.nlp.ling.tokensregex.PhraseTable
All Implemented Interfaces:
java.io.Serializable

public class PhraseTable
extends java.lang.Object
implements java.io.Serializable

Table used to lookup multi-word phrases. This class provides functions for looking up all instances of known phrases in a document in an efficient manner. Phrases can be added to the phrase table using

Author:
Angel Chang
See Also:
Serialized Form

Nested Class Summary
static class PhraseTable.Phrase
          A phrase is a multiword expression
static class PhraseTable.PhraseMatch
          Represents a matched phrase
static class PhraseTable.PhraseStringCollection
           
static class PhraseTable.StringList
           
static class PhraseTable.TokenList
           
static interface PhraseTable.WordList
           
 
Field Summary
 boolean caseInsensitive
           
 boolean ignorePunctuation
           
 boolean ignorePunctuationTokens
           
 boolean normalize
           
static java.util.Comparator<PhraseTable.PhraseMatch> PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR
           
 Annotator tokenizer
           
 
Constructor Summary
PhraseTable()
           
PhraseTable(boolean normalize, boolean caseInsensitive, boolean ignorePunctuation)
           
PhraseTable(int initSize)
           
 
Method Summary
 boolean addPhrase(java.util.List<java.lang.String> tokens)
           
 boolean addPhrase(java.util.List<java.lang.String> tokens, java.lang.String tag)
           
 boolean addPhrase(java.util.List<java.lang.String> tokens, java.lang.String tag, java.lang.Object phraseData)
           
 boolean addPhrase(java.lang.String phraseText)
           
 boolean addPhrase(java.lang.String phraseText, java.lang.String tag)
           
 boolean addPhrase(java.lang.String phraseText, java.lang.String tag, java.lang.Object phraseData)
           
 void addPhrases(java.util.Collection<java.lang.String> phraseTexts)
           
 void addPhrases(java.util.Map<java.lang.String,java.lang.String> taggedPhraseTexts)
           
protected  int checkWordListMatch(PhraseTable.Phrase phrase, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, int checkStart, boolean matchEnd)
           
 void clear()
           
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens)
          Given a list of tokens, returns list of spans (PhraseMatch) that corresponds to a phrase in the table (filtered by the list of acceptable phrase)
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
           
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases, java.lang.String text)
          Given a segment of text, returns list of spans (PhraseMatch) that corresponds to a phrase in the table (filtered by the list of acceptable phrase)
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens)
          Given a list of tokens, returns list of spans (PhraseMatch) that corresponds to a phrase in the table
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
           
 java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.lang.String text)
          Given a segment of text, returns list of spans (PhraseMatch) that corresponds to a phrase in the table
protected  java.util.List<PhraseTable.PhraseMatch> findMatches(java.util.Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization, boolean findAll, boolean matchEnd)
           
 java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens)
           
 java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean needNormalization)
           
 java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text)
           
 java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text, int tokenStart, int tokenEnd, boolean needNormalization)
           
protected  java.util.List<PhraseTable.PhraseMatch> findMatchesNormalized(java.util.Collection<PhraseTable.Phrase> acceptablePhrases, PhraseTable.WordList tokens, int tokenStart, int tokenEnd, boolean findAll, boolean matchEnd)
           
 java.util.List<PhraseTable.PhraseMatch> findNonOverlappingPhrases(java.util.List<PhraseTable.PhraseMatch> phraseMatches)
           
static PhraseTable.Phrase getLongestPhrase(java.util.List<PhraseTable.Phrase> phrases)
           
 java.lang.String getNormalizedForm(java.lang.String word)
           
 java.util.Iterator<PhraseTable.Phrase> iterator()
           
 PhraseTable.Phrase lookup(PhraseTable.WordList wordList)
           
 PhraseTable.Phrase lookup(java.lang.String phrase)
           
 PhraseTable.Phrase lookupNormalized(java.lang.String phrase)
           
 void readPhrases(java.lang.String filename, boolean checkTag)
           
 void readPhrases(java.lang.String filename, boolean checkTag, java.util.regex.Pattern delimiterPattern)
           
 void readPhrases(java.lang.String filename, boolean checkTag, java.lang.String delimiterRegex)
           
 void readPhrases(java.lang.String filename, int phraseColIndex, int tagColIndex)
           
 void setNormalizationCacheSize(int cacheSize)
           
 java.lang.String[] splitText(java.lang.String phraseText)
           
 PhraseTable.WordList toNormalizedWordList(java.lang.String phraseText)
           
static java.lang.String toString(PhraseTable.WordList wordList)
           
 PhraseTable.WordList toWordList(java.lang.String phraseText)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

normalize

public boolean normalize

caseInsensitive

public boolean caseInsensitive

ignorePunctuation

public boolean ignorePunctuation

ignorePunctuationTokens

public boolean ignorePunctuationTokens

tokenizer

public Annotator tokenizer

PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR

public static final java.util.Comparator<PhraseTable.PhraseMatch> PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR
Constructor Detail

PhraseTable

public PhraseTable()

PhraseTable

public PhraseTable(int initSize)

PhraseTable

public PhraseTable(boolean normalize,
                   boolean caseInsensitive,
                   boolean ignorePunctuation)
Method Detail

clear

public void clear()

setNormalizationCacheSize

public void setNormalizationCacheSize(int cacheSize)

readPhrases

public void readPhrases(java.lang.String filename,
                        boolean checkTag)
                 throws java.io.IOException
Throws:
java.io.IOException

readPhrases

public void readPhrases(java.lang.String filename,
                        boolean checkTag,
                        java.lang.String delimiterRegex)
                 throws java.io.IOException
Throws:
java.io.IOException

readPhrases

public void readPhrases(java.lang.String filename,
                        boolean checkTag,
                        java.util.regex.Pattern delimiterPattern)
                 throws java.io.IOException
Throws:
java.io.IOException

readPhrases

public void readPhrases(java.lang.String filename,
                        int phraseColIndex,
                        int tagColIndex)
                 throws java.io.IOException
Throws:
java.io.IOException

getLongestPhrase

public static PhraseTable.Phrase getLongestPhrase(java.util.List<PhraseTable.Phrase> phrases)

splitText

public java.lang.String[] splitText(java.lang.String phraseText)

toWordList

public PhraseTable.WordList toWordList(java.lang.String phraseText)

toNormalizedWordList

public PhraseTable.WordList toNormalizedWordList(java.lang.String phraseText)

addPhrases

public void addPhrases(java.util.Collection<java.lang.String> phraseTexts)

addPhrases

public void addPhrases(java.util.Map<java.lang.String,java.lang.String> taggedPhraseTexts)

addPhrase

public boolean addPhrase(java.lang.String phraseText)

addPhrase

public boolean addPhrase(java.lang.String phraseText,
                         java.lang.String tag)

addPhrase

public boolean addPhrase(java.lang.String phraseText,
                         java.lang.String tag,
                         java.lang.Object phraseData)

addPhrase

public boolean addPhrase(java.util.List<java.lang.String> tokens)

addPhrase

public boolean addPhrase(java.util.List<java.lang.String> tokens,
                         java.lang.String tag)

addPhrase

public boolean addPhrase(java.util.List<java.lang.String> tokens,
                         java.lang.String tag,
                         java.lang.Object phraseData)

getNormalizedForm

public java.lang.String getNormalizedForm(java.lang.String word)

lookup

public PhraseTable.Phrase lookup(java.lang.String phrase)

lookupNormalized

public PhraseTable.Phrase lookupNormalized(java.lang.String phrase)

lookup

public PhraseTable.Phrase lookup(PhraseTable.WordList wordList)

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.lang.String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds to a phrase in the table

Parameters:
text - Input text to search over
Returns:
List of all matched spans

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds to a phrase in the table

Parameters:
tokens - List of tokens to search over
Returns:
List of all matched spans

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
                                                              java.lang.String text)
Given a segment of text, returns list of spans (PhraseMatch) that corresponds to a phrase in the table (filtered by the list of acceptable phrase)

Parameters:
acceptablePhrases - - What phrases to look for (need to be subset of phrases already in table)
text - Input text to search over
Returns:
List of all matched spans

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
                                                              PhraseTable.WordList tokens)
Given a list of tokens, returns list of spans (PhraseMatch) that corresponds to a phrase in the table (filtered by the list of acceptable phrase)

Parameters:
acceptablePhrases - - What phrases to look for (need to be subset of phrases already in table)
tokens - List of tokens to search over
Returns:
List of all matched spans

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(PhraseTable.WordList tokens,
                                                              int tokenStart,
                                                              int tokenEnd,
                                                              boolean needNormalization)

findAllMatches

public java.util.List<PhraseTable.PhraseMatch> findAllMatches(java.util.List<PhraseTable.Phrase> acceptablePhrases,
                                                              PhraseTable.WordList tokens,
                                                              int tokenStart,
                                                              int tokenEnd,
                                                              boolean needNormalization)

findMatches

public java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text)

findMatches

public java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens)

findMatches

public java.util.List<PhraseTable.PhraseMatch> findMatches(PhraseTable.WordList tokens,
                                                           int tokenStart,
                                                           int tokenEnd,
                                                           boolean needNormalization)

findMatches

public java.util.List<PhraseTable.PhraseMatch> findMatches(java.lang.String text,
                                                           int tokenStart,
                                                           int tokenEnd,
                                                           boolean needNormalization)

checkWordListMatch

protected int checkWordListMatch(PhraseTable.Phrase phrase,
                                 PhraseTable.WordList tokens,
                                 int tokenStart,
                                 int tokenEnd,
                                 int checkStart,
                                 boolean matchEnd)

findNonOverlappingPhrases

public java.util.List<PhraseTable.PhraseMatch> findNonOverlappingPhrases(java.util.List<PhraseTable.PhraseMatch> phraseMatches)

findMatches

protected java.util.List<PhraseTable.PhraseMatch> findMatches(java.util.Collection<PhraseTable.Phrase> acceptablePhrases,
                                                              PhraseTable.WordList tokens,
                                                              int tokenStart,
                                                              int tokenEnd,
                                                              boolean needNormalization,
                                                              boolean findAll,
                                                              boolean matchEnd)

findMatchesNormalized

protected java.util.List<PhraseTable.PhraseMatch> findMatchesNormalized(java.util.Collection<PhraseTable.Phrase> acceptablePhrases,
                                                                        PhraseTable.WordList tokens,
                                                                        int tokenStart,
                                                                        int tokenEnd,
                                                                        boolean findAll,
                                                                        boolean matchEnd)

iterator

public java.util.Iterator<PhraseTable.Phrase> iterator()

toString

public static java.lang.String toString(PhraseTable.WordList wordList)


Stanford NLP Group