edu.stanford.nlp.ling.tokensregex
Class MultiWordStringMatcher

java.lang.Object
  extended by edu.stanford.nlp.ling.tokensregex.MultiWordStringMatcher

public class MultiWordStringMatcher
extends java.lang.Object

Finds multi word strings in a piece of text

Author:
Angel Chang

Nested Class Summary
static class MultiWordStringMatcher.LongestStringComparator
           
static class MultiWordStringMatcher.MatchType
          if matchType is EXCT: match exact string
if matchType is EXCTWS: match exact string, except whitespace can match multiple whitespaces
if matchType is LWS: match case insensitive string, except whitespace can match multiple whitespaces
if matchType is LNRM: disregards punctuation, does case insensitive match
if matchType is REGEX: interprets string as regex already
 
Field Summary
static java.util.Comparator<java.lang.String> LONGEST_STRING_COMPARATOR
           
 
Constructor Summary
MultiWordStringMatcher(MultiWordStringMatcher.MatchType matchType)
           
MultiWordStringMatcher(java.lang.String matchTypeStr)
           
 
Method Summary
 java.util.regex.Pattern createPattern(java.lang.String targetString)
           
static java.util.List<IntPair> findOffsets(java.util.regex.Pattern pattern, java.lang.String text)
          Finds pattern in text and returns offsets
static java.util.List<IntPair> findOffsets(java.util.regex.Pattern pattern, java.lang.String text, int start, int end)
          Finds pattern in text span from character start to end (exclusive) and returns offsets
 java.util.List<IntPair> findTargetStringOffsets(java.lang.String text, java.lang.String targetString)
          Finds target string in text and returns offsets (matches based on set matchType)
 java.util.List<IntPair> findTargetStringOffsets(java.lang.String text, java.lang.String targetString, int start, int end)
          Finds target string in text span from character start to end (exclusive) and returns offsets (matches based on set matchType)
protected  java.util.List<IntPair> findTargetStringOffsetsExct(java.lang.String text, java.lang.String targetString, int start, int end)
          Finds target string in text span from character start to end (exclusive) and returns offsets (does EXCT string matching)
protected  java.util.List<IntPair> findTargetStringOffsetsRegex(java.lang.String text, java.lang.String targetString, int start, int end)
          Finds target string in text and returns offsets using regular expressions (matches based on set matchType)
 java.lang.String getExctWsRegex(java.lang.String targetString)
           
 java.lang.String getLnrmRegex(java.lang.String targetString)
           
 java.lang.String getLWsRegex(java.lang.String targetString)
           
 MultiWordStringMatcher.MatchType getMatchType()
           
 java.util.regex.Pattern getPattern(java.lang.String targetString)
           
 java.util.regex.Pattern getPattern(java.lang.String[] targetStrings)
           
 java.lang.String getRegex(java.lang.String targetString)
           
 java.lang.String getRegex(java.lang.String[] targetStrings)
           
protected  java.lang.String markTargetString(java.lang.String text, java.lang.String targetString, java.lang.String beginMark, java.lang.String endMark, boolean markOnlyIfSpace)
           
 java.lang.String putSpacesAroundTargetString(java.lang.String text, java.lang.String targetString)
          Finds target string in text and put spaces around it so it will be matched with we match against tokens
 void setMatchType(MultiWordStringMatcher.MatchType matchType)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

LONGEST_STRING_COMPARATOR

public static final java.util.Comparator<java.lang.String> LONGEST_STRING_COMPARATOR
Constructor Detail

MultiWordStringMatcher

public MultiWordStringMatcher(MultiWordStringMatcher.MatchType matchType)

MultiWordStringMatcher

public MultiWordStringMatcher(java.lang.String matchTypeStr)
Method Detail

getMatchType

public MultiWordStringMatcher.MatchType getMatchType()

setMatchType

public void setMatchType(MultiWordStringMatcher.MatchType matchType)

putSpacesAroundTargetString

public java.lang.String putSpacesAroundTargetString(java.lang.String text,
                                                    java.lang.String targetString)
Finds target string in text and put spaces around it so it will be matched with we match against tokens

Parameters:
text - - String in which to look for the target string
targetString - - Target string to look for
Returns:
Updated text with spaces around target string

markTargetString

protected java.lang.String markTargetString(java.lang.String text,
                                            java.lang.String targetString,
                                            java.lang.String beginMark,
                                            java.lang.String endMark,
                                            boolean markOnlyIfSpace)

findTargetStringOffsetsExct

protected java.util.List<IntPair> findTargetStringOffsetsExct(java.lang.String text,
                                                              java.lang.String targetString,
                                                              int start,
                                                              int end)
Finds target string in text span from character start to end (exclusive) and returns offsets (does EXCT string matching)

Parameters:
text - - String in which to look for the target string
targetString - - Target string to look for
start - - position to start search
end - - position to end search
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the targetString can be find

getPattern

public java.util.regex.Pattern getPattern(java.lang.String[] targetStrings)

getRegex

public java.lang.String getRegex(java.lang.String[] targetStrings)

getPattern

public java.util.regex.Pattern getPattern(java.lang.String targetString)

createPattern

public java.util.regex.Pattern createPattern(java.lang.String targetString)

getRegex

public java.lang.String getRegex(java.lang.String targetString)

getExctWsRegex

public java.lang.String getExctWsRegex(java.lang.String targetString)

getLWsRegex

public java.lang.String getLWsRegex(java.lang.String targetString)

getLnrmRegex

public java.lang.String getLnrmRegex(java.lang.String targetString)

findTargetStringOffsetsRegex

protected java.util.List<IntPair> findTargetStringOffsetsRegex(java.lang.String text,
                                                               java.lang.String targetString,
                                                               int start,
                                                               int end)
Finds target string in text and returns offsets using regular expressions (matches based on set matchType)

Parameters:
text - - String in which to find target string
targetString - - Target string to look for
start - - position to start search
end - - position to end search
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the target string can be find

findOffsets

public static java.util.List<IntPair> findOffsets(java.util.regex.Pattern pattern,
                                                  java.lang.String text)
Finds pattern in text and returns offsets

Parameters:
pattern - - pattern to look for
text - - String in which to look for the pattern
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the pattern can be find

findOffsets

public static java.util.List<IntPair> findOffsets(java.util.regex.Pattern pattern,
                                                  java.lang.String text,
                                                  int start,
                                                  int end)
Finds pattern in text span from character start to end (exclusive) and returns offsets

Parameters:
pattern - - pattern to look for
text - - String in which to look for the pattern
start - - position to start search
end - - position to end search
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the pattern can be find

findTargetStringOffsets

public java.util.List<IntPair> findTargetStringOffsets(java.lang.String text,
                                                       java.lang.String targetString)
Finds target string in text and returns offsets (matches based on set matchType)

Parameters:
text - - String in which to look for the target string
targetString - - Target string to look for
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the target string can be find

findTargetStringOffsets

public java.util.List<IntPair> findTargetStringOffsets(java.lang.String text,
                                                       java.lang.String targetString,
                                                       int start,
                                                       int end)
Finds target string in text span from character start to end (exclusive) and returns offsets (matches based on set matchType)

Parameters:
text - - String in which to look for the target string
targetString - - Target string to look for
start - - position to start search
end - - position to end search
Returns:
list of integer pairs indicating the character offsets (begin, end - exclusive) at which the target string can be find


Stanford NLP Group