|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||
java.lang.Objectedu.stanford.nlp.classify.GeneralDataset<L,F>
edu.stanford.nlp.classify.Dataset<L,F>
L - Label typeF - Feature typepublic class Dataset<L,F>
An interfacing class for ClassifierFactory that incrementally
builds a more memory-efficient representation of a List of
Datum objects for the purposes of training a Classifier
with a ClassifierFactory.
and #getL1NormalizedTFIDFDataset()| Field Summary |
|---|
| Fields inherited from class edu.stanford.nlp.classify.GeneralDataset |
|---|
data, featureIndex, labelIndex, labels, size |
| Constructor Summary | |
|---|---|
Dataset()
|
|
Dataset(Index<F> featureIndex,
Index<L> labelIndex)
|
|
Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data)
Constructor that fully specifies a Dataset. |
|
Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data,
int size)
Constructor that fully specifies a Dataset. |
|
Dataset(int numDatums)
|
|
Dataset(int numDatums,
Index<F> featureIndex,
Index<L> labelIndex)
|
|
| Method Summary | |
|---|---|
void |
add(java.util.Collection<F> features,
L label)
|
void |
add(java.util.Collection<F> features,
L label,
boolean addNewFeatures)
|
void |
add(Datum<L,F> d)
|
void |
add(int[] features,
int label)
Adds a datums defined by feature indices and label index Careful with this one! Make sure that all indices are valid! |
protected void |
addFeatureIndices(int[] features)
|
protected void |
addFeatures(java.util.Collection<F> features)
|
protected void |
addFeatures(java.util.Collection<F> features,
boolean addNewFeatures)
|
protected void |
addLabel(L label)
|
protected void |
addLabelIndex(int label)
|
void |
applyFeatureCountThreshold(java.util.List<Pair<java.util.regex.Pattern,java.lang.Integer>> thresholds)
Applies feature count thresholds to the Dataset. |
void |
changeFeatureIndex(Index<F> newFeatureIndex)
|
void |
changeLabelIndex(Index<L> newLabelIndex)
|
protected void |
ensureSize()
|
Datum<L,F> |
getDatum(int index)
|
Counter<F> |
getFeatureCounter()
Get Number of datums a given feature appears in. |
double[] |
getInformationGains()
|
RVFDataset<L,F> |
getL1NormalizedTFIDFDataset()
Method to convert this dataset to RVFDataset using L1-normalized TF-IDF features |
RVFDatum<L,F> |
getL1NormalizedTFIDFDatum(Datum<L,F> datum,
Counter<F> featureDocCounts)
Method to convert features from counts to L1-normalized TFIDF based features |
Dataset<L,F> |
getRandomSubDataset(double p,
int seed)
|
RVFDatum<L,F> |
getRVFDatum(int index)
|
double[][] |
getValuesArray()
|
protected void |
initialize(int numDatums)
This method takes care of resetting values of the dataset such that it is empty with an initial capacity of numDatums. |
void |
printFullFeatureMatrix(java.io.PrintWriter pw)
prints the full feature matrix in tab-delimited form. |
void |
printSparseFeatureMatrix()
prints the sparse feature matrix using printSparseFeatureMatrix()
to System.out. |
void |
printSparseFeatureMatrix(java.io.PrintWriter pw)
prints a sparse feature matrix representation of the Dataset. |
static void |
printSVMLightFormat(java.io.PrintWriter pw,
ClassicCounter<java.lang.Integer> c,
int classNo)
Need to sort the counter by feature keys and dump it |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex,
java.util.List<java.lang.String> lines)
Constructs a Dataset by reading in a file in SVM light format. |
static Dataset<java.lang.String,java.lang.String> |
readSVMLightFormat(java.lang.String filename,
java.util.List<java.lang.String> lines)
Constructs a Dataset by reading in a file in SVM light format. |
void |
selectFeatures(int numFeatures,
double[] scores)
Generic method to select features based on the feature scores vector provided as an argument. |
void |
selectFeaturesBinaryInformationGain(int numFeatures)
|
Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(double percentDev)
|
Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(int start,
int end)
|
void |
summaryStatistics()
Prints some summary statistics to stderr for the Dataset. |
static Datum<java.lang.String,java.lang.String> |
svmLightLineToDatum(java.lang.String l)
|
java.lang.String |
toString()
|
java.lang.String |
toSummaryStatistics()
|
java.lang.String |
toSummaryString()
|
void |
updateLabels(int[] labels)
|
| Methods inherited from class edu.stanford.nlp.classify.GeneralDataset |
|---|
addAll, applyFeatureCountThreshold, applyFeatureMaxCountThreshold, clear, clear, featureIndex, getDataArray, getFeatureCounts, getLabelsArray, iterator, labelIndex, labelIterator, makeSvmLabelMap, mapDataset, mapDataset, mapDatum, numClasses, numFeatures, numFeatureTokens, numFeatureTypes, printSVMLightFormat, printSVMLightFormat, randomize, sampleDataset, size, trimData, trimLabels, trimToSize, trimToSize, trimToSize |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
| Constructor Detail |
|---|
public Dataset()
public Dataset(int numDatums)
public Dataset(int numDatums,
Index<F> featureIndex,
Index<L> labelIndex)
public Dataset(Index<F> featureIndex,
Index<L> labelIndex)
public Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data)
public Dataset(Index<L> labelIndex,
int[] labels,
Index<F> featureIndex,
int[][] data,
int size)
| Method Detail |
|---|
public Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(double percentDev)
split in class GeneralDataset<L,F>
public Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(int start,
int end)
split in class GeneralDataset<L,F>
public Dataset<L,F> getRandomSubDataset(double p,
int seed)
public double[][] getValuesArray()
getValuesArray in class GeneralDataset<L,F>public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename,
java.util.List<java.lang.String> lines)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex)
public static Dataset<java.lang.String,java.lang.String> readSVMLightFormat(java.lang.String filename,
Index<java.lang.String> featureIndex,
Index<java.lang.String> labelIndex,
java.util.List<java.lang.String> lines)
public static Datum<java.lang.String,java.lang.String> svmLightLineToDatum(java.lang.String l)
public Counter<F> getFeatureCounter()
public RVFDatum<L,F> getL1NormalizedTFIDFDatum(Datum<L,F> datum,
Counter<F> featureDocCounts)
datum - with a collection of features.featureDocCounts - a counter of doc-count for each feature.
public RVFDataset<L,F> getL1NormalizedTFIDFDataset()
public void add(Datum<L,F> d)
add in class GeneralDataset<L,F>
public void add(java.util.Collection<F> features,
L label)
public void add(java.util.Collection<F> features,
L label,
boolean addNewFeatures)
public void add(int[] features,
int label)
features - label - protected void ensureSize()
protected void addLabel(L label)
protected void addLabelIndex(int label)
protected void addFeatures(java.util.Collection<F> features)
protected void addFeatures(java.util.Collection<F> features,
boolean addNewFeatures)
protected void addFeatureIndices(int[] features)
protected final void initialize(int numDatums)
GeneralDataset
initialize in class GeneralDataset<L,F>numDatums - initial capacity of datasetpublic Datum<L,F> getDatum(int index)
getDatum in class GeneralDataset<L,F>public RVFDatum<L,F> getRVFDatum(int index)
getRVFDatum in class GeneralDataset<L,F>public void summaryStatistics()
summaryStatistics in class GeneralDataset<L,F>public java.lang.String toSummaryStatistics()
public void applyFeatureCountThreshold(java.util.List<Pair<java.util.regex.Pattern,java.lang.Integer>> thresholds)
thresholds - a list of pattern, threshold pairspublic void printFullFeatureMatrix(java.io.PrintWriter pw)
public void printSparseFeatureMatrix()
printSparseFeatureMatrix()
to System.out.
public void printSparseFeatureMatrix(java.io.PrintWriter pw)
Object.toString() representations of features.
public void changeLabelIndex(Index<L> newLabelIndex)
public void changeFeatureIndex(Index<F> newFeatureIndex)
public void selectFeaturesBinaryInformationGain(int numFeatures)
public void selectFeatures(int numFeatures,
double[] scores)
numFeatures - number of features to be selected.scores - a vector of size total number of features in the data.public double[] getInformationGains()
public void updateLabels(int[] labels)
public java.lang.String toString()
toString in class java.lang.Objectpublic java.lang.String toSummaryString()
public static void printSVMLightFormat(java.io.PrintWriter pw,
ClassicCounter<java.lang.Integer> c,
int classNo)
|
|||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | ||||||||