L - The type of the labels in the DatasetF - The type of the features in the Datasetpublic abstract class GeneralDataset<L,F> extends Object implements Serializable, Iterable<RVFDatum<L,F>>
Dataset and RVFDataset.| Modifier and Type | Field and Description |
|---|---|
protected int[][] |
data |
Index<F> |
featureIndex |
Index<L> |
labelIndex |
protected int[] |
labels |
protected int |
size |
| Constructor and Description |
|---|
GeneralDataset() |
| Modifier and Type | Method and Description |
|---|---|
abstract void |
add(Datum<L,F> d) |
void |
addAll(Iterable<? extends Datum<L,F>> data)
Adds all Datums in the given collection of data to this dataset
|
void |
applyFeatureCountThreshold(int k)
Applies a feature count threshold to the Dataset.
|
void |
applyFeatureMaxCountThreshold(int k)
Applies a max feature count threshold to the Dataset.
|
void |
clear()
Resets the Dataset so that it is empty and ready to collect data.
|
void |
clear(int numDatums)
Resets the Dataset so that it is empty and ready to collect data.
|
Index<F> |
featureIndex() |
int[][] |
getDataArray() |
abstract Datum<L,F> |
getDatum(int index) |
float[] |
getFeatureCounts()
Get the total count (over all data instances) of each feature
|
int[] |
getLabelsArray() |
abstract RVFDatum<L,F> |
getRVFDatum(int index) |
abstract double[][] |
getValuesArray() |
protected abstract void |
initialize(int numDatums)
This method takes care of resetting values of the dataset
such that it is empty with an initial capacity of numDatums.
|
Iterator<RVFDatum<L,F>> |
iterator() |
Index<L> |
labelIndex() |
Iterator<L> |
labelIterator()
Returns an iterator over the class labels of the Dataset
|
String[] |
makeSvmLabelMap()
Maps our labels to labels that are compatible with svm_light
|
GeneralDataset<L,F> |
mapDataset(GeneralDataset<L,F> dataset) |
<L2> GeneralDataset<L2,F> |
mapDataset(GeneralDataset<L,F> dataset,
Index<L2> newLabelIndex,
Map<L,L2> labelMapping,
L2 defaultLabel) |
static <L,L2,F> Datum<L2,F> |
mapDatum(Datum<L,F> d,
Map<L,L2> labelMapping,
L2 defaultLabel) |
int |
numClasses() |
ClassicCounter<L> |
numDatumsPerLabel() |
int |
numFeatures() |
int |
numFeatureTokens()
returns the number of feature tokens in the Dataset.
|
int |
numFeatureTypes()
returns the number of distinct feature types in the Dataset.
|
void |
printSVMLightFormat()
Dumps the Dataset as a training/test file for SVMLight.
|
void |
printSVMLightFormat(PrintWriter pw)
Print SVM Light Format file.
|
void |
randomize(int randomSeed)
Randomizes the data array in place.
|
GeneralDataset<L,F> |
sampleDataset(int randomSeed,
double sampleFrac,
boolean sampleWithReplacement) |
int |
size()
Returns the number of examples (
Datums) in the Dataset. |
abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(double p) |
abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> |
split(int start,
int end) |
abstract void |
summaryStatistics()
Print some statistics summarizing the dataset
|
protected void |
trimData() |
protected void |
trimLabels() |
protected double[][] |
trimToSize(double[][] i) |
protected int[] |
trimToSize(int[] i) |
protected int[][] |
trimToSize(int[][] i) |
public int numFeatures()
public int numClasses()
public int[] getLabelsArray()
public int[][] getDataArray()
public abstract double[][] getValuesArray()
public void clear()
public void clear(int numDatums)
numDatums - initial capacity of datasetprotected abstract void initialize(int numDatums)
numDatums - initial capacity of datasetpublic float[] getFeatureCounts()
public void applyFeatureCountThreshold(int k)
public void applyFeatureMaxCountThreshold(int k)
public int numFeatureTokens()
public int numFeatureTypes()
public void addAll(Iterable<? extends Datum<L,F>> data)
data - collection of datums you would like to add to the datasetpublic abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(int start, int end)
public abstract Pair<GeneralDataset<L,F>,GeneralDataset<L,F>> split(double p)
public int size()
Datums) in the Dataset.protected void trimData()
protected void trimLabels()
protected int[] trimToSize(int[] i)
protected int[][] trimToSize(int[][] i)
protected double[][] trimToSize(double[][] i)
public void randomize(int randomSeed)
randomSeed - public GeneralDataset<L,F> sampleDataset(int randomSeed, double sampleFrac, boolean sampleWithReplacement)
public abstract void summaryStatistics()
public Iterator<L> labelIterator()
public GeneralDataset<L,F> mapDataset(GeneralDataset<L,F> dataset)
dataset - public static <L,L2,F> Datum<L2,F> mapDatum(Datum<L,F> d, Map<L,L2> labelMapping, L2 defaultLabel)
public <L2> GeneralDataset<L2,F> mapDataset(GeneralDataset<L,F> dataset, Index<L2> newLabelIndex, Map<L,L2> labelMapping, L2 defaultLabel)
dataset - public void printSVMLightFormat()
public String[] makeSvmLabelMap()
public void printSVMLightFormat(PrintWriter pw)
public ClassicCounter<L> numDatumsPerLabel()