/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* OneClassClassifier.java
* Copyright (C) 2008 K.Hempstalk, University of Waikato, Hamilton, New Zealand.
*/
package weka.classifiers.meta;
import weka.classifiers.RandomizableSingleClassifierEnhancer;
import weka.classifiers.meta.generators.GaussianGenerator;
import weka.classifiers.meta.generators.Generator;
import weka.classifiers.meta.generators.InstanceHandler;
import weka.classifiers.meta.generators.Mean;
import weka.classifiers.meta.generators.NominalGenerator;
import weka.classifiers.meta.generators.NominalAttributeGenerator;
import weka.classifiers.meta.generators.NumericAttributeGenerator;
import weka.classifiers.meta.generators.Ranged;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.TechnicalInformation;
import weka.core.TechnicalInformationHandler;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.core.TechnicalInformation.Field;
import weka.core.TechnicalInformation.Type;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.AddValues;
import weka.filters.unsupervised.attribute.MergeManyValues;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
/**
* Performs one-class classification on a dataset.
*
* Classifier reduces the class being classified to just a single class, and learns the datawithout using any information from other classes. The testing stage will classify as 'target'or 'outlier' - so in order to calculate the outlier pass rate the dataset must contain informationfrom more than one class.
*
* Also, the output varies depending on whether the label 'outlier' exists in the instances usedto build the classifier. If so, then 'outlier' will be predicted, if not, then the label willbe considered missing when the prediction does not favour the target class. The 'outlier' classwill not be used to build the model if there are instances of this class in the dataset. It cansimply be used as a flag, you do not need to relabel any classes.
*
* For more information, see:
*
* Kathryn Hempstalk, Eibe Frank, Ian H. Witten: One-Class Classification by Combining Density and Class Probability Estimation. In: Proceedings of the 12th European Conference on Principles and Practice of Knowledge Discovery in Databases and 19th European Conference on Machine Learning, ECMLPKDD2008, Berlin, 505--519, 2008.
*
*
* BibTeX:
*
* @conference{Hempstalk2008,
* address = {Berlin},
* author = {Kathryn Hempstalk and Eibe Frank and Ian H. Witten},
* booktitle = {Proceedings of the 12th European Conference on Principles and Practice of Knowledge Discovery in Databases and 19th European Conference on Machine Learning, ECMLPKDD2008},
* month = {September},
* pages = {505--519},
* publisher = {Springer},
* series = {Lecture Notes in Computer Science},
* title = {One-Class Classification by Combining Density and Class Probability Estimation},
* volume = {Vol. 5211},
* year = {2008},
* location = {Antwerp, Belgium}
* }
*
-L
* Sets whether to correct the number of classes to two,
* if omitted no correction will be made.
*
*
-E
* Sets whether to exclusively use the density estimate.
*
*
-I
* Sets whether to use instance weights.
*
*
-S <num>
* Random number seed.
* (default 1)
*
*
-D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
*
-W
* Full name of base classifier.
* (default: weka.classifiers.meta.Bagging)
*
*
* Options specific to classifier weka.classifiers.meta.Bagging:
*
*
*
-P
* Size of each bag, as a percentage of the
* training set size. (default 100)
*
*
-O
* Calculate the out of bag error.
*
*
-S <num>
* Random number seed.
* (default 1)
*
*
-I <num>
* Number of iterations.
* (default 10)
*
*
-D
* If set, classifier is run in debug mode and
* may output additional info to the console
*
*
-W
* Full name of base classifier.
* (default: weka.classifiers.trees.REPTree)
*
*
* Options specific to classifier weka.classifiers.trees.REPTree:
*
*
*
-M <minimum number of instances>
* Set minimum number of instances per leaf (default 2).
*
*
-V <minimum variance for split>
* Set minimum numeric class variance proportion
* of train variance for split (default 1e-3).
*
*
-N <number of folds>
* Number of folds for reduced error pruning (default 3).
*
*
-S <seed>
* Seed for random data shuffling (default 1).
*
*
-P
* No pruning.
*
*
-L
* Maximum tree depth (default -1, no maximum)
*
*
* Options after -- are passed to the designated classifier.
*
* @author Kathryn Hempstalk (kah18 at cs.waikato.ac.nz)
* @author Eibe Frank (eibe at cs.waikato.ac.nz)
* @version $Revision: 5987 $
*/
public class OneClassClassifier
extends RandomizableSingleClassifierEnhancer
implements TechnicalInformationHandler {
/** for serialization. */
private static final long serialVersionUID = 6199125385010158931L;
/**
* The rejection rate of valid target objects (used to set the threshold).
*/
protected double m_TargetRejectionRate = 0.1;
/**
* The probability threshold (only classes above this will be considered target).
*/
protected double m_Threshold = 0.5;
/**
* The generators for the numeric attributes.
*/
protected ArrayList m_Generators;
/**
* The value of the class attribute to consider the target class.
*/
protected String m_TargetClassLabel = "target";
/**
* The number of times to repeat cross validation during learning.
*/
protected int m_NumRepeats = 10;
/**
* The percentage of heldout data.
*/
protected double m_PercentHeldout = 10;
/**
* The proportion of the data that will be generated.
*/
protected double m_ProportionGenerated = 0.5;
/**
* The default data generator for numeric attributes.
*/
protected NumericAttributeGenerator m_DefaultNumericGenerator = (NumericAttributeGenerator) new GaussianGenerator();
/**
* The default data generator for nominal attributes.
*/
protected NominalAttributeGenerator m_DefaultNominalGenerator = (NominalAttributeGenerator) new NominalGenerator();
/**
* Adds the outlier class if it doesn't already exist.
*/
protected AddValues m_AddOutlierFilter;
/**
* Whether to include laplace correction so if there are multiple
* values for a class, it is reduced to just two so that any laplace
* correction in another classifier corrects with one possible other class
* rather than several.
*/
protected boolean m_UseLaplaceCorrection = false;
/**
* The filter that merges the instances down to two values.
*/
protected MergeManyValues m_MergeFilter;
/**
* The label for the outlier class.
*/
public static final String OUTLIER_LABEL = "outlier";
/**
* Whether to use only the density estimate, or to include the
* base classifier in the probability estimates.
*/
protected boolean m_UseDensityOnly = false;
/**
* Whether to weight instances based on their prevalence in the
* test set used for calculating P(X|T).
*/
protected boolean m_UseInstanceWeights = false;
/** The random number generator used internally. */
protected Random m_Random;
/**
* Default constructor.
*/
public OneClassClassifier() {
super();
m_Classifier = new weka.classifiers.meta.Bagging();
}
/**
* Returns a string describing this classes ability.
*
* @return A description of the method.
*/
public String globalInfo() {
return
"Performs one-class classification on a dataset.\n\n"
+ "Classifier reduces the class being classified to just a single class, and learns the data"
+ "without using any information from other classes. The testing stage will classify as 'target'"
+ "or 'outlier' - so in order to calculate the outlier pass rate the dataset must contain information"
+ "from more than one class.\n"
+ "\n"
+ "Also, the output varies depending on whether the label 'outlier' exists in the instances used"
+ "to build the classifier. If so, then 'outlier' will be predicted, if not, then the label will"
+ "be considered missing when the prediction does not favour the target class. The 'outlier' class"
+ "will not be used to build the model if there are instances of this class in the dataset. It can"
+ "simply be used as a flag, you do not need to relabel any classes.\n"
+ "\n"
+ "For more information, see:\n"
+ "\n"
+ getTechnicalInformation().toString();
}
/**
* Returns an instance of a TechnicalInformation object, containing
* detailed information about the technical background of this class,
* e.g., paper reference or book this class is based on.
*
* @return the technical information about this class
*/
public TechnicalInformation getTechnicalInformation() {
TechnicalInformation result;
result = new TechnicalInformation(Type.CONFERENCE);
result.setValue(Field.AUTHOR, "Kathryn Hempstalk and Eibe Frank and Ian H. Witten");
result.setValue(Field.YEAR, "2008");
result.setValue(Field.TITLE, "One-Class Classification by Combining Density and Class Probability Estimation");
result.setValue(Field.BOOKTITLE, "Proceedings of the 12th European Conference on Principles and Practice of Knowledge Discovery in Databases and 19th European Conference on Machine Learning, ECMLPKDD2008");
result.setValue(Field.VOLUME, "Vol. 5211");
result.setValue(Field.PAGES, "505--519");
result.setValue(Field.PUBLISHER, "Springer");
result.setValue(Field.ADDRESS, "Berlin");
result.setValue(Field.SERIES, "Lecture Notes in Computer Science");
result.setValue(Field.LOCATION, "Antwerp, Belgium");
result.setValue(Field.MONTH, "September");
return result;
}
/**
* Returns an enumeration describing the available options.
*
* @return An enumeration of all the available options.
*/
public Enumeration listOptions() {
Vector result = new Vector();
result.addElement(new Option(
"\tSets the target rejection rate\n"
+ "\t(default: 0.1)",
"trr", 1, "-trr "));
result.addElement(new Option(
"\tSets the target class label\n"
+ "\t(default: 'target')",
"tcl", 1, "-tcl