Context Navigation

Discretize.java

Last change on this file was 29, checked in by gnappo, 15 years ago
Taggata versione per la demo e aggiunto branch.
File size: 29.9 KB

Line
1	/*
2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.
6	*
7	* This program is distributed in the hope that it will be useful,
8	* but WITHOUT ANY WARRANTY; without even the implied warranty of
9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10	* GNU General Public License for more details.
11	*
12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the Free Software
14	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15	*/
16
17	/*
18	* Discretize.java
19	* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
20	*
21	*/
22
23
24	package weka.filters.supervised.attribute;
25
26	import weka.core.Attribute;
27	import weka.core.Capabilities;
28	import weka.core.ContingencyTables;
29	import weka.core.FastVector;
30	import weka.core.Instance;
31	import weka.core.DenseInstance;
32	import weka.core.Instances;
33	import weka.core.Option;
34	import weka.core.OptionHandler;
35	import weka.core.Range;
36	import weka.core.RevisionUtils;
37	import weka.core.SparseInstance;
38	import weka.core.SpecialFunctions;
39	import weka.core.TechnicalInformation;
40	import weka.core.TechnicalInformationHandler;
41	import weka.core.Utils;
42	import weka.core.WeightedInstancesHandler;
43	import weka.core.Capabilities.Capability;
44	import weka.core.TechnicalInformation.Field;
45	import weka.core.TechnicalInformation.Type;
46	import weka.filters.Filter;
47	import weka.filters.SupervisedFilter;
48
49	import java.util.Enumeration;
50	import java.util.Vector;
51
52	/**
53	<!-- globalinfo-start -->
54	* An instance filter that discretizes a range of numeric attributes in the dataset into nominal attributes. Discretization is by Fayyad & Irani's MDL method (the default).<br/>
55	* <br/>
56	* For more information, see:<br/>
57	* <br/>
58	* Usama M. Fayyad, Keki B. Irani: Multi-interval discretization of continuousvalued attributes for classification learning. In: Thirteenth International Joint Conference on Articial Intelligence, 1022-1027, 1993.<br/>
59	* <br/>
60	* Igor Kononenko: On Biases in Estimating Multi-Valued Attributes. In: 14th International Joint Conference on Articial Intelligence, 1034-1040, 1995.
61	* <p/>
62	<!-- globalinfo-end -->
63	*
64	<!-- technical-bibtex-start -->
65	* BibTeX:
66	* <pre>
67	* @inproceedings{Fayyad1993,
68	* author = {Usama M. Fayyad and Keki B. Irani},
69	* booktitle = {Thirteenth International Joint Conference on Articial Intelligence},
70	* pages = {1022-1027},
71	* publisher = {Morgan Kaufmann Publishers},
72	* title = {Multi-interval discretization of continuousvalued attributes for classification learning},
73	* volume = {2},
74	* year = {1993}
75	* }
76	*
77	* @inproceedings{Kononenko1995,
78	* author = {Igor Kononenko},
79	* booktitle = {14th International Joint Conference on Articial Intelligence},
80	* pages = {1034-1040},
81	* title = {On Biases in Estimating Multi-Valued Attributes},
82	* year = {1995},
83	* PS = {http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz}
84	* }
85	* </pre>
86	* <p/>
87	<!-- technical-bibtex-end -->
88	*
89	<!-- options-start -->
90	* Valid options are: <p/>
91	*
92	* <pre> -R <col1,col2-col4,...>
93	* Specifies list of columns to Discretize. First and last are valid indexes.
94	* (default none)</pre>
95	*
96	* <pre> -V
97	* Invert matching sense of column indexes.</pre>
98	*
99	* <pre> -D
100	* Output binary attributes for discretized attributes.</pre>
101	*
102	* <pre> -E
103	* Use better encoding of split point for MDL.</pre>
104	*
105	* <pre> -K
106	* Use Kononenko's MDL criterion.</pre>
107	*
108	<!-- options-end -->
109	*
110	* @author Len Trigg (trigg@cs.waikato.ac.nz)
111	* @author Eibe Frank (eibe@cs.waikato.ac.nz)
112	* @version $Revision: 5987 $
113	*/
114	public class Discretize
115	extends Filter
116	implements SupervisedFilter, OptionHandler, WeightedInstancesHandler,
117	TechnicalInformationHandler {
118
119	/** for serialization */
120	static final long serialVersionUID = -3141006402280129097L;
121
122	/** Stores which columns to Discretize */
123	protected Range m_DiscretizeCols = new Range();
124
125	/** Store the current cutpoints */
126	protected double [][] m_CutPoints = null;
127
128	/** Output binary attributes for discretized attributes. */
129	protected boolean m_MakeBinary = false;
130
131	/** Use better encoding of split point for MDL. */
132	protected boolean m_UseBetterEncoding = false;
133
134	/** Use Kononenko's MDL criterion instead of Fayyad et al.'s */
135	protected boolean m_UseKononenko = false;
136
137	/** Constructor - initialises the filter */
138	public Discretize() {
139
140	setAttributeIndices("first-last");
141	}
142
143
144	/**
145	* Gets an enumeration describing the available options.
146	*
147	* @return an enumeration of all the available options.
148	*/
149	public Enumeration listOptions() {
150
151	Vector newVector = new Vector(7);
152
153	newVector.addElement(new Option(
154	"\tSpecifies list of columns to Discretize. First"
155	+ " and last are valid indexes.\n"
156	+ "\t(default none)",
157	"R", 1, "-R <col1,col2-col4,...>"));
158
159	newVector.addElement(new Option(
160	"\tInvert matching sense of column indexes.",
161	"V", 0, "-V"));
162
163	newVector.addElement(new Option(
164	"\tOutput binary attributes for discretized attributes.",
165	"D", 0, "-D"));
166
167	newVector.addElement(new Option(
168	"\tUse better encoding of split point for MDL.",
169	"E", 0, "-E"));
170
171	newVector.addElement(new Option(
172	"\tUse Kononenko's MDL criterion.",
173	"K", 0, "-K"));
174
175	return newVector.elements();
176	}
177
178
179	/**
180	* Parses a given list of options. <p/>
181	*
182	<!-- options-start -->
183	* Valid options are: <p/>
184	*
185	* <pre> -R <col1,col2-col4,...>
186	* Specifies list of columns to Discretize. First and last are valid indexes.
187	* (default none)</pre>
188	*
189	* <pre> -V
190	* Invert matching sense of column indexes.</pre>
191	*
192	* <pre> -D
193	* Output binary attributes for discretized attributes.</pre>
194	*
195	* <pre> -E
196	* Use better encoding of split point for MDL.</pre>
197	*
198	* <pre> -K
199	* Use Kononenko's MDL criterion.</pre>
200	*
201	<!-- options-end -->
202	*
203	* @param options the list of options as an array of strings
204	* @throws Exception if an option is not supported
205	*/
206	public void setOptions(String[] options) throws Exception {
207
208	setMakeBinary(Utils.getFlag('D', options));
209	setUseBetterEncoding(Utils.getFlag('E', options));
210	setUseKononenko(Utils.getFlag('K', options));
211	setInvertSelection(Utils.getFlag('V', options));
212
213	String convertList = Utils.getOption('R', options);
214	if (convertList.length() != 0) {
215	setAttributeIndices(convertList);
216	} else {
217	setAttributeIndices("first-last");
218	}
219
220	if (getInputFormat() != null) {
221	setInputFormat(getInputFormat());
222	}
223	}
224	/**
225	* Gets the current settings of the filter.
226	*
227	* @return an array of strings suitable for passing to setOptions
228	*/
229	public String [] getOptions() {
230
231	String [] options = new String [12];
232	int current = 0;
233
234	if (getMakeBinary()) {
235	options[current++] = "-D";
236	}
237	if (getUseBetterEncoding()) {
238	options[current++] = "-E";
239	}
240	if (getUseKononenko()) {
241	options[current++] = "-K";
242	}
243	if (getInvertSelection()) {
244	options[current++] = "-V";
245	}
246	if (!getAttributeIndices().equals("")) {
247	options[current++] = "-R"; options[current++] = getAttributeIndices();
248	}
249	while (current < options.length) {
250	options[current++] = "";
251	}
252	return options;
253	}
254
255	/**
256	* Returns the Capabilities of this filter.
257	*
258	* @return the capabilities of this object
259	* @see Capabilities
260	*/
261	public Capabilities getCapabilities() {
262	Capabilities result = super.getCapabilities();
263	result.disableAll();
264
265	// attributes
266	result.enableAllAttributes();
267	result.enable(Capability.MISSING_VALUES);
268
269	// class
270	result.enable(Capability.NOMINAL_CLASS);
271
272	return result;
273	}
274
275	/**
276	* Sets the format of the input instances.
277	*
278	* @param instanceInfo an Instances object containing the input instance
279	* structure (any instances contained in the object are ignored - only the
280	* structure is required).
281	* @return true if the outputFormat may be collected immediately
282	* @throws Exception if the input format can't be set successfully
283	*/
284	public boolean setInputFormat(Instances instanceInfo) throws Exception {
285
286	super.setInputFormat(instanceInfo);
287
288	m_DiscretizeCols.setUpper(instanceInfo.numAttributes() - 1);
289	m_CutPoints = null;
290
291	// If we implement loading cutfiles, then load
292	//them here and set the output format
293	return false;
294	}
295
296
297
298	/**
299	* Input an instance for filtering. Ordinarily the instance is processed
300	* and made available for output immediately. Some filters require all
301	* instances be read before producing output.
302	*
303	* @param instance the input instance
304	* @return true if the filtered instance may now be
305	* collected with output().
306	* @throws IllegalStateException if no input format has been defined.
307	*/
308	public boolean input(Instance instance) {
309
310	if (getInputFormat() == null) {
311	throw new IllegalStateException("No input instance format defined");
312	}
313	if (m_NewBatch) {
314	resetQueue();
315	m_NewBatch = false;
316	}
317
318	if (m_CutPoints != null) {
319	convertInstance(instance);
320	return true;
321	}
322
323	bufferInput(instance);
324	return false;
325	}
326
327
328	/**
329	* Signifies that this batch of input to the filter is finished. If the
330	* filter requires all instances prior to filtering, output() may now
331	* be called to retrieve the filtered instances.
332	*
333	* @return true if there are instances pending output
334	* @throws IllegalStateException if no input structure has been defined
335	*/
336	public boolean batchFinished() {
337
338	if (getInputFormat() == null) {
339	throw new IllegalStateException("No input instance format defined");
340	}
341	if (m_CutPoints == null) {
342	calculateCutPoints();
343
344	setOutputFormat();
345
346	// If we implement saving cutfiles, save the cuts here
347
348	// Convert pending input instances
349	for(int i = 0; i < getInputFormat().numInstances(); i++) {
350	convertInstance(getInputFormat().instance(i));
351	}
352	}
353	flushInput();
354
355	m_NewBatch = true;
356	return (numPendingOutput() != 0);
357	}
358
359	/**
360	* Returns a string describing this filter
361	*
362	* @return a description of the filter suitable for
363	* displaying in the explorer/experimenter gui
364	*/
365	public String globalInfo() {
366
367	return "An instance filter that discretizes a range of numeric"
368	+ " attributes in the dataset into nominal attributes."
369	+ " Discretization is by Fayyad & Irani's MDL method (the default).\n\n"
370	+ "For more information, see:\n\n"
371	+ getTechnicalInformation().toString();
372	}
373
374	/**
375	* Returns an instance of a TechnicalInformation object, containing
376	* detailed information about the technical background of this class,
377	* e.g., paper reference or book this class is based on.
378	*
379	* @return the technical information about this class
380	*/
381	public TechnicalInformation getTechnicalInformation() {
382	TechnicalInformation result;
383	TechnicalInformation additional;
384
385	result = new TechnicalInformation(Type.INPROCEEDINGS);
386	result.setValue(Field.AUTHOR, "Usama M. Fayyad and Keki B. Irani");
387	result.setValue(Field.TITLE, "Multi-interval discretization of continuousvalued attributes for classification learning");
388	result.setValue(Field.BOOKTITLE, "Thirteenth International Joint Conference on Articial Intelligence");
389	result.setValue(Field.YEAR, "1993");
390	result.setValue(Field.VOLUME, "2");
391	result.setValue(Field.PAGES, "1022-1027");
392	result.setValue(Field.PUBLISHER, "Morgan Kaufmann Publishers");
393
394	additional = result.add(Type.INPROCEEDINGS);
395	additional.setValue(Field.AUTHOR, "Igor Kononenko");
396	additional.setValue(Field.TITLE, "On Biases in Estimating Multi-Valued Attributes");
397	additional.setValue(Field.BOOKTITLE, "14th International Joint Conference on Articial Intelligence");
398	additional.setValue(Field.YEAR, "1995");
399	additional.setValue(Field.PAGES, "1034-1040");
400	additional.setValue(Field.PS, "http://ai.fri.uni-lj.si/papers/kononenko95-ijcai.ps.gz");
401
402	return result;
403	}
404
405	/**
406	* Returns the tip text for this property
407	*
408	* @return tip text for this property suitable for
409	* displaying in the explorer/experimenter gui
410	*/
411	public String makeBinaryTipText() {
412
413	return "Make resulting attributes binary.";
414	}
415
416	/**
417	* Gets whether binary attributes should be made for discretized ones.
418	*
419	* @return true if attributes will be binarized
420	*/
421	public boolean getMakeBinary() {
422
423	return m_MakeBinary;
424	}
425
426	/**
427	* Sets whether binary attributes should be made for discretized ones.
428	*
429	* @param makeBinary if binary attributes are to be made
430	*/
431	public void setMakeBinary(boolean makeBinary) {
432
433	m_MakeBinary = makeBinary;
434	}
435
436	/**
437	* Returns the tip text for this property
438	*
439	* @return tip text for this property suitable for
440	* displaying in the explorer/experimenter gui
441	*/
442	public String useKononenkoTipText() {
443
444	return "Use Kononenko's MDL criterion. If set to false"
445	+ " uses the Fayyad & Irani criterion.";
446	}
447
448	/**
449	* Gets whether Kononenko's MDL criterion is to be used.
450	*
451	* @return true if Kononenko's criterion will be used.
452	*/
453	public boolean getUseKononenko() {
454
455	return m_UseKononenko;
456	}
457
458	/**
459	* Sets whether Kononenko's MDL criterion is to be used.
460	*
461	* @param useKon true if Kononenko's one is to be used
462	*/
463	public void setUseKononenko(boolean useKon) {
464
465	m_UseKononenko = useKon;
466	}
467
468	/**
469	* Returns the tip text for this property
470	*
471	* @return tip text for this property suitable for
472	* displaying in the explorer/experimenter gui
473	*/
474	public String useBetterEncodingTipText() {
475
476	return "Uses a more efficient split point encoding.";
477	}
478
479	/**
480	* Gets whether better encoding is to be used for MDL.
481	*
482	* @return true if the better MDL encoding will be used
483	*/
484	public boolean getUseBetterEncoding() {
485
486	return m_UseBetterEncoding;
487	}
488
489	/**
490	* Sets whether better encoding is to be used for MDL.
491	*
492	* @param useBetterEncoding true if better encoding to be used.
493	*/
494	public void setUseBetterEncoding(boolean useBetterEncoding) {
495
496	m_UseBetterEncoding = useBetterEncoding;
497	}
498
499	/**
500	* Returns the tip text for this property
501	*
502	* @return tip text for this property suitable for
503	* displaying in the explorer/experimenter gui
504	*/
505	public String invertSelectionTipText() {
506
507	return "Set attribute selection mode. If false, only selected"
508	+ " (numeric) attributes in the range will be discretized; if"
509	+ " true, only non-selected attributes will be discretized.";
510	}
511
512	/**
513	* Gets whether the supplied columns are to be removed or kept
514	*
515	* @return true if the supplied columns will be kept
516	*/
517	public boolean getInvertSelection() {
518
519	return m_DiscretizeCols.getInvert();
520	}
521
522	/**
523	* Sets whether selected columns should be removed or kept. If true the
524	* selected columns are kept and unselected columns are deleted. If false
525	* selected columns are deleted and unselected columns are kept.
526	*
527	* @param invert the new invert setting
528	*/
529	public void setInvertSelection(boolean invert) {
530
531	m_DiscretizeCols.setInvert(invert);
532	}
533
534	/**
535	* Returns the tip text for this property
536	*
537	* @return tip text for this property suitable for
538	* displaying in the explorer/experimenter gui
539	*/
540	public String attributeIndicesTipText() {
541	return "Specify range of attributes to act on."
542	+ " This is a comma separated list of attribute indices, with"
543	+ " \"first\" and \"last\" valid values. Specify an inclusive"
544	+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
545	}
546
547	/**
548	* Gets the current range selection
549	*
550	* @return a string containing a comma separated list of ranges
551	*/
552	public String getAttributeIndices() {
553
554	return m_DiscretizeCols.getRanges();
555	}
556
557	/**
558	* Sets which attributes are to be Discretized (only numeric
559	* attributes among the selection will be Discretized).
560	*
561	* @param rangeList a string representing the list of attributes. Since
562	* the string will typically come from a user, attributes are indexed from
563	* 1. <br>
564	* eg: first-3,5,6-last
565	* @throws IllegalArgumentException if an invalid range list is supplied
566	*/
567	public void setAttributeIndices(String rangeList) {
568
569	m_DiscretizeCols.setRanges(rangeList);
570	}
571
572	/**
573	* Sets which attributes are to be Discretized (only numeric
574	* attributes among the selection will be Discretized).
575	*
576	* @param attributes an array containing indexes of attributes to Discretize.
577	* Since the array will typically come from a program, attributes are indexed
578	* from 0.
579	* @throws IllegalArgumentException if an invalid set of ranges
580	* is supplied
581	*/
582	public void setAttributeIndicesArray(int [] attributes) {
583
584	setAttributeIndices(Range.indicesToRangeList(attributes));
585	}
586
587	/**
588	* Gets the cut points for an attribute
589	*
590	* @param attributeIndex the index (from 0) of the attribute to get the cut points of
591	* @return an array containing the cutpoints (or null if the
592	* attribute requested isn't being Discretized
593	*/
594	public double [] getCutPoints(int attributeIndex) {
595
596	if (m_CutPoints == null) {
597	return null;
598	}
599	return m_CutPoints[attributeIndex];
600	}
601
602	/** Generate the cutpoints for each attribute */
603	protected void calculateCutPoints() {
604
605	Instances copy = null;
606
607	m_CutPoints = new double [getInputFormat().numAttributes()] [];
608	for(int i = getInputFormat().numAttributes() - 1; i >= 0; i--) {
609	if ((m_DiscretizeCols.isInRange(i)) &&
610	(getInputFormat().attribute(i).isNumeric())) {
611
612	// Use copy to preserve order
613	if (copy == null) {
614	copy = new Instances(getInputFormat());
615	}
616	calculateCutPointsByMDL(i, copy);
617	}
618	}
619	}
620
621	/**
622	* Set cutpoints for a single attribute using MDL.
623	*
624	* @param index the index of the attribute to set cutpoints for
625	* @param data the data to work with
626	*/
627	protected void calculateCutPointsByMDL(int index,
628	Instances data) {
629
630	// Sort instances
631	data.sort(data.attribute(index));
632
633	// Find first instances that's missing
634	int firstMissing = data.numInstances();
635	for (int i = 0; i < data.numInstances(); i++) {
636	if (data.instance(i).isMissing(index)) {
637	firstMissing = i;
638	break;
639	}
640	}
641	m_CutPoints[index] = cutPointsForSubset(data, index, 0, firstMissing);
642	}
643
644	/**
645	* Test using Kononenko's MDL criterion.
646	*
647	* @param priorCounts
648	* @param bestCounts
649	* @param numInstances
650	* @param numCutPoints
651	* @return true if the split is acceptable
652	*/
653	private boolean KononenkosMDL(double[] priorCounts,
654	double[][] bestCounts,
655	double numInstances,
656	int numCutPoints) {
657
658	double distPrior, instPrior, distAfter = 0, sum, instAfter = 0;
659	double before, after;
660	int numClassesTotal;
661
662	// Number of classes occuring in the set
663	numClassesTotal = 0;
664	for (int i = 0; i < priorCounts.length; i++) {
665	if (priorCounts[i] > 0) {
666	numClassesTotal++;
667	}
668	}
669
670	// Encode distribution prior to split
671	distPrior = SpecialFunctions.log2Binomial(numInstances
672	+ numClassesTotal - 1,
673	numClassesTotal - 1);
674
675	// Encode instances prior to split.
676	instPrior = SpecialFunctions.log2Multinomial(numInstances,
677	priorCounts);
678
679	before = instPrior + distPrior;
680
681	// Encode distributions and instances after split.
682	for (int i = 0; i < bestCounts.length; i++) {
683	sum = Utils.sum(bestCounts[i]);
684	distAfter += SpecialFunctions.log2Binomial(sum + numClassesTotal - 1,
685	numClassesTotal - 1);
686	instAfter += SpecialFunctions.log2Multinomial(sum,
687	bestCounts[i]);
688	}
689
690	// Coding cost after split
691	after = Utils.log2(numCutPoints) + distAfter + instAfter;
692
693	// Check if split is to be accepted
694	return (before > after);
695	}
696
697
698	/**
699	* Test using Fayyad and Irani's MDL criterion.
700	*
701	* @param priorCounts
702	* @param bestCounts
703	* @param numInstances
704	* @param numCutPoints
705	* @return true if the splits is acceptable
706	*/
707	private boolean FayyadAndIranisMDL(double[] priorCounts,
708	double[][] bestCounts,
709	double numInstances,
710	int numCutPoints) {
711
712	double priorEntropy, entropy, gain;
713	double entropyLeft, entropyRight, delta;
714	int numClassesTotal, numClassesRight, numClassesLeft;
715
716	// Compute entropy before split.
717	priorEntropy = ContingencyTables.entropy(priorCounts);
718
719	// Compute entropy after split.
720	entropy = ContingencyTables.entropyConditionedOnRows(bestCounts);
721
722	// Compute information gain.
723	gain = priorEntropy - entropy;
724
725	// Number of classes occuring in the set
726	numClassesTotal = 0;
727	for (int i = 0; i < priorCounts.length; i++) {
728	if (priorCounts[i] > 0) {
729	numClassesTotal++;
730	}
731	}
732
733	// Number of classes occuring in the left subset
734	numClassesLeft = 0;
735	for (int i = 0; i < bestCounts[0].length; i++) {
736	if (bestCounts[0][i] > 0) {
737	numClassesLeft++;
738	}
739	}
740
741	// Number of classes occuring in the right subset
742	numClassesRight = 0;
743	for (int i = 0; i < bestCounts[1].length; i++) {
744	if (bestCounts[1][i] > 0) {
745	numClassesRight++;
746	}
747	}
748
749	// Entropy of the left and the right subsets
750	entropyLeft = ContingencyTables.entropy(bestCounts[0]);
751	entropyRight = ContingencyTables.entropy(bestCounts[1]);
752
753	// Compute terms for MDL formula
754	delta = Utils.log2(Math.pow(3, numClassesTotal) - 2) -
755	(((double) numClassesTotal * priorEntropy) -
756	(numClassesRight * entropyRight) -
757	(numClassesLeft * entropyLeft));
758
759	// Check if split is to be accepted
760	return (gain > (Utils.log2(numCutPoints) + delta) / (double)numInstances);
761	}
762
763
764	/**
765	* Selects cutpoints for sorted subset.
766	*
767	* @param instances
768	* @param attIndex
769	* @param first
770	* @param lastPlusOne
771	* @return
772	*/
773	private double[] cutPointsForSubset(Instances instances, int attIndex,
774	int first, int lastPlusOne) {
775
776	double[][] counts, bestCounts;
777	double[] priorCounts, left, right, cutPoints;
778	double currentCutPoint = -Double.MAX_VALUE, bestCutPoint = -1,
779	currentEntropy, bestEntropy, priorEntropy, gain;
780	int bestIndex = -1, numInstances = 0, numCutPoints = 0;
781
782	// Compute number of instances in set
783	if ((lastPlusOne - first) < 2) {
784	return null;
785	}
786
787	// Compute class counts.
788	counts = new double[2][instances.numClasses()];
789	for (int i = first; i < lastPlusOne; i++) {
790	numInstances += instances.instance(i).weight();
791	counts[1][(int)instances.instance(i).classValue()] +=
792	instances.instance(i).weight();
793	}
794
795	// Save prior counts
796	priorCounts = new double[instances.numClasses()];
797	System.arraycopy(counts[1], 0, priorCounts, 0,
798	instances.numClasses());
799
800	// Entropy of the full set
801	priorEntropy = ContingencyTables.entropy(priorCounts);
802	bestEntropy = priorEntropy;
803
804	// Find best entropy.
805	bestCounts = new double[2][instances.numClasses()];
806	for (int i = first; i < (lastPlusOne - 1); i++) {
807	counts[0][(int)instances.instance(i).classValue()] +=
808	instances.instance(i).weight();
809	counts[1][(int)instances.instance(i).classValue()] -=
810	instances.instance(i).weight();
811	if (instances.instance(i).value(attIndex) <
812	instances.instance(i + 1).value(attIndex)) {
813	currentCutPoint = (instances.instance(i).value(attIndex) +
814	instances.instance(i + 1).value(attIndex)) / 2.0;
815	currentEntropy = ContingencyTables.entropyConditionedOnRows(counts);
816	if (currentEntropy < bestEntropy) {
817	bestCutPoint = currentCutPoint;
818	bestEntropy = currentEntropy;
819	bestIndex = i;
820	System.arraycopy(counts[0], 0,
821	bestCounts[0], 0, instances.numClasses());
822	System.arraycopy(counts[1], 0,
823	bestCounts[1], 0, instances.numClasses());
824	}
825	numCutPoints++;
826	}
827	}
828
829	// Use worse encoding?
830	if (!m_UseBetterEncoding) {
831	numCutPoints = (lastPlusOne - first) - 1;
832	}
833
834	// Checks if gain is zero
835	gain = priorEntropy - bestEntropy;
836	if (gain <= 0) {
837	return null;
838	}
839
840	// Check if split is to be accepted
841	if ((m_UseKononenko && KononenkosMDL(priorCounts, bestCounts,
842	numInstances, numCutPoints)) \|\|
843	(!m_UseKononenko && FayyadAndIranisMDL(priorCounts, bestCounts,
844	numInstances, numCutPoints))) {
845
846	// Select split points for the left and right subsets
847	left = cutPointsForSubset(instances, attIndex, first, bestIndex + 1);
848	right = cutPointsForSubset(instances, attIndex,
849	bestIndex + 1, lastPlusOne);
850
851	// Merge cutpoints and return them
852	if ((left == null) && (right) == null) {
853	cutPoints = new double[1];
854	cutPoints[0] = bestCutPoint;
855	} else if (right == null) {
856	cutPoints = new double[left.length + 1];
857	System.arraycopy(left, 0, cutPoints, 0, left.length);
858	cutPoints[left.length] = bestCutPoint;
859	} else if (left == null) {
860	cutPoints = new double[1 + right.length];
861	cutPoints[0] = bestCutPoint;
862	System.arraycopy(right, 0, cutPoints, 1, right.length);
863	} else {
864	cutPoints = new double[left.length + right.length + 1];
865	System.arraycopy(left, 0, cutPoints, 0, left.length);
866	cutPoints[left.length] = bestCutPoint;
867	System.arraycopy(right, 0, cutPoints, left.length + 1, right.length);
868	}
869
870	return cutPoints;
871	} else
872	return null;
873	}
874
875	/**
876	* Set the output format. Takes the currently defined cutpoints and
877	* m_InputFormat and calls setOutputFormat(Instances) appropriately.
878	*/
879	protected void setOutputFormat() {
880
881	if (m_CutPoints == null) {
882	setOutputFormat(null);
883	return;
884	}
885	FastVector attributes = new FastVector(getInputFormat().numAttributes());
886	int classIndex = getInputFormat().classIndex();
887	for(int i = 0; i < getInputFormat().numAttributes(); i++) {
888	if ((m_DiscretizeCols.isInRange(i))
889	&& (getInputFormat().attribute(i).isNumeric())) {
890	if (!m_MakeBinary) {
891	FastVector attribValues = new FastVector(1);
892	if (m_CutPoints[i] == null) {
893	attribValues.addElement("'All'");
894	} else {
895	for(int j = 0; j <= m_CutPoints[i].length; j++) {
896	if (j == 0) {
897	attribValues.addElement("'(-inf-"
898	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
899	} else if (j == m_CutPoints[i].length) {
900	attribValues.addElement("'("
901	+ Utils.doubleToString(m_CutPoints[i][j - 1], 6)
902	+ "-inf)'");
903	} else {
904	attribValues.addElement("'("
905	+ Utils.doubleToString(m_CutPoints[i][j - 1], 6) + "-"
906	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
907	}
908	}
909	}
910	attributes.addElement(new Attribute(getInputFormat().
911	attribute(i).name(),
912	attribValues));
913	} else {
914	if (m_CutPoints[i] == null) {
915	FastVector attribValues = new FastVector(1);
916	attribValues.addElement("'All'");
917	attributes.addElement(new Attribute(getInputFormat().
918	attribute(i).name(),
919	attribValues));
920	} else {
921	if (i < getInputFormat().classIndex()) {
922	classIndex += m_CutPoints[i].length - 1;
923	}
924	for(int j = 0; j < m_CutPoints[i].length; j++) {
925	FastVector attribValues = new FastVector(2);
926	attribValues.addElement("'(-inf-"
927	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "]'");
928	attribValues.addElement("'("
929	+ Utils.doubleToString(m_CutPoints[i][j], 6) + "-inf)'");
930	attributes.addElement(new Attribute(getInputFormat().
931	attribute(i).name(),
932	attribValues));
933	}
934	}
935	}
936	} else {
937	attributes.addElement(getInputFormat().attribute(i).copy());
938	}
939	}
940	Instances outputFormat =
941	new Instances(getInputFormat().relationName(), attributes, 0);
942	outputFormat.setClassIndex(classIndex);
943	setOutputFormat(outputFormat);
944	}
945
946	/**
947	* Convert a single instance over. The converted instance is added to
948	* the end of the output queue.
949	*
950	* @param instance the instance to convert
951	*/
952	protected void convertInstance(Instance instance) {
953
954	int index = 0;
955	double [] vals = new double [outputFormatPeek().numAttributes()];
956	// Copy and convert the values
957	for(int i = 0; i < getInputFormat().numAttributes(); i++) {
958	if (m_DiscretizeCols.isInRange(i) &&
959	getInputFormat().attribute(i).isNumeric()) {
960	int j;
961	double currentVal = instance.value(i);
962	if (m_CutPoints[i] == null) {
963	if (instance.isMissing(i)) {
964	vals[index] = Utils.missingValue();
965	} else {
966	vals[index] = 0;
967	}
968	index++;
969	} else {
970	if (!m_MakeBinary) {
971	if (instance.isMissing(i)) {
972	vals[index] = Utils.missingValue();
973	} else {
974	for (j = 0; j < m_CutPoints[i].length; j++) {
975	if (currentVal <= m_CutPoints[i][j]) {
976	break;
977	}
978	}
979	vals[index] = j;
980	}
981	index++;
982	} else {
983	for (j = 0; j < m_CutPoints[i].length; j++) {
984	if (instance.isMissing(i)) {
985	vals[index] = Utils.missingValue();
986	} else if (currentVal <= m_CutPoints[i][j]) {
987	vals[index] = 0;
988	} else {
989	vals[index] = 1;
990	}
991	index++;
992	}
993	}
994	}
995	} else {
996	vals[index] = instance.value(i);
997	index++;
998	}
999	}
1000
1001	Instance inst = null;
1002	if (instance instanceof SparseInstance) {
1003	inst = new SparseInstance(instance.weight(), vals);
1004	} else {
1005	inst = new DenseInstance(instance.weight(), vals);
1006	}
1007	inst.setDataset(getOutputFormat());
1008	copyValues(inst, false, instance.dataset(), getOutputFormat());
1009	inst.setDataset(getOutputFormat());
1010	push(inst);
1011	}
1012
1013	/**
1014	* Returns the revision string.
1015	*
1016	* @return the revision
1017	*/
1018	public String getRevision() {
1019	return RevisionUtils.extract("$Revision: 5987 $");
1020	}
1021
1022	/**
1023	* Main method for testing this class.
1024	*
1025	* @param argv should contain arguments to the filter: use -h for help
1026	*/
1027	public static void main(String [] argv) {
1028	runFilter(new Discretize(), argv);
1029	}
1030	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/MetisMQI/src/main/java/weka/filters/supervised/attribute/Discretize.java

Download in other formats: