1 | /* |
---|
2 | * This program is free software; you can redistribute it and/or modify |
---|
3 | * it under the terms of the GNU General Public License as published by |
---|
4 | * the Free Software Foundation; either version 2 of the License, or |
---|
5 | * (at your option) any later version. |
---|
6 | * |
---|
7 | * This program is distributed in the hope that it will be useful, |
---|
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
10 | * GNU General Public License for more details. |
---|
11 | * |
---|
12 | * You should have received a copy of the GNU General Public License |
---|
13 | * along with this program; if not, write to the Free Software |
---|
14 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
---|
15 | */ |
---|
16 | |
---|
17 | /* |
---|
18 | * Instances.java |
---|
19 | * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | package weka.core; |
---|
24 | |
---|
25 | import weka.core.converters.ArffLoader.ArffReader; |
---|
26 | import weka.core.converters.ConverterUtils.DataSource; |
---|
27 | |
---|
28 | import java.io.FileReader; |
---|
29 | import java.io.IOException; |
---|
30 | import java.io.Reader; |
---|
31 | import java.io.Serializable; |
---|
32 | import java.util.Enumeration; |
---|
33 | import java.util.Random; |
---|
34 | import java.util.List; |
---|
35 | import java.util.AbstractList; |
---|
36 | import java.util.ArrayList; |
---|
37 | |
---|
38 | /** |
---|
39 | * Class for handling an ordered set of weighted instances. <p> |
---|
40 | * |
---|
41 | * Typical usage: <p> |
---|
42 | * <pre> |
---|
43 | * import weka.core.converters.ConverterUtils.DataSource; |
---|
44 | * ... |
---|
45 | * |
---|
46 | * // Read all the instances in the file (ARFF, CSV, XRFF, ...) |
---|
47 | * DataSource source = new DataSource(filename); |
---|
48 | * Instances instances = source.getDataSet(); |
---|
49 | * |
---|
50 | * // Make the last attribute be the class |
---|
51 | * instances.setClassIndex(instances.numAttributes() - 1); |
---|
52 | * |
---|
53 | * // Print header and instances. |
---|
54 | * System.out.println("\nDataset:\n"); |
---|
55 | * System.out.println(instances); |
---|
56 | * |
---|
57 | * ... |
---|
58 | * </pre><p> |
---|
59 | * |
---|
60 | * All methods that change a set of instances are safe, ie. a change |
---|
61 | * of a set of instances does not affect any other sets of |
---|
62 | * instances. All methods that change a datasets's attribute |
---|
63 | * information clone the dataset before it is changed. |
---|
64 | * |
---|
65 | * @author Eibe Frank (eibe@cs.waikato.ac.nz) |
---|
66 | * @author Len Trigg (trigg@cs.waikato.ac.nz) |
---|
67 | * @author FracPete (fracpete at waikato dot ac dot nz) |
---|
68 | * @version $Revision: 5987 $ |
---|
69 | */ |
---|
70 | public class Instances extends AbstractList<Instance> |
---|
71 | implements Serializable, RevisionHandler { |
---|
72 | |
---|
73 | /** for serialization */ |
---|
74 | static final long serialVersionUID = -19412345060742748L; |
---|
75 | |
---|
76 | /** The filename extension that should be used for arff files */ |
---|
77 | public final static String FILE_EXTENSION = ".arff"; |
---|
78 | |
---|
79 | /** The filename extension that should be used for bin. serialized instances files */ |
---|
80 | public final static String SERIALIZED_OBJ_FILE_EXTENSION = ".bsi"; |
---|
81 | |
---|
82 | /** The keyword used to denote the start of an arff header */ |
---|
83 | public final static String ARFF_RELATION = "@relation"; |
---|
84 | |
---|
85 | /** The keyword used to denote the start of the arff data section */ |
---|
86 | public final static String ARFF_DATA = "@data"; |
---|
87 | |
---|
88 | /** The dataset's name. */ |
---|
89 | protected /*@spec_public non_null@*/ String m_RelationName; |
---|
90 | |
---|
91 | /** The attribute information. */ |
---|
92 | protected /*@spec_public non_null@*/ ArrayList<Attribute> m_Attributes; |
---|
93 | /* public invariant (\forall int i; 0 <= i && i < m_Attributes.size(); |
---|
94 | m_Attributes.get(i) != null); |
---|
95 | */ |
---|
96 | |
---|
97 | /** The instances. */ |
---|
98 | protected /*@spec_public non_null@*/ ArrayList<Instance> m_Instances; |
---|
99 | |
---|
100 | /** The class attribute's index */ |
---|
101 | protected int m_ClassIndex; |
---|
102 | //@ protected invariant classIndex() == m_ClassIndex; |
---|
103 | |
---|
104 | /** The lines read so far in case of incremental loading. Since the |
---|
105 | * StreamTokenizer will be re-initialized with every instance that is read, |
---|
106 | * we have to keep track of the number of lines read so far. |
---|
107 | * @see #readInstance(Reader) */ |
---|
108 | protected int m_Lines = 0; |
---|
109 | |
---|
110 | /** |
---|
111 | * Reads an ARFF file from a reader, and assigns a weight of |
---|
112 | * one to each instance. Lets the index of the class |
---|
113 | * attribute be undefined (negative). |
---|
114 | * |
---|
115 | * @param reader the reader |
---|
116 | * @throws IOException if the ARFF file is not read |
---|
117 | * successfully |
---|
118 | */ |
---|
119 | public Instances(/*@non_null@*/Reader reader) throws IOException { |
---|
120 | ArffReader arff = new ArffReader(reader); |
---|
121 | Instances dataset = arff.getData(); |
---|
122 | initialize(dataset, dataset.numInstances()); |
---|
123 | dataset.copyInstances(0, this, dataset.numInstances()); |
---|
124 | compactify(); |
---|
125 | } |
---|
126 | |
---|
127 | /** |
---|
128 | * Reads the header of an ARFF file from a reader and |
---|
129 | * reserves space for the given number of instances. Lets |
---|
130 | * the class index be undefined (negative). |
---|
131 | * |
---|
132 | * @param reader the reader |
---|
133 | * @param capacity the capacity |
---|
134 | * @throws IllegalArgumentException if the header is not read successfully |
---|
135 | * or the capacity is negative. |
---|
136 | * @throws IOException if there is a problem with the reader. |
---|
137 | * @deprecated instead of using this method in conjunction with the |
---|
138 | * <code>readInstance(Reader)</code> method, one should use the |
---|
139 | * <code>ArffLoader</code> or <code>DataSource</code> class instead. |
---|
140 | * @see weka.core.converters.ArffLoader |
---|
141 | * @see weka.core.converters.ConverterUtils.DataSource |
---|
142 | */ |
---|
143 | //@ requires capacity >= 0; |
---|
144 | //@ ensures classIndex() == -1; |
---|
145 | @Deprecated public Instances(/*@non_null@*/Reader reader, int capacity) |
---|
146 | throws IOException { |
---|
147 | |
---|
148 | ArffReader arff = new ArffReader(reader, 0); |
---|
149 | Instances header = arff.getStructure(); |
---|
150 | initialize(header, capacity); |
---|
151 | m_Lines = arff.getLineNo(); |
---|
152 | } |
---|
153 | |
---|
154 | /** |
---|
155 | * Constructor copying all instances and references to |
---|
156 | * the header information from the given set of instances. |
---|
157 | * |
---|
158 | * @param dataset the set to be copied |
---|
159 | */ |
---|
160 | public Instances(/*@non_null@*/Instances dataset) { |
---|
161 | |
---|
162 | this(dataset, dataset.numInstances()); |
---|
163 | |
---|
164 | dataset.copyInstances(0, this, dataset.numInstances()); |
---|
165 | } |
---|
166 | |
---|
167 | /** |
---|
168 | * Constructor creating an empty set of instances. Copies references |
---|
169 | * to the header information from the given set of instances. Sets |
---|
170 | * the capacity of the set of instances to 0 if its negative. |
---|
171 | * |
---|
172 | * @param dataset the instances from which the header |
---|
173 | * information is to be taken |
---|
174 | * @param capacity the capacity of the new dataset |
---|
175 | */ |
---|
176 | public Instances(/*@non_null@*/Instances dataset, int capacity) { |
---|
177 | initialize(dataset, capacity); |
---|
178 | } |
---|
179 | |
---|
180 | /** |
---|
181 | * initializes with the header information of the given dataset and sets |
---|
182 | * the capacity of the set of instances. |
---|
183 | * |
---|
184 | * @param dataset the dataset to use as template |
---|
185 | * @param capacity the number of rows to reserve |
---|
186 | */ |
---|
187 | protected void initialize(Instances dataset, int capacity) { |
---|
188 | if (capacity < 0) |
---|
189 | capacity = 0; |
---|
190 | |
---|
191 | // Strings only have to be "shallow" copied because |
---|
192 | // they can't be modified. |
---|
193 | m_ClassIndex = dataset.m_ClassIndex; |
---|
194 | m_RelationName = dataset.m_RelationName; |
---|
195 | m_Attributes = dataset.m_Attributes; |
---|
196 | m_Instances = new ArrayList<Instance>(capacity); |
---|
197 | } |
---|
198 | |
---|
199 | /** |
---|
200 | * Creates a new set of instances by copying a |
---|
201 | * subset of another set. |
---|
202 | * |
---|
203 | * @param source the set of instances from which a subset |
---|
204 | * is to be created |
---|
205 | * @param first the index of the first instance to be copied |
---|
206 | * @param toCopy the number of instances to be copied |
---|
207 | * @throws IllegalArgumentException if first and toCopy are out of range |
---|
208 | */ |
---|
209 | //@ requires 0 <= first; |
---|
210 | //@ requires 0 <= toCopy; |
---|
211 | //@ requires first + toCopy <= source.numInstances(); |
---|
212 | public Instances(/*@non_null@*/Instances source, int first, int toCopy) { |
---|
213 | |
---|
214 | this(source, toCopy); |
---|
215 | |
---|
216 | if ((first < 0) || ((first + toCopy) > source.numInstances())) { |
---|
217 | throw new IllegalArgumentException("Parameters first and/or toCopy out "+ |
---|
218 | "of range"); |
---|
219 | } |
---|
220 | source.copyInstances(first, this, toCopy); |
---|
221 | } |
---|
222 | |
---|
223 | /** |
---|
224 | * Creates an empty set of instances. Uses the given |
---|
225 | * attribute information. Sets the capacity of the set of |
---|
226 | * instances to 0 if its negative. Given attribute information |
---|
227 | * must not be changed after this constructor has been used. |
---|
228 | * |
---|
229 | * @param name the name of the relation |
---|
230 | * @param attInfo the attribute information |
---|
231 | * @param capacity the capacity of the set |
---|
232 | */ |
---|
233 | public Instances(/*@non_null@*/String name, |
---|
234 | /*@non_null@*/ArrayList<Attribute> attInfo, int capacity) { |
---|
235 | |
---|
236 | m_RelationName = name; |
---|
237 | m_ClassIndex = -1; |
---|
238 | m_Attributes = attInfo; |
---|
239 | for (int i = 0; i < numAttributes(); i++) { |
---|
240 | attribute(i).setIndex(i); |
---|
241 | } |
---|
242 | m_Instances = new ArrayList<Instance>(capacity); |
---|
243 | } |
---|
244 | |
---|
245 | /** |
---|
246 | * Create a copy of the structure if the data has string or |
---|
247 | * relational attributes, "cleanses" string types (i.e. doesn't |
---|
248 | * contain references to the strings seen in the past) and all |
---|
249 | * relational attributes. |
---|
250 | * |
---|
251 | * @return a copy of the instance structure. |
---|
252 | */ |
---|
253 | public Instances stringFreeStructure() { |
---|
254 | |
---|
255 | ArrayList<Attribute> newAtts = new ArrayList<Attribute>(); |
---|
256 | for (int i = 0 ; i < m_Attributes.size(); i++) { |
---|
257 | Attribute att = (Attribute)m_Attributes.get(i); |
---|
258 | if (att.type() == Attribute.STRING) { |
---|
259 | newAtts.add(new Attribute(att.name(), (List<String>)null, i)); |
---|
260 | } else if (att.type() == Attribute.RELATIONAL) { |
---|
261 | newAtts.add(new Attribute(att.name(), new Instances(att.relation(), 0), i)); |
---|
262 | } |
---|
263 | } |
---|
264 | if (newAtts.size() == 0) { |
---|
265 | return new Instances(this, 0); |
---|
266 | } |
---|
267 | ArrayList<Attribute> atts = Utils.cast(m_Attributes.clone()); |
---|
268 | for (int i = 0; i < newAtts.size(); i++) { |
---|
269 | atts.set(((Attribute)newAtts.get(i)).index(), newAtts.get(i)); |
---|
270 | } |
---|
271 | Instances result = new Instances(this, 0); |
---|
272 | result.m_Attributes = atts; |
---|
273 | return result; |
---|
274 | } |
---|
275 | |
---|
276 | /** |
---|
277 | * Adds one instance to the end of the set. |
---|
278 | * Shallow copies instance before it is added. Increases the |
---|
279 | * size of the dataset if it is not large enough. Does not |
---|
280 | * check if the instance is compatible with the dataset. |
---|
281 | * Note: String or relational values are not transferred. |
---|
282 | * |
---|
283 | * @param instance the instance to be added |
---|
284 | */ |
---|
285 | public boolean add(/*@non_null@*/ Instance instance) { |
---|
286 | |
---|
287 | Instance newInstance = (Instance)instance.copy(); |
---|
288 | |
---|
289 | newInstance.setDataset(this); |
---|
290 | m_Instances.add(newInstance); |
---|
291 | |
---|
292 | return true; |
---|
293 | } |
---|
294 | |
---|
295 | /** |
---|
296 | * Adds one instance to the end of the set. |
---|
297 | * Shallow copies instance before it is added. Increases the |
---|
298 | * size of the dataset if it is not large enough. Does not |
---|
299 | * check if the instance is compatible with the dataset. |
---|
300 | * Note: String or relational values are not transferred. |
---|
301 | * |
---|
302 | * @param index position where instance is to be inserted |
---|
303 | * @param instance the instance to be added |
---|
304 | */ |
---|
305 | //@ requires 0 <= index; |
---|
306 | //@ requires index < m_Instances.size(); |
---|
307 | public void add(int index, /*@non_null@*/ Instance instance) { |
---|
308 | |
---|
309 | Instance newInstance = (Instance)instance.copy(); |
---|
310 | |
---|
311 | newInstance.setDataset(this); |
---|
312 | m_Instances.add(index, newInstance); |
---|
313 | } |
---|
314 | |
---|
315 | /** |
---|
316 | * Returns an attribute. |
---|
317 | * |
---|
318 | * @param index the attribute's index (index starts with 0) |
---|
319 | * @return the attribute at the given position |
---|
320 | */ |
---|
321 | //@ requires 0 <= index; |
---|
322 | //@ requires index < m_Attributes.size(); |
---|
323 | //@ ensures \result != null; |
---|
324 | public /*@pure@*/ Attribute attribute(int index) { |
---|
325 | |
---|
326 | return (Attribute) m_Attributes.get(index); |
---|
327 | } |
---|
328 | |
---|
329 | /** |
---|
330 | * Returns an attribute given its name. If there is more than |
---|
331 | * one attribute with the same name, it returns the first one. |
---|
332 | * Returns null if the attribute can't be found. |
---|
333 | * |
---|
334 | * @param name the attribute's name |
---|
335 | * @return the attribute with the given name, null if the |
---|
336 | * attribute can't be found |
---|
337 | */ |
---|
338 | public /*@pure@*/ Attribute attribute(String name) { |
---|
339 | |
---|
340 | for (int i = 0; i < numAttributes(); i++) { |
---|
341 | if (attribute(i).name().equals(name)) { |
---|
342 | return attribute(i); |
---|
343 | } |
---|
344 | } |
---|
345 | return null; |
---|
346 | } |
---|
347 | |
---|
348 | /** |
---|
349 | * Checks for attributes of the given type in the dataset |
---|
350 | * |
---|
351 | * @param attType the attribute type to look for |
---|
352 | * @return true if attributes of the given type are present |
---|
353 | */ |
---|
354 | public boolean checkForAttributeType(int attType) { |
---|
355 | |
---|
356 | int i = 0; |
---|
357 | |
---|
358 | while (i < m_Attributes.size()) { |
---|
359 | if (attribute(i++).type() == attType) { |
---|
360 | return true; |
---|
361 | } |
---|
362 | } |
---|
363 | return false; |
---|
364 | } |
---|
365 | |
---|
366 | /** |
---|
367 | * Checks for string attributes in the dataset |
---|
368 | * |
---|
369 | * @return true if string attributes are present, false otherwise |
---|
370 | */ |
---|
371 | public /*@pure@*/ boolean checkForStringAttributes() { |
---|
372 | return checkForAttributeType(Attribute.STRING); |
---|
373 | } |
---|
374 | |
---|
375 | /** |
---|
376 | * Checks if the given instance is compatible |
---|
377 | * with this dataset. Only looks at the size of |
---|
378 | * the instance and the ranges of the values for |
---|
379 | * nominal and string attributes. |
---|
380 | * |
---|
381 | * @param instance the instance to check |
---|
382 | * @return true if the instance is compatible with the dataset |
---|
383 | */ |
---|
384 | public /*@pure@*/ boolean checkInstance(Instance instance) { |
---|
385 | |
---|
386 | if (instance.numAttributes() != numAttributes()) { |
---|
387 | return false; |
---|
388 | } |
---|
389 | for (int i = 0; i < numAttributes(); i++) { |
---|
390 | if (instance.isMissing(i)) { |
---|
391 | continue; |
---|
392 | } else if (attribute(i).isNominal() || |
---|
393 | attribute(i).isString()) { |
---|
394 | if (!(Utils.eq(instance.value(i), |
---|
395 | (double)(int)instance.value(i)))) { |
---|
396 | return false; |
---|
397 | } else if (Utils.sm(instance.value(i), 0) || |
---|
398 | Utils.gr(instance.value(i), |
---|
399 | attribute(i).numValues())) { |
---|
400 | return false; |
---|
401 | } |
---|
402 | } |
---|
403 | } |
---|
404 | return true; |
---|
405 | } |
---|
406 | |
---|
407 | /** |
---|
408 | * Returns the class attribute. |
---|
409 | * |
---|
410 | * @return the class attribute |
---|
411 | * @throws UnassignedClassException if the class is not set |
---|
412 | */ |
---|
413 | //@ requires classIndex() >= 0; |
---|
414 | public /*@pure@*/ Attribute classAttribute() { |
---|
415 | |
---|
416 | if (m_ClassIndex < 0) { |
---|
417 | throw new UnassignedClassException("Class index is negative (not set)!"); |
---|
418 | } |
---|
419 | return attribute(m_ClassIndex); |
---|
420 | } |
---|
421 | |
---|
422 | /** |
---|
423 | * Returns the class attribute's index. Returns negative number |
---|
424 | * if it's undefined. |
---|
425 | * |
---|
426 | * @return the class index as an integer |
---|
427 | */ |
---|
428 | // ensures \result == m_ClassIndex; |
---|
429 | public /*@pure@*/ int classIndex() { |
---|
430 | |
---|
431 | return m_ClassIndex; |
---|
432 | } |
---|
433 | |
---|
434 | /** |
---|
435 | * Compactifies the set of instances. Decreases the capacity of |
---|
436 | * the set so that it matches the number of instances in the set. |
---|
437 | */ |
---|
438 | public void compactify() { |
---|
439 | |
---|
440 | m_Instances.trimToSize(); |
---|
441 | } |
---|
442 | |
---|
443 | /** |
---|
444 | * Removes all instances from the set. |
---|
445 | */ |
---|
446 | public void delete() { |
---|
447 | |
---|
448 | m_Instances = new ArrayList<Instance>(); |
---|
449 | } |
---|
450 | |
---|
451 | /** |
---|
452 | * Removes an instance at the given position from the set. |
---|
453 | * |
---|
454 | * @param index the instance's position (index starts with 0) |
---|
455 | */ |
---|
456 | //@ requires 0 <= index && index < numInstances(); |
---|
457 | public void delete(int index) { |
---|
458 | |
---|
459 | m_Instances.remove(index); |
---|
460 | } |
---|
461 | |
---|
462 | /** |
---|
463 | * Deletes an attribute at the given position |
---|
464 | * (0 to numAttributes() - 1). A deep copy of the attribute |
---|
465 | * information is performed before the attribute is deleted. |
---|
466 | * |
---|
467 | * @param position the attribute's position (position starts with 0) |
---|
468 | * @throws IllegalArgumentException if the given index is out of range |
---|
469 | * or the class attribute is being deleted |
---|
470 | */ |
---|
471 | //@ requires 0 <= position && position < numAttributes(); |
---|
472 | //@ requires position != classIndex(); |
---|
473 | public void deleteAttributeAt(int position) { |
---|
474 | |
---|
475 | if ((position < 0) || (position >= m_Attributes.size())) { |
---|
476 | throw new IllegalArgumentException("Index out of range"); |
---|
477 | } |
---|
478 | if (position == m_ClassIndex) { |
---|
479 | throw new IllegalArgumentException("Can't delete class attribute"); |
---|
480 | } |
---|
481 | freshAttributeInfo(); |
---|
482 | if (m_ClassIndex > position) { |
---|
483 | m_ClassIndex--; |
---|
484 | } |
---|
485 | m_Attributes.remove(position); |
---|
486 | for (int i = position; i < m_Attributes.size(); i++) { |
---|
487 | Attribute current = (Attribute)m_Attributes.get(i); |
---|
488 | current.setIndex(current.index() - 1); |
---|
489 | } |
---|
490 | for (int i = 0; i < numInstances(); i++) { |
---|
491 | instance(i).setDataset(null); |
---|
492 | instance(i).deleteAttributeAt(position); |
---|
493 | instance(i).setDataset(this); |
---|
494 | } |
---|
495 | } |
---|
496 | |
---|
497 | /** |
---|
498 | * Deletes all attributes of the given type in the dataset. A deep copy of |
---|
499 | * the attribute information is performed before an attribute is deleted. |
---|
500 | * |
---|
501 | * @param attType the attribute type to delete |
---|
502 | * @throws IllegalArgumentException if attribute couldn't be |
---|
503 | * successfully deleted (probably because it is the class attribute). |
---|
504 | */ |
---|
505 | public void deleteAttributeType(int attType) { |
---|
506 | int i = 0; |
---|
507 | while (i < m_Attributes.size()) { |
---|
508 | if (attribute(i).type() == attType) { |
---|
509 | deleteAttributeAt(i); |
---|
510 | } else { |
---|
511 | i++; |
---|
512 | } |
---|
513 | } |
---|
514 | } |
---|
515 | |
---|
516 | /** |
---|
517 | * Deletes all string attributes in the dataset. A deep copy of the attribute |
---|
518 | * information is performed before an attribute is deleted. |
---|
519 | * |
---|
520 | * @throws IllegalArgumentException if string attribute couldn't be |
---|
521 | * successfully deleted (probably because it is the class attribute). |
---|
522 | * @see #deleteAttributeType(int) |
---|
523 | */ |
---|
524 | public void deleteStringAttributes() { |
---|
525 | deleteAttributeType(Attribute.STRING); |
---|
526 | } |
---|
527 | |
---|
528 | /** |
---|
529 | * Removes all instances with missing values for a particular |
---|
530 | * attribute from the dataset. |
---|
531 | * |
---|
532 | * @param attIndex the attribute's index (index starts with 0) |
---|
533 | */ |
---|
534 | //@ requires 0 <= attIndex && attIndex < numAttributes(); |
---|
535 | public void deleteWithMissing(int attIndex) { |
---|
536 | |
---|
537 | ArrayList<Instance> newInstances = new ArrayList<Instance>(numInstances()); |
---|
538 | |
---|
539 | for (int i = 0; i < numInstances(); i++) { |
---|
540 | if (!instance(i).isMissing(attIndex)) { |
---|
541 | newInstances.add(instance(i)); |
---|
542 | } |
---|
543 | } |
---|
544 | m_Instances = newInstances; |
---|
545 | } |
---|
546 | |
---|
547 | /** |
---|
548 | * Removes all instances with missing values for a particular |
---|
549 | * attribute from the dataset. |
---|
550 | * |
---|
551 | * @param att the attribute |
---|
552 | */ |
---|
553 | public void deleteWithMissing(/*@non_null@*/ Attribute att) { |
---|
554 | |
---|
555 | deleteWithMissing(att.index()); |
---|
556 | } |
---|
557 | |
---|
558 | /** |
---|
559 | * Removes all instances with a missing class value |
---|
560 | * from the dataset. |
---|
561 | * |
---|
562 | * @throws UnassignedClassException if class is not set |
---|
563 | */ |
---|
564 | public void deleteWithMissingClass() { |
---|
565 | |
---|
566 | if (m_ClassIndex < 0) { |
---|
567 | throw new UnassignedClassException("Class index is negative (not set)!"); |
---|
568 | } |
---|
569 | deleteWithMissing(m_ClassIndex); |
---|
570 | } |
---|
571 | |
---|
572 | /** |
---|
573 | * Returns an enumeration of all the attributes. |
---|
574 | * |
---|
575 | * @return enumeration of all the attributes. |
---|
576 | */ |
---|
577 | public /*@non_null pure@*/ Enumeration enumerateAttributes() { |
---|
578 | |
---|
579 | return new WekaEnumeration(m_Attributes, m_ClassIndex); |
---|
580 | } |
---|
581 | |
---|
582 | /** |
---|
583 | * Returns an enumeration of all instances in the dataset. |
---|
584 | * |
---|
585 | * @return enumeration of all instances in the dataset |
---|
586 | */ |
---|
587 | public /*@non_null pure@*/ Enumeration enumerateInstances() { |
---|
588 | |
---|
589 | return new WekaEnumeration(m_Instances); |
---|
590 | } |
---|
591 | |
---|
592 | /** |
---|
593 | * Checks if two headers are equivalent. If not, then returns a message why |
---|
594 | * they differ. |
---|
595 | * |
---|
596 | * @param dataset another dataset |
---|
597 | * @return null if the header of the given dataset is equivalent |
---|
598 | * to this header, otherwise a message with details on |
---|
599 | * why they differ |
---|
600 | */ |
---|
601 | public String equalHeadersMsg(Instances dataset) { |
---|
602 | // Check class and all attributes |
---|
603 | if (m_ClassIndex != dataset.m_ClassIndex) |
---|
604 | return "Class index differ: " + (m_ClassIndex+1) + " != " + (dataset.m_ClassIndex+1); |
---|
605 | |
---|
606 | if (m_Attributes.size() != dataset.m_Attributes.size()) |
---|
607 | return "Different number of attributes: " + m_Attributes.size() + " != " + dataset.m_Attributes.size(); |
---|
608 | |
---|
609 | for (int i = 0; i < m_Attributes.size(); i++) { |
---|
610 | String msg = attribute(i).equalsMsg(dataset.attribute(i)); |
---|
611 | if (msg != null) |
---|
612 | return "Attributes differ at position " + (i+1) + ":\n" + msg; |
---|
613 | } |
---|
614 | |
---|
615 | return null; |
---|
616 | } |
---|
617 | |
---|
618 | /** |
---|
619 | * Checks if two headers are equivalent. |
---|
620 | * |
---|
621 | * @param dataset another dataset |
---|
622 | * @return true if the header of the given dataset is equivalent |
---|
623 | * to this header |
---|
624 | */ |
---|
625 | public /*@pure@*/ boolean equalHeaders(Instances dataset){ |
---|
626 | return (equalHeadersMsg(dataset) == null); |
---|
627 | } |
---|
628 | |
---|
629 | /** |
---|
630 | * Returns the first instance in the set. |
---|
631 | * |
---|
632 | * @return the first instance in the set |
---|
633 | */ |
---|
634 | //@ requires numInstances() > 0; |
---|
635 | public /*@non_null pure@*/ Instance firstInstance() { |
---|
636 | |
---|
637 | return (Instance)m_Instances.get(0); |
---|
638 | } |
---|
639 | |
---|
640 | /** |
---|
641 | * Returns a random number generator. The initial seed of the random |
---|
642 | * number generator depends on the given seed and the hash code of |
---|
643 | * a string representation of a instances chosen based on the given |
---|
644 | * seed. |
---|
645 | * |
---|
646 | * @param seed the given seed |
---|
647 | * @return the random number generator |
---|
648 | */ |
---|
649 | public Random getRandomNumberGenerator(long seed) { |
---|
650 | |
---|
651 | Random r = new Random(seed); |
---|
652 | r.setSeed(instance(r.nextInt(numInstances())).toStringNoWeight().hashCode() + seed); |
---|
653 | return r; |
---|
654 | } |
---|
655 | |
---|
656 | /** |
---|
657 | * Inserts an attribute at the given position (0 to |
---|
658 | * numAttributes()) and sets all values to be missing. |
---|
659 | * Shallow copies the attribute before it is inserted, and performs |
---|
660 | * a deep copy of the existing attribute information. |
---|
661 | * |
---|
662 | * @param att the attribute to be inserted |
---|
663 | * @param position the attribute's position (position starts with 0) |
---|
664 | * @throws IllegalArgumentException if the given index is out of range |
---|
665 | */ |
---|
666 | //@ requires 0 <= position; |
---|
667 | //@ requires position <= numAttributes(); |
---|
668 | public void insertAttributeAt(/*@non_null@*/ Attribute att, int position) { |
---|
669 | |
---|
670 | if ((position < 0) || |
---|
671 | (position > m_Attributes.size())) { |
---|
672 | throw new IllegalArgumentException("Index out of range"); |
---|
673 | } |
---|
674 | att = (Attribute)att.copy(); |
---|
675 | freshAttributeInfo(); |
---|
676 | att.setIndex(position); |
---|
677 | m_Attributes.add(position, att); |
---|
678 | for (int i = position + 1; i < m_Attributes.size(); i++) { |
---|
679 | Attribute current = (Attribute)m_Attributes.get(i); |
---|
680 | current.setIndex(current.index() + 1); |
---|
681 | } |
---|
682 | for (int i = 0; i < numInstances(); i++) { |
---|
683 | instance(i).setDataset(null); |
---|
684 | instance(i).insertAttributeAt(position); |
---|
685 | instance(i).setDataset(this); |
---|
686 | } |
---|
687 | if (m_ClassIndex >= position) { |
---|
688 | m_ClassIndex++; |
---|
689 | } |
---|
690 | } |
---|
691 | |
---|
692 | /** |
---|
693 | * Returns the instance at the given position. |
---|
694 | * |
---|
695 | * @param index the instance's index (index starts with 0) |
---|
696 | * @return the instance at the given position |
---|
697 | */ |
---|
698 | //@ requires 0 <= index; |
---|
699 | //@ requires index < numInstances(); |
---|
700 | public /*@non_null pure@*/ Instance instance(int index) { |
---|
701 | |
---|
702 | return m_Instances.get(index); |
---|
703 | } |
---|
704 | |
---|
705 | /** |
---|
706 | * Returns the instance at the given position. |
---|
707 | * |
---|
708 | * @param index the instance's index (index starts with 0) |
---|
709 | * @return the instance at the given position |
---|
710 | */ |
---|
711 | //@ requires 0 <= index; |
---|
712 | //@ requires index < numInstances(); |
---|
713 | public /*@non_null pure@*/ Instance get(int index) { |
---|
714 | |
---|
715 | return m_Instances.get(index); |
---|
716 | } |
---|
717 | |
---|
718 | /** |
---|
719 | * Returns the kth-smallest attribute value of a numeric attribute. |
---|
720 | * Note that calling this method will change the order of the data! |
---|
721 | * |
---|
722 | * @param att the Attribute object |
---|
723 | * @param k the value of k |
---|
724 | * @return the kth-smallest value |
---|
725 | */ |
---|
726 | public double kthSmallestValue(Attribute att, int k) { |
---|
727 | |
---|
728 | return kthSmallestValue(att.index(), k); |
---|
729 | } |
---|
730 | |
---|
731 | /** |
---|
732 | * Returns the kth-smallest attribute value of a numeric attribute. |
---|
733 | * Note that calling this method will change the order of the data! |
---|
734 | * The number of non-missing values in the data must be as least |
---|
735 | * as last as k for this to work. |
---|
736 | * |
---|
737 | * @param attIndex the attribute's index |
---|
738 | * @param k the value of k |
---|
739 | * @return the kth-smallest value |
---|
740 | */ |
---|
741 | public double kthSmallestValue(int attIndex, int k) { |
---|
742 | |
---|
743 | if (!attribute(attIndex).isNumeric()) { |
---|
744 | throw new IllegalArgumentException("Instances: attribute must be numeric to compute kth-smallest value."); |
---|
745 | } |
---|
746 | |
---|
747 | int i,j; |
---|
748 | |
---|
749 | // move all instances with missing values to end |
---|
750 | j = numInstances() - 1; |
---|
751 | i = 0; |
---|
752 | while (i <= j) { |
---|
753 | if (instance(j).isMissing(attIndex)) { |
---|
754 | j--; |
---|
755 | } else { |
---|
756 | if (instance(i).isMissing(attIndex)) { |
---|
757 | swap(i,j); |
---|
758 | j--; |
---|
759 | } |
---|
760 | i++; |
---|
761 | } |
---|
762 | } |
---|
763 | |
---|
764 | if ((k < 1) || (k > j+1)) { |
---|
765 | throw new IllegalArgumentException("Instances: value for k for computing kth-smallest value too large."); |
---|
766 | } |
---|
767 | |
---|
768 | return instance(select(attIndex, 0, j, k)).value(attIndex); |
---|
769 | } |
---|
770 | |
---|
771 | /** |
---|
772 | * Returns the last instance in the set. |
---|
773 | * |
---|
774 | * @return the last instance in the set |
---|
775 | */ |
---|
776 | //@ requires numInstances() > 0; |
---|
777 | public /*@non_null pure@*/ Instance lastInstance() { |
---|
778 | |
---|
779 | return (Instance)m_Instances.get(m_Instances.size() - 1); |
---|
780 | } |
---|
781 | |
---|
782 | /** |
---|
783 | * Returns the mean (mode) for a numeric (nominal) attribute as |
---|
784 | * a floating-point value. Returns 0 if the attribute is neither nominal nor |
---|
785 | * numeric. If all values are missing it returns zero. |
---|
786 | * |
---|
787 | * @param attIndex the attribute's index (index starts with 0) |
---|
788 | * @return the mean or the mode |
---|
789 | */ |
---|
790 | public /*@pure@*/ double meanOrMode(int attIndex) { |
---|
791 | |
---|
792 | double result, found; |
---|
793 | int [] counts; |
---|
794 | |
---|
795 | if (attribute(attIndex).isNumeric()) { |
---|
796 | result = found = 0; |
---|
797 | for (int j = 0; j < numInstances(); j++) { |
---|
798 | if (!instance(j).isMissing(attIndex)) { |
---|
799 | found += instance(j).weight(); |
---|
800 | result += instance(j).weight()*instance(j).value(attIndex); |
---|
801 | } |
---|
802 | } |
---|
803 | if (found <= 0) { |
---|
804 | return 0; |
---|
805 | } else { |
---|
806 | return result / found; |
---|
807 | } |
---|
808 | } else if (attribute(attIndex).isNominal()) { |
---|
809 | counts = new int[attribute(attIndex).numValues()]; |
---|
810 | for (int j = 0; j < numInstances(); j++) { |
---|
811 | if (!instance(j).isMissing(attIndex)) { |
---|
812 | counts[(int) instance(j).value(attIndex)] += instance(j).weight(); |
---|
813 | } |
---|
814 | } |
---|
815 | return (double)Utils.maxIndex(counts); |
---|
816 | } else { |
---|
817 | return 0; |
---|
818 | } |
---|
819 | } |
---|
820 | |
---|
821 | /** |
---|
822 | * Returns the mean (mode) for a numeric (nominal) attribute as a |
---|
823 | * floating-point value. Returns 0 if the attribute is neither |
---|
824 | * nominal nor numeric. If all values are missing it returns zero. |
---|
825 | * |
---|
826 | * @param att the attribute |
---|
827 | * @return the mean or the mode |
---|
828 | */ |
---|
829 | public /*@pure@*/ double meanOrMode(Attribute att) { |
---|
830 | |
---|
831 | return meanOrMode(att.index()); |
---|
832 | } |
---|
833 | |
---|
834 | /** |
---|
835 | * Returns the number of attributes. |
---|
836 | * |
---|
837 | * @return the number of attributes as an integer |
---|
838 | */ |
---|
839 | //@ ensures \result == m_Attributes.size(); |
---|
840 | public /*@pure@*/ int numAttributes() { |
---|
841 | |
---|
842 | return m_Attributes.size(); |
---|
843 | } |
---|
844 | |
---|
845 | /** |
---|
846 | * Returns the number of class labels. |
---|
847 | * |
---|
848 | * @return the number of class labels as an integer if the class |
---|
849 | * attribute is nominal, 1 otherwise. |
---|
850 | * @throws UnassignedClassException if the class is not set |
---|
851 | */ |
---|
852 | //@ requires classIndex() >= 0; |
---|
853 | public /*@pure@*/ int numClasses() { |
---|
854 | |
---|
855 | if (m_ClassIndex < 0) { |
---|
856 | throw new UnassignedClassException("Class index is negative (not set)!"); |
---|
857 | } |
---|
858 | if (!classAttribute().isNominal()) { |
---|
859 | return 1; |
---|
860 | } else { |
---|
861 | return classAttribute().numValues(); |
---|
862 | } |
---|
863 | } |
---|
864 | |
---|
865 | /** |
---|
866 | * Returns the number of distinct values of a given attribute. |
---|
867 | * Returns the number of instances if the attribute is a |
---|
868 | * string attribute. The value 'missing' is not counted. |
---|
869 | * |
---|
870 | * @param attIndex the attribute (index starts with 0) |
---|
871 | * @return the number of distinct values of a given attribute |
---|
872 | */ |
---|
873 | //@ requires 0 <= attIndex; |
---|
874 | //@ requires attIndex < numAttributes(); |
---|
875 | public /*@pure@*/ int numDistinctValues(int attIndex) { |
---|
876 | |
---|
877 | if (attribute(attIndex).isNumeric()) { |
---|
878 | double [] attVals = attributeToDoubleArray(attIndex); |
---|
879 | int [] sorted = Utils.sort(attVals); |
---|
880 | double prev = 0; |
---|
881 | int counter = 0; |
---|
882 | for (int i = 0; i < sorted.length; i++) { |
---|
883 | Instance current = instance(sorted[i]); |
---|
884 | if (current.isMissing(attIndex)) { |
---|
885 | break; |
---|
886 | } |
---|
887 | if ((i == 0) || |
---|
888 | (current.value(attIndex) > prev)) { |
---|
889 | prev = current.value(attIndex); |
---|
890 | counter++; |
---|
891 | } |
---|
892 | } |
---|
893 | return counter; |
---|
894 | } else { |
---|
895 | return attribute(attIndex).numValues(); |
---|
896 | } |
---|
897 | } |
---|
898 | |
---|
899 | /** |
---|
900 | * Returns the number of distinct values of a given attribute. |
---|
901 | * Returns the number of instances if the attribute is a |
---|
902 | * string attribute. The value 'missing' is not counted. |
---|
903 | * |
---|
904 | * @param att the attribute |
---|
905 | * @return the number of distinct values of a given attribute |
---|
906 | */ |
---|
907 | public /*@pure@*/ int numDistinctValues(/*@non_null@*/Attribute att) { |
---|
908 | |
---|
909 | return numDistinctValues(att.index()); |
---|
910 | } |
---|
911 | |
---|
912 | /** |
---|
913 | * Returns the number of instances in the dataset. |
---|
914 | * |
---|
915 | * @return the number of instances in the dataset as an integer |
---|
916 | */ |
---|
917 | //@ ensures \result == m_Instances.size(); |
---|
918 | public /*@pure@*/ int numInstances() { |
---|
919 | |
---|
920 | return m_Instances.size(); |
---|
921 | } |
---|
922 | |
---|
923 | /** |
---|
924 | * Returns the number of instances in the dataset. |
---|
925 | * |
---|
926 | * @return the number of instances in the dataset as an integer |
---|
927 | */ |
---|
928 | //@ ensures \result == m_Instances.size(); |
---|
929 | public /*@pure@*/ int size() { |
---|
930 | |
---|
931 | return m_Instances.size(); |
---|
932 | } |
---|
933 | |
---|
934 | /** |
---|
935 | * Shuffles the instances in the set so that they are ordered |
---|
936 | * randomly. |
---|
937 | * |
---|
938 | * @param random a random number generator |
---|
939 | */ |
---|
940 | public void randomize(Random random) { |
---|
941 | |
---|
942 | for (int j = numInstances() - 1; j > 0; j--) |
---|
943 | swap(j, random.nextInt(j+1)); |
---|
944 | } |
---|
945 | |
---|
946 | /** |
---|
947 | * Reads a single instance from the reader and appends it |
---|
948 | * to the dataset. Automatically expands the dataset if it |
---|
949 | * is not large enough to hold the instance. This method does |
---|
950 | * not check for carriage return at the end of the line. |
---|
951 | * |
---|
952 | * @param reader the reader |
---|
953 | * @return false if end of file has been reached |
---|
954 | * @throws IOException if the information is not read |
---|
955 | * successfully |
---|
956 | * @deprecated instead of using this method in conjunction with the |
---|
957 | * <code>readInstance(Reader)</code> method, one should use the |
---|
958 | * <code>ArffLoader</code> or <code>DataSource</code> class instead. |
---|
959 | * @see weka.core.converters.ArffLoader |
---|
960 | * @see weka.core.converters.ConverterUtils.DataSource |
---|
961 | */ |
---|
962 | @Deprecated public boolean readInstance(Reader reader) throws IOException { |
---|
963 | |
---|
964 | ArffReader arff = new ArffReader(reader, this, m_Lines, 1); |
---|
965 | Instance inst = arff.readInstance(arff.getData(), false); |
---|
966 | m_Lines = arff.getLineNo(); |
---|
967 | if (inst != null) { |
---|
968 | add(inst); |
---|
969 | return true; |
---|
970 | } |
---|
971 | else { |
---|
972 | return false; |
---|
973 | } |
---|
974 | } |
---|
975 | |
---|
976 | /** |
---|
977 | * Returns the relation's name. |
---|
978 | * |
---|
979 | * @return the relation's name as a string |
---|
980 | */ |
---|
981 | //@ ensures \result == m_RelationName; |
---|
982 | public /*@pure@*/ String relationName() { |
---|
983 | |
---|
984 | return m_RelationName; |
---|
985 | } |
---|
986 | |
---|
987 | /** |
---|
988 | * Removes the instance at the given position. |
---|
989 | * |
---|
990 | * @param index the instance's index (index starts with 0) |
---|
991 | * @return the instance at the given position |
---|
992 | */ |
---|
993 | //@ requires 0 <= index; |
---|
994 | //@ requires index < numInstances(); |
---|
995 | public Instance remove(int index) { |
---|
996 | |
---|
997 | return m_Instances.remove(index); |
---|
998 | } |
---|
999 | |
---|
1000 | /** |
---|
1001 | * Renames an attribute. This change only affects this |
---|
1002 | * dataset. |
---|
1003 | * |
---|
1004 | * @param att the attribute's index (index starts with 0) |
---|
1005 | * @param name the new name |
---|
1006 | */ |
---|
1007 | public void renameAttribute(int att, String name) { |
---|
1008 | |
---|
1009 | Attribute newAtt = attribute(att).copy(name); |
---|
1010 | ArrayList<Attribute> newVec = new ArrayList<Attribute>(numAttributes()); |
---|
1011 | |
---|
1012 | for (int i = 0; i < numAttributes(); i++) { |
---|
1013 | if (i == att) { |
---|
1014 | newVec.add(newAtt); |
---|
1015 | } else { |
---|
1016 | newVec.add(attribute(i)); |
---|
1017 | } |
---|
1018 | } |
---|
1019 | m_Attributes = newVec; |
---|
1020 | } |
---|
1021 | |
---|
1022 | /** |
---|
1023 | * Renames an attribute. This change only affects this |
---|
1024 | * dataset. |
---|
1025 | * |
---|
1026 | * @param att the attribute |
---|
1027 | * @param name the new name |
---|
1028 | */ |
---|
1029 | public void renameAttribute(Attribute att, String name) { |
---|
1030 | |
---|
1031 | renameAttribute(att.index(), name); |
---|
1032 | } |
---|
1033 | |
---|
1034 | /** |
---|
1035 | * Renames the value of a nominal (or string) attribute value. This |
---|
1036 | * change only affects this dataset. |
---|
1037 | * |
---|
1038 | * @param att the attribute's index (index starts with 0) |
---|
1039 | * @param val the value's index (index starts with 0) |
---|
1040 | * @param name the new name |
---|
1041 | */ |
---|
1042 | public void renameAttributeValue(int att, int val, String name) { |
---|
1043 | |
---|
1044 | Attribute newAtt = (Attribute)attribute(att).copy(); |
---|
1045 | ArrayList<Attribute> newVec = new ArrayList<Attribute>(numAttributes()); |
---|
1046 | |
---|
1047 | newAtt.setValue(val, name); |
---|
1048 | for (int i = 0; i < numAttributes(); i++) { |
---|
1049 | if (i == att) { |
---|
1050 | newVec.add(newAtt); |
---|
1051 | } else { |
---|
1052 | newVec.add(attribute(i)); |
---|
1053 | } |
---|
1054 | } |
---|
1055 | m_Attributes = newVec; |
---|
1056 | } |
---|
1057 | |
---|
1058 | /** |
---|
1059 | * Renames the value of a nominal (or string) attribute value. This |
---|
1060 | * change only affects this dataset. |
---|
1061 | * |
---|
1062 | * @param att the attribute |
---|
1063 | * @param val the value |
---|
1064 | * @param name the new name |
---|
1065 | */ |
---|
1066 | public void renameAttributeValue(Attribute att, String val, |
---|
1067 | String name) { |
---|
1068 | |
---|
1069 | int v = att.indexOfValue(val); |
---|
1070 | if (v == -1) throw new IllegalArgumentException(val + " not found"); |
---|
1071 | renameAttributeValue(att.index(), v, name); |
---|
1072 | } |
---|
1073 | |
---|
1074 | /** |
---|
1075 | * Creates a new dataset of the same size using random sampling |
---|
1076 | * with replacement. |
---|
1077 | * |
---|
1078 | * @param random a random number generator |
---|
1079 | * @return the new dataset |
---|
1080 | */ |
---|
1081 | public Instances resample(Random random) { |
---|
1082 | |
---|
1083 | Instances newData = new Instances(this, numInstances()); |
---|
1084 | while (newData.numInstances() < numInstances()) { |
---|
1085 | newData.add(instance(random.nextInt(numInstances()))); |
---|
1086 | } |
---|
1087 | return newData; |
---|
1088 | } |
---|
1089 | |
---|
1090 | /** |
---|
1091 | * Creates a new dataset of the same size using random sampling |
---|
1092 | * with replacement according to the current instance weights. The |
---|
1093 | * weights of the instances in the new dataset are set to one. |
---|
1094 | * |
---|
1095 | * @param random a random number generator |
---|
1096 | * @return the new dataset |
---|
1097 | */ |
---|
1098 | public Instances resampleWithWeights(Random random) { |
---|
1099 | |
---|
1100 | double [] weights = new double[numInstances()]; |
---|
1101 | for (int i = 0; i < weights.length; i++) { |
---|
1102 | weights[i] = instance(i).weight(); |
---|
1103 | } |
---|
1104 | return resampleWithWeights(random, weights); |
---|
1105 | } |
---|
1106 | |
---|
1107 | |
---|
1108 | /** |
---|
1109 | * Creates a new dataset of the same size using random sampling |
---|
1110 | * with replacement according to the given weight vector. The |
---|
1111 | * weights of the instances in the new dataset are set to one. |
---|
1112 | * The length of the weight vector has to be the same as the |
---|
1113 | * number of instances in the dataset, and all weights have to |
---|
1114 | * be positive. |
---|
1115 | * |
---|
1116 | * @param random a random number generator |
---|
1117 | * @param weights the weight vector |
---|
1118 | * @return the new dataset |
---|
1119 | * @throws IllegalArgumentException if the weights array is of the wrong |
---|
1120 | * length or contains negative weights. |
---|
1121 | */ |
---|
1122 | public Instances resampleWithWeights(Random random, |
---|
1123 | double[] weights) { |
---|
1124 | |
---|
1125 | if (weights.length != numInstances()) { |
---|
1126 | throw new IllegalArgumentException("weights.length != numInstances."); |
---|
1127 | } |
---|
1128 | Instances newData = new Instances(this, numInstances()); |
---|
1129 | if (numInstances() == 0) { |
---|
1130 | return newData; |
---|
1131 | } |
---|
1132 | double[] probabilities = new double[numInstances()]; |
---|
1133 | double sumProbs = 0, sumOfWeights = Utils.sum(weights); |
---|
1134 | for (int i = 0; i < numInstances(); i++) { |
---|
1135 | sumProbs += random.nextDouble(); |
---|
1136 | probabilities[i] = sumProbs; |
---|
1137 | } |
---|
1138 | Utils.normalize(probabilities, sumProbs / sumOfWeights); |
---|
1139 | |
---|
1140 | // Make sure that rounding errors don't mess things up |
---|
1141 | probabilities[numInstances() - 1] = sumOfWeights; |
---|
1142 | int k = 0; int l = 0; |
---|
1143 | sumProbs = 0; |
---|
1144 | while ((k < numInstances() && (l < numInstances()))) { |
---|
1145 | if (weights[l] < 0) { |
---|
1146 | throw new IllegalArgumentException("Weights have to be positive."); |
---|
1147 | } |
---|
1148 | sumProbs += weights[l]; |
---|
1149 | while ((k < numInstances()) && |
---|
1150 | (probabilities[k] <= sumProbs)) { |
---|
1151 | newData.add(instance(l)); |
---|
1152 | newData.instance(k).setWeight(1); |
---|
1153 | k++; |
---|
1154 | } |
---|
1155 | l++; |
---|
1156 | } |
---|
1157 | return newData; |
---|
1158 | } |
---|
1159 | |
---|
1160 | /** |
---|
1161 | * Replaces the instance at the given position. |
---|
1162 | * Shallow copies instance before it is added. Does not |
---|
1163 | * check if the instance is compatible with the dataset. |
---|
1164 | * Note: String or relational values are not transferred. |
---|
1165 | * |
---|
1166 | * @param index position where instance is to be inserted |
---|
1167 | * @param instance the instance to be inserted |
---|
1168 | * @return the instance previously at that position |
---|
1169 | */ |
---|
1170 | //@ requires 0 <= index; |
---|
1171 | //@ requires index < m_Instances.size(); |
---|
1172 | public Instance set(int index, /*@non_null@*/ Instance instance) { |
---|
1173 | |
---|
1174 | Instance newInstance = (Instance)instance.copy(); |
---|
1175 | Instance oldInstance = m_Instances.get(index); |
---|
1176 | |
---|
1177 | newInstance.setDataset(this); |
---|
1178 | m_Instances.set(index, newInstance); |
---|
1179 | |
---|
1180 | return oldInstance; |
---|
1181 | } |
---|
1182 | |
---|
1183 | /** |
---|
1184 | * Sets the class attribute. |
---|
1185 | * |
---|
1186 | * @param att attribute to be the class |
---|
1187 | */ |
---|
1188 | public void setClass(Attribute att) { |
---|
1189 | |
---|
1190 | m_ClassIndex = att.index(); |
---|
1191 | } |
---|
1192 | |
---|
1193 | /** |
---|
1194 | * Sets the class index of the set. |
---|
1195 | * If the class index is negative there is assumed to be no class. |
---|
1196 | * (ie. it is undefined) |
---|
1197 | * |
---|
1198 | * @param classIndex the new class index (index starts with 0) |
---|
1199 | * @throws IllegalArgumentException if the class index is too big or < 0 |
---|
1200 | */ |
---|
1201 | public void setClassIndex(int classIndex) { |
---|
1202 | |
---|
1203 | if (classIndex >= numAttributes()) { |
---|
1204 | throw new IllegalArgumentException("Invalid class index: " + classIndex); |
---|
1205 | } |
---|
1206 | m_ClassIndex = classIndex; |
---|
1207 | } |
---|
1208 | |
---|
1209 | /** |
---|
1210 | * Sets the relation's name. |
---|
1211 | * |
---|
1212 | * @param newName the new relation name. |
---|
1213 | */ |
---|
1214 | public void setRelationName(/*@non_null@*/String newName) { |
---|
1215 | |
---|
1216 | m_RelationName = newName; |
---|
1217 | } |
---|
1218 | |
---|
1219 | /** |
---|
1220 | * Sorts the instances based on an attribute. For numeric attributes, |
---|
1221 | * instances are sorted in ascending order. For nominal attributes, |
---|
1222 | * instances are sorted based on the attribute label ordering |
---|
1223 | * specified in the header. Instances with missing values for the |
---|
1224 | * attribute are placed at the end of the dataset. |
---|
1225 | * |
---|
1226 | * @param attIndex the attribute's index (index starts with 0) |
---|
1227 | */ |
---|
1228 | public void sort(int attIndex) { |
---|
1229 | |
---|
1230 | int i,j; |
---|
1231 | |
---|
1232 | // move all instances with missing values to end |
---|
1233 | j = numInstances() - 1; |
---|
1234 | i = 0; |
---|
1235 | while (i <= j) { |
---|
1236 | if (instance(j).isMissing(attIndex)) { |
---|
1237 | j--; |
---|
1238 | } else { |
---|
1239 | if (instance(i).isMissing(attIndex)) { |
---|
1240 | swap(i,j); |
---|
1241 | j--; |
---|
1242 | } |
---|
1243 | i++; |
---|
1244 | } |
---|
1245 | } |
---|
1246 | quickSort(attIndex, 0, j); |
---|
1247 | } |
---|
1248 | |
---|
1249 | /** |
---|
1250 | * Sorts the instances based on an attribute. For numeric attributes, |
---|
1251 | * instances are sorted into ascending order. For nominal attributes, |
---|
1252 | * instances are sorted based on the attribute label ordering |
---|
1253 | * specified in the header. Instances with missing values for the |
---|
1254 | * attribute are placed at the end of the dataset. |
---|
1255 | * |
---|
1256 | * @param att the attribute |
---|
1257 | */ |
---|
1258 | public void sort(Attribute att) { |
---|
1259 | |
---|
1260 | sort(att.index()); |
---|
1261 | } |
---|
1262 | |
---|
1263 | /** |
---|
1264 | * Stratifies a set of instances according to its class values |
---|
1265 | * if the class attribute is nominal (so that afterwards a |
---|
1266 | * stratified cross-validation can be performed). |
---|
1267 | * |
---|
1268 | * @param numFolds the number of folds in the cross-validation |
---|
1269 | * @throws UnassignedClassException if the class is not set |
---|
1270 | */ |
---|
1271 | public void stratify(int numFolds) { |
---|
1272 | |
---|
1273 | if (numFolds <= 1) { |
---|
1274 | throw new IllegalArgumentException("Number of folds must be greater than 1"); |
---|
1275 | } |
---|
1276 | if (m_ClassIndex < 0) { |
---|
1277 | throw new UnassignedClassException("Class index is negative (not set)!"); |
---|
1278 | } |
---|
1279 | if (classAttribute().isNominal()) { |
---|
1280 | |
---|
1281 | // sort by class |
---|
1282 | int index = 1; |
---|
1283 | while (index < numInstances()) { |
---|
1284 | Instance instance1 = instance(index - 1); |
---|
1285 | for (int j = index; j < numInstances(); j++) { |
---|
1286 | Instance instance2 = instance(j); |
---|
1287 | if ((instance1.classValue() == instance2.classValue()) || |
---|
1288 | (instance1.classIsMissing() && |
---|
1289 | instance2.classIsMissing())) { |
---|
1290 | swap(index,j); |
---|
1291 | index++; |
---|
1292 | } |
---|
1293 | } |
---|
1294 | index++; |
---|
1295 | } |
---|
1296 | stratStep(numFolds); |
---|
1297 | } |
---|
1298 | } |
---|
1299 | |
---|
1300 | /** |
---|
1301 | * Computes the sum of all the instances' weights. |
---|
1302 | * |
---|
1303 | * @return the sum of all the instances' weights as a double |
---|
1304 | */ |
---|
1305 | public /*@pure@*/ double sumOfWeights() { |
---|
1306 | |
---|
1307 | double sum = 0; |
---|
1308 | |
---|
1309 | for (int i = 0; i < numInstances(); i++) { |
---|
1310 | sum += instance(i).weight(); |
---|
1311 | } |
---|
1312 | return sum; |
---|
1313 | } |
---|
1314 | |
---|
1315 | /** |
---|
1316 | * Creates the test set for one fold of a cross-validation on |
---|
1317 | * the dataset. |
---|
1318 | * |
---|
1319 | * @param numFolds the number of folds in the cross-validation. Must |
---|
1320 | * be greater than 1. |
---|
1321 | * @param numFold 0 for the first fold, 1 for the second, ... |
---|
1322 | * @return the test set as a set of weighted instances |
---|
1323 | * @throws IllegalArgumentException if the number of folds is less than 2 |
---|
1324 | * or greater than the number of instances. |
---|
1325 | */ |
---|
1326 | //@ requires 2 <= numFolds && numFolds < numInstances(); |
---|
1327 | //@ requires 0 <= numFold && numFold < numFolds; |
---|
1328 | public Instances testCV(int numFolds, int numFold) { |
---|
1329 | |
---|
1330 | int numInstForFold, first, offset; |
---|
1331 | Instances test; |
---|
1332 | |
---|
1333 | if (numFolds < 2) { |
---|
1334 | throw new IllegalArgumentException("Number of folds must be at least 2!"); |
---|
1335 | } |
---|
1336 | if (numFolds > numInstances()) { |
---|
1337 | throw new IllegalArgumentException("Can't have more folds than instances!"); |
---|
1338 | } |
---|
1339 | numInstForFold = numInstances() / numFolds; |
---|
1340 | if (numFold < numInstances() % numFolds){ |
---|
1341 | numInstForFold++; |
---|
1342 | offset = numFold; |
---|
1343 | }else |
---|
1344 | offset = numInstances() % numFolds; |
---|
1345 | test = new Instances(this, numInstForFold); |
---|
1346 | first = numFold * (numInstances() / numFolds) + offset; |
---|
1347 | copyInstances(first, test, numInstForFold); |
---|
1348 | return test; |
---|
1349 | } |
---|
1350 | |
---|
1351 | /** |
---|
1352 | * Returns the dataset as a string in ARFF format. Strings |
---|
1353 | * are quoted if they contain whitespace characters, or if they |
---|
1354 | * are a question mark. |
---|
1355 | * |
---|
1356 | * @return the dataset in ARFF format as a string |
---|
1357 | */ |
---|
1358 | public String toString() { |
---|
1359 | |
---|
1360 | StringBuffer text = new StringBuffer(); |
---|
1361 | |
---|
1362 | text.append(ARFF_RELATION).append(" "). |
---|
1363 | append(Utils.quote(m_RelationName)).append("\n\n"); |
---|
1364 | for (int i = 0; i < numAttributes(); i++) { |
---|
1365 | text.append(attribute(i)).append("\n"); |
---|
1366 | } |
---|
1367 | text.append("\n").append(ARFF_DATA).append("\n"); |
---|
1368 | |
---|
1369 | text.append(stringWithoutHeader()); |
---|
1370 | return text.toString(); |
---|
1371 | } |
---|
1372 | |
---|
1373 | /** |
---|
1374 | * Returns the instances in the dataset as a string in ARFF format. Strings |
---|
1375 | * are quoted if they contain whitespace characters, or if they |
---|
1376 | * are a question mark. |
---|
1377 | * |
---|
1378 | * @return the dataset in ARFF format as a string |
---|
1379 | */ |
---|
1380 | protected String stringWithoutHeader() { |
---|
1381 | |
---|
1382 | StringBuffer text = new StringBuffer(); |
---|
1383 | |
---|
1384 | for (int i = 0; i < numInstances(); i++) { |
---|
1385 | text.append(instance(i)); |
---|
1386 | if (i < numInstances() - 1) { |
---|
1387 | text.append('\n'); |
---|
1388 | } |
---|
1389 | } |
---|
1390 | return text.toString(); |
---|
1391 | } |
---|
1392 | |
---|
1393 | /** |
---|
1394 | * Creates the training set for one fold of a cross-validation |
---|
1395 | * on the dataset. |
---|
1396 | * |
---|
1397 | * @param numFolds the number of folds in the cross-validation. Must |
---|
1398 | * be greater than 1. |
---|
1399 | * @param numFold 0 for the first fold, 1 for the second, ... |
---|
1400 | * @return the training set |
---|
1401 | * @throws IllegalArgumentException if the number of folds is less than 2 |
---|
1402 | * or greater than the number of instances. |
---|
1403 | */ |
---|
1404 | //@ requires 2 <= numFolds && numFolds < numInstances(); |
---|
1405 | //@ requires 0 <= numFold && numFold < numFolds; |
---|
1406 | public Instances trainCV(int numFolds, int numFold) { |
---|
1407 | |
---|
1408 | int numInstForFold, first, offset; |
---|
1409 | Instances train; |
---|
1410 | |
---|
1411 | if (numFolds < 2) { |
---|
1412 | throw new IllegalArgumentException("Number of folds must be at least 2!"); |
---|
1413 | } |
---|
1414 | if (numFolds > numInstances()) { |
---|
1415 | throw new IllegalArgumentException("Can't have more folds than instances!"); |
---|
1416 | } |
---|
1417 | numInstForFold = numInstances() / numFolds; |
---|
1418 | if (numFold < numInstances() % numFolds) { |
---|
1419 | numInstForFold++; |
---|
1420 | offset = numFold; |
---|
1421 | }else |
---|
1422 | offset = numInstances() % numFolds; |
---|
1423 | train = new Instances(this, numInstances() - numInstForFold); |
---|
1424 | first = numFold * (numInstances() / numFolds) + offset; |
---|
1425 | copyInstances(0, train, first); |
---|
1426 | copyInstances(first + numInstForFold, train, |
---|
1427 | numInstances() - first - numInstForFold); |
---|
1428 | |
---|
1429 | return train; |
---|
1430 | } |
---|
1431 | |
---|
1432 | /** |
---|
1433 | * Creates the training set for one fold of a cross-validation |
---|
1434 | * on the dataset. The data is subsequently randomized based |
---|
1435 | * on the given random number generator. |
---|
1436 | * |
---|
1437 | * @param numFolds the number of folds in the cross-validation. Must |
---|
1438 | * be greater than 1. |
---|
1439 | * @param numFold 0 for the first fold, 1 for the second, ... |
---|
1440 | * @param random the random number generator |
---|
1441 | * @return the training set |
---|
1442 | * @throws IllegalArgumentException if the number of folds is less than 2 |
---|
1443 | * or greater than the number of instances. |
---|
1444 | */ |
---|
1445 | //@ requires 2 <= numFolds && numFolds < numInstances(); |
---|
1446 | //@ requires 0 <= numFold && numFold < numFolds; |
---|
1447 | public Instances trainCV(int numFolds, int numFold, Random random) { |
---|
1448 | |
---|
1449 | Instances train = trainCV(numFolds, numFold); |
---|
1450 | train.randomize(random); |
---|
1451 | return train; |
---|
1452 | } |
---|
1453 | |
---|
1454 | /** |
---|
1455 | * Computes the variance for a numeric attribute. |
---|
1456 | * |
---|
1457 | * @param attIndex the numeric attribute (index starts with 0) |
---|
1458 | * @return the variance if the attribute is numeric |
---|
1459 | * @throws IllegalArgumentException if the attribute is not numeric |
---|
1460 | */ |
---|
1461 | public /*@pure@*/ double variance(int attIndex) { |
---|
1462 | |
---|
1463 | double sum = 0, sumSquared = 0, sumOfWeights = 0; |
---|
1464 | |
---|
1465 | if (!attribute(attIndex).isNumeric()) { |
---|
1466 | throw new IllegalArgumentException("Can't compute variance because attribute is " + |
---|
1467 | "not numeric!"); |
---|
1468 | } |
---|
1469 | for (int i = 0; i < numInstances(); i++) { |
---|
1470 | if (!instance(i).isMissing(attIndex)) { |
---|
1471 | sum += instance(i).weight() * |
---|
1472 | instance(i).value(attIndex); |
---|
1473 | sumSquared += instance(i).weight() * |
---|
1474 | instance(i).value(attIndex) * |
---|
1475 | instance(i).value(attIndex); |
---|
1476 | sumOfWeights += instance(i).weight(); |
---|
1477 | } |
---|
1478 | } |
---|
1479 | if (sumOfWeights <= 1) { |
---|
1480 | return 0; |
---|
1481 | } |
---|
1482 | double result = (sumSquared - (sum * sum / sumOfWeights)) / |
---|
1483 | (sumOfWeights - 1); |
---|
1484 | |
---|
1485 | // We don't like negative variance |
---|
1486 | if (result < 0) { |
---|
1487 | return 0; |
---|
1488 | } else { |
---|
1489 | return result; |
---|
1490 | } |
---|
1491 | } |
---|
1492 | |
---|
1493 | /** |
---|
1494 | * Computes the variance for a numeric attribute. |
---|
1495 | * |
---|
1496 | * @param att the numeric attribute |
---|
1497 | * @return the variance if the attribute is numeric |
---|
1498 | * @throws IllegalArgumentException if the attribute is not numeric |
---|
1499 | */ |
---|
1500 | public /*@pure@*/ double variance(Attribute att) { |
---|
1501 | |
---|
1502 | return variance(att.index()); |
---|
1503 | } |
---|
1504 | |
---|
1505 | /** |
---|
1506 | * Calculates summary statistics on the values that appear in this |
---|
1507 | * set of instances for a specified attribute. |
---|
1508 | * |
---|
1509 | * @param index the index of the attribute to summarize (index starts with 0) |
---|
1510 | * @return an AttributeStats object with it's fields calculated. |
---|
1511 | */ |
---|
1512 | //@ requires 0 <= index && index < numAttributes(); |
---|
1513 | public AttributeStats attributeStats(int index) { |
---|
1514 | |
---|
1515 | AttributeStats result = new AttributeStats(); |
---|
1516 | if (attribute(index).isNominal()) { |
---|
1517 | result.nominalCounts = new int [attribute(index).numValues()]; |
---|
1518 | result.nominalWeights = new double[attribute(index).numValues()]; |
---|
1519 | } |
---|
1520 | if (attribute(index).isNumeric()) { |
---|
1521 | result.numericStats = new weka.experiment.Stats(); |
---|
1522 | } |
---|
1523 | result.totalCount = numInstances(); |
---|
1524 | |
---|
1525 | double [] attVals = attributeToDoubleArray(index); |
---|
1526 | int [] sorted = Utils.sort(attVals); |
---|
1527 | int currentCount = 0; |
---|
1528 | double currentWeight = 0; |
---|
1529 | double prev = Double.NaN; |
---|
1530 | for (int j = 0; j < numInstances(); j++) { |
---|
1531 | Instance current = instance(sorted[j]); |
---|
1532 | if (current.isMissing(index)) { |
---|
1533 | result.missingCount = numInstances() - j; |
---|
1534 | break; |
---|
1535 | } |
---|
1536 | if (current.value(index) == prev) { |
---|
1537 | currentCount++; |
---|
1538 | currentWeight += current.weight(); |
---|
1539 | } else { |
---|
1540 | result.addDistinct(prev, currentCount, currentWeight); |
---|
1541 | currentCount = 1; |
---|
1542 | currentWeight = current.weight(); |
---|
1543 | prev = current.value(index); |
---|
1544 | } |
---|
1545 | } |
---|
1546 | result.addDistinct(prev, currentCount, currentWeight); |
---|
1547 | result.distinctCount--; // So we don't count "missing" as a value |
---|
1548 | return result; |
---|
1549 | } |
---|
1550 | |
---|
1551 | /** |
---|
1552 | * Gets the value of all instances in this dataset for a particular |
---|
1553 | * attribute. Useful in conjunction with Utils.sort to allow iterating |
---|
1554 | * through the dataset in sorted order for some attribute. |
---|
1555 | * |
---|
1556 | * @param index the index of the attribute. |
---|
1557 | * @return an array containing the value of the desired attribute for |
---|
1558 | * each instance in the dataset. |
---|
1559 | */ |
---|
1560 | //@ requires 0 <= index && index < numAttributes(); |
---|
1561 | public /*@pure@*/ double [] attributeToDoubleArray(int index) { |
---|
1562 | |
---|
1563 | double [] result = new double[numInstances()]; |
---|
1564 | for (int i = 0; i < result.length; i++) { |
---|
1565 | result[i] = instance(i).value(index); |
---|
1566 | } |
---|
1567 | return result; |
---|
1568 | } |
---|
1569 | |
---|
1570 | /** |
---|
1571 | * Generates a string summarizing the set of instances. Gives a breakdown |
---|
1572 | * for each attribute indicating the number of missing/discrete/unique |
---|
1573 | * values and other information. |
---|
1574 | * |
---|
1575 | * @return a string summarizing the dataset |
---|
1576 | */ |
---|
1577 | public String toSummaryString() { |
---|
1578 | |
---|
1579 | StringBuffer result = new StringBuffer(); |
---|
1580 | result.append("Relation Name: ").append(relationName()).append('\n'); |
---|
1581 | result.append("Num Instances: ").append(numInstances()).append('\n'); |
---|
1582 | result.append("Num Attributes: ").append(numAttributes()).append('\n'); |
---|
1583 | result.append('\n'); |
---|
1584 | |
---|
1585 | result.append(Utils.padLeft("", 5)).append(Utils.padRight("Name", 25)); |
---|
1586 | result.append(Utils.padLeft("Type", 5)).append(Utils.padLeft("Nom", 5)); |
---|
1587 | result.append(Utils.padLeft("Int", 5)).append(Utils.padLeft("Real", 5)); |
---|
1588 | result.append(Utils.padLeft("Missing", 12)); |
---|
1589 | result.append(Utils.padLeft("Unique", 12)); |
---|
1590 | result.append(Utils.padLeft("Dist", 6)).append('\n'); |
---|
1591 | for (int i = 0; i < numAttributes(); i++) { |
---|
1592 | Attribute a = attribute(i); |
---|
1593 | AttributeStats as = attributeStats(i); |
---|
1594 | result.append(Utils.padLeft("" + (i + 1), 4)).append(' '); |
---|
1595 | result.append(Utils.padRight(a.name(), 25)).append(' '); |
---|
1596 | long percent; |
---|
1597 | switch (a.type()) { |
---|
1598 | case Attribute.NOMINAL: |
---|
1599 | result.append(Utils.padLeft("Nom", 4)).append(' '); |
---|
1600 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1601 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1602 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1603 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1604 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1605 | break; |
---|
1606 | case Attribute.NUMERIC: |
---|
1607 | result.append(Utils.padLeft("Num", 4)).append(' '); |
---|
1608 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1609 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1610 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1611 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1612 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1613 | break; |
---|
1614 | case Attribute.DATE: |
---|
1615 | result.append(Utils.padLeft("Dat", 4)).append(' '); |
---|
1616 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1617 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1618 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1619 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1620 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1621 | break; |
---|
1622 | case Attribute.STRING: |
---|
1623 | result.append(Utils.padLeft("Str", 4)).append(' '); |
---|
1624 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1625 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1626 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1627 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1628 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1629 | break; |
---|
1630 | case Attribute.RELATIONAL: |
---|
1631 | result.append(Utils.padLeft("Rel", 4)).append(' '); |
---|
1632 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1633 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1634 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1635 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1636 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1637 | break; |
---|
1638 | default: |
---|
1639 | result.append(Utils.padLeft("???", 4)).append(' '); |
---|
1640 | result.append(Utils.padLeft("" + 0, 3)).append("% "); |
---|
1641 | percent = Math.round(100.0 * as.intCount / as.totalCount); |
---|
1642 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1643 | percent = Math.round(100.0 * as.realCount / as.totalCount); |
---|
1644 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1645 | break; |
---|
1646 | } |
---|
1647 | result.append(Utils.padLeft("" + as.missingCount, 5)).append(" /"); |
---|
1648 | percent = Math.round(100.0 * as.missingCount / as.totalCount); |
---|
1649 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1650 | result.append(Utils.padLeft("" + as.uniqueCount, 5)).append(" /"); |
---|
1651 | percent = Math.round(100.0 * as.uniqueCount / as.totalCount); |
---|
1652 | result.append(Utils.padLeft("" + percent, 3)).append("% "); |
---|
1653 | result.append(Utils.padLeft("" + as.distinctCount, 5)).append(' '); |
---|
1654 | result.append('\n'); |
---|
1655 | } |
---|
1656 | return result.toString(); |
---|
1657 | } |
---|
1658 | |
---|
1659 | /** |
---|
1660 | * Copies instances from one set to the end of another |
---|
1661 | * one. |
---|
1662 | * |
---|
1663 | * @param from the position of the first instance to be copied |
---|
1664 | * @param dest the destination for the instances |
---|
1665 | * @param num the number of instances to be copied |
---|
1666 | */ |
---|
1667 | //@ requires 0 <= from && from <= numInstances() - num; |
---|
1668 | //@ requires 0 <= num; |
---|
1669 | protected void copyInstances(int from, /*@non_null@*/ Instances dest, int num) { |
---|
1670 | |
---|
1671 | for (int i = 0; i < num; i++) { |
---|
1672 | dest.add(instance(from + i)); |
---|
1673 | } |
---|
1674 | } |
---|
1675 | |
---|
1676 | /** |
---|
1677 | * Replaces the attribute information by a clone of |
---|
1678 | * itself. |
---|
1679 | */ |
---|
1680 | protected void freshAttributeInfo() { |
---|
1681 | |
---|
1682 | ArrayList<Attribute> newList = new ArrayList<Attribute>(m_Attributes.size()); |
---|
1683 | for (Attribute att : m_Attributes) { |
---|
1684 | newList.add((Attribute)att.copy()); |
---|
1685 | } |
---|
1686 | m_Attributes = newList; |
---|
1687 | } |
---|
1688 | |
---|
1689 | /** |
---|
1690 | * Returns string including all instances, their weights and |
---|
1691 | * their indices in the original dataset. |
---|
1692 | * |
---|
1693 | * @return description of instance and its weight as a string |
---|
1694 | */ |
---|
1695 | protected /*@pure@*/ String instancesAndWeights(){ |
---|
1696 | |
---|
1697 | StringBuffer text = new StringBuffer(); |
---|
1698 | |
---|
1699 | for (int i = 0; i < numInstances(); i++) { |
---|
1700 | text.append(instance(i) + " " + instance(i).weight()); |
---|
1701 | if (i < numInstances() - 1) { |
---|
1702 | text.append("\n"); |
---|
1703 | } |
---|
1704 | } |
---|
1705 | return text.toString(); |
---|
1706 | } |
---|
1707 | |
---|
1708 | /** |
---|
1709 | * Partitions the instances around a pivot. Used by quicksort and |
---|
1710 | * kthSmallestValue. |
---|
1711 | * |
---|
1712 | * @param attIndex the attribute's index (index starts with 0) |
---|
1713 | * @param l the first index of the subset (index starts with 0) |
---|
1714 | * @param r the last index of the subset (index starts with 0) |
---|
1715 | * |
---|
1716 | * @return the index of the middle element |
---|
1717 | */ |
---|
1718 | //@ requires 0 <= attIndex && attIndex < numAttributes(); |
---|
1719 | //@ requires 0 <= left && left <= right && right < numInstances(); |
---|
1720 | protected int partition(int attIndex, int l, int r) { |
---|
1721 | |
---|
1722 | double pivot = instance((l + r) / 2).value(attIndex); |
---|
1723 | |
---|
1724 | while (l < r) { |
---|
1725 | while ((instance(l).value(attIndex) < pivot) && (l < r)) { |
---|
1726 | l++; |
---|
1727 | } |
---|
1728 | while ((instance(r).value(attIndex) > pivot) && (l < r)) { |
---|
1729 | r--; |
---|
1730 | } |
---|
1731 | if (l < r) { |
---|
1732 | swap(l, r); |
---|
1733 | l++; |
---|
1734 | r--; |
---|
1735 | } |
---|
1736 | } |
---|
1737 | if ((l == r) && (instance(r).value(attIndex) > pivot)) { |
---|
1738 | r--; |
---|
1739 | } |
---|
1740 | |
---|
1741 | return r; |
---|
1742 | } |
---|
1743 | |
---|
1744 | /** |
---|
1745 | * Implements quicksort according to Manber's "Introduction to |
---|
1746 | * Algorithms". |
---|
1747 | * |
---|
1748 | * @param attIndex the attribute's index (index starts with 0) |
---|
1749 | * @param left the first index of the subset to be sorted (index starts with 0) |
---|
1750 | * @param right the last index of the subset to be sorted (index starts with 0) |
---|
1751 | */ |
---|
1752 | //@ requires 0 <= attIndex && attIndex < numAttributes(); |
---|
1753 | //@ requires 0 <= first && first <= right && right < numInstances(); |
---|
1754 | protected void quickSort(int attIndex, int left, int right) { |
---|
1755 | |
---|
1756 | if (left < right) { |
---|
1757 | int middle = partition(attIndex, left, right); |
---|
1758 | quickSort(attIndex, left, middle); |
---|
1759 | quickSort(attIndex, middle + 1, right); |
---|
1760 | } |
---|
1761 | } |
---|
1762 | |
---|
1763 | /** |
---|
1764 | * Implements computation of the kth-smallest element according |
---|
1765 | * to Manber's "Introduction to Algorithms". |
---|
1766 | * |
---|
1767 | * @param attIndex the attribute's index (index starts with 0) |
---|
1768 | * @param left the first index of the subset (index starts with 0) |
---|
1769 | * @param right the last index of the subset (index starts with 0) |
---|
1770 | * @param k the value of k |
---|
1771 | * |
---|
1772 | * @return the index of the kth-smallest element |
---|
1773 | */ |
---|
1774 | //@ requires 0 <= attIndex && attIndex < numAttributes(); |
---|
1775 | //@ requires 0 <= first && first <= right && right < numInstances(); |
---|
1776 | protected int select(int attIndex, int left, int right, int k) { |
---|
1777 | |
---|
1778 | if (left == right) { |
---|
1779 | return left; |
---|
1780 | } else { |
---|
1781 | int middle = partition(attIndex, left, right); |
---|
1782 | if ((middle - left + 1) >= k) { |
---|
1783 | return select(attIndex, left, middle, k); |
---|
1784 | } else { |
---|
1785 | return select(attIndex, middle + 1, right, k - (middle - left + 1)); |
---|
1786 | } |
---|
1787 | } |
---|
1788 | } |
---|
1789 | |
---|
1790 | /** |
---|
1791 | * Help function needed for stratification of set. |
---|
1792 | * |
---|
1793 | * @param numFolds the number of folds for the stratification |
---|
1794 | */ |
---|
1795 | protected void stratStep (int numFolds){ |
---|
1796 | |
---|
1797 | ArrayList<Instance> newVec = new ArrayList<Instance>(m_Instances.size()); |
---|
1798 | int start = 0, j; |
---|
1799 | |
---|
1800 | // create stratified batch |
---|
1801 | while (newVec.size() < numInstances()) { |
---|
1802 | j = start; |
---|
1803 | while (j < numInstances()) { |
---|
1804 | newVec.add(instance(j)); |
---|
1805 | j = j + numFolds; |
---|
1806 | } |
---|
1807 | start++; |
---|
1808 | } |
---|
1809 | m_Instances = newVec; |
---|
1810 | } |
---|
1811 | |
---|
1812 | /** |
---|
1813 | * Swaps two instances in the set. |
---|
1814 | * |
---|
1815 | * @param i the first instance's index (index starts with 0) |
---|
1816 | * @param j the second instance's index (index starts with 0) |
---|
1817 | */ |
---|
1818 | //@ requires 0 <= i && i < numInstances(); |
---|
1819 | //@ requires 0 <= j && j < numInstances(); |
---|
1820 | public void swap(int i, int j){ |
---|
1821 | |
---|
1822 | Instance in = m_Instances.get(i); |
---|
1823 | m_Instances.set(i, m_Instances.get(j)); |
---|
1824 | m_Instances.set(j, in); |
---|
1825 | } |
---|
1826 | |
---|
1827 | /** |
---|
1828 | * Merges two sets of Instances together. The resulting set will have |
---|
1829 | * all the attributes of the first set plus all the attributes of the |
---|
1830 | * second set. The number of instances in both sets must be the same. |
---|
1831 | * |
---|
1832 | * @param first the first set of Instances |
---|
1833 | * @param second the second set of Instances |
---|
1834 | * @return the merged set of Instances |
---|
1835 | * @throws IllegalArgumentException if the datasets are not the same size |
---|
1836 | */ |
---|
1837 | public static Instances mergeInstances(Instances first, Instances second) { |
---|
1838 | |
---|
1839 | if (first.numInstances() != second.numInstances()) { |
---|
1840 | throw new IllegalArgumentException("Instance sets must be of the same size"); |
---|
1841 | } |
---|
1842 | |
---|
1843 | // Create the vector of merged attributes |
---|
1844 | ArrayList<Attribute> newAttributes = new ArrayList<Attribute>(); |
---|
1845 | for (int i = 0; i < first.numAttributes(); i++) { |
---|
1846 | newAttributes.add(first.attribute(i)); |
---|
1847 | } |
---|
1848 | for (int i = 0; i < second.numAttributes(); i++) { |
---|
1849 | newAttributes.add(second.attribute(i)); |
---|
1850 | } |
---|
1851 | |
---|
1852 | // Create the set of Instances |
---|
1853 | Instances merged = new Instances(first.relationName() + '_' |
---|
1854 | + second.relationName(), |
---|
1855 | newAttributes, |
---|
1856 | first.numInstances()); |
---|
1857 | // Merge each instance |
---|
1858 | for (int i = 0; i < first.numInstances(); i++) { |
---|
1859 | merged.add(first.instance(i).mergeInstance(second.instance(i))); |
---|
1860 | } |
---|
1861 | return merged; |
---|
1862 | } |
---|
1863 | |
---|
1864 | /** |
---|
1865 | * Method for testing this class. |
---|
1866 | * |
---|
1867 | * @param argv should contain one element: the name of an ARFF file |
---|
1868 | */ |
---|
1869 | //@ requires argv != null; |
---|
1870 | //@ requires argv.length == 1; |
---|
1871 | //@ requires argv[0] != null; |
---|
1872 | public static void test(String [] argv) { |
---|
1873 | |
---|
1874 | Instances instances, secondInstances, train, test, empty; |
---|
1875 | Random random = new Random(2); |
---|
1876 | Reader reader; |
---|
1877 | int start, num; |
---|
1878 | ArrayList<Attribute> testAtts; |
---|
1879 | ArrayList<String> testVals; |
---|
1880 | int i,j; |
---|
1881 | |
---|
1882 | try{ |
---|
1883 | if (argv.length > 1) { |
---|
1884 | throw (new Exception("Usage: Instances [<filename>]")); |
---|
1885 | } |
---|
1886 | |
---|
1887 | // Creating set of instances from scratch |
---|
1888 | testVals = new ArrayList<String>(2); |
---|
1889 | testVals.add("first_value"); |
---|
1890 | testVals.add("second_value"); |
---|
1891 | testAtts = new ArrayList<Attribute>(2); |
---|
1892 | testAtts.add(new Attribute("nominal_attribute", testVals)); |
---|
1893 | testAtts.add(new Attribute("numeric_attribute")); |
---|
1894 | instances = new Instances("test_set", testAtts, 10); |
---|
1895 | instances.add(new DenseInstance(instances.numAttributes())); |
---|
1896 | instances.add(new DenseInstance(instances.numAttributes())); |
---|
1897 | instances.add(new DenseInstance(instances.numAttributes())); |
---|
1898 | instances.setClassIndex(0); |
---|
1899 | System.out.println("\nSet of instances created from scratch:\n"); |
---|
1900 | System.out.println(instances); |
---|
1901 | |
---|
1902 | if (argv.length == 1) { |
---|
1903 | String filename = argv[0]; |
---|
1904 | reader = new FileReader(filename); |
---|
1905 | |
---|
1906 | // Read first five instances and print them |
---|
1907 | System.out.println("\nFirst five instances from file:\n"); |
---|
1908 | instances = new Instances(reader, 1); |
---|
1909 | instances.setClassIndex(instances.numAttributes() - 1); |
---|
1910 | i = 0; |
---|
1911 | while ((i < 5) && (instances.readInstance(reader))) { |
---|
1912 | i++; |
---|
1913 | } |
---|
1914 | System.out.println(instances); |
---|
1915 | |
---|
1916 | // Read all the instances in the file |
---|
1917 | reader = new FileReader(filename); |
---|
1918 | instances = new Instances(reader); |
---|
1919 | |
---|
1920 | // Make the last attribute be the class |
---|
1921 | instances.setClassIndex(instances.numAttributes() - 1); |
---|
1922 | |
---|
1923 | // Print header and instances. |
---|
1924 | System.out.println("\nDataset:\n"); |
---|
1925 | System.out.println(instances); |
---|
1926 | System.out.println("\nClass index: "+instances.classIndex()); |
---|
1927 | } |
---|
1928 | |
---|
1929 | // Test basic methods based on class index. |
---|
1930 | System.out.println("\nClass name: "+instances.classAttribute().name()); |
---|
1931 | System.out.println("\nClass index: "+instances.classIndex()); |
---|
1932 | System.out.println("\nClass is nominal: " + |
---|
1933 | instances.classAttribute().isNominal()); |
---|
1934 | System.out.println("\nClass is numeric: " + |
---|
1935 | instances.classAttribute().isNumeric()); |
---|
1936 | System.out.println("\nClasses:\n"); |
---|
1937 | for (i = 0; i < instances.numClasses(); i++) { |
---|
1938 | System.out.println(instances.classAttribute().value(i)); |
---|
1939 | } |
---|
1940 | System.out.println("\nClass values and labels of instances:\n"); |
---|
1941 | for (i = 0; i < instances.numInstances(); i++) { |
---|
1942 | Instance inst = instances.instance(i); |
---|
1943 | System.out.print(inst.classValue() + "\t"); |
---|
1944 | System.out.print(inst.toString(inst.classIndex())); |
---|
1945 | if (instances.instance(i).classIsMissing()) { |
---|
1946 | System.out.println("\tis missing"); |
---|
1947 | } else { |
---|
1948 | System.out.println(); |
---|
1949 | } |
---|
1950 | } |
---|
1951 | |
---|
1952 | // Create random weights. |
---|
1953 | System.out.println("\nCreating random weights for instances."); |
---|
1954 | for (i = 0; i < instances.numInstances(); i++) { |
---|
1955 | instances.instance(i).setWeight(random.nextDouble()); |
---|
1956 | } |
---|
1957 | |
---|
1958 | // Print all instances and their weights (and the sum of weights). |
---|
1959 | System.out.println("\nInstances and their weights:\n"); |
---|
1960 | System.out.println(instances.instancesAndWeights()); |
---|
1961 | System.out.print("\nSum of weights: "); |
---|
1962 | System.out.println(instances.sumOfWeights()); |
---|
1963 | |
---|
1964 | // Insert an attribute |
---|
1965 | secondInstances = new Instances(instances); |
---|
1966 | Attribute testAtt = new Attribute("Inserted"); |
---|
1967 | secondInstances.insertAttributeAt(testAtt, 0); |
---|
1968 | System.out.println("\nSet with inserted attribute:\n"); |
---|
1969 | System.out.println(secondInstances); |
---|
1970 | System.out.println("\nClass name: " |
---|
1971 | + secondInstances.classAttribute().name()); |
---|
1972 | |
---|
1973 | // Delete the attribute |
---|
1974 | secondInstances.deleteAttributeAt(0); |
---|
1975 | System.out.println("\nSet with attribute deleted:\n"); |
---|
1976 | System.out.println(secondInstances); |
---|
1977 | System.out.println("\nClass name: " |
---|
1978 | + secondInstances.classAttribute().name()); |
---|
1979 | |
---|
1980 | // Test if headers are equal |
---|
1981 | System.out.println("\nHeaders equal: "+ |
---|
1982 | instances.equalHeaders(secondInstances) + "\n"); |
---|
1983 | |
---|
1984 | // Print data in internal format. |
---|
1985 | System.out.println("\nData (internal values):\n"); |
---|
1986 | for (i = 0; i < instances.numInstances(); i++) { |
---|
1987 | for (j = 0; j < instances.numAttributes(); j++) { |
---|
1988 | if (instances.instance(i).isMissing(j)) { |
---|
1989 | System.out.print("? "); |
---|
1990 | } else { |
---|
1991 | System.out.print(instances.instance(i).value(j) + " "); |
---|
1992 | } |
---|
1993 | } |
---|
1994 | System.out.println(); |
---|
1995 | } |
---|
1996 | |
---|
1997 | // Just print header |
---|
1998 | System.out.println("\nEmpty dataset:\n"); |
---|
1999 | empty = new Instances(instances, 0); |
---|
2000 | System.out.println(empty); |
---|
2001 | System.out.println("\nClass name: "+empty.classAttribute().name()); |
---|
2002 | |
---|
2003 | // Create copy and rename an attribute and a value (if possible) |
---|
2004 | if (empty.classAttribute().isNominal()) { |
---|
2005 | Instances copy = new Instances(empty, 0); |
---|
2006 | copy.renameAttribute(copy.classAttribute(), "new_name"); |
---|
2007 | copy.renameAttributeValue(copy.classAttribute(), |
---|
2008 | copy.classAttribute().value(0), |
---|
2009 | "new_val_name"); |
---|
2010 | System.out.println("\nDataset with names changed:\n" + copy); |
---|
2011 | System.out.println("\nOriginal dataset:\n" + empty); |
---|
2012 | } |
---|
2013 | |
---|
2014 | // Create and prints subset of instances. |
---|
2015 | start = instances.numInstances() / 4; |
---|
2016 | num = instances.numInstances() / 2; |
---|
2017 | System.out.print("\nSubset of dataset: "); |
---|
2018 | System.out.println(num + " instances from " + (start + 1) |
---|
2019 | + ". instance"); |
---|
2020 | secondInstances = new Instances(instances, start, num); |
---|
2021 | System.out.println("\nClass name: " |
---|
2022 | + secondInstances.classAttribute().name()); |
---|
2023 | |
---|
2024 | // Print all instances and their weights (and the sum of weights). |
---|
2025 | System.out.println("\nInstances and their weights:\n"); |
---|
2026 | System.out.println(secondInstances.instancesAndWeights()); |
---|
2027 | System.out.print("\nSum of weights: "); |
---|
2028 | System.out.println(secondInstances.sumOfWeights()); |
---|
2029 | |
---|
2030 | // Create and print training and test sets for 3-fold |
---|
2031 | // cross-validation. |
---|
2032 | System.out.println("\nTrain and test folds for 3-fold CV:"); |
---|
2033 | if (instances.classAttribute().isNominal()) { |
---|
2034 | instances.stratify(3); |
---|
2035 | } |
---|
2036 | for (j = 0; j < 3; j++) { |
---|
2037 | train = instances.trainCV(3,j, new Random(1)); |
---|
2038 | test = instances.testCV(3,j); |
---|
2039 | |
---|
2040 | // Print all instances and their weights (and the sum of weights). |
---|
2041 | System.out.println("\nTrain: "); |
---|
2042 | System.out.println("\nInstances and their weights:\n"); |
---|
2043 | System.out.println(train.instancesAndWeights()); |
---|
2044 | System.out.print("\nSum of weights: "); |
---|
2045 | System.out.println(train.sumOfWeights()); |
---|
2046 | System.out.println("\nClass name: "+train.classAttribute().name()); |
---|
2047 | System.out.println("\nTest: "); |
---|
2048 | System.out.println("\nInstances and their weights:\n"); |
---|
2049 | System.out.println(test.instancesAndWeights()); |
---|
2050 | System.out.print("\nSum of weights: "); |
---|
2051 | System.out.println(test.sumOfWeights()); |
---|
2052 | System.out.println("\nClass name: "+test.classAttribute().name()); |
---|
2053 | } |
---|
2054 | |
---|
2055 | // Randomize instances and print them. |
---|
2056 | System.out.println("\nRandomized dataset:"); |
---|
2057 | instances.randomize(random); |
---|
2058 | |
---|
2059 | // Print all instances and their weights (and the sum of weights). |
---|
2060 | System.out.println("\nInstances and their weights:\n"); |
---|
2061 | System.out.println(instances.instancesAndWeights()); |
---|
2062 | System.out.print("\nSum of weights: "); |
---|
2063 | System.out.println(instances.sumOfWeights()); |
---|
2064 | |
---|
2065 | // Sort instances according to first attribute and |
---|
2066 | // print them. |
---|
2067 | System.out.print("\nInstances sorted according to first attribute:\n "); |
---|
2068 | instances.sort(0); |
---|
2069 | |
---|
2070 | // Print all instances and their weights (and the sum of weights). |
---|
2071 | System.out.println("\nInstances and their weights:\n"); |
---|
2072 | System.out.println(instances.instancesAndWeights()); |
---|
2073 | System.out.print("\nSum of weights: "); |
---|
2074 | System.out.println(instances.sumOfWeights()); |
---|
2075 | } catch (Exception e) { |
---|
2076 | e.printStackTrace(); |
---|
2077 | } |
---|
2078 | } |
---|
2079 | |
---|
2080 | /** |
---|
2081 | * Main method for this class. The following calls are possible: |
---|
2082 | * <ul> |
---|
2083 | * <li> |
---|
2084 | * <code>weka.core.Instances</code> help<br/> |
---|
2085 | * prints a short list of possible commands. |
---|
2086 | * </li> |
---|
2087 | * <li> |
---|
2088 | * <code>weka.core.Instances</code> <filename><br/> |
---|
2089 | * prints a summary of a set of instances. |
---|
2090 | * </li> |
---|
2091 | * <li> |
---|
2092 | * <code>weka.core.Instances</code> merge <filename1> <filename2><br/> |
---|
2093 | * merges the two datasets (must have same number of instances) and |
---|
2094 | * outputs the results on stdout. |
---|
2095 | * </li> |
---|
2096 | * <li> |
---|
2097 | * <code>weka.core.Instances</code> append <filename1> <filename2><br/> |
---|
2098 | * appends the second dataset to the first one (must have same headers) and |
---|
2099 | * outputs the results on stdout. |
---|
2100 | * </li> |
---|
2101 | * <li> |
---|
2102 | * <code>weka.core.Instances</code> headers <filename1> <filename2><br/> |
---|
2103 | * Compares the headers of the two datasets and prints whether they match |
---|
2104 | * or not. |
---|
2105 | * </li> |
---|
2106 | * <li> |
---|
2107 | * <code>weka.core.Instances</code> randomize <seed> <filename><br/> |
---|
2108 | * randomizes the dataset with the given seed and outputs the result on stdout. |
---|
2109 | * </li> |
---|
2110 | * </ul> |
---|
2111 | * |
---|
2112 | * @param args the commandline parameters |
---|
2113 | */ |
---|
2114 | public static void main(String[] args) { |
---|
2115 | |
---|
2116 | try { |
---|
2117 | Instances i; |
---|
2118 | // read from stdin and print statistics |
---|
2119 | if (args.length == 0) { |
---|
2120 | DataSource source = new DataSource(System.in); |
---|
2121 | i = source.getDataSet(); |
---|
2122 | System.out.println(i.toSummaryString()); |
---|
2123 | } |
---|
2124 | // read file and print statistics |
---|
2125 | else if ((args.length == 1) && (!args[0].equals("-h")) && (!args[0].equals("help"))) { |
---|
2126 | DataSource source = new DataSource(args[0]); |
---|
2127 | i = source.getDataSet(); |
---|
2128 | System.out.println(i.toSummaryString()); |
---|
2129 | } |
---|
2130 | // read two files, merge them and print result to stdout |
---|
2131 | else if ((args.length == 3) && (args[0].toLowerCase().equals("merge"))) { |
---|
2132 | DataSource source1 = new DataSource(args[1]); |
---|
2133 | DataSource source2 = new DataSource(args[2]); |
---|
2134 | i = Instances.mergeInstances(source1.getDataSet(), source2.getDataSet()); |
---|
2135 | System.out.println(i); |
---|
2136 | } |
---|
2137 | // read two files, append them and print result to stdout |
---|
2138 | else if ((args.length == 3) && (args[0].toLowerCase().equals("append"))) { |
---|
2139 | DataSource source1 = new DataSource(args[1]); |
---|
2140 | DataSource source2 = new DataSource(args[2]); |
---|
2141 | String msg = source1.getStructure().equalHeadersMsg(source2.getStructure()); |
---|
2142 | if (msg != null) |
---|
2143 | throw new Exception("The two datasets have different headers:\n" + msg); |
---|
2144 | Instances structure = source1.getStructure(); |
---|
2145 | System.out.println(source1.getStructure()); |
---|
2146 | while (source1.hasMoreElements(structure)) |
---|
2147 | System.out.println(source1.nextElement(structure)); |
---|
2148 | structure = source2.getStructure(); |
---|
2149 | while (source2.hasMoreElements(structure)) |
---|
2150 | System.out.println(source2.nextElement(structure)); |
---|
2151 | } |
---|
2152 | // read two files and compare their headers |
---|
2153 | else if ((args.length == 3) && (args[0].toLowerCase().equals("headers"))) { |
---|
2154 | DataSource source1 = new DataSource(args[1]); |
---|
2155 | DataSource source2 = new DataSource(args[2]); |
---|
2156 | String msg = source1.getStructure().equalHeadersMsg(source2.getStructure()); |
---|
2157 | if (msg == null) |
---|
2158 | System.out.println("Headers match"); |
---|
2159 | else |
---|
2160 | System.out.println("Headers don't match:\n" + msg); |
---|
2161 | } |
---|
2162 | // read file and seed value, randomize data and print result to stdout |
---|
2163 | else if ((args.length == 3) && (args[0].toLowerCase().equals("randomize"))) { |
---|
2164 | DataSource source = new DataSource(args[2]); |
---|
2165 | i = source.getDataSet(); |
---|
2166 | i.randomize(new Random(Integer.parseInt(args[1]))); |
---|
2167 | System.out.println(i); |
---|
2168 | } |
---|
2169 | // wrong parameters or help |
---|
2170 | else { |
---|
2171 | System.err.println( |
---|
2172 | "\nUsage:\n" |
---|
2173 | // help |
---|
2174 | + "\tweka.core.Instances help\n" |
---|
2175 | + "\t\tPrints this help\n" |
---|
2176 | // stats |
---|
2177 | + "\tweka.core.Instances <filename>\n" |
---|
2178 | + "\t\tOutputs dataset statistics\n" |
---|
2179 | // merge |
---|
2180 | + "\tweka.core.Instances merge <filename1> <filename2>\n" |
---|
2181 | + "\t\tMerges the datasets (must have same number of rows).\n" |
---|
2182 | + "\t\tGenerated dataset gets output on stdout.\n" |
---|
2183 | // append |
---|
2184 | + "\tweka.core.Instances append <filename1> <filename2>\n" |
---|
2185 | + "\t\tAppends the second dataset to the first (must have same number of attributes).\n" |
---|
2186 | + "\t\tGenerated dataset gets output on stdout.\n" |
---|
2187 | // headers |
---|
2188 | + "\tweka.core.Instances headers <filename1> <filename2>\n" |
---|
2189 | + "\t\tCompares the structure of the two datasets and outputs whether they\n" |
---|
2190 | + "\t\tdiffer or not.\n" |
---|
2191 | // randomize |
---|
2192 | + "\tweka.core.Instances randomize <seed> <filename>\n" |
---|
2193 | + "\t\tRandomizes the dataset and outputs it on stdout.\n" |
---|
2194 | ); |
---|
2195 | } |
---|
2196 | } |
---|
2197 | catch (Exception ex) { |
---|
2198 | ex.printStackTrace(); |
---|
2199 | System.err.println(ex.getMessage()); |
---|
2200 | } |
---|
2201 | } |
---|
2202 | |
---|
2203 | /** |
---|
2204 | * Returns the revision string. |
---|
2205 | * |
---|
2206 | * @return the revision |
---|
2207 | */ |
---|
2208 | public String getRevision() { |
---|
2209 | return RevisionUtils.extract("$Revision: 5987 $"); |
---|
2210 | } |
---|
2211 | } |
---|