source: branches/MetisMQI/src/main/java/weka/filters/unsupervised/attribute/InterquartileRange.java

Last change on this file was 29, checked in by gnappo, 15 years ago

Taggata versione per la demo e aggiunto branch.

File size: 28.5 KB
Line 
1/*
2 *    This program is free software; you can redistribute it and/or modify
3 *    it under the terms of the GNU General Public License as published by
4 *    the Free Software Foundation; either version 2 of the License, or
5 *    (at your option) any later version.
6 *
7 *    This program is distributed in the hope that it will be useful,
8 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10 *    GNU General Public License for more details.
11 *
12 *    You should have received a copy of the GNU General Public License
13 *    along with this program; if not, write to the Free Software
14 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * InterquartileRange.java
19 * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
20 */
21
22package weka.filters.unsupervised.attribute;
23
24import weka.core.Attribute;
25import weka.core.Capabilities;
26import weka.core.FastVector;
27import weka.core.Instance; 
28import weka.core.DenseInstance;
29import weka.core.Instances;
30import weka.core.Option;
31import weka.core.Range;
32import weka.core.RevisionUtils;
33import weka.core.Utils;
34import weka.core.Capabilities.Capability;
35import weka.filters.SimpleBatchFilter;
36
37import java.util.Enumeration;
38import java.util.Vector;
39
40/**
41 <!-- globalinfo-start -->
42 * A filter for detecting outliers and extreme values based on interquartile ranges. The filter skips the class attribute.<br/>
43 * <br/>
44 * Outliers:<br/>
45 *   Q3 + OF*IQR &lt; x &lt;= Q3 + EVF*IQR<br/>
46 *   or<br/>
47 *   Q1 - EVF*IQR &lt;= x &lt; Q1 - OF*IQR<br/>
48 * <br/>
49 * Extreme values:<br/>
50 *   x &gt; Q3 + EVF*IQR<br/>
51 *   or<br/>
52 *   x &lt; Q1 - EVF*IQR<br/>
53 * <br/>
54 * Key:<br/>
55 *   Q1  = 25% quartile<br/>
56 *   Q3  = 75% quartile<br/>
57 *   IQR = Interquartile Range, difference between Q1 and Q3<br/>
58 *   OF  = Outlier Factor<br/>
59 *   EVF = Extreme Value Factor
60 * <p/>
61 <!-- globalinfo-end -->
62 *
63 <!-- options-start -->
64 * Valid options are: <p/>
65 *
66 * <pre> -D
67 *  Turns on output of debugging information.</pre>
68 *
69 * <pre> -R &lt;col1,col2-col4,...&gt;
70 *  Specifies list of columns to base outlier/extreme value detection
71 *  on. If an instance is considered in at least one of those
72 *  attributes an outlier/extreme value, it is tagged accordingly.
73 *  'first' and 'last' are valid indexes.
74 *  (default none)</pre>
75 *
76 * <pre> -O &lt;num&gt;
77 *  The factor for outlier detection.
78 *  (default: 3)</pre>
79 *
80 * <pre> -E &lt;num&gt;
81 *  The factor for extreme values detection.
82 *  (default: 2*Outlier Factor)</pre>
83 *
84 * <pre> -E-as-O
85 *  Tags extreme values also as outliers.
86 *  (default: off)</pre>
87 *
88 * <pre> -P
89 *  Generates Outlier/ExtremeValue pair for each numeric attribute in
90 *  the range, not just a single indicator pair for all the attributes.
91 *  (default: off)</pre>
92 *
93 * <pre> -M
94 *  Generates an additional attribute 'Offset' per Outlier/ExtremeValue
95 *  pair that contains the multiplier that the value is off the median.
96 *     value = median + 'multiplier' * IQR
97 * Note: implicitely sets '-P'. (default: off)</pre>
98 *
99 <!-- options-end -->
100 *
101 * Thanks to Dale for a few brainstorming sessions.
102 *
103 * @author  Dale Fletcher (dale at cs dot waikato dot ac dot nz)
104 * @author  fracpete (fracpete at waikato dot ac dot nz)
105 * @version $Revision: 5987 $
106 */
107public class InterquartileRange
108  extends SimpleBatchFilter {
109
110  /** for serialization */
111  private static final long serialVersionUID = -227879653639723030L;
112
113  /** indicator for non-numeric attributes */
114  public final static int NON_NUMERIC = -1;
115 
116  /** the attribute range to work on */
117  protected Range m_Attributes = new Range("first-last");
118 
119  /** the generated indices (only for performance reasons) */
120  protected int[] m_AttributeIndices = null;
121
122  /** the factor for detecting outliers */
123  protected double m_OutlierFactor = 3;
124 
125  /** the factor for detecting extreme values, by default 2*m_OutlierFactor */
126  protected double m_ExtremeValuesFactor = 2*m_OutlierFactor;
127 
128  /** whether extreme values are also tagged as outliers */
129  protected boolean m_ExtremeValuesAsOutliers = false;
130
131  /** the upper extreme value threshold (= Q3 + EVF*IQR) */
132  protected double[] m_UpperExtremeValue = null;
133
134  /** the upper outlier threshold (= Q3 + OF*IQR) */
135  protected double[] m_UpperOutlier = null;
136
137  /** the lower outlier threshold (= Q1 - OF*IQR) */
138  protected double[] m_LowerOutlier = null;
139
140  /** the interquartile range  */
141  protected double[] m_IQR = null;
142
143  /** the median  */
144  protected double[] m_Median = null;
145
146  /** the lower extreme value threshold (= Q1 - EVF*IQR) */
147  protected double[] m_LowerExtremeValue = null;
148 
149  /** whether to generate Outlier/ExtremeValue attributes for each attribute
150   * instead of a general one */
151  protected boolean m_DetectionPerAttribute = false;
152
153  /** the position of the outlier attribute */
154  protected int[] m_OutlierAttributePosition = null;
155
156  /** whether to add another attribute called "Offset", that lists the
157   * 'multiplier' by which the outlier/extreme value is away from the median,
158   * i.e., value = median + 'multiplier' * IQR <br/>
159   * automatically enables m_DetectionPerAttribute!
160   */
161  protected boolean m_OutputOffsetMultiplier = false;
162 
163  /**
164   * Returns a string describing this filter
165   *
166   * @return            a description of the filter suitable for
167   *                    displaying in the explorer/experimenter gui
168   */
169  public String globalInfo() {
170    return 
171        "A filter for detecting outliers and extreme values based on "
172      + "interquartile ranges. The filter skips the class attribute.\n\n"
173      + "Outliers:\n"
174      + "  Q3 + OF*IQR < x <= Q3 + EVF*IQR\n"
175      + "  or\n"
176      + "  Q1 - EVF*IQR <= x < Q1 - OF*IQR\n"
177      + "\n"
178      + "Extreme values:\n"
179      + "  x > Q3 + EVF*IQR\n"
180      + "  or\n"
181      + "  x < Q1 - EVF*IQR\n"
182      + "\n"
183      + "Key:\n"
184      + "  Q1  = 25% quartile\n"
185      + "  Q3  = 75% quartile\n"
186      + "  IQR = Interquartile Range, difference between Q1 and Q3\n"
187      + "  OF  = Outlier Factor\n"
188      + "  EVF = Extreme Value Factor";
189  }
190
191  /**
192   * Returns an enumeration describing the available options.
193   *
194   * @return            an enumeration of all the available options.
195   */
196  public Enumeration listOptions() {
197    Vector result = new Vector();
198    Enumeration enm = super.listOptions();
199    while (enm.hasMoreElements())
200      result.add(enm.nextElement());
201     
202    result.addElement(new Option(
203        "\tSpecifies list of columns to base outlier/extreme value detection\n"
204        + "\ton. If an instance is considered in at least one of those\n"
205        + "\tattributes an outlier/extreme value, it is tagged accordingly.\n"
206        + " 'first' and 'last' are valid indexes.\n"
207        + "\t(default none)",
208        "R", 1, "-R <col1,col2-col4,...>"));
209
210    result.addElement(new Option(
211        "\tThe factor for outlier detection.\n"
212        + "\t(default: 3)",
213        "O", 1, "-O <num>"));
214
215    result.addElement(new Option(
216        "\tThe factor for extreme values detection.\n"
217        + "\t(default: 2*Outlier Factor)",
218        "E", 1, "-E <num>"));
219
220    result.addElement(new Option(
221        "\tTags extreme values also as outliers.\n"
222        + "\t(default: off)",
223        "E-as-O", 0, "-E-as-O"));
224
225    result.addElement(new Option(
226        "\tGenerates Outlier/ExtremeValue pair for each numeric attribute in\n"
227        + "\tthe range, not just a single indicator pair for all the attributes.\n"
228        + "\t(default: off)",
229        "P", 0, "-P"));
230
231    result.addElement(new Option(
232        "\tGenerates an additional attribute 'Offset' per Outlier/ExtremeValue\n"
233        + "\tpair that contains the multiplier that the value is off the median.\n"
234        + "\t   value = median + 'multiplier' * IQR\n"
235        + "Note: implicitely sets '-P'."
236        + "\t(default: off)",
237        "M", 0, "-M"));
238
239    return result.elements();
240  }
241
242  /**
243   * Parses a list of options for this object. <p/>
244   *
245   <!-- options-start -->
246   * Valid options are: <p/>
247   *
248   * <pre> -D
249   *  Turns on output of debugging information.</pre>
250   *
251   * <pre> -R &lt;col1,col2-col4,...&gt;
252   *  Specifies list of columns to base outlier/extreme value detection
253   *  on. If an instance is considered in at least one of those
254   *  attributes an outlier/extreme value, it is tagged accordingly.
255   *  'first' and 'last' are valid indexes.
256   *  (default none)</pre>
257   *
258   * <pre> -O &lt;num&gt;
259   *  The factor for outlier detection.
260   *  (default: 3)</pre>
261   *
262   * <pre> -E &lt;num&gt;
263   *  The factor for extreme values detection.
264   *  (default: 2*Outlier Factor)</pre>
265   *
266   * <pre> -E-as-O
267   *  Tags extreme values also as outliers.
268   *  (default: off)</pre>
269   *
270   * <pre> -P
271   *  Generates Outlier/ExtremeValue pair for each numeric attribute in
272   *  the range, not just a single indicator pair for all the attributes.
273   *  (default: off)</pre>
274   *
275   * <pre> -M
276   *  Generates an additional attribute 'Offset' per Outlier/ExtremeValue
277   *  pair that contains the multiplier that the value is off the median.
278   *     value = median + 'multiplier' * IQR
279   * Note: implicitely sets '-P'. (default: off)</pre>
280   *
281   <!-- options-end -->
282   *
283   * @param options     the list of options as an array of strings
284   * @throws Exception  if an option is not supported
285   */
286  public void setOptions(String[] options) throws Exception {
287    String        tmpStr;
288
289    super.setOptions(options);
290
291    tmpStr = Utils.getOption("R", options);
292    if (tmpStr.length() != 0)
293      setAttributeIndices(tmpStr);
294    else
295      setAttributeIndices("first-last");
296
297    tmpStr = Utils.getOption("O", options);
298    if (tmpStr.length() != 0)
299      setOutlierFactor(Double.parseDouble(tmpStr));
300    else
301      setOutlierFactor(3);
302
303    tmpStr = Utils.getOption("E", options);
304    if (tmpStr.length() != 0)
305      setExtremeValuesFactor(Double.parseDouble(tmpStr));
306    else
307      setExtremeValuesFactor(2*getOutlierFactor());
308   
309    setExtremeValuesAsOutliers(Utils.getFlag("E-as-O", options));
310   
311    setDetectionPerAttribute(Utils.getFlag("P", options));
312
313    setOutputOffsetMultiplier(Utils.getFlag("M", options));
314  }
315
316  /**
317   * Gets the current settings of the filter.
318   *
319   * @return            an array of strings suitable for passing to setOptions
320   */
321  public String[] getOptions() {
322    Vector        result;
323    String[]      options;
324    int           i;
325
326    result = new Vector();
327
328    options = super.getOptions();
329    for (i = 0; i < options.length; i++)
330      result.add(options[i]);
331
332    result.add("-R");
333    if (!getAttributeIndices().equals(""))
334      result.add(getAttributeIndices());
335    else
336      result.add("first-last");
337   
338    result.add("-O");
339    result.add("" + getOutlierFactor());
340
341    result.add("-E");
342    result.add("" + getExtremeValuesFactor());
343
344    if (getExtremeValuesAsOutliers())
345      result.add("-E-as-O");
346   
347    if (getDetectionPerAttribute())
348      result.add("-P");
349   
350    if (getOutputOffsetMultiplier())
351      result.add("-M");
352   
353    return (String[]) result.toArray(new String[result.size()]);
354  }
355
356  /**
357   * Returns the tip text for this property
358   *
359   * @return tip text for this property suitable for
360   * displaying in the explorer/experimenter gui
361   */
362  public String attributeIndicesTipText() {
363    return 
364        "Specify range of attributes to act on; "
365      + " this is a comma separated list of attribute indices, with"
366      + " \"first\" and \"last\" valid values; specify an inclusive"
367      + " range with \"-\", eg: \"first-3,5,6-10,last\".";
368  }
369
370  /**
371   * Gets the current range selection
372   *
373   * @return            a string containing a comma separated list of ranges
374   */
375  public String getAttributeIndices() {
376    return m_Attributes.getRanges();
377  }
378
379  /**
380   * Sets which attributes are to be used for interquartile calculations and
381   * outlier/extreme value detection (only numeric attributes among the
382   * selection will be used).
383   *
384   * @param value       a string representing the list of attributes. Since
385   *                    the string will typically come from a user, attributes
386   *                    are indexed from 1. <br> eg: first-3,5,6-last
387   * @throws IllegalArgumentException if an invalid range list is supplied
388   */
389  public void setAttributeIndices(String value) {
390    m_Attributes.setRanges(value);
391  }
392
393  /**
394   * Sets which attributes are to be used for interquartile calculations and
395   * outlier/extreme value detection (only numeric attributes among the
396   * selection will be used).
397   *
398   * @param value       an array containing indexes of attributes to work on.
399   *                    Since the array will typically come from a program,
400   *                    attributes are indexed from 0.
401   * @throws IllegalArgumentException if an invalid set of ranges is supplied
402   */
403  public void setAttributeIndicesArray(int[] value) {
404    setAttributeIndices(Range.indicesToRangeList(value));
405  }
406
407  /**
408   * Returns the tip text for this property
409   *
410   * @return            tip text for this property suitable for
411   *                    displaying in the explorer/experimenter gui
412   */
413  public String outlierFactorTipText() {
414    return "The factor for determining the thresholds for outliers.";
415  }
416
417  /**
418   * Sets the factor for determining the thresholds for outliers.
419   *
420   * @param value       the factor.
421   */
422  public void setOutlierFactor(double value) {
423    if (value >= getExtremeValuesFactor())
424      System.err.println("OutlierFactor must be smaller than ExtremeValueFactor");
425    else
426      m_OutlierFactor = value;
427  }
428
429  /**
430   * Gets the factor for determining the thresholds for outliers.
431   *
432   * @return            the factor.
433   */
434  public double getOutlierFactor() {
435    return m_OutlierFactor;
436  }
437
438  /**
439   * Returns the tip text for this property
440   *
441   * @return            tip text for this property suitable for
442   *                    displaying in the explorer/experimenter gui
443   */
444  public String extremeValuesFactorTipText() {
445    return "The factor for determining the thresholds for extreme values.";
446  }
447
448  /**
449   * Sets the factor for determining the thresholds for extreme values.
450   *
451   * @param value       the factor.
452   */
453  public void setExtremeValuesFactor(double value) {
454    if (value <= getOutlierFactor())
455      System.err.println("ExtremeValuesFactor must be greater than OutlierFactor!");
456    else
457      m_ExtremeValuesFactor = value;
458  }
459
460  /**
461   * Gets the factor for determining the thresholds for extreme values.
462   *
463   * @return            the factor.
464   */
465  public double getExtremeValuesFactor() {
466    return m_ExtremeValuesFactor;
467  }
468
469  /**
470   * Returns the tip text for this property
471   *
472   * @return            tip text for this property suitable for
473   *                    displaying in the explorer/experimenter gui
474   */
475  public String extremeValuesAsOutliersTipText() {
476    return "Whether to tag extreme values also as outliers.";
477  }
478
479  /**
480   * Set whether extreme values are also tagged as outliers.
481   *
482   * @param value       whether or not to tag extreme values also as outliers.
483   */
484  public void setExtremeValuesAsOutliers(boolean value) {
485    m_ExtremeValuesAsOutliers = value;
486  }
487
488  /**
489   * Get whether extreme values are also tagged as outliers.
490   *
491   * @return            true if extreme values are also tagged as outliers.
492   */
493  public boolean getExtremeValuesAsOutliers() {
494    return m_ExtremeValuesAsOutliers;
495  }
496
497  /**
498   * Returns the tip text for this property
499   *
500   * @return            tip text for this property suitable for
501   *                    displaying in the explorer/experimenter gui
502   */
503  public String detectionPerAttributeTipText() {
504    return 
505        "Generates Outlier/ExtremeValue attribute pair for each numeric "
506      + "attribute, not just a single pair for all numeric attributes together.";
507  }
508
509  /**
510   * Set whether an Outlier/ExtremeValue attribute pair is generated for
511   * each numeric attribute ("true") or just one pair for all numeric
512   * attributes together ("false").
513   *
514   * @param value       whether or not to generate indicator attribute pairs
515   *                    for each numeric attribute.
516   */
517  public void setDetectionPerAttribute(boolean value) {
518    m_DetectionPerAttribute = value;
519    if (!m_DetectionPerAttribute)
520      m_OutputOffsetMultiplier = false;
521  }
522
523  /**
524   * Gets whether an Outlier/ExtremeValue attribute pair is generated for
525   * each numeric attribute ("true") or just one pair for all numeric
526   * attributes together ("false").
527   *
528   * @return            true if indicator attribute pairs are generated for
529   *                    each numeric attribute.
530   */
531  public boolean getDetectionPerAttribute() {
532    return m_DetectionPerAttribute;
533  }
534
535  /**
536   * Returns the tip text for this property
537   *
538   * @return            tip text for this property suitable for
539   *                    displaying in the explorer/experimenter gui
540   */
541  public String outputOffsetMultiplierTipText() {
542    return 
543        "Generates an additional attribute 'Offset' that contains the "
544      + "multiplier the value is off the median: "
545      + "value = median + 'multiplier' * IQR";
546  }
547
548  /**
549   * Set whether an additional attribute "Offset" is generated per
550   * Outlier/ExtremeValue attribute pair that lists the multiplier the value
551   * is off the median: value = median + 'multiplier' * IQR.
552   *
553   * @param value       whether or not to generate the additional attribute.
554   */
555  public void setOutputOffsetMultiplier(boolean value) {
556    m_OutputOffsetMultiplier = value;
557    if (m_OutputOffsetMultiplier)
558      m_DetectionPerAttribute = true;
559  }
560
561  /**
562   * Gets whether an additional attribute "Offset" is generated per
563   * Outlier/ExtremeValue attribute pair that lists the multiplier the value
564   * is off the median: value = median + 'multiplier' * IQR.
565   *
566   * @return            true if the additional attribute is generated.
567   */
568  public boolean getOutputOffsetMultiplier() {
569    return m_OutputOffsetMultiplier;
570  }
571
572  /**
573   * Returns the Capabilities of this filter.
574   *
575   * @return            the capabilities of this object
576   * @see               Capabilities
577   */
578  public Capabilities getCapabilities() {
579    Capabilities result = super.getCapabilities();
580    result.disableAll();
581
582    // attributes
583    result.enableAllAttributes();
584    result.enable(Capability.MISSING_VALUES);
585   
586    // class
587    result.enableAllClasses();
588    result.enable(Capability.MISSING_CLASS_VALUES);
589    result.enable(Capability.NO_CLASS);
590   
591    return result;
592  }
593
594  /**
595   * Determines the output format based on the input format and returns
596   * this. In case the output format cannot be returned immediately, i.e.,
597   * hasImmediateOutputFormat() returns false, then this method will called
598   * from batchFinished() after the call of preprocess(Instances), in which,
599   * e.g., statistics for the actual processing step can be gathered.
600   *
601   * @param inputFormat     the input format to base the output format on
602   * @return                the output format
603   * @throws Exception      in case the determination goes wrong
604   * @see                   #hasImmediateOutputFormat()
605   * @see                   #batchFinished()
606   */
607  protected Instances determineOutputFormat(Instances inputFormat)
608      throws Exception {
609   
610    FastVector          atts;
611    FastVector          values;
612    Instances           result;
613    int                 i;
614
615    // attributes must be numeric
616    m_Attributes.setUpper(inputFormat.numAttributes() - 1);
617    m_AttributeIndices = m_Attributes.getSelection();
618    for (i = 0; i < m_AttributeIndices.length; i++) {
619      // ignore class
620      if (m_AttributeIndices[i] == inputFormat.classIndex()) {
621        m_AttributeIndices[i] = NON_NUMERIC;
622        continue;
623      }
624      // not numeric -> ignore it
625      if (!inputFormat.attribute(m_AttributeIndices[i]).isNumeric())
626        m_AttributeIndices[i] = NON_NUMERIC;
627    }
628   
629    // get old attributes
630    atts = new FastVector();
631    for (i = 0; i < inputFormat.numAttributes(); i++)
632      atts.addElement(inputFormat.attribute(i));
633   
634    if (!getDetectionPerAttribute()) {
635      m_OutlierAttributePosition    = new int[1];
636      m_OutlierAttributePosition[0] = atts.size();
637     
638      // add 2 new attributes
639      values = new FastVector();
640      values.addElement("no");
641      values.addElement("yes");
642      atts.addElement(new Attribute("Outlier", values));
643     
644      values = new FastVector();
645      values.addElement("no");
646      values.addElement("yes");
647      atts.addElement(new Attribute("ExtremeValue", values));
648    }
649    else {
650      m_OutlierAttributePosition = new int[m_AttributeIndices.length];
651     
652      for (i = 0; i < m_AttributeIndices.length; i++) {
653        if (m_AttributeIndices[i] == NON_NUMERIC)
654          continue;
655       
656        m_OutlierAttributePosition[i] = atts.size();
657
658        // add new attributes
659        values = new FastVector();
660        values.addElement("no");
661        values.addElement("yes");
662        atts.addElement(
663            new Attribute(
664                inputFormat.attribute(
665                    m_AttributeIndices[i]).name() + "_Outlier", values));
666       
667        values = new FastVector();
668        values.addElement("no");
669        values.addElement("yes");
670        atts.addElement(
671            new Attribute(
672                inputFormat.attribute(
673                    m_AttributeIndices[i]).name() + "_ExtremeValue", values));
674
675        if (getOutputOffsetMultiplier())
676          atts.addElement(
677              new Attribute(
678                  inputFormat.attribute(
679                      m_AttributeIndices[i]).name() + "_Offset"));
680      }
681    }
682
683    // generate header
684    result = new Instances(inputFormat.relationName(), atts, 0);
685    result.setClassIndex(inputFormat.classIndex());
686   
687    return result;
688  }
689
690  /**
691   * computes the thresholds for outliers and extreme values
692   *
693   * @param instances   the data to work on
694   */
695  protected void computeThresholds(Instances instances) {
696    int         i;
697    double[]    values;
698    int[]       sortedIndices;
699    int         half;
700    int         quarter;
701    double      q1;
702    double      q2;
703    double      q3;
704   
705    m_UpperExtremeValue = new double[m_AttributeIndices.length];
706    m_UpperOutlier      = new double[m_AttributeIndices.length];
707    m_LowerOutlier      = new double[m_AttributeIndices.length];
708    m_LowerExtremeValue = new double[m_AttributeIndices.length];
709    m_Median            = new double[m_AttributeIndices.length];
710    m_IQR               = new double[m_AttributeIndices.length];
711   
712    for (i = 0; i < m_AttributeIndices.length; i++) {
713      // non-numeric attribute?
714      if (m_AttributeIndices[i] == NON_NUMERIC)
715        continue;
716     
717      // sort attribute data
718      values        = instances.attributeToDoubleArray(m_AttributeIndices[i]);
719      sortedIndices = Utils.sort(values);
720     
721      // determine indices
722      half    = sortedIndices.length / 2;
723      quarter = half / 2;
724     
725      if (sortedIndices.length % 2 == 1) {
726        q2 = values[sortedIndices[half]];
727      }
728      else {
729        q2 = (values[sortedIndices[half]] + values[sortedIndices[half + 1]]) / 2;
730      }
731     
732      if (half % 2 == 1) {
733        q1 = values[sortedIndices[quarter]];
734        q3 = values[sortedIndices[sortedIndices.length - quarter - 1]];
735      }
736      else {
737        q1 = (values[sortedIndices[quarter]] + values[sortedIndices[quarter + 1]]) / 2;
738        q3 = (values[sortedIndices[sortedIndices.length - quarter - 1]] + values[sortedIndices[sortedIndices.length - quarter]]) / 2;
739      }
740     
741      // determine thresholds and other values
742      m_Median[i]            = q2;
743      m_IQR[i]               = q3 - q1;
744      m_UpperExtremeValue[i] = q3 + getExtremeValuesFactor() * m_IQR[i];
745      m_UpperOutlier[i]      = q3 + getOutlierFactor()       * m_IQR[i];
746      m_LowerOutlier[i]      = q1 - getOutlierFactor()       * m_IQR[i];
747      m_LowerExtremeValue[i] = q1 - getExtremeValuesFactor() * m_IQR[i];
748    }
749  }
750 
751  /**
752   * returns whether the instance has an outlier in the specified attribute
753   * or not
754   *
755   * @param inst        the instance to test
756   * @param index       the attribute index
757   * @return            true if the instance is an outlier
758   */
759  protected boolean isOutlier(Instance inst, int index) {
760    boolean     result;
761    double      value;
762
763    value  = inst.value(m_AttributeIndices[index]);
764    result =    ((m_UpperOutlier[index]      <  value) && (value <= m_UpperExtremeValue[index]))
765             || ((m_LowerExtremeValue[index] <= value) && (value <  m_LowerOutlier[index]));
766   
767    return result;
768  }
769 
770  /**
771   * returns whether the instance is an outlier or not
772   *
773   * @param inst        the instance to test
774   * @return            true if the instance is an outlier
775   */
776  protected boolean isOutlier(Instance inst) {
777    boolean     result;
778    int         i;
779
780    result = false;
781   
782    for (i = 0; i < m_AttributeIndices.length; i++) {
783      // non-numeric attribute?
784      if (m_AttributeIndices[i] == NON_NUMERIC)
785        continue;
786
787      result = isOutlier(inst, i);
788     
789      if (result)
790        break;
791    }
792   
793    return result;
794  }
795 
796  /**
797   * returns whether the instance has an extreme value in the specified
798   * attribute or not
799   *
800   * @param inst        the instance to test
801   * @param index       the attribute index
802   * @return            true if the instance is an extreme value
803   */
804  protected boolean isExtremeValue(Instance inst, int index) {
805    boolean     result;
806    double      value;
807
808    value  = inst.value(m_AttributeIndices[index]);
809    result =    (value > m_UpperExtremeValue[index]) 
810             || (value < m_LowerExtremeValue[index]);
811     
812    return result;
813  }
814 
815  /**
816   * returns whether the instance is an extreme value or not
817   *
818   * @param inst        the instance to test
819   * @return            true if the instance is an extreme value
820   */
821  protected boolean isExtremeValue(Instance inst) {
822    boolean     result;
823    int         i;
824
825    result = false;
826   
827    for (i = 0; i < m_AttributeIndices.length; i++) {
828      // non-numeric attribute?
829      if (m_AttributeIndices[i] == NON_NUMERIC)
830        continue;
831     
832      result = isExtremeValue(inst, i);
833     
834      if (result)
835        break;
836    }
837   
838    return result;
839  }
840 
841  /**
842   * returns the mulitplier of the IQR the instance is off the median for this
843   * particular attribute.
844   *
845   * @param inst        the instance to test
846   * @param index       the attribute index
847   * @return            the multiplier
848   */
849  protected double calculateMultiplier(Instance inst, int index) {
850    double      result;
851    double      value;
852
853    value  = inst.value(m_AttributeIndices[index]);
854    result = (value - m_Median[index]) / m_IQR[index];
855     
856    return result;
857  }
858 
859  /**
860   * Processes the given data (may change the provided dataset) and returns
861   * the modified version. This method is called in batchFinished().
862   * This implementation only calls process(Instance) for each instance
863   * in the given dataset.
864   *
865   * @param instances   the data to process
866   * @return            the modified data
867   * @throws Exception  in case the processing goes wrong
868   * @see               #batchFinished()
869   */
870  protected Instances process(Instances instances) throws Exception {
871    Instances   result;
872    Instance    instOld;
873    Instance    instNew;
874    int         i;
875    int         n;
876    double[]    values;
877    int         numAttNew;
878    int         numAttOld;
879   
880    if (!isFirstBatchDone())
881      computeThresholds(instances);
882   
883    result    = getOutputFormat();
884    numAttOld = instances.numAttributes();
885    numAttNew = result.numAttributes();
886   
887    for (n = 0; n < instances.numInstances(); n++) {
888      instOld = instances.instance(n);
889      values  = new double[numAttNew];
890      System.arraycopy(instOld.toDoubleArray(), 0, values, 0, numAttOld);
891     
892      // generate new instance
893      instNew = new DenseInstance(1.0, values);
894      instNew.setDataset(result);
895
896      // per attribute?
897      if (!getDetectionPerAttribute()) {
898        // outlier?
899        if (isOutlier(instOld))
900          instNew.setValue(m_OutlierAttributePosition[0], 1);
901        // extreme value?
902        if (isExtremeValue(instOld)) {
903          instNew.setValue(m_OutlierAttributePosition[0] + 1, 1);
904          // tag extreme values also as outliers?
905          if (getExtremeValuesAsOutliers())
906            instNew.setValue(m_OutlierAttributePosition[0], 1);
907        }
908      }
909      else {
910        for (i = 0; i < m_AttributeIndices.length; i++) {
911          // non-numeric attribute?
912          if (m_AttributeIndices[i] == NON_NUMERIC)
913            continue;
914         
915          // outlier?
916          if (isOutlier(instOld, m_AttributeIndices[i]))
917            instNew.setValue(m_OutlierAttributePosition[i], 1);
918          // extreme value?
919          if (isExtremeValue(instOld, m_AttributeIndices[i])) {
920            instNew.setValue(m_OutlierAttributePosition[i] + 1, 1);
921            // tag extreme values also as outliers?
922            if (getExtremeValuesAsOutliers())
923              instNew.setValue(m_OutlierAttributePosition[i], 1);
924          }
925          // add multiplier?
926          if (getOutputOffsetMultiplier())
927            instNew.setValue(
928                m_OutlierAttributePosition[i] + 2, 
929                calculateMultiplier(instOld, m_AttributeIndices[i]));
930        }
931      }
932     
933      // copy possible strings, relational values...
934      copyValues(instNew, false, instOld.dataset(), getOutputFormat());
935     
936      // add to output
937      result.add(instNew);
938    }
939   
940    return result;
941  }
942 
943  /**
944   * Returns the revision string.
945   *
946   * @return            the revision
947   */
948  public String getRevision() {
949    return RevisionUtils.extract("$Revision: 5987 $");
950  }
951
952  /**
953   * Main method for testing this class.
954   *
955   * @param args should contain arguments to the filter: use -h for help
956   */
957  public static void main(String[] args) {
958    runFilter(new InterquartileRange(), args);
959  }
960}
Note: See TracBrowser for help on using the repository browser.