List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:org.pentaho.di.scoring.WekaScoringData.java
License:Open Source License
/** * Generates a prediction (more specifically, an output row containing all * input Kettle fields plus new fields that hold the prediction(s)) for an * incoming Kettle row given a Weka model. * * @param inputMeta the meta data for the incoming rows * @param outputMeta the meta data for the output rows * @param inputRow the values of the incoming row * @param meta meta data for this step * @return a Kettle row containing all incoming fields along with new ones * that hold the prediction(s)//from w ww . j a v a 2s .c om * @throws Exception if an error occurs */ public Object[] generatePrediction(RowMetaInterface inputMeta, RowMetaInterface outputMeta, Object[] inputRow, WekaScoringMeta meta) throws Exception { int[] mappingIndexes = m_mappingIndexes; WekaScoringModel model = getModel(); boolean outputProbs = meta.getOutputProbabilities(); boolean supervised = model.isSupervisedLearningModel(); Attribute classAtt = null; if (supervised) { classAtt = model.getHeader().classAttribute(); } // need to construct an Instance to represent this // input row Instance toScore = constructInstance(inputMeta, inputRow, mappingIndexes, model, false); double[] prediction = model.distributionForInstance(toScore); // Update the model?? if (meta.getUpdateIncrementalModel() && model.isUpdateableModel() && !toScore.isMissing(toScore.classIndex())) { model.update(toScore); } // First copy the input data to the new result... Object[] resultRow = RowDataUtil.resizeArray(inputRow, outputMeta.size()); int index = inputMeta.size(); // output for numeric class or discrete class value if (prediction.length == 1 || !outputProbs) { if (supervised) { if (classAtt.isNumeric()) { Double newVal = new Double(prediction[0]); resultRow[index++] = newVal; } else { int maxProb = Utils.maxIndex(prediction); if (prediction[maxProb] > 0) { String newVal = classAtt.value(maxProb); resultRow[index++] = newVal; } else { String newVal = BaseMessages.getString(WekaScoringMeta.PKG, "WekaScoringData.Message.UnableToPredict"); //$NON-NLS-1$ resultRow[index++] = newVal; } } } else { int maxProb = Utils.maxIndex(prediction); if (prediction[maxProb] > 0) { Double newVal = new Double(maxProb); resultRow[index++] = newVal; } else { String newVal = BaseMessages.getString(WekaScoringMeta.PKG, "WekaScoringData.Message.UnableToPredictCluster"); //$NON-NLS-1$ resultRow[index++] = newVal; } } } else { // output probability distribution for (int i = 0; i < prediction.length; i++) { Double newVal = new Double(prediction[i]); resultRow[index++] = newVal; } } return resultRow; }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Recursively backfits data into the tree. * //ww w . ja v a 2 s . c o m * @param data * the data to work with * @param classProbs * the class distribution * @throws Exception * if generation fails */ protected void backfitData(Instances data, double[] classProbs) throws Exception { // Make leaf if there are no training instances if (data.numInstances() == 0) { m_Attribute = -1; m_ClassDistribution = null; m_Prop = null; return; } // Check if node doesn't contain enough instances or is pure // or maximum depth reached m_ClassDistribution = classProbs.clone(); /* * if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum || * Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)], * Utils .sum(m_ClassDistribution))) { * * // Make leaf m_Attribute = -1; m_Prop = null; return; } */ // Are we at an inner node if (m_Attribute > -1) { // Compute new weights for subsets based on backfit data m_Prop = new double[m_Successors.length]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (!inst.isMissing(m_Attribute)) { if (data.attribute(m_Attribute).isNominal()) { m_Prop[(int) inst.value(m_Attribute)] += inst.weight(); } else { m_Prop[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1] += inst.weight(); } } } // If we only have missing values we can make this node into a leaf if (Utils.sum(m_Prop) <= 0) { m_Attribute = -1; m_Prop = null; return; } // Otherwise normalize the proportions Utils.normalize(m_Prop); // Split data Instances[] subsets = splitData(data); // Go through subsets for (int i = 0; i < subsets.length; i++) { // Compute distribution for current subset double[] dist = new double[data.numClasses()]; for (int j = 0; j < subsets[i].numInstances(); j++) { dist[(int) subsets[i].instance(j).classValue()] += subsets[i].instance(j).weight(); } // Backfit subset m_Successors[i].backfitData(subsets[i], dist); } // If unclassified instances are allowed, we don't need to store the // class distribution if (getAllowUnclassifiedInstances()) { m_ClassDistribution = null; return; } // Otherwise, if all successors are non-empty, we don't need to // store the class distribution boolean emptySuccessor = false; for (int i = 0; i < subsets.length; i++) { if (m_Successors[i].m_ClassDistribution == null) { emptySuccessor = true; return; } } m_ClassDistribution = null; // If we have a least two non-empty successors, we should keep this // tree /* * int nonEmptySuccessors = 0; for (int i = 0; i < subsets.length; * i++) { if (m_Successors[i].m_ClassDistribution != null) { * nonEmptySuccessors++; if (nonEmptySuccessors > 1) { return; } } } * * // Otherwise, this node is a leaf or should become a leaf * m_Successors = null; m_Attribute = -1; m_Prop = null; return; */ } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution for an attribute. * //from w w w . ja va 2 s .c o m * @param props * @param dists * @param att * the attribute index * @param data * the data to work with * @throws Exception * if something goes wrong */ protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data, double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception { HashMap<String, Double> mp = new HashMap<String, Double>(); double splitPoint = givenSplitPoint; double origSplitPoint = 0; Attribute attribute = null; double[][] dist = null; int indexOfFirstMissingValue = -1; String CustomClassifierId = null; CustomSet cSet = null; if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) { CustomClassifierId = getKeyinMap(custom_classifiers, att, data); } else if (att >= data.numAttributes() + custom_classifiers.size()) { cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList); } else { attribute = data.attribute(att); } if (CustomClassifierId == null && cSet == null) { if (attribute.isNominal()) { // For nominal attributes dist = new double[attribute.numValues()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Skip missing values at this stage if (indexOfFirstMissingValue < 0) { indexOfFirstMissingValue = i; } continue; } dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight(); } } else { // For numeric attributes double[][] currDist = new double[2][data.numClasses()]; dist = new double[2][data.numClasses()]; // Sort data data.sort(att); // Move all instances into second subset for (int j = 0; j < data.numInstances(); j++) { Instance inst = data.instance(j); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value indexOfFirstMissingValue = j; break; } currDist[1][(int) inst.classValue()] += inst.weight(); } // Value before splitting double priorVal = priorVal(currDist); // Save initial distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } if (Double.isNaN(splitPoint)) { // Try all possible split points double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } // Can we place a sensible split point here? if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save split point splitPoint = (inst.value(att) + currSplit) / 2.0; origSplitPoint = splitPoint; // Save distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); } } else { double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; // Split data set using given split point. for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save computed split point origSplitPoint = (inst.value(att) + currSplit) / 2.0; } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(att) <= splitPoint) { // Save distribution since split point is specified for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } } } } else if (CustomClassifierId != null) { Classifier fc = custom_classifiers.get(CustomClassifierId); dist = new double[data.numClasses()][data.numClasses()]; Instance inst; for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); double predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { dist[(int) predictedClass][(int) inst.classValue()] += inst.weight(); } } } else if (cSet != null) { dist = new double[data.numClasses()][data.numClasses()]; JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { testPoint = new double[2]; ctr = 0; for (Attribute a : aList) { if (!data.instance(k).isMissing(a)) { testPoint[ctr] = data.instance(k).value(a); ctr++; } } int check = checkPointInPolygon(attrVertices, testPoint); dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight(); } } // Compute weights for subsetsCustomClassifierIndex props[att] = new double[dist.length]; for (int k = 0; k < props[att].length; k++) { props[att][k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props[att]), 0)) { for (int k = 0; k < props[att].length; k++) { props[att][k] = 1.0 / props[att].length; } } else { Utils.normalize(props[att]); } // Any instances with missing values ? if (indexOfFirstMissingValue > -1) { // Distribute weights for instances with missing values for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (attribute.isNominal()) { // Need to check if attribute value is missing if (inst.isMissing(att)) { for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } else { // Can be sure that value is missing, so no test required for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } } // Return distribution and split point dists[att] = dist; mp.put("split_point", splitPoint); mp.put("orig_split_point", origSplitPoint); return mp; }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution of an instance using the decision tree. * /*from w w w . j a v a2s . c o m*/ * @param instance * the instance to compute the distribution for * @return the computed class distribution * @throws Exception * if computation fails */ @Override public double[] distributionForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } double[] returnedDist = null; //Set Parent Node to set m_pred in case custom set occurs. if (m_Successors != null) { for (int i = 0; i < m_Successors.length; i++) { m_Successors[i].setParentNode(this.parentNode); } } if (m_Info != null) { if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) { // Node is not a leaf if (instance.isMissing(m_Attribute)) { LOGGER.debug("Missing attribute"); // Value is missing returnedDist = new double[m_Info.numClasses()]; // Split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } LOGGER.debug("Missing Instance"); } else if (m_Info.attribute(m_Attribute).isNominal()) { // For nominal attributes returnedDist = m_Successors[(int) instance.value(m_Attribute)] .distributionForInstance(instance); } else { // For numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } } else if (m_Attribute >= m_Info.numAttributes() - 1) { if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, m_Info, d); double[] testPoint = new double[2]; testPoint[0] = instance.value(aList.get(0)); testPoint[1] = instance.value(aList.get(1)); int check = checkPointInPolygon(attrVertices, testPoint); if (m_Successors[check].getM_Attribute() == -1) { parentNode.setM_pred(m_ClassAssignment.get((check == 0) ? "Outside" : "Inside")); } returnedDist = m_Successors[check].distributionForInstance(instance); } else { String classifierId = ""; classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info); Classifier fc = listOfFc.get(classifierId); double predictedClass = fc.classifyInstance(instance); if (predictedClass != Instance.missingValue()) { returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance); } } } } // Node is a leaf or successor is empty? if ((m_Attribute == -1) || (returnedDist == null)) { // Is node empty? if (m_ClassDistribution == null) { if (getAllowUnclassifiedInstances()) { return new double[m_Info.numClasses()]; } else { return null; } } // Else return normalized distribution double[] normalizedDistribution = m_ClassDistribution.clone(); if (this.parentNode != null) { this.parentNode.setJsonnode(this.getJsonnode()); } try { Utils.normalize(normalizedDistribution); } catch (Exception e) { LOGGER.error("Sum is 0. Coudln't Normalize"); } return normalizedDistribution; } else { return returnedDist; } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution of an instance using the decision tree. * //w w w. j a v a 2s. co m * @param instance * the instance to compute the distribution for * @return the computed class distribution * @throws Exception * if computation fails */ public double[] predForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } double[] returnedDist = null; if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) { // Node is not a leaf if (instance.isMissing(m_Attribute)) { // Value is missing returnedDist = new double[m_Info.numClasses()]; // Split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if (m_Info.attribute(m_Attribute).isNominal()) { // For nominal attributes returnedDist = m_Successors[(int) instance.value(m_Attribute)].distributionForInstance(instance); } else { // For numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } } else if (m_Attribute >= m_Info.numAttributes() - 1) { if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, m_Info, d); double[] testPoint = new double[2]; testPoint[0] = instance.value(aList.get(0)); testPoint[1] = instance.value(aList.get(1)); int check = checkPointInPolygon(attrVertices, testPoint); returnedDist = m_Successors[check].distributionForInstance(instance); } else { String classifierId = ""; classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info); Classifier fc = listOfFc.get(classifierId); double predictedClass = fc.classifyInstance(instance); if (predictedClass != Instance.missingValue()) { returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance); } } } // Node is a leaf or successor is empty? if ((m_Attribute == -1) || (returnedDist == null)) { // Is node empty? if (m_ClassDistribution == null) { if (getAllowUnclassifiedInstances()) { return new double[m_Info.numClasses()]; } else { return null; } } // Else return normalized distribution double[] normalizedDistribution = m_ClassDistribution.clone(); Utils.normalize(normalizedDistribution); return normalizedDistribution; } else { return returnedDist; } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Splits instances into subsets based on the given split. * // ww w. j av a 2 s . c o m * @param data * the data to work with * @return the subsets of instances * @throws Exception * if something goes wrong */ protected Instances[] splitData(Instances data) throws Exception { // Allocate array of Instances objects Instances[] subsets = new Instances[m_Prop.length]; for (int i = 0; i < m_Prop.length; i++) { subsets[i] = new Instances(data, data.numInstances()); } if (m_Attribute >= data.numAttributes()) { if (m_Attribute >= listOfFc.size() + data.numAttributes() - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (data.numAttributes() - 1 + listOfFc.size()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { ctr = 0; for (Attribute a : aList) { testPoint[ctr] = data.instance(k).value(a); ctr++; } int check = checkPointInPolygon(attrVertices, testPoint); subsets[check].add(data.instance(k)); continue; } } else { Classifier fc; double predictedClass; // Go through the data for (int i = 0; i < data.numInstances(); i++) { // Get instance Instance inst = data.instance(i); String classifierId = getKeyinMap(listOfFc, m_Attribute, data); fc = listOfFc.get(classifierId); predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { subsets[(int) predictedClass].add(inst); continue; } // Else throw an exception throw new IllegalArgumentException("Unknown attribute type"); } } } else { // Go through the data for (int i = 0; i < data.numInstances(); i++) { // Get instance Instance inst = data.instance(i); // Does the instance have a missing value? if (inst.isMissing(m_Attribute)) { // Split instance up for (int k = 0; k < m_Prop.length; k++) { if (m_Prop[k] > 0) { Instance copy = (Instance) inst.copy(); copy.setWeight(m_Prop[k] * inst.weight()); subsets[k].add(copy); } } // Proceed to next instance continue; } // Do we have a nominal attribute? if (data.attribute(m_Attribute).isNominal()) { subsets[(int) inst.value(m_Attribute)].add(inst); // Proceed to next instance continue; } // Do we have a numeric attribute? if (data.attribute(m_Attribute).isNumeric()) { subsets[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1].add(inst); // Proceed to next instance continue; } // Else throw an exception throw new IllegalArgumentException("Unknown attribute type"); } } // Save memory for (int i = 0; i < m_Prop.length; i++) { subsets[i].compactify(); } // Return the subsets return subsets; }
From source file:org.stream_gpu.float_knn.float_search.NormalizableDistance.java
License:Open Source License
/** * Used to initialize the ranges. For this the values of the first * instance is used to save time./*from w w w.ja v a 2s .c o m*/ * Sets low and high to the values of the first instance and * width to zero. * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public void updateRangesFirst(Instance instance, int numAtt, float[][] ranges) { for (int j = 0; j < numAtt; j++) { if (!instance.isMissing(j)) { ranges[j][R_MIN] = (float) instance.value(j); ranges[j][R_MAX] = (float) instance.value(j); ranges[j][R_WIDTH] = 0.0F; } else { // if value was missing ranges[j][R_MIN] = Float.POSITIVE_INFINITY; ranges[j][R_MAX] = -Float.POSITIVE_INFINITY; ranges[j][R_WIDTH] = Float.POSITIVE_INFINITY; } } }
From source file:org.stream_gpu.float_knn.float_search.NormalizableDistance.java
License:Open Source License
/** * Updates the minimum and maximum and width values for all the attributes * based on a new instance./* www . j av a2 s . c o m*/ * * @param instance the new instance * @param numAtt number of attributes in the model * @param ranges low, high and width values for all attributes */ public void updateRanges(Instance instance, int numAtt, float[][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < numAtt; j++) { float value = (float) instance.value(j); if (!instance.isMissing(j)) { if (value < ranges[j][R_MIN]) { ranges[j][R_MIN] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; if (value > ranges[j][R_MAX]) { //if this is the first value that is ranges[j][R_MAX] = value; //not missing. The,0 ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } else { if (value > ranges[j][R_MAX]) { ranges[j][R_MAX] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } } } }
From source file:org.stream_gpu.float_knn.float_search.NormalizableDistance.java
License:Open Source License
/** * Updates the ranges given a new instance. * // w ww. j av a 2s . co m * @param instance the new instance * @param ranges low, high and width values for all attributes * @return the updated ranges */ public float[][] updateRanges(Instance instance, float[][] ranges) { // updateRangesFirst must have been called on ranges for (int j = 0; j < ranges.length; j++) { float value = (float) instance.value(j); if (!instance.isMissing(j)) { if (value < ranges[j][R_MIN]) { ranges[j][R_MIN] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } else { if (instance.value(j) > ranges[j][R_MAX]) { ranges[j][R_MAX] = value; ranges[j][R_WIDTH] = ranges[j][R_MAX] - ranges[j][R_MIN]; } } } } return ranges; }
From source file:org.stream_gpu.float_knn.float_search.NormalizableDistance.java
License:Open Source License
/** * Test if an instance is within the given ranges. * // w w w .j a va 2s . com * @param instance the instance * @param ranges the ranges the instance is tested to be in * @return true if instance is within the ranges */ public boolean inRanges(Instance instance, float[][] ranges) { boolean isIn = true; // updateRangesFirst must have been called on ranges for (int j = 0; isIn && (j < ranges.length); j++) { if (!instance.isMissing(j)) { float value = (float) instance.value(j); isIn = value <= ranges[j][R_MAX]; if (isIn) isIn = value >= ranges[j][R_MIN]; } } return isIn; }