List of usage examples for weka.core Instance value
public double value(Attribute att);
From source file:org.opentox.jaqpot3.qsar.util.WekaInstancesProcess.java
License:Open Source License
public static Map<String, Double> getInstanceAttributeValues(Instance inst, int numAttributes) { //numAttributes need to be set before adding the new attributes Map<String, Double> featureMap = new HashMap(); if (numAttributes > 0) { double res; for (int i = 0; i < numAttributes; ++i) { res = (!Double.isNaN(inst.value(i))) ? inst.value(i) : 0; res = (!Double.isInfinite(res)) ? res : 0; featureMap.put(inst.attribute(i).name(), res); }//ww w. j a v a 2s . c om } return featureMap; }
From source file:org.opentox.jaqpot3.qsar.util.WekaInstancesProcess.java
License:Open Source License
public static PMMLEvaluationContext getInstanceAttributeFieldRefValues(Instance inst, int numAttributes, PMMLEvaluationContext context, List<DataField> dataFields) { //numAttributes need to be set before adding the new attributes for (DataField dataField : dataFields) { for (int i = 0; i < numAttributes; ++i) { if (StringUtils.equals(inst.attribute(i).name(), dataField.getName().toString())) { context.declare(dataField.getName(), inst.value(i)); break; }//from ww w .j a va2 s.c om } } return context; }
From source file:org.opentox.toxotis.factory.DatasetFactory.java
License:Open Source License
/** * Create a {@link DataEntry data entry} from a single instance. * @param instance//from www .jav a2 s.c o m * @return * A Data Entry that corresponds to the provided instance. * @throws ToxOtisException */ public DataEntry createDataEntry(Instance instance) throws ToxOtisException { Enumeration attributes = instance.enumerateAttributes(); DataEntry de = new DataEntry(); try { while (attributes.hasMoreElements()) { Attribute attribute = (Attribute) attributes.nextElement(); if (attribute.name().equals(Dataset.COMPOUND_URI) || attribute.name().equals("URI")) { de.setConformer(new Compound(new VRI(instance.stringValue(attribute)))); } else { FeatureValue fv = new FeatureValue(); Feature feature = new Feature(new VRI(attribute.name())); LiteralValue value = null; if (attribute.isNumeric()) { value = new LiteralValue<Double>(instance.value(attribute), XSDDatatype.XSDdouble); feature.getOntologicalClasses().add(OTClasses.numericFeature()); } else if (attribute.isString() || attribute.isDate()) { value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring); feature.getOntologicalClasses().add(OTClasses.stringFeature()); } else if (attribute.isNominal()) { value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring); Enumeration nominalValues = attribute.enumerateValues(); feature.getOntologicalClasses().add(OTClasses.nominalFeature()); while (nominalValues.hasMoreElements()) { String nomValue = (String) nominalValues.nextElement(); feature.getAdmissibleValues() .add(new LiteralValue<String>(nomValue, XSDDatatype.XSDstring)); } } fv.setFeature(feature); fv.setValue(value); de.addFeatureValue(fv); } } } catch (URISyntaxException ex) { throw new ToxOtisException(ex); } return de; }
From source file:org.packDataMining.SMOTE.java
License:Open Source License
/** * The procedure implementing the SMOTE algorithm. The output * instances are pushed onto the output queue for collection. * //from www. j a v a2 s .c o m * @throws Exception if provided options cannot be executed * on input instances */ protected void doSMOTE() throws Exception { int minIndex = 0; int min = Integer.MAX_VALUE; if (m_DetectMinorityClass) { // find minority class int[] classCounts = getInputFormat().attributeStats(getInputFormat().classIndex()).nominalCounts; for (int i = 0; i < classCounts.length; i++) { if (classCounts[i] != 0 && classCounts[i] < min) { min = classCounts[i]; minIndex = i; } } } else { String classVal = getClassValue(); if (classVal.equalsIgnoreCase("first")) { minIndex = 1; } else if (classVal.equalsIgnoreCase("last")) { minIndex = getInputFormat().numClasses(); } else { minIndex = Integer.parseInt(classVal); } if (minIndex > getInputFormat().numClasses()) { throw new Exception("value index must be <= the number of classes"); } minIndex--; // make it an index } int nearestNeighbors; if (min <= getNearestNeighbors()) { nearestNeighbors = min - 1; } else { nearestNeighbors = getNearestNeighbors(); } if (nearestNeighbors < 1) throw new Exception("Cannot use 0 neighbors!"); // compose minority class dataset // also push all dataset instances Instances sample = getInputFormat().stringFreeStructure(); Enumeration instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); push((Instance) instance.copy()); if ((int) instance.classValue() == minIndex) { sample.add(instance); } } // compute Value Distance Metric matrices for nominal features Map vdmMap = new HashMap(); Enumeration attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNominal() || attr.isString()) { double[][] vdm = new double[attr.numValues()][attr.numValues()]; vdmMap.put(attr, vdm); int[] featureValueCounts = new int[attr.numValues()]; int[][] featureValueCountsByClass = new int[getInputFormat().classAttribute().numValues()][attr .numValues()]; instanceEnum = getInputFormat().enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); int value = (int) instance.value(attr); int classValue = (int) instance.classValue(); featureValueCounts[value]++; featureValueCountsByClass[classValue][value]++; } for (int valueIndex1 = 0; valueIndex1 < attr.numValues(); valueIndex1++) { for (int valueIndex2 = 0; valueIndex2 < attr.numValues(); valueIndex2++) { double sum = 0; for (int classValueIndex = 0; classValueIndex < getInputFormat() .numClasses(); classValueIndex++) { double c1i = (double) featureValueCountsByClass[classValueIndex][valueIndex1]; double c2i = (double) featureValueCountsByClass[classValueIndex][valueIndex2]; double c1 = (double) featureValueCounts[valueIndex1]; double c2 = (double) featureValueCounts[valueIndex2]; double term1 = c1i / c1; double term2 = c2i / c2; sum += Math.abs(term1 - term2); } vdm[valueIndex1][valueIndex2] = sum; } } } } } // use this random source for all required randomness Random rand = new Random(getRandomSeed()); // find the set of extra indices to use if the percentage is not evenly divisible by 100 List extraIndices = new LinkedList(); double percentageRemainder = (getPercentage() / 100) - Math.floor(getPercentage() / 100.0); int extraIndicesCount = (int) (percentageRemainder * sample.numInstances()); if (extraIndicesCount >= 1) { for (int i = 0; i < sample.numInstances(); i++) { extraIndices.add(i); } } Collections.shuffle(extraIndices, rand); extraIndices = extraIndices.subList(0, extraIndicesCount); Set extraIndexSet = new HashSet(extraIndices); // the main loop to handle computing nearest neighbors and generating SMOTE // examples from each instance in the original minority class data Instance[] nnArray = new Instance[nearestNeighbors]; for (int i = 0; i < sample.numInstances(); i++) { Instance instanceI = sample.instance(i); // find k nearest neighbors for each instance List distanceToInstance = new LinkedList(); for (int j = 0; j < sample.numInstances(); j++) { Instance instanceJ = sample.instance(j); if (i != j) { double distance = 0; attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { double iVal = instanceI.value(attr); double jVal = instanceJ.value(attr); if (attr.isNumeric()) { distance += Math.pow(iVal - jVal, 2); } else { distance += ((double[][]) vdmMap.get(attr))[(int) iVal][(int) jVal]; } } } distance = Math.pow(distance, .5); distanceToInstance.add(new Object[] { distance, instanceJ }); } } // sort the neighbors according to distance Collections.sort(distanceToInstance, new Comparator() { public int compare(Object o1, Object o2) { double distance1 = (Double) ((Object[]) o1)[0]; double distance2 = (Double) ((Object[]) o2)[0]; return (int) Math.ceil(distance1 - distance2); } }); // populate the actual nearest neighbor instance array Iterator entryIterator = distanceToInstance.iterator(); int j = 0; while (entryIterator.hasNext() && j < nearestNeighbors) { nnArray[j] = (Instance) ((Object[]) entryIterator.next())[1]; j++; } // create synthetic examples int n = (int) Math.floor(getPercentage() / 100); while (n > 0 || extraIndexSet.remove(i)) { double[] values = new double[sample.numAttributes()]; int nn = rand.nextInt(nearestNeighbors); attrEnum = getInputFormat().enumerateAttributes(); while (attrEnum.hasMoreElements()) { Attribute attr = (Attribute) attrEnum.nextElement(); if (!attr.equals(getInputFormat().classAttribute())) { if (attr.isNumeric()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (double) (instanceI.value(attr) + gap * dif); } else if (attr.isDate()) { double dif = nnArray[nn].value(attr) - instanceI.value(attr); double gap = rand.nextDouble(); values[attr.index()] = (long) (instanceI.value(attr) + gap * dif); } else { int[] valueCounts = new int[attr.numValues()]; int iVal = (int) instanceI.value(attr); valueCounts[iVal]++; for (int nnEx = 0; nnEx < nearestNeighbors; nnEx++) { int val = (int) nnArray[nnEx].value(attr); valueCounts[val]++; } int maxIndex = 0; int max = Integer.MIN_VALUE; for (int index = 0; index < attr.numValues(); index++) { if (valueCounts[index] > max) { max = valueCounts[index]; maxIndex = index; } } values[attr.index()] = maxIndex; } } } values[sample.classIndex()] = minIndex; Instance synthetic = new Instance(1.0, values); push(synthetic); n--; } } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Trying to get generate distribution of classes * //from www . j a va2 s. c om * @param Instances * @Param Attribute index to get distribution of * @Param HashMap to put data into * * @return HashMap of class distribution data */ protected HashMap addDistributionData(Instances instances, int attIndex, HashMap distMap) throws Exception { Map<String, Comparable> temp = new HashMap<String, Comparable>(); ArrayList<Object> distData = new ArrayList(); // GenerateCSV csv = new GenerateCSV(); // String data = ""; boolean isNominal = false; instances.sort(attIndex); for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); if (!Double.isNaN(inst.value(attIndex))) { temp = new HashMap<String, Comparable>(); if (inst.attribute(attIndex).isNominal()) { temp.put("value", inst.attribute(attIndex).value((int) inst.value(attIndex))); isNominal = true; // data+=inst.attribute(m_Attribute).value((int)inst.value(m_Attribute))+","; } else { temp.put("value", inst.value(attIndex)); // data+=inst.value(att)+","; } temp.put("classprob", inst.classAttribute().value((int) inst.classValue())); // data+=inst.classAttribute().value((int) // inst.classValue())+"\n"; distData.add(temp); } } if (!distData.isEmpty()) { distMap.put("dataArray", distData); distMap.put("isNominal", isNominal); setDistributionData(distMap); } return distMap; // To check if data is being generated right. // csv.generateCsvFile("/home/karthik/Documents/distribution.csv", // data); }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Recursively backfits data into the tree. * /*from w ww . j av a 2s . c o m*/ * @param data * the data to work with * @param classProbs * the class distribution * @throws Exception * if generation fails */ protected void backfitData(Instances data, double[] classProbs) throws Exception { // Make leaf if there are no training instances if (data.numInstances() == 0) { m_Attribute = -1; m_ClassDistribution = null; m_Prop = null; return; } // Check if node doesn't contain enough instances or is pure // or maximum depth reached m_ClassDistribution = classProbs.clone(); /* * if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum || * Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)], * Utils .sum(m_ClassDistribution))) { * * // Make leaf m_Attribute = -1; m_Prop = null; return; } */ // Are we at an inner node if (m_Attribute > -1) { // Compute new weights for subsets based on backfit data m_Prop = new double[m_Successors.length]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (!inst.isMissing(m_Attribute)) { if (data.attribute(m_Attribute).isNominal()) { m_Prop[(int) inst.value(m_Attribute)] += inst.weight(); } else { m_Prop[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1] += inst.weight(); } } } // If we only have missing values we can make this node into a leaf if (Utils.sum(m_Prop) <= 0) { m_Attribute = -1; m_Prop = null; return; } // Otherwise normalize the proportions Utils.normalize(m_Prop); // Split data Instances[] subsets = splitData(data); // Go through subsets for (int i = 0; i < subsets.length; i++) { // Compute distribution for current subset double[] dist = new double[data.numClasses()]; for (int j = 0; j < subsets[i].numInstances(); j++) { dist[(int) subsets[i].instance(j).classValue()] += subsets[i].instance(j).weight(); } // Backfit subset m_Successors[i].backfitData(subsets[i], dist); } // If unclassified instances are allowed, we don't need to store the // class distribution if (getAllowUnclassifiedInstances()) { m_ClassDistribution = null; return; } // Otherwise, if all successors are non-empty, we don't need to // store the class distribution boolean emptySuccessor = false; for (int i = 0; i < subsets.length; i++) { if (m_Successors[i].m_ClassDistribution == null) { emptySuccessor = true; return; } } m_ClassDistribution = null; // If we have a least two non-empty successors, we should keep this // tree /* * int nonEmptySuccessors = 0; for (int i = 0; i < subsets.length; * i++) { if (m_Successors[i].m_ClassDistribution != null) { * nonEmptySuccessors++; if (nonEmptySuccessors > 1) { return; } } } * * // Otherwise, this node is a leaf or should become a leaf * m_Successors = null; m_Attribute = -1; m_Prop = null; return; */ } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Recursively generates a tree./*from w ww .j a va2 s. c o m*/ * * @param data * the data to work with * @param classProbs * the class distribution * @param header * the header of the data * @param minNum * the minimum number of instances per leaf * @param debug * whether debugging is on * @param attIndicesWindow * the attribute window to choose attributes from * @param random * random number generator for choosing random attributes * @param depth * the current depth * @param determineStructure * whether to determine structure * @param m_distributionData * HashMap to put distribution data if getSplitData is true in * any node * @throws Exception * if generation fails */ protected void buildTree(Instances data, double[] classProbs, Instances header, boolean debug, int depth, JsonNode node, int parent_index, HashMap m_distributionData, Instances requiredInstances, LinkedHashMap<String, Classifier> custom_classifiers, List<CustomSet> cSList, CustomClassifierService ccService, Dataset ds) throws Exception { if (mapper == null) { mapper = new ObjectMapper(); } // Store structure of dataset, set minimum number of instances m_Info = header; m_Debug = debug; // if in dead json return if (node == null) { m_Attribute = -1; m_ClassDistribution = null; m_Prop = null; return; } // Make leaf if there are no training instances if (data.numInstances() == 0) { m_Attribute = -1; m_ClassDistribution = null; m_Prop = null; return; } // Check if node doesn't contain enough instances or is pure // or maximum depth reached m_ClassDistribution = classProbs.clone(); cSetList = cSList; ccSer = ccService; d = ds; // if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum // || Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)], // Utils // .sum(m_ClassDistribution)) // || ((getMaxDepth() > 0) && (depth >= getMaxDepth()))) { // // Make leaf // m_Attribute = -1; // m_Prop = null; // return; // } // Investigate the selected attribute int attIndex = parent_index; // options child added by web client developer // TODO work with him to make a more meaningful structure... JsonNode options = node.get("options"); if (options == null) { return; } String kind = options.get("kind").asText(); JsonNode att_name = options.get("attribute_name"); Boolean getSplitData = false; Boolean getInstanceData = false; // this allows me to modify the json tree structure to add data about // the evaluation ObjectNode evalresults = (ObjectNode) options; ObjectNode _node = (ObjectNode) node; //For Roc - Node Match _node.set("roc_uid_0", null); _node.set("roc_uid_1", null); Map<String, JsonNode> sons = new HashMap<String, JsonNode>(); // String name = node_name.asText(); if (kind != null && kind.equals("split_node") && att_name != null) { // // attIndex = data.attribute(node_id.asText()).index(); if (!att_name.asText().equals("") && !att_name.asText().contains("custom_classifier") && !att_name.asText().contains("custom_tree") && !att_name.asText().contains("custom_set")) { attIndex = data.attribute(att_name.asText()).index(); } else { if (att_name.asText().contains("custom_set")) { int ctr = 0; for (CustomSet c : cSList) { if (c.getId() == Long.valueOf(att_name.asText().replace("custom_set_", ""))) { break; } ctr++; } attIndex = (data.numAttributes() - 1) + custom_classifiers.size() + ctr; } else { if (att_name.asText().contains("custom_classifier_new")) { HashMap mp = ccSer.buildCustomClasifier(data, Long.valueOf(att_name.asText().replace("custom_classifier_new_", ""))); Classifier fc = (Classifier) mp.get("classifier"); custom_classifiers.put("custom_classifier_" + mp.get("id"), fc); evalresults.put("unique_id", "custom_classifier_" + mp.get("id")); evalresults.put("attribute_name", "custom_classifier_" + mp.get("id")); att_name = evalresults.get("attribute_name"); } int ctr = 0; for (String key : custom_classifiers.keySet()) { if (key.equals(att_name.asText())) { break; } ctr++; } attIndex = (data.numAttributes() - 1) + ctr; } } if (node.get("getSplitData") != null) { getSplitData = node.get("getSplitData").asBoolean(); } JsonNode split_values = node.get("children"); int c = 0; if (split_values != null && split_values.size() > 0) { for (JsonNode svalue : split_values) { String key = svalue.get("name").asText(); JsonNode son = svalue.get("children").get(0); if (key.contains("<")) { key = "low"; } else if (key.contains(">")) { key = "high"; } sons.put(key, son); c++; } } // LOGGER.debug("Id name "+att_name+" index "+attIndex+" type "+kind+" sons "+c); } else { // LOGGER.debug("non split node, name "+att_name+" type "+kind); } double[] vals = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()]; double[][][] dists = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()][0][0]; double[][] props = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()][0]; double[] splits = new double[data.numAttributes() + custom_classifiers.size() + cSetList.size()]; listOfFc = custom_classifiers; // Compute class distributions and value of splitting // criterion for each attribute HashMap<String, Double> mp = new HashMap<String, Double>(); if (attIndex >= data.numAttributes() && attIndex < data.numAttributes() + custom_classifiers.size()) { mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers); } else if (attIndex >= data.numAttributes() + custom_classifiers.size() - 1) { mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers); } else { if (options.get("split_point") != null) { mp = distribution(props, dists, attIndex, data, options.get("split_point").asDouble(), custom_classifiers); } else { mp = distribution(props, dists, attIndex, data, Double.NaN, custom_classifiers); } } splits[attIndex] = mp.get("split_point"); vals[attIndex] = gain(dists[attIndex], priorVal(dists[attIndex])); m_Attribute = attIndex; double[][] distribution = dists[m_Attribute]; // stop if input json tree does not contain any more children // replacing Utils.gr(vals[m_Attribute], 0)&& if (kind != null && kind.equals("split_node") && att_name != null) { //Assign Classes for custom sets(visual splits). m_ClassAssignment.put("Inside", Utils.maxIndex(dists[m_Attribute][1])); m_ClassAssignment.put("Outside", (Utils.maxIndex(dists[m_Attribute][1]) == 1) ? 0 : 1); // Build subtrees m_SplitPoint = splits[m_Attribute]; m_Prop = props[m_Attribute]; Instances[] subsets = splitData(data); m_Successors = new ManualTree[distribution.length]; // record quantity and quality measures for node int quantity = 0; for (int i = 0; i < distribution.length; i++) { quantity += subsets[i].numInstances(); } evalresults.put("bin_size", quantity); evalresults.put("infogain", vals[m_Attribute]); evalresults.put("majClass", m_Info.classAttribute().value(Utils.maxIndex(m_ClassDistribution))); evalresults.put("split_point", m_SplitPoint); evalresults.put("orig_split_point", mp.get("orig_split_point")); if (Boolean.TRUE.equals(getSplitData)) { addDistributionData(data, m_Attribute, m_distributionData); } int maxIndex = 0; double maxCount = 0; double errors = 0; double[] classDist = new double[2]; double pct_correct = 0; double bin_size = 0; for (int i = 0; i < distribution.length; i++) { m_Successors[i] = new ManualTree(); m_Successors[i].setKValue(m_KValue); m_Successors[i].setMaxDepth(getMaxDepth()); //To compute class distribution for split node. for (int j = 0; j < distribution[i].length; j++) { classDist[j] += distribution[i][j]; } // test an instance to see which child node to send its subset // down. // after split, should hold for all in set String child_name = ""; Instances subset = subsets[i]; if (subset == null || subset.numInstances() == 0) { continue; } Instance inst = subset.instance(0); if (m_Attribute >= data.numAttributes() && m_Attribute < data.numAttributes() + custom_classifiers.size()) { double predictedClass = custom_classifiers.get(att_name.asText()).classifyInstance(inst); child_name = m_Info.classAttribute().value((int) predictedClass); } else if (m_Attribute >= data.numAttributes() + custom_classifiers.size() - 1) { CustomSet cSet = getReqCustomSet( m_Attribute - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, ds); double[] testPoint = new double[2]; testPoint[0] = inst.value(aList.get(0)); testPoint[1] = inst.value(aList.get(1)); int check = checkPointInPolygon(attrVertices, testPoint); if (check == 0) { child_name = "Outside"; } else { child_name = "Inside"; } } else { // which nominal attribute is this split linked to? if (subset.attribute(m_Attribute).isNominal()) { child_name = inst.attribute(m_Attribute).value((int) inst.value(m_Attribute)); } // otherwise, if we have a numeric attribute, are we going // high or low? else if (data.attribute(m_Attribute).isNumeric()) { if (inst.value(m_Attribute) < m_SplitPoint) { child_name = "low"; } else { child_name = "high"; } } } m_Successors[i].setM_ClassAssignment((HashMap<String, Integer>) m_ClassAssignment.clone()); JsonNode son = sons.get(child_name); if (son != null) { m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, son, attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList, ccService, ds); } else { // if we are a split node with no input children, we need to // add them into the tree // JsonNode split_values = node.get("children"); if (kind != null && kind.equals("split_node")) { ArrayNode children = (ArrayNode) node.get("children"); if (children == null) { children = mapper.createArrayNode(); } ObjectNode child = mapper.createObjectNode(); child.put("name", child_name); ObjectNode c_options = mapper.createObjectNode(); c_options.put("attribute_name", child_name); c_options.put("kind", "split_value"); child.put("options", c_options); children.add(child); _node.put("children", children); m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, child, attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList, ccService, ds); } else { // for leaf nodes, calling again ends the cycle and // fills up the bins appropriately m_Successors[i].buildTree(subsets[i], distribution[i], header, m_Debug, depth + 1, node, attIndex, m_distributionData, requiredInstances, custom_classifiers, cSList, ccService, ds); } } } // Compute pct_correct from distributions and send to split_node bin_size = Utils.sum(classDist); maxIndex = Utils.maxIndex(classDist); maxCount = classDist[maxIndex]; String class_name = m_Info.classAttribute().value(maxIndex); _node.put("majClass", class_name); errors += bin_size - maxCount; pct_correct = (quantity - errors) / quantity; evalresults.put("pct_correct", pct_correct); // If all successors are non-empty, we don't need to store the class // distribution boolean emptySuccessor = false; for (int i = 0; i < subsets.length; i++) { if (m_Successors[i].m_ClassDistribution == null) { emptySuccessor = true; break; } } if (!emptySuccessor) { m_ClassDistribution = null; } } else { m_Attribute = -1; if (kind != null && kind.equals("leaf_node")) { double bin_size = 0, maxCount = 0; int maxIndex = 0; double errors = 0; double pct_correct = 0; if (m_ClassDistribution != null) { bin_size = Utils.sum(m_ClassDistribution); maxIndex = Utils.maxIndex(m_ClassDistribution); // this is // where it // decides // what // class the // leaf is.. // takes the // majority. maxCount = m_ClassDistribution[maxIndex]; errors = bin_size - maxCount; pct_correct = (bin_size - errors) / bin_size; } if (node.get("pickInst") != null) { getInstanceData = node.get("pickInst").asBoolean(); } if (Boolean.TRUE.equals(getInstanceData)) { requiredInstances.delete(); for (int k = 0; k < data.numInstances(); k++) { requiredInstances.add(data.instance(k)); } } String class_name = m_Info.classAttribute().value(maxIndex); _node.put("majClass", class_name); if (node.get("setClass") != null) { String setClass = node.get("setClass").asText(); class_name = m_Info.classAttribute().value(m_ClassAssignment.get(setClass)); } _node.put("name", class_name); evalresults.put("attribute_name", class_name); evalresults.put("kind", "leaf_node"); evalresults.put("bin_size", Utils.doubleToString(bin_size, 2)); evalresults.put("errors", Utils.doubleToString(errors, 2)); evalresults.put("pct_correct", Utils.doubleToString(pct_correct, 2)); this.setJsonnode(_node); } else { // Make leaf // add the data to the json object double bin_size = 0, maxCount = 0; int maxIndex = 0; double errors = 0; double pct_correct = 0; if (m_ClassDistribution != null) { bin_size = Utils.sum(m_ClassDistribution); maxIndex = Utils.maxIndex(m_ClassDistribution); // this is // where it // decides // what // class the // leaf is.. // takes the // majority. maxCount = m_ClassDistribution[maxIndex]; errors = bin_size - maxCount; pct_correct = (bin_size - errors) / bin_size; } ArrayNode children = (ArrayNode) node.get("children"); if (children == null) { children = mapper.createArrayNode(); } ObjectNode child = mapper.createObjectNode(); String class_name = m_Info.classAttribute().value(maxIndex); child.put("majClass", class_name); String nodeName = node.get("name").asText(); if (nodeName.equals("Inside") || nodeName.equals("Outside")) { child.put("setClass", nodeName); class_name = m_Info.classAttribute().value(m_ClassAssignment.get(nodeName)); } child.put("name", class_name); ObjectNode c_options = mapper.createObjectNode(); c_options.put("attribute_name", class_name); c_options.put("kind", "leaf_node"); c_options.put("bin_size", Utils.doubleToString(bin_size, 2)); c_options.put("errors", Utils.doubleToString(errors, 2)); c_options.put("pct_correct", Utils.doubleToString(pct_correct, 2)); child.put("options", c_options); children.add(child); _node.put("children", children); this.setJsonnode(child); } } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution for an attribute. * //from www . j av a 2 s . c o m * @param props * @param dists * @param att * the attribute index * @param data * the data to work with * @throws Exception * if something goes wrong */ protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data, double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception { HashMap<String, Double> mp = new HashMap<String, Double>(); double splitPoint = givenSplitPoint; double origSplitPoint = 0; Attribute attribute = null; double[][] dist = null; int indexOfFirstMissingValue = -1; String CustomClassifierId = null; CustomSet cSet = null; if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) { CustomClassifierId = getKeyinMap(custom_classifiers, att, data); } else if (att >= data.numAttributes() + custom_classifiers.size()) { cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList); } else { attribute = data.attribute(att); } if (CustomClassifierId == null && cSet == null) { if (attribute.isNominal()) { // For nominal attributes dist = new double[attribute.numValues()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Skip missing values at this stage if (indexOfFirstMissingValue < 0) { indexOfFirstMissingValue = i; } continue; } dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight(); } } else { // For numeric attributes double[][] currDist = new double[2][data.numClasses()]; dist = new double[2][data.numClasses()]; // Sort data data.sort(att); // Move all instances into second subset for (int j = 0; j < data.numInstances(); j++) { Instance inst = data.instance(j); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value indexOfFirstMissingValue = j; break; } currDist[1][(int) inst.classValue()] += inst.weight(); } // Value before splitting double priorVal = priorVal(currDist); // Save initial distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } if (Double.isNaN(splitPoint)) { // Try all possible split points double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } // Can we place a sensible split point here? if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save split point splitPoint = (inst.value(att) + currSplit) / 2.0; origSplitPoint = splitPoint; // Save distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); } } else { double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; // Split data set using given split point. for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save computed split point origSplitPoint = (inst.value(att) + currSplit) / 2.0; } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(att) <= splitPoint) { // Save distribution since split point is specified for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } } } } else if (CustomClassifierId != null) { Classifier fc = custom_classifiers.get(CustomClassifierId); dist = new double[data.numClasses()][data.numClasses()]; Instance inst; for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); double predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { dist[(int) predictedClass][(int) inst.classValue()] += inst.weight(); } } } else if (cSet != null) { dist = new double[data.numClasses()][data.numClasses()]; JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { testPoint = new double[2]; ctr = 0; for (Attribute a : aList) { if (!data.instance(k).isMissing(a)) { testPoint[ctr] = data.instance(k).value(a); ctr++; } } int check = checkPointInPolygon(attrVertices, testPoint); dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight(); } } // Compute weights for subsetsCustomClassifierIndex props[att] = new double[dist.length]; for (int k = 0; k < props[att].length; k++) { props[att][k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props[att]), 0)) { for (int k = 0; k < props[att].length; k++) { props[att][k] = 1.0 / props[att].length; } } else { Utils.normalize(props[att]); } // Any instances with missing values ? if (indexOfFirstMissingValue > -1) { // Distribute weights for instances with missing values for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (attribute.isNominal()) { // Need to check if attribute value is missing if (inst.isMissing(att)) { for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } else { // Can be sure that value is missing, so no test required for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } } // Return distribution and split point dists[att] = dist; mp.put("split_point", splitPoint); mp.put("orig_split_point", origSplitPoint); return mp; }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution of an instance using the decision tree. * /*from ww w . ja v a 2 s. c o m*/ * @param instance * the instance to compute the distribution for * @return the computed class distribution * @throws Exception * if computation fails */ @Override public double[] distributionForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } double[] returnedDist = null; //Set Parent Node to set m_pred in case custom set occurs. if (m_Successors != null) { for (int i = 0; i < m_Successors.length; i++) { m_Successors[i].setParentNode(this.parentNode); } } if (m_Info != null) { if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) { // Node is not a leaf if (instance.isMissing(m_Attribute)) { LOGGER.debug("Missing attribute"); // Value is missing returnedDist = new double[m_Info.numClasses()]; // Split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } LOGGER.debug("Missing Instance"); } else if (m_Info.attribute(m_Attribute).isNominal()) { // For nominal attributes returnedDist = m_Successors[(int) instance.value(m_Attribute)] .distributionForInstance(instance); } else { // For numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } } else if (m_Attribute >= m_Info.numAttributes() - 1) { if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, m_Info, d); double[] testPoint = new double[2]; testPoint[0] = instance.value(aList.get(0)); testPoint[1] = instance.value(aList.get(1)); int check = checkPointInPolygon(attrVertices, testPoint); if (m_Successors[check].getM_Attribute() == -1) { parentNode.setM_pred(m_ClassAssignment.get((check == 0) ? "Outside" : "Inside")); } returnedDist = m_Successors[check].distributionForInstance(instance); } else { String classifierId = ""; classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info); Classifier fc = listOfFc.get(classifierId); double predictedClass = fc.classifyInstance(instance); if (predictedClass != Instance.missingValue()) { returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance); } } } } // Node is a leaf or successor is empty? if ((m_Attribute == -1) || (returnedDist == null)) { // Is node empty? if (m_ClassDistribution == null) { if (getAllowUnclassifiedInstances()) { return new double[m_Info.numClasses()]; } else { return null; } } // Else return normalized distribution double[] normalizedDistribution = m_ClassDistribution.clone(); if (this.parentNode != null) { this.parentNode.setJsonnode(this.getJsonnode()); } try { Utils.normalize(normalizedDistribution); } catch (Exception e) { LOGGER.error("Sum is 0. Coudln't Normalize"); } return normalizedDistribution; } else { return returnedDist; } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution of an instance using the decision tree. * /*w ww.j a va2 s. c om*/ * @param instance * the instance to compute the distribution for * @return the computed class distribution * @throws Exception * if computation fails */ public double[] predForInstance(Instance instance) throws Exception { // default model? if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } double[] returnedDist = null; if (m_Attribute > -1 && m_Attribute < m_Info.numAttributes()) { // Node is not a leaf if (instance.isMissing(m_Attribute)) { // Value is missing returnedDist = new double[m_Info.numClasses()]; // Split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if (m_Info.attribute(m_Attribute).isNominal()) { // For nominal attributes returnedDist = m_Successors[(int) instance.value(m_Attribute)].distributionForInstance(instance); } else { // For numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } } else if (m_Attribute >= m_Info.numAttributes() - 1) { if (m_Attribute >= (listOfFc.size() + m_Info.numAttributes()) - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (listOfFc.size() - 1 + m_Info.numAttributes()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, m_Info, d); double[] testPoint = new double[2]; testPoint[0] = instance.value(aList.get(0)); testPoint[1] = instance.value(aList.get(1)); int check = checkPointInPolygon(attrVertices, testPoint); returnedDist = m_Successors[check].distributionForInstance(instance); } else { String classifierId = ""; classifierId = getKeyinMap(listOfFc, m_Attribute, m_Info); Classifier fc = listOfFc.get(classifierId); double predictedClass = fc.classifyInstance(instance); if (predictedClass != Instance.missingValue()) { returnedDist = m_Successors[(int) predictedClass].distributionForInstance(instance); } } } // Node is a leaf or successor is empty? if ((m_Attribute == -1) || (returnedDist == null)) { // Is node empty? if (m_ClassDistribution == null) { if (getAllowUnclassifiedInstances()) { return new double[m_Info.numClasses()]; } else { return null; } } // Else return normalized distribution double[] normalizedDistribution = m_ClassDistribution.clone(); Utils.normalize(normalizedDistribution); return normalizedDistribution; } else { return returnedDist; } }