List of usage examples for weka.core Instance value
public double value(Attribute att);
From source file:org.wkwk.classifier.MyC45.java
@Override public double classifyInstance(Instance data) { if (splitAttribute == null) { return classValue; } else {//from w w w . j av a 2s .com if (splitAttribute.isNominal()) { return successors[(int) data.value(splitAttribute)].classifyInstance(data); } else if (splitAttribute.isNumeric()) { if (data.value(splitAttribute) < attrThreshold) { return successors[0].classifyInstance(data); } else { return successors[1].classifyInstance(data); } } else { return -1; } } }
From source file:org.wkwk.classifier.MyC45.java
public Instances[] splitData(Instances data, Attribute attr) { Instances[] splitData = new Instances[attr.numValues()]; for (int i = 0; i < attr.numValues(); i++) { splitData[i] = new Instances(data, data.numInstances()); }/* w ww . j ava 2 s .c o m*/ Enumeration instEnum = data.enumerateInstances(); while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); splitData[(int) inst.value(attr)].add(inst); } return splitData; }
From source file:org.wkwk.classifier.MyC45.java
public double bestThreshold(Instances data, Attribute attr) { data.sort(attr);/* w ww . ja va 2 s. c om*/ double m_ig = 0; double bestThr = 0; double classTemp = data.get(0).classValue(); double valueTemp = data.get(0).value(attr); Enumeration instEnum = data.enumerateInstances(); double dt; while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); if (classTemp != inst.classValue()) { classTemp = inst.classValue(); dt = valueTemp; valueTemp = inst.value(attr); double threshold = dt + ((valueTemp - dt) / 2); double igTemp = computeInfoGainCont(data, attr, threshold); if (m_ig < igTemp) { m_ig = igTemp; bestThr = threshold; } } } return bestThr; }
From source file:Part2.HierarchicalClusterer.java
License:Open Source License
/** calculate the distance between two clusters * @param cluster1 list of indices of instances in the first cluster * @param cluster2 dito for second cluster * @return distance between clusters based on link type *///www . j av a 2s . com double getDistance(double[][] fDistance, Vector<Integer> cluster1, Vector<Integer> cluster2) { double fBestDist = Double.MAX_VALUE; //double SemiDist = m_DistanceFunction.Semi_distance(fDistance, cluster1, cluster2); switch (m_nLinkType) { case SINGLE: // find single link distance aka minimum link, which is the closest distance between // any item in cluster1 and any item in cluster2 fBestDist = Double.MAX_VALUE; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist > fDist) { fBestDist = fDist; } } } break; case COMPLETE: case ADJCOMLPETE: // find complete link distance aka maximum link, which is the largest distance between // any item in cluster1 and any item in cluster2 fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist < fDist) { fBestDist = fDist; } } } if (m_nLinkType == COMPLETE) { break; } // calculate adjustment, which is the largest within cluster distance double fMaxDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = i + 1; j < cluster1.size(); j++) { int i2 = cluster1.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } for (int i = 0; i < cluster2.size(); i++) { int i1 = cluster2.elementAt(i); for (int j = i + 1; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } fBestDist -= fMaxDist; break; case AVERAGE: // finds average distance between the elements of the two clusters fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); fBestDist += fDistance[i1][i2]; } } fBestDist /= (cluster1.size() * cluster2.size()); break; case MEAN: { // calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering) Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); fBestDist = 0; for (int i = 0; i < merged.size(); i++) { int i1 = merged.elementAt(i); for (int j = i + 1; j < merged.size(); j++) { int i2 = merged.elementAt(j); fBestDist += fDistance[i1][i2]; } } int n = merged.size(); fBestDist /= (n * (n - 1.0) / 2.0); } break; case CENTROID: // finds the distance of the centroids of the clusters double[] fValues1 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster1.size(); i++) { Instance instance = m_instances.instance(cluster1.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] += instance.value(j); } } double[] fValues2 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster2.size(); i++) { Instance instance = m_instances.instance(cluster2.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues2[j] += instance.value(j); } } for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] /= cluster1.size(); fValues2[j] /= cluster2.size(); } // set up two instances for distance function Instance instance1 = (Instance) m_instances.instance(0).copy(); Instance instance2 = (Instance) m_instances.instance(0).copy(); for (int j = 0; j < m_instances.numAttributes(); j++) { instance1.setValue(j, fValues1[j]); instance2.setValue(j, fValues2[j]); } fBestDist = m_DistanceFunction.distance(instance1, instance2); break; case WARD: { // finds the distance of the change in caused by merging the cluster. // The information of a cluster is calculated as the error sum of squares of the // centroids of the cluster and its members. double ESS1 = calcESS(cluster1); double ESS2 = calcESS(cluster2); Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); double ESS = calcESS(merged); fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size(); } break; } //double alpha = 1; //return alpha*SemiDist + (1-alpha)*fBestDist; return fBestDist; }
From source file:Part2.HierarchicalClustererEx.java
License:Open Source License
/** calculate the distance between two clusters * @param cluster1 list of indices of instances in the first cluster * @param cluster2 dito for second cluster * @return distance between clusters based on link type *///w w w. ja va 2 s . c o m double getDistance(double[][] fDistance, Vector<Integer> cluster1, Vector<Integer> cluster2) { double fBestDist = Double.MAX_VALUE; //double SemiDist = m_DistanceFunction.Semi_distance(fDistance, cluster1, cluster2); switch (m_nLinkType) { case SINGLE: // find single link distance aka minimum link, which is the closest distance between // any item in cluster1 and any item in cluster2 fBestDist = Double.MAX_VALUE; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist > fDist) { fBestDist = fDist; } } } break; case COMPLETE: case ADJCOMLPETE: // find complete link distance aka maximum link, which is the largest distance between // any item in cluster1 and any item in cluster2 fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fBestDist < fDist) { fBestDist = fDist; } } } if (m_nLinkType == COMPLETE) { break; } // calculate adjustment, which is the largest within cluster distance double fMaxDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = i + 1; j < cluster1.size(); j++) { int i2 = cluster1.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } for (int i = 0; i < cluster2.size(); i++) { int i1 = cluster2.elementAt(i); for (int j = i + 1; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); double fDist = fDistance[i1][i2]; if (fMaxDist < fDist) { fMaxDist = fDist; } } } fBestDist -= fMaxDist; break; case AVERAGE: // finds average distance between the elements of the two clusters fBestDist = 0; for (int i = 0; i < cluster1.size(); i++) { int i1 = cluster1.elementAt(i); for (int j = 0; j < cluster2.size(); j++) { int i2 = cluster2.elementAt(j); fBestDist += fDistance[i1][i2]; } } fBestDist /= (cluster1.size() * cluster2.size()); break; case MEAN: { // calculates the mean distance of a merged cluster (akak Group-average agglomerative clustering) Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); fBestDist = 0; for (int i = 0; i < merged.size(); i++) { int i1 = merged.elementAt(i); for (int j = i + 1; j < merged.size(); j++) { int i2 = merged.elementAt(j); fBestDist += fDistance[i1][i2]; } } int n = merged.size(); fBestDist /= (n * (n - 1.0) / 2.0); } break; case CENTROID: // finds the distance of the centroids of the clusters double[] fValues1 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster1.size(); i++) { Instance instance = m_instances.instance(cluster1.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] += instance.value(j); } } double[] fValues2 = new double[m_instances.numAttributes()]; for (int i = 0; i < cluster2.size(); i++) { Instance instance = m_instances.instance(cluster2.elementAt(i)); for (int j = 0; j < m_instances.numAttributes(); j++) { fValues2[j] += instance.value(j); } } for (int j = 0; j < m_instances.numAttributes(); j++) { fValues1[j] /= cluster1.size(); fValues2[j] /= cluster2.size(); } // set up two instances for distance function Instance instance1 = (Instance) m_instances.instance(0).copy(); Instance instance2 = (Instance) m_instances.instance(0).copy(); for (int j = 0; j < m_instances.numAttributes(); j++) { instance1.setValue(j, fValues1[j]); instance2.setValue(j, fValues2[j]); } fBestDist = m_DistanceFunction.distance(instance1, instance2); break; case WARD: { // finds the distance of the change in caused by merging the cluster. // The information of a cluster is calculated as the error sum of squares of the // centroids of the cluster and its members. double ESS1 = calcESS(cluster1); double ESS2 = calcESS(cluster2); Vector<Integer> merged = new Vector<Integer>(); merged.addAll(cluster1); merged.addAll(cluster2); double ESS = calcESS(merged); fBestDist = ESS * merged.size() - ESS1 * cluster1.size() - ESS2 * cluster2.size(); } break; } double alpha = 0.5; //return alpha*SemiDist + (1-alpha)*fBestDist; return fBestDist; }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * /*from ww w.ja v a2 s .c om*/ * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:probcog.bayesnets.core.WEKADiscretizationFilter.java
License:Open Source License
public String getValueForContinuous(double continuous) { Instance inst = new Instance(1); inst.setValue(0, continuous);/* w w w . j ava2 s . com*/ try { filter.input(inst); filter.batchFinished(); Instance newInst = filter.output(); int value = (int) newInst.value(0); return outputValues[value]; } catch (Exception e) { e.printStackTrace(); return null; } }
From source file:probcog.bayesnets.learning.CPTLearner.java
License:Open Source License
/** * learns all the examples in the instances. Each instance in the instances represents one example. * All the random variables (nodes) in the network * need to be found in each instance as columns that are named accordingly, i.e. for each * random variable, there must be an attribute with a matching name in the instance. * @param instances the instances * @throws Exception if the result set is empty * @throws SQLException particularly if there is no matching column for one of the node names *//*from w ww.jav a2s . co m*/ public void learn(Instances instances) throws Exception { if (!initialized) init(); // if it's an empty result set, throw exception if (instances.numInstances() == 0) throw new Exception("empty result set!"); BeliefNode[] nodes = bn.bn.getNodes(); int numAttributes = instances.numAttributes(); // Now we can get much more nodes than attributes // if(numAttributes != nodes.length) // throw new Exception("Result does not contain suitable data (attribute count = " + numAttributes + "; node count = " + nodes.length + ")"); // map node indices to attribute index int[] nodeIdx2colIdx = new int[nodes.length]; Arrays.fill(nodeIdx2colIdx, -1); for (int i = 0; i < numAttributes; i++) { Set<String> nodeNames = bn.getNodeNamesForAttribute(instances.attribute(i).name()); //logger.debug("Nodes for attribute "+instances.attribute(i).name()+": "+nodeNames); if (nodeNames == null) continue; for (String nodeName : nodeNames) { int node_idx = bn.getNodeIndex(nodeName); if (node_idx == -1) throw new Exception("Unknown node referenced in result set: " + instances.attribute(i).name()); nodeIdx2colIdx[node_idx] = i; } } // gather data, iterating over the result set int[] domainIndices = new int[nodes.length]; @SuppressWarnings("unchecked") Enumeration<Instance> instanceEnum = instances.enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = instanceEnum.nextElement(); // for each row... // - get the indices into the domains of each node // that correspond to the current row of data // (sorted in the same order as the nodes are ordered // in the BeliefNetwork) for (int node_idx = 0; node_idx < nodes.length; node_idx++) { int domain_idx; if (clusterers[node_idx] == null) { Discrete domain = (Discrete) nodes[node_idx].getDomain(); String strValue; if (domain instanceof Discretized) { // If we have a discretized domain we discretize first... int colIdx = nodeIdx2colIdx[node_idx]; if (colIdx < 0) { //bn.dump(); /* for (int i = 0; i < numAttributes; i++) { logger.debug("Attribute "+i+": "+instances.attribute(i).name()); } StringBuffer sb = new StringBuffer(); for (int i = 0; i < nodeIdx2colIdx.length; i++) { sb.append(i+"\t"); } sb.append("\n"); for (int i = 0; i < nodeIdx2colIdx.length; i++) { sb.append(nodeIdx2colIdx[i]+"\t"); } logger.debug(sb); */ throw new Exception( "No attribute specified for " + bn.bn.getNodes()[node_idx].getName()); } double value = instance.value(colIdx); strValue = (((Discretized) domain).getNameFromContinuous(value)); /*if (domain.findName(strValue) == -1) { logger.debug(domain); logger.debug(strValue); }*/ } else { int colIdx = nodeIdx2colIdx[node_idx]; if (colIdx < 0) { throw new Exception( "No attribute specified for " + bn.bn.getNodes()[node_idx].getName()); } strValue = instance.stringValue(nodeIdx2colIdx[node_idx]); } domain_idx = domain.findName(strValue); if (domain_idx == -1) { /*String[] myDomain = bn.getDiscreteDomainAsArray(bn.bn.getNodes()[node_idx].getName()); for (int i=0; i<myDomain.length; i++) { logger.debug(myDomain[i]); }*/ throw new Exception(strValue + " not found in domain of " + nodes[node_idx].getName()); } } else { Instance inst = new Instance(1); inst.setValue(0, instance.value(nodeIdx2colIdx[node_idx])); domain_idx = clusterers[node_idx].clusterInstance(inst); } domainIndices[node_idx] = domain_idx; } // - update each node's CPT for (int i = 0; i < nodes.length; i++) { counters[i].count(domainIndices); } } }
From source file:probcog.bayesnets.learning.DomainLearner.java
License:Open Source License
/** * learns all the examples in the result set. Each row in the result set * represents one example. All the random variables (nodes) that have been * scheduled for learning in the constructor need to be found in each result * row as columns that are named accordingly, i.e. for each random variable * for which the domain is to be learnt, there must be a column with a * matching name in the result set.//from w w w . j a v a 2 s . c o m * * @param rs * the result set * @throws Exception * if the result set is empty * @throws SQLException * particularly if there is no matching column for one of the * node names */ public void learn(Instances instances) throws Exception, SQLException { // if it's an empty result set, throw exception if (instances.numInstances() == 0) throw new Exception("empty result set!"); // gather domain data int numDirectDomains = directDomains != null ? directDomains.length : 0; int numClusteredDomains = clusteredDomains != null ? clusteredDomains.length : 0; @SuppressWarnings("unchecked") Enumeration<Instance> instanceEnum = instances.enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = instanceEnum.nextElement(); // for direct learning, add outcomes to the set of outcomes for (int i = 0; i < numDirectDomains; i++) { directDomainData.get(i).add(instance.stringValue(instances.attribute(directDomains[i].getName()))); } // for clustering, gather all instances for (int i = 0; i < numClusteredDomains; i++) { Instance inst = new Instance(1); inst.setValue(attrValue, instance.value(instances.attribute(clusteredDomains[i].nodeName))); clusterData[i].add(inst); } } }