List of usage examples for weka.core Instances enumerateAttributes
publicEnumeration<Attribute> enumerateAttributes()
From source file:dewaweebtreeclassifier.veranda.VerandaTree.java
/** * //from w ww . j av a2 s . c o m * @param data */ public void buildTree(Instances data) { // exit if there is no data left in the dataset if (data.numInstances() == 0) { mChild = null; return; } double[] informationGains = new double[data.numAttributes()]; Enumeration enumAttrs = data.enumerateAttributes(); while (enumAttrs.hasMoreElements()) { Attribute attr = (Attribute) enumAttrs.nextElement(); informationGains[attr.index()] = computeGain(data, attr); } int maxIdx = Utils.maxIndex(informationGains); if (Utils.eq(informationGains[maxIdx], 0)) { mClassDistribution = new int[data.numClasses()]; Enumeration enumInst = data.enumerateInstances(); while (enumInst.hasMoreElements()) { Instance instance = (Instance) enumInst.nextElement(); mClassDistribution[(int) instance.classValue()]++; } mClassValue = Utils.maxIndex(mClassDistribution); } else { mSplitAttribute = data.attribute(maxIdx); Instances[] splitInstances = splitInstancesOnAttribute(data, mSplitAttribute); mChild = new VerandaTree[mSplitAttribute.numValues()]; for (int i = 0; i < mChild.length; i++) { mChild[i] = new VerandaTree(); mChild[i].buildTree(splitInstances[i]); } } }
From source file:dewaweebtreeclassifier.Veranda.java
/** * /* w w w .ja v a 2 s . c om*/ * @param data * @return */ private boolean isAllNominalAttributes(Instances data) { Enumeration enumAttr = data.enumerateAttributes(); while (enumAttr.hasMoreElements()) { Attribute attr = (Attribute) enumAttr.nextElement(); if (!attr.isNominal()) { return false; } } return true; }
From source file:ergasia2pkg.LP_ROS.java
/** * Creates a new MultiLabelInstances object given a list of Instance * * @param Hashmap<String,List<Instance> map from which to create instances * @param MultiLabelInstances used just to get the Label metadata * @return MultiLabelInstances new MultiLabelInstances Object */// ww w . j a v a 2 s. c o m private MultiLabelInstances createNewMultilabelInstance(HashMap<String, List<Instance>> labelsetGroup, MultiLabelInstances mlData) throws InvalidDataFormatException { Instances in = mlData.getDataSet(); Enumeration enumeration = in.enumerateAttributes(); ArrayList attlist = Collections.list(enumeration); int capacity = 0; for (String labelset : labelsetGroup.keySet()) { capacity += labelsetGroup.get(labelset).size(); } Instances newInstances = new Instances("sampledDataset", attlist, capacity); for (String labelset : labelsetGroup.keySet()) { List<Instance> instanceList = (ArrayList<Instance>) labelsetGroup.get(labelset); for (Instance inst : instanceList) { newInstances.add(inst); } } MultiLabelInstances newData = new MultiLabelInstances(newInstances, mlData.getLabelsMetaData()); return newData; }
From source file:gov.va.chir.tagline.dao.DatasetUtil.java
License:Open Source License
@SuppressWarnings("unchecked") public static Instances createDataset(final Instances header, final Collection<Document> documents) throws Exception { // Update header to include all docIDs from the passed in documents // (Weka requires all values for nominal features) final Set<String> docIds = new TreeSet<String>(); for (Document document : documents) { docIds.add(document.getName());/*from ww w . j a v a 2s . c o m*/ } final AddValues avf = new AddValues(); avf.setLabels(StringUtils.join(docIds, ",")); // Have to add 1 because SingleIndex.setValue() has a bug, expecting // the passed in index to be 1-based rather than 0-based. Why? I have // no idea. // Calling path: AddValues.setInputFormat() --> // SingleIndex.setUpper() --> // SingleIndex.setValue() avf.setAttributeIndex(String.valueOf(header.attribute(DOC_ID).index() + 1)); avf.setInputFormat(header); final Instances newHeader = Filter.useFilter(header, avf); final Instances instances = new Instances(newHeader, documents.size()); // Map attributes final Map<String, Attribute> attrMap = new HashMap<String, Attribute>(); final Enumeration<Attribute> en = newHeader.enumerateAttributes(); while (en.hasMoreElements()) { final Attribute attr = en.nextElement(); attrMap.put(attr.name(), attr); } attrMap.put(newHeader.classAttribute().name(), newHeader.classAttribute()); final Attribute docId = attrMap.get(DOC_ID); final Attribute lineId = attrMap.get(LINE_ID); final Attribute classAttr = attrMap.get(LABEL); // Add data for (Document document : documents) { final Map<String, Object> docFeatures = document.getFeatures(); for (Line line : document.getLines()) { final Instance instance = new DenseInstance(attrMap.size()); final Map<String, Object> lineFeatures = line.getFeatures(); lineFeatures.putAll(docFeatures); instance.setValue(docId, document.getName()); instance.setValue(lineId, line.getLineId()); if (line.getLabel() == null) { instance.setMissing(classAttr); } else { instance.setValue(classAttr, line.getLabel()); } for (Attribute attribute : attrMap.values()) { if (!attribute.equals(docId) && !attribute.equals(lineId) && !attribute.equals(classAttr)) { final String name = attribute.name(); final Object obj = lineFeatures.get(name); if (obj instanceof Double) { instance.setValue(attribute, ((Double) obj).doubleValue()); } else if (obj instanceof Integer) { instance.setValue(attribute, ((Integer) obj).doubleValue()); } else { instance.setValue(attribute, obj.toString()); } } } instances.add(instance); } } // Set last attribute as class instances.setClassIndex(attrMap.size() - 1); return instances; }
From source file:id3.MyID3.java
/** * Membuat pohon keputusan// w w w . jav a 2 s .co m * @param instances data train * @throws Exception */ @Override public void buildClassifier(Instances instances) throws Exception { // Check if classifier can handle the data getCapabilities().testWithFail(instances); // Remove missing value instance from instances instances = new Instances(instances); instances.deleteWithMissingClass(); // Gather list of attribute in instances ArrayList<Attribute> remainingAttributes = new ArrayList<>(); Enumeration enumAttributes = instances.enumerateAttributes(); while (enumAttributes.hasMoreElements()) { remainingAttributes.add((Attribute) enumAttributes.nextElement()); } // Start build classifier ID3 buildMyID3(instances, remainingAttributes); }
From source file:id3.MyID3.java
/** * Main method//ww w . ja v a 2 s .c o m * @param args arguments */ public static void main(String[] args) { Instances instances; try { BufferedReader reader = new BufferedReader(new FileReader("D:\\Weka-3-6\\data\\weather.nominal.arff")); try { instances = new Instances(reader); instances.setClassIndex(instances.numAttributes() - 1); MyID3 id3 = new MyID3(); try { id3.buildClassifier(instances); } catch (Exception e) { e.printStackTrace(); } // Test class distribution double[] classDistribution = id3.classDistribution(instances); for (int i = 0; i < classDistribution.length; i++) { System.out.println(classDistribution[i]); } // Test entrophy and information gain for each attribute System.out.println(id3.computeEntropy(instances)); Enumeration attributes = instances.enumerateAttributes(); while (attributes.hasMoreElements()) { System.out.println(id3.computeIG(instances, (Attribute) attributes.nextElement())); } // Test build classifier try { id3.buildClassifier(instances); } catch (Exception e) { e.printStackTrace(); } System.out.println(id3.toString()); // Evaluate model from build classifier (full training) Evaluation eval = null; try { eval = new Evaluation(instances); } catch (Exception e) { e.printStackTrace(); } try { System.out.println(instances); eval.evaluateModel(id3, instances); } catch (Exception e) { e.printStackTrace(); } System.out.println(eval.toSummaryString("\nResults Full-Training\n\n", false)); // Evaluate model from build classifier (test set) // Test Confusion Matrix System.out.println("Confusion Matrix : "); double[][] cmMatrix = eval.confusionMatrix(); for (int row_i = 0; row_i < cmMatrix.length; row_i++) { for (int col_i = 0; col_i < cmMatrix.length; col_i++) { System.out.print(cmMatrix[row_i][col_i]); System.out.print("|"); } System.out.println(); } } catch (IOException e) { e.printStackTrace(); } } catch (FileNotFoundException e) { e.printStackTrace(); } }
From source file:iris.ID3.java
public void makeLikeAWhat(Instances instances) { // Create storage for different info gains double[] infoGains = new double[instances.numAttributes()]; // Enumerate through attributes to find the best gain Enumeration attributeEnum = instances.enumerateAttributes(); while (attributeEnum.hasMoreElements()) { // Loop through attributes, adding gain to infoGains array Attribute att = (Attribute) attributeEnum.nextElement(); infoGains[att.index()] = infoGain(instances, att); }//from ww w . ja va 2 s. c o m // Use maxIndex to find the highest info gain in the array highestInfoGain = instances.attribute(Utils.maxIndex(infoGains)); // Make a leaf if there is no more info to gain // Otherwise, create children // Check if there is no more info to gain if (Utils.eq(infoGains[highestInfoGain.index()], 0)) { highestInfoGain = null; // Instantiate maxDistribution maxDistribution = new double[instances.numClasses()]; // Set up enumerator for instances Enumeration instanceEnum = instances.enumerateInstances(); // Tally classes while (instanceEnum.hasMoreElements()) { Instance instance = (Instance) instanceEnum.nextElement(); maxDistribution[(int) instance.classValue()]++; } // Normalize data for easier manipulation Utils.normalize(maxDistribution); // Get the max index of the distrubtion classValue = Utils.maxIndex(maxDistribution); // Save class attribute classAttribute = instances.classAttribute(); } // Create children else { // Split best attribute into bins Instances[] bins = makeBins(instances, highestInfoGain); // Create nodes children = new ID3[highestInfoGain.numValues()]; for (int i = 0; i < highestInfoGain.numValues(); i++) { children[i] = new ID3(); children[i].makeLikeAWhat(bins[i]); } } }
From source file:j48.BinC45ModelSelection.java
License:Open Source License
/** * Selects C4.5-type split for the given dataset. *///from w w w .ja v a 2 s.c o m public final ClassifierSplitModel selectModel(Instances data) { double minResult; double currentResult; BinC45Split[] currentModel; BinC45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { Attribute attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } currentModel = new BinC45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if ((data.attribute(i).isNumeric()) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the // original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. bestModel.setSplitPoint(m_allData); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:j48.C45ModelSelection.java
License:Open Source License
/** * Selects C4.5-type split for the given dataset. *///from w w w . ja va 2 s.c o m public final ClassifierSplitModel selectModel(Instances data) { double minResult; double currentResult; C45Split[] currentModel; C45Split bestModel = null; NoSplit noSplitModel = null; double averageInfoGain = 0; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; Attribute attribute; double sumOfWeights; int i; try { // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); noSplitModel = new NoSplit(checkDistribution); if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) return noSplitModel; // Check if all attributes are nominal and have a // lot of values. if (m_allData != null) { Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } } currentModel = new j48.C45Split[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) if (m_allData != null) { if ((data.attribute(i).isNumeric()) || (multiVal || Utils.sm((double) data.attribute(i).numValues(), (0.3 * (double) m_allData.numInstances())))) { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else { averageInfoGain = averageInfoGain + currentModel[i].infoGain(); validModels++; } } else currentModel[i] = null; } // Check if any useful split was found. if (validModels == 0) return noSplitModel; averageInfoGain = averageInfoGain / (double) validModels; // Find "best" attribute to split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) // Use 1E-3 here to get a closer approximation to the // original // implementation. if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3)) && Utils.gr(currentModel[i].gainRatio(), minResult)) { bestModel = currentModel[i]; minResult = currentModel[i].gainRatio(); } } // Check if useful split was found. if (Utils.eq(minResult, 0)) return noSplitModel; // Add all Instances with unknown values for the corresponding // attribute to the distribution for the model, so that // the complete distribution is stored with the model. bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()); // Set the split point analogue to C45 if attribute numeric. if (m_allData != null) bestModel.setSplitPoint(m_allData); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }
From source file:j48.NBTreeModelSelection.java
License:Open Source License
/** * Selects NBTree-type split for the given dataset. *///from w w w .jav a 2 s .c om public final ClassifierSplitModel selectModel(Instances data) { double globalErrors = 0; double minResult; double currentResult; NBTreeSplit[] currentModel; NBTreeSplit bestModel = null; NBTreeNoSplit noSplitModel = null; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; Attribute attribute; double sumOfWeights; int i; try { // build the global model at this node noSplitModel = new NBTreeNoSplit(); noSplitModel.buildClassifier(data); if (data.numInstances() < 5) { return noSplitModel; } // evaluate it globalErrors = noSplitModel.getErrors(); if (globalErrors == 0) { return noSplitModel; } // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); if (Utils.sm(checkDistribution.total(), m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) { return noSplitModel; } // Check if all attributes are nominal and have a // lot of values. if (m_allData != null) { Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) { multiVal = false; break; } } } currentModel = new NBTreeSplit[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).classIndex()) { // Get models for current attribute. currentModel[i] = new NBTreeSplit(i, m_minNoObj, sumOfWeights); currentModel[i].setGlobalModel(noSplitModel); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()) { validModels++; } } else { currentModel[i] = null; } } // Check if any useful split was found. if (validModels == 0) { return noSplitModel; } // Find "best" attribute to split on. minResult = globalErrors; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).classIndex()) && (currentModel[i].checkModel())) { /* System.err.println("Errors for "+data.attribute(i).name()+" "+ currentModel[i].getErrors()); */ if (currentModel[i].getErrors() < minResult) { bestModel = currentModel[i]; minResult = currentModel[i].getErrors(); } } } // System.exit(1); // Check if useful split was found. if (((globalErrors - minResult) / globalErrors) < 0.05) { return noSplitModel; } /* if (bestModel == null) { System.err.println("This shouldn't happen! glob : "+globalErrors+ " minRes : "+minResult); System.exit(1); } */ // Set the global model for the best split // bestModel.setGlobalModel(noSplitModel); return bestModel; } catch (Exception e) { e.printStackTrace(); } return null; }