List of usage examples for weka.core Instances sort
public void sort(Attribute att)
From source file:lu.lippmann.cdb.ext.hydviga.gaps.GapFiller.java
License:Open Source License
private Instances fillAllGapsWithDiscretizedTime(final Instances ds) throws Exception { int firstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(ds); final String datename = ds.attribute(firstDateIdx).name(); if (firstDateIdx == -1) { throw new Exception("No date attribute in this dataset!"); }/*w ww .j a v a 2 s .c om*/ Instances newds = new Instances(ds); /* add discretized time */ newds = WekaTimeSeriesUtil.buildDataSetWithDiscretizedTime(newds); /* add fake numerical time */ newds.insertAttributeAt(new Attribute(datename + "_fake"), newds.numAttributes()); for (int i = 0; i < newds.numInstances(); i++) { newds.instance(i).setValue(newds.numAttributes() - 1, newds.instance(i).value(firstDateIdx)); } /* remove 'true' date */ while (firstDateIdx != -1) { newds.deleteAttributeAt(firstDateIdx); firstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(newds); } /* transform nominal as binaries */ for (int iidx : WekaDataStatsUtil.getNominalAttributesIndexes(newds)) { newds = WekaDataProcessingUtil.buildDataSetWithNominalAsBinary(newds, iidx); } /* rename attributes for which the name can occur issues in tree evaluation */ for (int k = 0; k < newds.numAttributes(); k++) { String atn = newds.attribute(k).name(); if (atn.contains("=")) atn = atn.replaceAll("=", (int) (Math.random() * 1000) + ""); if (atn.contains("<")) atn = atn.replaceAll("<", (int) (Math.random() * 1000) + ""); if (atn.contains(">")) atn = atn.replaceAll(">", (int) (Math.random() * 1000) + ""); if (atn.contains(".")) atn = atn.replace(".", (int) (Math.random() * 1000) + ""); newds = WekaDataProcessingUtil.renameAttribute(newds, k, atn); } /* replace missing values */ newds = fillGaps0(newds); /* reconstruct date according to discretized time */ final String df = ds.attribute(WekaDataStatsUtil.getFirstDateAttributeIdx(ds)).getDateFormat(); newds.insertAttributeAt(new Attribute(datename + "_new", df), newds.numAttributes()); final int newfirstDateIdx = WekaDataStatsUtil.getFirstDateAttributeIdx(newds); for (int i = 0; i < newds.numInstances(); i++) { final Instance inst = newds.instance(i); inst.setValue(newfirstDateIdx, newds.instance(i).value(newds.numAttributes() - 2)); } /* sort by date ! */ newds.sort(newfirstDateIdx); /* remove discretized time */ final Set<String> toRemove = new HashSet<String>(); for (int i = 0; i < newds.numAttributes(); i++) { if (newds.attribute(i).name().startsWith("t_")) toRemove.add(newds.attribute(i).name()); } for (final String tr : toRemove) newds.deleteAttributeAt(newds.attribute(tr).index()); /* delete the fake attribute time */ newds.deleteAttributeAt(newds.numAttributes() - 2); return newds; }
From source file:machine_learing_clasifier.MyC45.java
public double BestContinousAttribute(Instances i, Attribute att) { i.sort(att); Enumeration enumForMissingAttr = i.enumerateInstances(); double temp = i.get(0).classValue(); double igtemp = 0; double bestthreshold = 0; double a;// ww w.j a v a2 s . co m double b = i.get(0).value(att); while (enumForMissingAttr.hasMoreElements()) { Instance inst = (Instance) enumForMissingAttr.nextElement(); if (temp != inst.classValue()) { temp = inst.classValue(); a = b; b = inst.value(att); double threshold = a + ((b - a) / 2); double igtemp2 = computeInformationGainContinous(i, att, threshold); if (igtemp < igtemp2) { bestthreshold = threshold; igtemp = igtemp2; } } } return bestthreshold; }
From source file:myclassifier.myC45Pack.SplitModel.java
public void buildClassifier(Instances dataSet) throws Exception { // Initialize the remaining instance variables. numSubsets = 0;/*from ww w . j a v a 2s . co m*/ splitPointValue = Double.MAX_VALUE; infoGain = 0; gainRatio = 0; // Different treatment for enumerated and numeric attributes. if (dataSet.attribute(attribIndex).isNominal()) { numOfBranches = dataSet.attribute(attribIndex).numValues(); numOfSplitPoints = dataSet.attribute(attribIndex).numValues(); handleNominalAttribute(dataSet); } else { //attribute numeric numOfBranches = 2; numOfSplitPoints = 0; dataSet.sort(dataSet.attribute(attribIndex)); handleNumericAttribute(dataSet); } }
From source file:myid3andc45classifier.Model.MyC45.java
@Override public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); data = new Instances(data); data.deleteWithMissingClass();// ww w .ja v a2 s .c o m Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); if (attr.isNumeric()) { ArrayList<Double> mid = new ArrayList<Double>(); Instances savedData = null; double temp, max = Double.NEGATIVE_INFINITY; // TODO: split nominal data.sort(attr); for (int i = 0; i < data.numInstances() - 1; i++) { if (data.instance(i).classValue() != data.instance(i + 1).classValue()) { if (data.attribute(attr.name() + " " + (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2) == null) { data = convertInstances(data, attr, (data.instance(i + 1).value(attr) + data.instance(i).value(attr)) / 2); //temp = computeInfoGainRatio(newData, newData.attribute(newData.numAttributes()-1)); //System.out.println("attribute "+newData.attribute(newData.numAttributes()-1).name()); //if (temp > max) { // max = temp; // savedData = newData; //} } } } //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); double mean = attributeStats.numericStats.mean; if (Double.isNaN(mean)) mean = 0; // Replace missing value with mean Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), mean); } } //data = new Instances(savedData); } else { //Penanganan Missing Value AttributeStats attributeStats = data.attributeStats(attr.index()); int maxIndex = 0; for (int i = 1; i < attr.numValues(); i++) { if (attributeStats.nominalCounts[maxIndex] < attributeStats.nominalCounts[i]) { maxIndex = i; } } // Replace missing value with max index Enumeration instEnumerate = data.enumerateInstances(); while (instEnumerate.hasMoreElements()) { Instance instance = (Instance) instEnumerate.nextElement(); if (instance.isMissing(attr.index())) { instance.setValue(attr.index(), maxIndex); } } } } makeMyC45Tree(data); }
From source file:newdtl.NewJ48.java
/** * Computes Gain Ratio for an attribute. * * @param data the data for which gain ratio is to be computed * @param att the attribute//from www .ja v a 2s . c o m * @return the gain ratio for the given attribute and data * @throws Exception if computation fails */ private double[] computeGainRatio(Instances data, Attribute att) { if (att.isNumeric()) { data.sort(att); double[] threshold; double[] gainRatios; if (data.numInstances() == 1) { threshold = new double[1]; gainRatios = new double[1]; threshold[0] = data.instance(0).value(att); double infoGain = computeInfoGain(data, att, threshold[0]); double splitInfo = computeSplitInformation(data, att, threshold[0]); gainRatios[0] = infoGain > 0 ? infoGain / splitInfo : infoGain; } else { threshold = new double[data.numInstances() - 1]; gainRatios = new double[data.numInstances() - 1]; for (int i = 0; i < data.numInstances() - 1; i++) { threshold[i] = data.instance(i).value(att); double infoGain = computeInfoGain(data, att, threshold[i]); double splitInfo = computeSplitInformation(data, att, threshold[i]); gainRatios[i] = infoGain > 0 ? infoGain / splitInfo : infoGain; } } return new double[] { gainRatios[maxIndex(gainRatios)], threshold[maxIndex(gainRatios)] }; } else { double infoGain = computeInfoGain(data, att); double splitInfo = computeSplitInformation(data, att); return new double[] { splitInfo > 0 ? infoGain / splitInfo : splitInfo, 0 }; } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Trying to get generate distribution of classes * //from w ww .j a va 2 s. c o m * @param Instances * @Param Attribute index to get distribution of * @Param HashMap to put data into * * @return HashMap of class distribution data */ protected HashMap addDistributionData(Instances instances, int attIndex, HashMap distMap) throws Exception { Map<String, Comparable> temp = new HashMap<String, Comparable>(); ArrayList<Object> distData = new ArrayList(); // GenerateCSV csv = new GenerateCSV(); // String data = ""; boolean isNominal = false; instances.sort(attIndex); for (int i = 0; i < instances.numInstances(); i++) { Instance inst = instances.instance(i); if (!Double.isNaN(inst.value(attIndex))) { temp = new HashMap<String, Comparable>(); if (inst.attribute(attIndex).isNominal()) { temp.put("value", inst.attribute(attIndex).value((int) inst.value(attIndex))); isNominal = true; // data+=inst.attribute(m_Attribute).value((int)inst.value(m_Attribute))+","; } else { temp.put("value", inst.value(attIndex)); // data+=inst.value(att)+","; } temp.put("classprob", inst.classAttribute().value((int) inst.classValue())); // data+=inst.classAttribute().value((int) // inst.classValue())+"\n"; distData.add(temp); } } if (!distData.isEmpty()) { distMap.put("dataArray", distData); distMap.put("isNominal", isNominal); setDistributionData(distMap); } return distMap; // To check if data is being generated right. // csv.generateCsvFile("/home/karthik/Documents/distribution.csv", // data); }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution for an attribute. * /*from ww w. j a va 2 s . co m*/ * @param props * @param dists * @param att * the attribute index * @param data * the data to work with * @throws Exception * if something goes wrong */ protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data, double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception { HashMap<String, Double> mp = new HashMap<String, Double>(); double splitPoint = givenSplitPoint; double origSplitPoint = 0; Attribute attribute = null; double[][] dist = null; int indexOfFirstMissingValue = -1; String CustomClassifierId = null; CustomSet cSet = null; if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) { CustomClassifierId = getKeyinMap(custom_classifiers, att, data); } else if (att >= data.numAttributes() + custom_classifiers.size()) { cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList); } else { attribute = data.attribute(att); } if (CustomClassifierId == null && cSet == null) { if (attribute.isNominal()) { // For nominal attributes dist = new double[attribute.numValues()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Skip missing values at this stage if (indexOfFirstMissingValue < 0) { indexOfFirstMissingValue = i; } continue; } dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight(); } } else { // For numeric attributes double[][] currDist = new double[2][data.numClasses()]; dist = new double[2][data.numClasses()]; // Sort data data.sort(att); // Move all instances into second subset for (int j = 0; j < data.numInstances(); j++) { Instance inst = data.instance(j); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value indexOfFirstMissingValue = j; break; } currDist[1][(int) inst.classValue()] += inst.weight(); } // Value before splitting double priorVal = priorVal(currDist); // Save initial distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } if (Double.isNaN(splitPoint)) { // Try all possible split points double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } // Can we place a sensible split point here? if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save split point splitPoint = (inst.value(att) + currSplit) / 2.0; origSplitPoint = splitPoint; // Save distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); } } else { double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; // Split data set using given split point. for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save computed split point origSplitPoint = (inst.value(att) + currSplit) / 2.0; } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(att) <= splitPoint) { // Save distribution since split point is specified for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } } } } else if (CustomClassifierId != null) { Classifier fc = custom_classifiers.get(CustomClassifierId); dist = new double[data.numClasses()][data.numClasses()]; Instance inst; for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); double predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { dist[(int) predictedClass][(int) inst.classValue()] += inst.weight(); } } } else if (cSet != null) { dist = new double[data.numClasses()][data.numClasses()]; JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { testPoint = new double[2]; ctr = 0; for (Attribute a : aList) { if (!data.instance(k).isMissing(a)) { testPoint[ctr] = data.instance(k).value(a); ctr++; } } int check = checkPointInPolygon(attrVertices, testPoint); dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight(); } } // Compute weights for subsetsCustomClassifierIndex props[att] = new double[dist.length]; for (int k = 0; k < props[att].length; k++) { props[att][k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props[att]), 0)) { for (int k = 0; k < props[att].length; k++) { props[att][k] = 1.0 / props[att].length; } } else { Utils.normalize(props[att]); } // Any instances with missing values ? if (indexOfFirstMissingValue > -1) { // Distribute weights for instances with missing values for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (attribute.isNominal()) { // Need to check if attribute value is missing if (inst.isMissing(att)) { for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } else { // Can be sure that value is missing, so no test required for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } } // Return distribution and split point dists[att] = dist; mp.put("split_point", splitPoint); mp.put("orig_split_point", origSplitPoint); return mp; }
From source file:org.wkwk.classifier.MyC45.java
public double bestThreshold(Instances data, Attribute attr) { data.sort(attr); double m_ig = 0; double bestThr = 0; double classTemp = data.get(0).classValue(); double valueTemp = data.get(0).value(attr); Enumeration instEnum = data.enumerateInstances(); double dt;//from ww w . j a v a 2 s. c om while (instEnum.hasMoreElements()) { Instance inst = (Instance) instEnum.nextElement(); if (classTemp != inst.classValue()) { classTemp = inst.classValue(); dt = valueTemp; valueTemp = inst.value(attr); double threshold = dt + ((valueTemp - dt) / 2); double igTemp = computeInfoGainCont(data, attr, threshold); if (m_ig < igTemp) { m_ig = igTemp; bestThr = threshold; } } } return bestThr; }
From source file:Prediccion.PrecidePasoNodo.java
License:Open Source License
Instances cargarDatos(int hora) throws ParseException { //Declaramos los atributos de las instancias Attribute a0 = new Attribute("Intervalo", "yyyy-MM-dd HH:mm:ss"); Attribute a1 = new Attribute("Total"); ArrayList<Attribute> c = new ArrayList<>(); c.add(a0);//from www .j av a 2 s . c o m c.add(a1); //Creamos el conjunto de instancias Instances instances = new Instances(nodo, c, 1000); //Instanciamos conexion con FT cFT = new conectarFusionTables(); Sqlresponse r = cFT.select(TABLAID, "Intervalo, Total", "idNodo = " + nodo + " and Intervalo ENDS WITH '00:00:00'", "ORDER BY \'Intervalo\' DESC LIMIT 10000"); for (List<Object> a : r.getRows()) { Instance i = new DenseInstance(2); String s0 = (String) a.get(0); String s1 = (String) a.get(1); System.err.println(s0 + " ->" + s1); i.setValue(instances.attribute(0), instances.attribute(0).parseDate(s0)); i.setValue(instances.attribute(1), Integer.parseInt(s1)); instances.add(i); } instances.sort(0); return instances; }