List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:intensityclustering.IntensityClustering.java
/** * Draws the 2D Histogram Plot in the IntensityClustering. X-Axsis is * intensity value of chanel 2 image (where the stained nuclei are). Y-axis * are relative frequencies of present nuclei. * * @param tss The TMAspots whose nuclei are considered (both gold-standard * and estimated nuclei).//from w w w . ja v a 2s .c o m * @param doAlsoClustering If true, the TMApoints are also clustered * according to the histogram. */ void drawNucleiIntensities2D(List<TMAspot> tss, boolean doAlsoClustering) { // draw the plot Plot2DPanel plot; if (((java.awt.BorderLayout) (jPanel9.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) { plot = (Plot2DPanel) ((java.awt.BorderLayout) (jPanel9.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER); plot.removeAllPlots(); plot.removeAllPlotables(); } else { plot = new Plot2DPanel(PlotPanel.SOUTH); plot.setAxisLabels("Intensity", "Frequency"); plot.plotCanvas.setBackground(jPanel9.getBackground()); plot.plotLegend.setBackground(jPanel9.getBackground()); plot.plotToolBar.setBackground(plot.plotCanvas.getBackground()); } if (((java.awt.BorderLayout) (jPanel9.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) { jPanel9.add(plot, java.awt.BorderLayout.CENTER); jPanel15.setBackground(plot.plotCanvas.getBackground()); jPanel15.setVisible(true); validate(); pack(); } if (tss.size() > 0) { try { this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR)); List<Integer> intensities = new ArrayList<>(); int intensity; int min = Integer.parseInt(jTextField1.getText()); int max = Integer.parseInt(jTextField16.getText()); for (TMAspot ts : tss) { //TODO: GET THE CHANNEL 2 Image //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false); BufferedImage img = ts.getBufferedImage(false); // img can be null if color deconvolution has not been performed, yet. if (img != null) { List<TMApoint> tps = ts.getPoints(); for (TMALabel tp : tps) { intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false) .getRed(); if (intensity >= min && intensity <= max) { intensities.add(intensity); } } } } double[] intensities_array = new double[intensities.size()]; for (int i = 0; i < intensities.size(); i++) { intensities_array[i] = intensities.get(i); } int nbins = jSlider7.getValue(); if (intensities_array.length > 0) { plot.addHistogramPlot("TMA points", intensities_array, 0, 256, nbins); } //else { // JOptionPane.showMessageDialog(this, "No TMA points have been found.", "No TMA points found.", JOptionPane.WARNING_MESSAGE); //} //// Cluster Points according to histograms if (doAlsoClustering) { // Find Clusters int n = getParam_nClusters(); // Create ARFF Data FastVector atts; Instances data; int i; // 1. create arff data format atts = new FastVector(1); for (i = 0; i < 1; i++) { atts.addElement(new Attribute(Integer.toString(i))); } // 2. create Instances object data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss)); // 3. fill with data for (i = 0; i < intensities_array.length; i++) { // add the instance Instance inst = new Instance(1.0, new double[] { intensities_array[i] }); inst.setDataset(data); data.add(inst); } // 4. set data class index (last attribute is the class) //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X if (tmarker.DEBUG > 4) { java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO, data.toString()); } Clusterer clusterer = getClusterer(); String[] options = getClustererOptions(); if (tmarker.DEBUG > 3) { if (options.length > 0) { String info = "Clusterer should have options:\n"; for (String o : options) { info += o + " "; } info += "\n"; java.util.logging.Logger.getLogger(getClass().getName()) .log(java.util.logging.Level.INFO, info); } } clusterer.setOptions(options); // set the clusterer options clusterer.buildClusterer(data); // build the clusterer // order the clusters according to the brightness // The most bright cluster should be 0, then 1, then 2,... ArrayList<ArrayList<Double>> values = new ArrayList<>(); for (i = 0; i < n; i++) { values.add(new ArrayList<Double>()); } int z; double value; for (i = 0; i < data.numInstances(); i++) { z = clusterer.clusterInstance(data.instance(i)); value = data.instance(i).value(0); values.get(z).add(value); } double[] means = new double[n]; double[] stds = new double[n]; for (i = 0; i < n; i++) { means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()])); stds[i] = Misc.std(values.get(i).toArray(new Double[values.get(i).size()])); } int[] ordering = Misc.orderArray(means, true); for (i = 0; i < n; i++) { int ind = Misc.IndexOf(ordering, i); plot.addPlotable(new Line(getParam_ColorOfClassK(i), new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] }, new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] }, 2 * stds[ind])); plot.addPlot(Plot2DPanel.LINE, "Staining " + i, getParam_ColorOfClassK(i), new double[][] { new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] }, new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] } }); } String clusterInfo = ""; for (String o : clusterer.getOptions()) { clusterInfo += o + " "; } clusterInfo += "\n\n"; clusterInfo += clusterer.toString().trim(); if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) { try { clusterInfo += ((HierarchicalClusterer) clusterer).graph(); HierarchyVisualizer a = new HierarchyVisualizer( ((HierarchicalClusterer) clusterer).graph()); a.setSize(800, 600); if (clusterVisualizer == null) { clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram"); clusterVisualizer.setIconImage(getIconImage()); clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE); clusterVisualizer.setSize(800, 600); } Container contentPane = clusterVisualizer.getContentPane(); contentPane.removeAll(); contentPane.add(a); } catch (Exception e) { clusterVisualizer = null; } } jTextArea1.setText(clusterInfo); if (tmarker.DEBUG > 3) { String info = "Clusterer has options\n"; for (String o : clusterer.getOptions()) { info += o + " "; } info += "\n"; info += clusterer.toString() + "\n"; // info += (clusterer).globalInfo() + "\n"; info += "\n"; info += clusterInfo + "\n"; java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO, info); } // cluster all TMAspots and assign the corresponding class to them // Cluster the points List<List<Integer>> clustered_points = new ArrayList<>(); for (i = 0; i < n; i++) { clustered_points.add(new ArrayList<Integer>()); } int k; for (TMAspot ts : tss) { //TODO: GET THE CHANNEL 2 IMAGE //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false); BufferedImage img = ts.getBufferedImage(false); List<TMApoint> tps = ts.getPoints(); for (TMApoint tp : tps) { intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false) .getRed(); // add the instance Instance inst = new Instance(1.0, new double[] { intensity }); inst.setDataset(data); k = ordering[clusterer.clusterInstance(inst)]; // store the color for later visualization clustered_points.get(k).add(intensity); // set the staining of the TMApoint switch (k) { case 0: tp.setStaining(TMALabel.STAINING_0); break; case 1: tp.setStaining(TMALabel.STAINING_1); break; case 2: tp.setStaining(TMALabel.STAINING_2); break; default: tp.setStaining(TMALabel.STAINING_3); break; } } ts.dispStainingInfo(); if (manager.getVisibleTMAspot() == ts) { manager.repaintVisibleTMAspot(); } } // Write the description String description = "Nuclei clustered with " + getParam_AutomaticClustererString(); if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) { description += " (" + getParam_HierarchicalClusteringMethod() + ")"; } description += ", n=" + getParam_nClusters() + ", channel 2 intensity."; jLabel42.setText(description); jLabel41.setText(" "); } } catch (Exception e) { e.printStackTrace(); } finally { this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR)); } } }
From source file:intensityclustering.IntensityClustering.java
/** * Clusters the TMApoints on given TMAspots according to their staining * intensity (color). All parameters (e.g. clusterer and parameters) are * selected by the user. Features are simple color features. * * @param tss The TMAspots of which all nuclei (gold-standard and estimated) * are clustered according to color./*from w w w. j a va2 s .c o m*/ */ private void clusterPointsAutomaticallyColorSpace(List<TMAspot> tss) { if (tss.size() > 0) { try { this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR)); int n = getParam_nClusters(); // Create ARFF Data FastVector atts; Instances data; int i; // 1. create arff data format atts = new FastVector(3); for (i = 0; i < 3; i++) { atts.addElement(new Attribute(Integer.toString(i))); } // 2. create Instances object data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss)); // 3. fill with data BufferedImage img; Color c; float[] features = new float[3]; String colorSpace = getParam_ColorSpace(); for (TMAspot ts : tss) { img = ts.getBufferedImage(); List<TMApoint> tps = ts.getPoints(); for (TMApoint tp : tps) { Color2Feature(TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false), colorSpace, features); // add the instance Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] }); inst.setDataset(data); data.add(inst); } } // 4. set data class index (last attribute is the class) //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X if (tmarker.DEBUG > 4) { java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO, data.toString()); } Clusterer clusterer = getClusterer(); String[] options = getClustererOptions(); if (false && colorSpace.equalsIgnoreCase("hsb")) { String[] newoptions = new String[options.length + 2]; System.arraycopy(options, 0, newoptions, 0, options.length); newoptions[options.length] = "-A"; newoptions[options.length + 1] = "weka.core.MyHSBDistance"; options = newoptions; } if (tmarker.DEBUG > 3) { if (options.length > 0) { String info = "Clusterer should have options\n"; for (String o : options) { info += o + " "; } info += "\n"; java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO, info); } } clusterer.setOptions(options); // set the clusterer options clusterer.buildClusterer(data); // build the clusterer // order the clusters according to the brightness // The most bright cluster should be 0, then 1, then 2,... ArrayList<ArrayList<Double>> values = new ArrayList<>(); for (i = 0; i < clusterer.numberOfClusters(); i++) { values.add(new ArrayList<Double>()); } int z; double value; for (i = 0; i < data.numInstances(); i++) { z = clusterer.clusterInstance(data.instance(i)); value = getParam_ColorSpace().equalsIgnoreCase("hsb") ? data.instance(i).value(2) : Misc.RGBToGray(data.instance(i).value(0), data.instance(i).value(1), data.instance(i).value(2)); values.get(z).add(value); } double[] means = new double[clusterer.numberOfClusters()]; for (i = 0; i < clusterer.numberOfClusters(); i++) { means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()])); } int[] ordering = Misc.orderArray(means, !getParam_ColorSpace().equalsIgnoreCase("rtp")); String clusterInfo = ""; for (String o : clusterer.getOptions()) { clusterInfo += o + " "; } clusterInfo += "\n\n"; clusterInfo += clusterer.toString().trim(); if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) { try { clusterInfo += ((HierarchicalClusterer) clusterer).graph(); HierarchyVisualizer a = new HierarchyVisualizer( ((HierarchicalClusterer) clusterer).graph()); a.setSize(800, 600); if (clusterVisualizer == null) { clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram"); clusterVisualizer.setIconImage(getIconImage()); clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE); clusterVisualizer.setSize(800, 600); } Container contentPane = clusterVisualizer.getContentPane(); contentPane.removeAll(); contentPane.add(a); } catch (Exception e) { clusterVisualizer = null; } } jTextArea1.setText(clusterInfo); if (tmarker.DEBUG > 3) { String info = "Clusterer has options\n"; for (String o : clusterer.getOptions()) { info += o + " "; } info += "\n"; info += clusterer.toString() + "\n"; // info += (clusterer).globalInfo() + "\n"; info += "\n"; info += clusterInfo + "\n"; java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO, info); } // cluster all TMAspots and assign the corresponding class to them // Cluster the points List<List<Color>> clustered_points = new ArrayList<>(); for (i = 0; i < clusterer.numberOfClusters(); i++) { clustered_points.add(new ArrayList<Color>()); } int k; for (TMAspot ts : tss) { img = ts.getBufferedImage(); List<TMApoint> tps = ts.getPoints(); for (TMApoint tp : tps) { c = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false); Color2Feature(c, colorSpace, features); // add the instance Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] }); inst.setDataset(data); k = ordering[clusterer.clusterInstance(inst)]; // store the color for later visualization clustered_points.get(k).add(c); // set the staining of the TMApoint switch (k) { case 0: tp.setStaining(TMALabel.STAINING_0); break; case 1: tp.setStaining(TMALabel.STAINING_1); break; case 2: tp.setStaining(TMALabel.STAINING_2); break; default: tp.setStaining(TMALabel.STAINING_3); break; } } ts.dispStainingInfo(); if (manager.getVisibleTMAspot() == ts) { manager.repaintVisibleTMAspot(); } } // draw the points Plot3DPanel plot; if (((java.awt.BorderLayout) (jPanel2.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) { plot = (Plot3DPanel) ((java.awt.BorderLayout) (jPanel2.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER); plot.removeAllPlots(); } else { plot = new Plot3DPanel(); plot.plotCanvas.setBackground(jPanel2.getBackground()); plot.addLegend(PlotPanel.SOUTH); plot.plotLegend.setBackground(jPanel2.getBackground()); } if (colorSpace.equalsIgnoreCase("hsb")) { plot.setAxisLabels("Hue", "Saturation", "Brightness"); } else if (colorSpace.equalsIgnoreCase("rtp")) { plot.setAxisLabels("R", "Theta", "Phi"); } else { plot.setAxisLabels("Red", "Green", "Blue"); } for (i = 0; i < clusterer.numberOfClusters(); i++) { double[] xs = new double[clustered_points.get(i).size()]; double[] ys = new double[clustered_points.get(i).size()]; double[] zs = new double[clustered_points.get(i).size()]; for (int j = 0; j < clustered_points.get(i).size(); j++) { Color2Feature(clustered_points.get(i).get(j), colorSpace, features); xs[j] = features[0]; ys[j] = features[1]; zs[j] = features[2]; } if (xs.length > 0) { c = getParam_ColorOfClassK(i); plot.addScatterPlot("Staining " + i, c, xs, ys, zs); } } // Write the description String description = "Nuclei clustered with " + getParam_AutomaticClustererString(); if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) { description += " (" + getParam_HierarchicalClusteringMethod() + ")"; } description += ", n=" + getParam_nClusters() + ", color space " + getParam_ColorSpace() + "."; jLabel41.setText(description); jLabel42.setText(" "); if (((java.awt.BorderLayout) (jPanel2.getLayout())) .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) { jPanel2.add(plot, java.awt.BorderLayout.CENTER); validate(); pack(); } } catch (Exception | OutOfMemoryError e) { java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.SEVERE, null, e); JOptionPane.showMessageDialog(this, "The clustering could not be performed.\n\n" + "A possible reasons is:\n" + "- Not enough memory (too many points), \n\n" + "You might want to try a different clustering method or less TMAspots.\n\n" + "The error message is: \n" + e.getMessage(), "Error at Nucleus clustering", JOptionPane.WARNING_MESSAGE); } finally { this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR)); } } }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Sets distribution associated with model. *///from w ww . j a v a2 s . co m public void resetDistribution(Instances data) throws Exception { Instances insts = new Instances(data, data.numInstances()); for (int i = 0; i < data.numInstances(); i++) { if (whichSubset(data.instance(i)) > -1) { insts.add(data.instance(i)); } } Distribution newD = new Distribution(insts, this); newD.addInstWithUnknown(data, m_attIndex); m_distribution = newD; }
From source file:j48.C45PruneableClassifierTreeG.java
License:Open Source License
/** * finds new nodes that improve accuracy and grafts them onto the tree * * @param fulldata the instances in whole trainset * @param iindex records num tests each instance has failed up to this node * @param limits the upper/lower limits for numeric attributes * @param parent the node immediately before the current one * @param pLaplace laplace for leaf, calculated by parent (in case leaf empty) * @param pLeafClass class of leaf, determined by parent (in case leaf empty) */// ww w. j a v a 2s . com private void findGraft(Instances fulldata, double[][] iindex, double[][] limits, ClassifierTree parent, double pLaplace, int pLeafClass) throws Exception { // get the class for this leaf int leafClass = (m_isEmpty) ? pLeafClass : localModel().distribution().maxClass(); // get the laplace value for this leaf double leafLaplace = (m_isEmpty) ? pLaplace : laplaceLeaf(leafClass); // sort the instances into those at the leaf, those in atbop, and discarded Instances l = new Instances(fulldata, fulldata.numInstances()); Instances n = new Instances(fulldata, fulldata.numInstances()); int lcount = 0; int acount = 0; for (int x = 0; x < fulldata.numInstances(); x++) { if (iindex[0][x] <= 0 && iindex[1][x] <= 0) continue; if (iindex[0][x] != 0) { l.add(fulldata.instance(x)); l.instance(lcount).setWeight(iindex[0][x]); // move instance's weight in iindex to same index as in l iindex[0][lcount++] = iindex[0][x]; } if (iindex[1][x] > 0) { n.add(fulldata.instance(x)); n.instance(acount).setWeight(iindex[1][x]); // move instance's weight in iindex to same index as in n iindex[1][acount++] = iindex[1][x]; } } boolean graftPossible = false; double[] classDist = new double[n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (iindex[1][x] > 0 && !n.instance(x).classIsMissing()) classDist[(int) n.instance(x).classValue()] += iindex[1][x]; } for (int cVal = 0; cVal < n.numClasses(); cVal++) { double theLaplace = (classDist[cVal] + 1.0) / (classDist[cVal] + 2.0); if (cVal != leafClass && (theLaplace > leafLaplace) && (biprob(classDist[cVal], classDist[cVal], leafLaplace) > m_BiProbCrit)) { graftPossible = true; break; } } if (!graftPossible) { return; } // 1. Initialize to {} a set of tuples t containing potential tests ArrayList t = new ArrayList(); // go through each attribute for (int a = 0; a < n.numAttributes(); a++) { if (a == n.classIndex()) continue; // skip the class // sort instances in atbop by $a int[] sorted = sortByAttribute(n, a); // 2. For each continuous attribute $a: if (n.attribute(a).isNumeric()) { // find min and max values for this attribute at the leaf boolean prohibited = false; double minLeaf = Double.POSITIVE_INFINITY; double maxLeaf = Double.NEGATIVE_INFINITY; for (int i = 0; i < l.numInstances(); i++) { if (l.instance(i).isMissing(a)) { if (l.instance(i).classValue() == leafClass) { prohibited = true; break; } } double value = l.instance(i).value(a); if (!m_relabel || l.instance(i).classValue() == leafClass) { if (value < minLeaf) minLeaf = value; if (value > maxLeaf) maxLeaf = value; } } if (prohibited) { continue; } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is < the min value for $a for a case at the leaf which // has the class $c, and $v is > the lowerlimit of $a at // the leaf. // (note: error in original paper stated that $v must be // smaller OR EQUAL TO the min value). // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) <= $v & value($a,$x) > lowerlim($l,$a)}, $k). double minBestClass = Double.NaN; double minBestLaplace = leafLaplace; double minBestVal = Double.NaN; double minBestPos = Double.NaN; double minBestTotal = Double.NaN; double[][] minBestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) break; // missing are sorted to end: no more valid vals double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval <= limits[a][0]) { if (m_Debug) System.out.println("\t <= lowerlim: continuing..."); continue; } // note: error in paper would have this read "theVal > minLeaf) if (theval >= minLeaf) { if (m_Debug) System.out.println("\t >= minLeaf; breaking..."); break; } counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out the best laplace/class (for <= theval) double total = Utils.sum(counts[0]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > minBestLaplace) { minBestPos = counts[0][c]; minBestTotal = total; minBestLaplace = temp; minBestClass = c; minBestCounts = copyCounts(counts); minBestVal = (x == n.numInstances() - 1) ? theval : ((theval + n.instance(sorted[x + 1]).value(a)) / 2.0); } } } // (b) add to t tuple <n,a,v,k,L',"<="> if (!Double.isNaN(minBestVal) && biprob(minBestPos, minBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, minBestVal, 0, leafClass, minBestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } // free space minBestCounts = null; // (c) find values of // n: instances in atbop (already have that, actually) // $v: a value for $a that exists for a case in the atbop, where // $v is > the max value for $a for a case at the leaf which // has the class $c, and $v is <= the upperlimit of $a at // the leaf. // k: k is a class // that maximize L' = Laplace({x: x contained in cases(n) // & value(a,x) > v & value(a,x) <= upperlim(l,a)}, k). double maxBestClass = -1; double maxBestLaplace = leafLaplace; double maxBestVal = Double.NaN; double maxBestPos = Double.NaN; double maxBestTotal = Double.NaN; double[][] maxBestCounts = null; for (int c = 0; c < n.numClasses(); c++) { // zero the counts counts[0][c] = 0; counts[1][c] = 0; // shouldn't need to do this ... } // check smallest val for a in atbop is < upper limit if (n.numInstances() >= 1 && n.instance(sorted[0]).value(a) < limits[a][1]) { for (int x = n.numInstances() - 1; x >= 0; x--) { if (n.instance(sorted[x]).isMissing(a)) continue; double theval = n.instance(sorted[x]).value(a); if (m_Debug) System.out.println("\t " + theval); if (theval > limits[a][1]) { if (m_Debug) System.out.println("\t >= upperlim; continuing..."); continue; } if (theval <= maxLeaf) { if (m_Debug) System.out.println("\t < maxLeaf; breaking..."); break; } // increment counts counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != 0 && !n.instance(sorted[x - 1]).isMissing(a)) { int z = x - 1; while (z >= 0 && n.instance(sorted[z]).value(a) == theval) { z--; x--; counts[1][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } // work out best laplace for > theval double total = Utils.sum(counts[1]); for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[1][c] + 1.0) / (total + 2.0); if (temp > maxBestLaplace) { maxBestPos = counts[1][c]; maxBestTotal = total; maxBestLaplace = temp; maxBestClass = c; maxBestCounts = copyCounts(counts); maxBestVal = (x == 0) ? theval : ((theval + n.instance(sorted[x - 1]).value(a)) / 2.0); } } } // (d) add to t tuple <n,a,v,k,L',">"> if (!Double.isNaN(maxBestVal) && biprob(maxBestPos, maxBestTotal, leafLaplace) > m_BiProbCrit) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, maxBestVal, 1, leafClass, maxBestCounts); } catch (Exception e) { System.err.println("graftsplit error:" + e.getMessage()); System.exit(1); } t.add(gsplit); } } } else { // must be a nominal attribute // 3. for each discrete attribute a for which there is no // test at an ancestor of l // skip if this attribute has already been used if (limits[a][1] == 1) { continue; } boolean[] prohibit = new boolean[l.attribute(a).numValues()]; for (int aval = 0; aval < n.attribute(a).numValues(); aval++) { for (int x = 0; x < l.numInstances(); x++) { if ((l.instance(x).isMissing(a) || l.instance(x).value(a) == aval) && (!m_relabel || (l.instance(x).classValue() == leafClass))) { prohibit[aval] = true; break; } } } // (a) find values of // $n: instances in atbop (already have that, actually) // $v: $v is a value for $a // $k: $k is a class // that maximize L' = Laplace({$x: $x contained in cases($n) // & value($a,$x) = $v}, $k). double bestVal = Double.NaN; double bestClass = Double.NaN; double bestLaplace = leafLaplace; double[][] bestCounts = null; double[][] counts = new double[2][n.numClasses()]; for (int x = 0; x < n.numInstances(); x++) { if (n.instance(sorted[x]).isMissing(a)) continue; // zero the counts for (int c = 0; c < n.numClasses(); c++) counts[0][c] = 0; double theval = n.instance(sorted[x]).value(a); counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; if (x != n.numInstances() - 1) { int z = x + 1; while (z < n.numInstances() && n.instance(sorted[z]).value(a) == theval) { z++; x++; counts[0][(int) n.instance(sorted[x]).classValue()] += iindex[1][sorted[x]]; } } if (!prohibit[(int) theval]) { // work out best laplace for > theval double total = Utils.sum(counts[0]); bestLaplace = leafLaplace; bestClass = Double.NaN; for (int c = 0; c < n.numClasses(); c++) { double temp = (counts[0][c] + 1.0) / (total + 2.0); if (temp > bestLaplace && biprob(counts[0][c], total, leafLaplace) > m_BiProbCrit) { bestLaplace = temp; bestClass = c; bestVal = theval; bestCounts = copyCounts(counts); } } // add to graft list if (!Double.isNaN(bestClass)) { GraftSplit gsplit = null; try { gsplit = new GraftSplit(a, bestVal, 2, leafClass, bestCounts); } catch (Exception e) { System.err.println("graftsplit error: " + e.getMessage()); System.exit(1); } t.add(gsplit); } } } // (b) add to t tuple <n,a,v,k,L',"="> // done this already } } // 4. remove from t all tuples <n,a,v,c,L,x> such that L <= // Laplace(cases(l),c) or prob(x,n,Laplace(cases(l),c) <= 0.05 // -- checked this constraint prior to adding a tuple -- // *** step six done before step five for efficiency *** // 6. for each <n,a,v,k,L,x> in t ordered on L from highest to lowest // order the tuples from highest to lowest laplace // (this actually orders lowest to highest) Collections.sort(t); // 5. remove from t all tuples <n,a,v,c,L,x> such that there is // no tuple <n',a',v',k',L',x'> such that k' != c & L' < L. for (int x = 0; x < t.size(); x++) { GraftSplit gs = (GraftSplit) t.get(x); if (gs.maxClassForSubsetOfInterest() != leafClass) { break; // reached a graft with class != leafClass, so stop deleting } else { t.remove(x); x--; } } // if no potential grafts were found, do nothing and return if (t.size() < 1) { return; } // create the distributions for each graft for (int x = t.size() - 1; x >= 0; x--) { GraftSplit gs = (GraftSplit) t.get(x); try { gs.buildClassifier(l); gs.deleteGraftedCases(l); // so they don't go down the other branch } catch (Exception e) { System.err.println("graftsplit build error: " + e.getMessage()); } } // add this stuff to the tree ((C45PruneableClassifierTreeG) parent).setDescendents(t, this); }
From source file:jjj.asap.sas.datasets.job.Import.java
License:Open Source License
private void buildDataset(int k, String input, String output) { if (IOUtils.exists(output)) { Job.log("NOTE", output + " already exists - nothing to do."); return;/*from w w w. j a v a2 s.c o m*/ } // create empty dataset final DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); if (Contest.isMultiChoice(k)) { builder.addNominalVariable("color", Contest.COLORS); } builder.addStringVariable("text"); builder.addNominalVariable("score", Contest.getRubrics(k)); Instances dataset = builder.getDataset(IOUtils.getName(output)); // now add obs Iterator<String> it = new FileIterator(input); while (it.hasNext()) { // parse data String[] data = StringUtils.safeSplit(it.next(), "\t", 6); double id = Double.parseDouble(data[0]); String score = data[2]; String color = data[4]; String text = data[5]; // add to dataset dataset.add(new DenseInstance(dataset.numAttributes())); Instance ob = dataset.lastInstance(); ob.setValue(dataset.attribute("id"), id); if (Contest.isMultiChoice(k)) { ob.setValue(dataset.attribute("color"), color); } ob.setValue(dataset.attribute("text"), text); if ("?".equals(score)) { ob.setValue(dataset.attribute("score"), Utils.missingValue()); } else { ob.setValue(dataset.attribute("score"), score); } } Dataset.save(output, dataset); }
From source file:jjj.asap.sas.ensemble.impl.CrossValidatedEnsemble.java
License:Open Source License
@Override public StrongLearner build(int essaySet, String ensembleName, List<WeakLearner> learners) { // can't handle empty case if (learners.isEmpty()) { return this.ensemble.build(essaySet, ensembleName, learners); }//from www . ja v a 2s .c o m // create a dummy dataset. DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); builder.addNominalVariable("class", Contest.getRubrics(essaySet)); Instances dummy = builder.getDataset("dummy"); // add data Map<Double, Double> groundTruth = Contest.getGoldStandard(essaySet); for (double id : learners.get(0).getPreds().keySet()) { dummy.add(new DenseInstance(1.0, new double[] { id, groundTruth.get(id) })); } // stratify dummy.sort(0); dummy.randomize(new Random(1)); dummy.setClassIndex(1); dummy.stratify(nFolds); // now evaluate each fold Map<Double, Double> preds = new HashMap<Double, Double>(); for (int k = 0; k < nFolds; k++) { Instances train = dummy.trainCV(nFolds, k); Instances test = dummy.testCV(nFolds, k); List<WeakLearner> cvLeaners = new ArrayList<WeakLearner>(); for (WeakLearner learner : learners) { WeakLearner copy = learner.copyOf(); for (int i = 0; i < test.numInstances(); i++) { copy.getPreds().remove(test.instance(i).value(0)); copy.getProbs().remove(test.instance(i).value(0)); } cvLeaners.add(copy); } // train on fold StrongLearner cv = this.ensemble.build(essaySet, ensembleName, cvLeaners); List<WeakLearner> testLeaners = new ArrayList<WeakLearner>(); for (WeakLearner learner : cv.getLearners()) { WeakLearner copy = learner.copyOf(); copy.getPreds().clear(); copy.getProbs().clear(); WeakLearner source = find(copy.getName(), learners); for (int i = 0; i < test.numInstances(); i++) { double id = test.instance(i).value(0); copy.getPreds().put(id, source.getPreds().get(id)); copy.getProbs().put(id, source.getProbs().get(id)); } testLeaners.add(copy); } preds.putAll(this.ensemble.classify(essaySet, ensembleName, testLeaners, cv.getContext())); } // now prepare final result StrongLearner strong = this.ensemble.build(essaySet, ensembleName, learners); double trainingError = strong.getKappa(); double cvError = Calc.kappa(essaySet, preds, groundTruth); // Job.log(essaySet+"-"+ensembleName, "XVAL: training error = " + trainingError + " cv error = " + cvError); strong.setKappa(cvError); return strong; }
From source file:jjj.asap.sas.ensemble.impl.StackedClassifier.java
License:Open Source License
/** * Returns a dataset representing the learners. *//*ww w . j a v a 2 s . com*/ private Instances getMetaDataset(int essaySet, List<WeakLearner> learners) { // create dataset headers DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); for (int i = 0; i < learners.size(); i++) { if (useNumericVariables) { builder.addVariable("x" + i); } else { builder.addNominalVariable("x" + i, Contest.getRubrics(essaySet)); } } builder.addNominalVariable("score", Contest.getRubrics(essaySet)); Instances dataset = builder.getDataset(this.getClass().getCanonicalName()); Map<Double, Double> labels = Contest.getGoldStandard(essaySet); // now add the data for (double id : learners.get(0).getPreds().keySet()) { double[] data = new double[dataset.numAttributes()]; data[0] = id; for (int i = 0; i < learners.size(); i++) { data[i + 1] = learners.get(i).getPreds().get(id); } data[dataset.numAttributes() - 1] = labels.containsKey(id) ? labels.get(id) : Utils.missingValue(); dataset.add(new DenseInstance(1.0, data)); } return dataset; }
From source file:jjj.asap.sas.parser.job.ImportParserData.java
License:Open Source License
private void process(final String parent, int essaySet, Map<Double, List<String>> tags, Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) { // check if output exists boolean any = false; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff")) any = true;/*from w w w . j av a 2 s . c o m*/ if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff")) any = true; if (!any) { Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet + "-*.arff returns all required datasets - nothing to do"); return; } // Load an existing dataset to use as a template. Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff"); // create the output datasets here. except for the extra statistics, // the format is the same as 'dataset'. Instances tagsData = new Instances(dataset, 0); tagsData.setRelationName(essaySet + "-pos-tags.arff"); Instances treeData = new Instances(dataset, 0); treeData.setRelationName(essaySet + "-parse-tree.arff"); Instances dependsData[] = new Instances[7]; for (int j = 0; j < 7; j++) { dependsData[j] = new Instances(dataset, 0); dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff"); } // extra stats DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); if (Contest.isMultiChoice(essaySet)) { builder.addNominalVariable("color", Contest.COLORS); } builder.addVariable("x_sent"); builder.addVariable("x_para"); builder.addVariable("x_length"); builder.addVariable("x_words"); builder.addVariable("x_unique_words"); builder.addNominalVariable("score", Contest.getRubrics(essaySet)); Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff"); // now add rows for each instance for (int i = 0; i < dataset.numInstances(); i++) { // common variables Instance ob = dataset.instance(i); double id = ob.value(0); String y = ob.isMissing(dataset.numAttributes() - 1) ? null : ob.stringValue(dataset.numAttributes() - 1); String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null; String str = ob.stringValue(dataset.attribute("text")); // // Extra stats // int nSent = tags.containsKey(id) ? tags.get(id).size() : 0; int nPara = 0; for (int a = 0; a < str.length(); a++) { if (str.charAt(a) == '^') nPara++; } int nLength = str.length(); int nWords = 0; int nUniqueWords = 0; String[] words = str.toLowerCase().split(" "); nWords = words.length; Set<String> u = new HashSet<String>(); for (String w : words) { u.add(w); } nUniqueWords = u.size(); extraStats.add(new DenseInstance(extraStats.numAttributes())); Instance extra = extraStats.lastInstance(); extra.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { extra.setValue(1, color); } extra.setValue(extraStats.attribute("x_sent"), nSent); extra.setValue(extraStats.attribute("x_para"), nPara); extra.setValue(extraStats.attribute("x_length"), nLength); extra.setValue(extraStats.attribute("x_words"), nWords); extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords); if (y == null) extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue()); else extra.setValue(extraStats.numAttributes() - 1, y); // // POS tags // String tagsText = ""; List<String> tagsList = tags.get(id); if (tagsList == null || tagsList.isEmpty()) { Job.log("WARNING", "no tags for " + id); tagsText = "x"; } else { for (String tagsItem : tagsList) { tagsText += tagsItem; } } tagsData.add(new DenseInstance(ob.numAttributes())); Instance tagsOb = tagsData.lastInstance(); tagsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { tagsOb.setValue(1, color); tagsOb.setValue(2, tagsText.trim()); if (y == null) { tagsOb.setValue(3, Utils.missingValue()); } else { tagsOb.setValue(3, y); } } else { tagsOb.setValue(1, tagsText.trim()); if (y == null) { tagsOb.setValue(2, Utils.missingValue()); } else { tagsOb.setValue(2, y); } } // // Parse Tree // String treeText = ""; List<String> treeList = parseTrees.get(id); if (treeList == null || treeList.isEmpty()) { Job.log("WARNING", "no parse tree for " + id); treeText = "x"; } else { for (String treeItem : treeList) { treeText += treeItem; } } treeData.add(new DenseInstance(ob.numAttributes())); Instance treeOb = treeData.lastInstance(); treeOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { treeOb.setValue(1, color); treeOb.setValue(2, treeText.trim()); if (y == null) { treeOb.setValue(3, Utils.missingValue()); } else { treeOb.setValue(3, y); } } else { treeOb.setValue(1, treeText.trim()); if (y == null) { treeOb.setValue(2, Utils.missingValue()); } else { treeOb.setValue(2, y); } } // // Depends data // for (int j = 0; j < 7; j++) { String text = ""; List<String> list = depends.get(id); if (list == null || list.isEmpty()) { Job.log("WARNING", "no depends for " + id); text = "x"; } else { for (String item : list) { String[] term = StringUtils.safeSplit(item, "/", 3); switch (j) { case 0: text += item; break; case 1: text += term[1] + "/" + term[2]; break; case 2: text += term[0] + "/" + term[2]; break; case 3: text += term[0] + "/" + term[1]; break; case 4: text += term[0]; break; case 5: text += term[1]; break; case 6: text += term[2]; break; } text += " "; } } dependsData[j].add(new DenseInstance(ob.numAttributes())); Instance dependsOb = dependsData[j].lastInstance(); dependsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { dependsOb.setValue(1, color); dependsOb.setValue(2, text.trim()); if (y == null) { dependsOb.setValue(3, Utils.missingValue()); } else { dependsOb.setValue(3, y); } } else { dependsOb.setValue(1, text.trim()); if (y == null) { dependsOb.setValue(2, Utils.missingValue()); } else { dependsOb.setValue(2, y); } } } // j } // dataset // Now save the new datasets Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData); Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData); for (int j = 0; j < 7; j++) { Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]); } Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats); }
From source file:joelib2.algo.datamining.yale.ExampleSetHelper.java
License:Open Source License
public static Instances createMolInstances(MoleculeVector molecules, String[] attributes, int[] attributeTypes) throws WekaException { // load descriptor binning DescriptorBinning binning = DescriptorBinning.getDescBinning(molecules); int length = molecules.getSize(); if (attributes.length != attributeTypes.length) { throw new WekaException("Different number of attributes and attribute types."); //return null; }// w w w .ja v a 2 s .c o m Enumeration enumeration = binning.getDescriptors(); FastVector attributesV = new FastVector(binning.numberOfDescriptors()); Molecule mol; BasicPairData pairData; for (int i = 0; i < attributes.length; i++) { if (attributeTypes[i] == Attribute.NUMERIC) { // numeric attributesV.addElement(new Attribute((String) enumeration.nextElement(), attributesV.size())); } else if (attributeTypes[i] == Attribute.NOMINAL) { // nominal // create a list with all nominal values Hashtable hashed = new Hashtable(); for (int j = 0; j < length; j++) { mol = molecules.getMol(j); // get unparsed data pairData = (BasicPairData) mol.getData(attributes[i], false); if (pairData != null) { if (pairData.getKeyValue() instanceof String) { hashed.put(pairData.getKeyValue(), ""); } else { hashed.put(pairData.toString(), ""); } } } // store list of nominal values in the Weka data structure FastVector attributeValues = new FastVector(hashed.size()); String tmp; for (Enumeration e = hashed.keys(); e.hasMoreElements();) { tmp = (String) e.nextElement(); attributeValues.addElement(tmp); //System.out.println("NOMINAL " + tmp); } attributesV.addElement(new Attribute(attributes[i], attributeValues, attributesV.size())); } } int size = attributesV.size(); Attribute attribute; // create molecule instances Instances instances = new Instances("MoleculeInstances", attributesV, attributesV.size()); // iterate over all instances (to generate them) double[] instance; for (int i = 0; i < length; i++) { mol = molecules.getMol(i); instance = new double[size]; for (int j = 0; j < size; j++) { attribute = (Attribute) attributesV.elementAt(j); // get parsed data pairData = (BasicPairData) mol.getData(attribute.name(), true); // add nominal or numeric or missing value if (pairData == null) { instance[attribute.index()] = Instance.missingValue(); } else { if (attribute.isNominal()) { // nominal String tmpS = pairData.toString().trim(); if (tmpS.indexOf("\n") != -1) { throw new WekaException("Descriptor " + attribute.name() + " contains multiple lines and is not a valid nominal value."); } else { instance[attribute.index()] = attribute.indexOfValue(pairData.toString()); if (instance[attribute.index()] == -1) { // invalid nominal value logger.error("Invalid nominal value."); return null; } } } else { // numeric if (pairData instanceof NativeValue) { double tmpD = ((NativeValue) pairData).getDoubleNV(); if (Double.isNaN(tmpD)) { instance[attribute.index()] = Instance.missingValue(); } else { instance[attribute.index()] = tmpD; } } else { throw new WekaException("Descriptor " + attribute.name() + " is not a native value."); } } } attribute.index(); } // add created molecule instance to molecule instances instances.add(new Instance(1, instance)); } return instances; }
From source file:joelib2.algo.datamining.yale.ExampleSetHelper.java
License:Open Source License
public static Instances matrix2instances(double[][] matrix, String[] descriptors, int[] attributeTypes) { FastVector attributesV = new FastVector(descriptors.length); int molecules = matrix[0].length; for (int i = 0; i < descriptors.length; i++) { if (attributeTypes[i] == Attribute.NUMERIC) { // numeric attributesV.addElement(new Attribute(descriptors[i], attributesV.size())); } else if (attributeTypes[i] == Attribute.NOMINAL) { // nominal // create a list with all nominal values Hashtable hashed = new Hashtable(); for (int j = 0; j < molecules; j++) { hashed.put(new Double(matrix[i][j]), ""); }/*from w ww . j ava2 s . c o m*/ // store list of nominal values in the Weka data structure FastVector attributeValues = new FastVector(hashed.size()); Double tmp; for (Enumeration e = hashed.keys(); e.hasMoreElements();) { tmp = (Double) e.nextElement(); attributeValues.addElement(tmp.toString()); //System.out.println("NOMINAL " + tmp); } attributesV.addElement(new Attribute(descriptors[i], attributeValues, attributesV.size())); } } int descriptorSize = attributesV.size(); Attribute attribute = null; // create molecule instances Instances instances = new Instances("MatrixInstances", attributesV, attributesV.size()); // iterate over all instances (to generate them) double[] instance; for (int i = 0; i < molecules; i++) { instance = new double[descriptorSize]; for (int j = 0; j < descriptorSize; j++) { attribute = (Attribute) attributesV.elementAt(j); if (Double.isNaN(matrix[j][i])) { instance[attribute.index()] = Instance.missingValue(); } else { if (attributeTypes[j] == Attribute.NUMERIC) { // numeric instance[attribute.index()] = matrix[j][i]; } else if (attributeTypes[j] == Attribute.NOMINAL) { // nominal instance[attribute.index()] = attribute.indexOfValue(Double.toString(matrix[j][i])); if (instance[attribute.index()] == -1) { // invalid nominal value logger.error("Invalid nominal value."); return null; } } } attribute.index(); } // add created molecule instance to molecule instances Instance inst = new Instance(1, instance); instances.add(inst); //System.out.println("instance (attr.:"+inst.numAttributes()+", vals:"+inst.numValues()+"): "+inst); } //System.out.println(instances.toString()); return instances; }