List of usage examples for weka.core Instance weight
public double weight();
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Backfits the given data into the tree. *//*from w w w . j a v a 2s . c o m*/ public void backfitData(Instances data) throws Exception { // Compute initial class counts double[] classProbs = new double[data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); classProbs[(int) inst.classValue()] += inst.weight(); } // Fit data into tree backfitData(data, classProbs); }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Recursively backfits data into the tree. * // w w w .j a v a 2s .co m * @param data * the data to work with * @param classProbs * the class distribution * @throws Exception * if generation fails */ protected void backfitData(Instances data, double[] classProbs) throws Exception { // Make leaf if there are no training instances if (data.numInstances() == 0) { m_Attribute = -1; m_ClassDistribution = null; m_Prop = null; return; } // Check if node doesn't contain enough instances or is pure // or maximum depth reached m_ClassDistribution = classProbs.clone(); /* * if (Utils.sum(m_ClassDistribution) < 2 * m_MinNum || * Utils.eq(m_ClassDistribution[Utils.maxIndex(m_ClassDistribution)], * Utils .sum(m_ClassDistribution))) { * * // Make leaf m_Attribute = -1; m_Prop = null; return; } */ // Are we at an inner node if (m_Attribute > -1) { // Compute new weights for subsets based on backfit data m_Prop = new double[m_Successors.length]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (!inst.isMissing(m_Attribute)) { if (data.attribute(m_Attribute).isNominal()) { m_Prop[(int) inst.value(m_Attribute)] += inst.weight(); } else { m_Prop[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1] += inst.weight(); } } } // If we only have missing values we can make this node into a leaf if (Utils.sum(m_Prop) <= 0) { m_Attribute = -1; m_Prop = null; return; } // Otherwise normalize the proportions Utils.normalize(m_Prop); // Split data Instances[] subsets = splitData(data); // Go through subsets for (int i = 0; i < subsets.length; i++) { // Compute distribution for current subset double[] dist = new double[data.numClasses()]; for (int j = 0; j < subsets[i].numInstances(); j++) { dist[(int) subsets[i].instance(j).classValue()] += subsets[i].instance(j).weight(); } // Backfit subset m_Successors[i].backfitData(subsets[i], dist); } // If unclassified instances are allowed, we don't need to store the // class distribution if (getAllowUnclassifiedInstances()) { m_ClassDistribution = null; return; } // Otherwise, if all successors are non-empty, we don't need to // store the class distribution boolean emptySuccessor = false; for (int i = 0; i < subsets.length; i++) { if (m_Successors[i].m_ClassDistribution == null) { emptySuccessor = true; return; } } m_ClassDistribution = null; // If we have a least two non-empty successors, we should keep this // tree /* * int nonEmptySuccessors = 0; for (int i = 0; i < subsets.length; * i++) { if (m_Successors[i].m_ClassDistribution != null) { * nonEmptySuccessors++; if (nonEmptySuccessors > 1) { return; } } } * * // Otherwise, this node is a leaf or should become a leaf * m_Successors = null; m_Attribute = -1; m_Prop = null; return; */ } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Builds classifier./*from w w w. ja v a2s . c om*/ * * @param data * the data to train with * @throws Exception * if something goes wrong or the data doesn't fit */ @Override public void buildClassifier(Instances data) throws Exception { // Make sure K value is in range if (m_KValue > data.numAttributes() - 1) m_KValue = data.numAttributes() - 1; if (m_KValue < 1) m_KValue = (int) Utils.log2(data.numAttributes()) + 1; // can classifier handle the data? getCapabilities().testWithFail(data); // remove instances with missing class data = new Instances(data); data.deleteWithMissingClass(); // only class? -> build ZeroR model if (data.numAttributes() == 1) { System.err.println( "Cannot build model (only class attribute present in data!), " + "using ZeroR model instead!"); m_ZeroR = new weka.classifiers.rules.ZeroR(); m_ZeroR.buildClassifier(data); return; } else { m_ZeroR = null; } // Figure out appropriate datasets Instances train = null; Instances backfit = null; Random rand = data.getRandomNumberGenerator(m_randomSeed); if (m_NumFolds <= 0) { train = data; } else { data.randomize(rand); data.stratify(m_NumFolds); train = data.trainCV(m_NumFolds, 1, rand); backfit = data.testCV(m_NumFolds, 1); } //Set Default Instances for selection. setRequiredInst(data); // Create the attribute indices window int[] attIndicesWindow = new int[data.numAttributes() - 1]; int j = 0; for (int i = 0; i < attIndicesWindow.length; i++) { if (j == data.classIndex()) j++; // do not include the class attIndicesWindow[i] = j++; } // Compute initial class counts double[] classProbs = new double[train.numClasses()]; for (int i = 0; i < train.numInstances(); i++) { Instance inst = train.instance(i); classProbs[(int) inst.classValue()] += inst.weight(); } Instances requiredInstances = getRequiredInst(); // Build tree if (jsontree != null) { buildTree(train, classProbs, new Instances(data, 0), m_Debug, 0, jsontree, 0, m_distributionData, requiredInstances, listOfFc, cSetList, ccSer, d); } else { System.out.println("No json tree specified, failing to process tree"); } setRequiredInst(requiredInstances); // Backfit if required if (backfit != null) { backfitData(backfit); } }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Computes class distribution for an attribute. * /*from w ww . ja va2 s . c om*/ * @param props * @param dists * @param att * the attribute index * @param data * the data to work with * @throws Exception * if something goes wrong */ protected HashMap<String, Double> distribution(double[][] props, double[][][] dists, int att, Instances data, double givenSplitPoint, HashMap<String, Classifier> custom_classifiers) throws Exception { HashMap<String, Double> mp = new HashMap<String, Double>(); double splitPoint = givenSplitPoint; double origSplitPoint = 0; Attribute attribute = null; double[][] dist = null; int indexOfFirstMissingValue = -1; String CustomClassifierId = null; CustomSet cSet = null; if (att >= data.numAttributes() && att < data.numAttributes() + custom_classifiers.size()) { CustomClassifierId = getKeyinMap(custom_classifiers, att, data); } else if (att >= data.numAttributes() + custom_classifiers.size()) { cSet = getReqCustomSet(att - (data.numAttributes() - 1 + custom_classifiers.size()), cSetList); } else { attribute = data.attribute(att); } if (CustomClassifierId == null && cSet == null) { if (attribute.isNominal()) { // For nominal attributes dist = new double[attribute.numValues()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Skip missing values at this stage if (indexOfFirstMissingValue < 0) { indexOfFirstMissingValue = i; } continue; } dist[(int) inst.value(att)][(int) inst.classValue()] += inst.weight(); } } else { // For numeric attributes double[][] currDist = new double[2][data.numClasses()]; dist = new double[2][data.numClasses()]; // Sort data data.sort(att); // Move all instances into second subset for (int j = 0; j < data.numInstances(); j++) { Instance inst = data.instance(j); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value indexOfFirstMissingValue = j; break; } currDist[1][(int) inst.classValue()] += inst.weight(); } // Value before splitting double priorVal = priorVal(currDist); // Save initial distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } if (Double.isNaN(splitPoint)) { // Try all possible split points double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } // Can we place a sensible split point here? if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save split point splitPoint = (inst.value(att) + currSplit) / 2.0; origSplitPoint = splitPoint; // Save distribution for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); } } else { double currSplit = data.instance(0).value(att); double currVal, bestVal = -Double.MAX_VALUE; // Split data set using given split point. for (int i = 0; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (inst.isMissing(att)) { // Can stop as soon as we hit a missing value break; } if (inst.value(att) > currSplit) { // Compute gain for split point currVal = gain(currDist, priorVal); // Is the current split point the best point so far? if (currVal > bestVal) { // Store value of current point bestVal = currVal; // Save computed split point origSplitPoint = (inst.value(att) + currSplit) / 2.0; } } currSplit = inst.value(att); // Shift over the weight currDist[0][(int) inst.classValue()] += inst.weight(); currDist[1][(int) inst.classValue()] -= inst.weight(); if (inst.value(att) <= splitPoint) { // Save distribution since split point is specified for (int j = 0; j < currDist.length; j++) { System.arraycopy(currDist[j], 0, dist[j], 0, dist[j].length); } } } } } } else if (CustomClassifierId != null) { Classifier fc = custom_classifiers.get(CustomClassifierId); dist = new double[data.numClasses()][data.numClasses()]; Instance inst; for (int i = 0; i < data.numInstances(); i++) { inst = data.instance(i); double predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { dist[(int) predictedClass][(int) inst.classValue()] += inst.weight(); } } } else if (cSet != null) { dist = new double[data.numClasses()][data.numClasses()]; JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { testPoint = new double[2]; ctr = 0; for (Attribute a : aList) { if (!data.instance(k).isMissing(a)) { testPoint[ctr] = data.instance(k).value(a); ctr++; } } int check = checkPointInPolygon(attrVertices, testPoint); dist[check][(int) data.instance(k).classValue()] += data.instance(k).weight(); } } // Compute weights for subsetsCustomClassifierIndex props[att] = new double[dist.length]; for (int k = 0; k < props[att].length; k++) { props[att][k] = Utils.sum(dist[k]); } if (Utils.eq(Utils.sum(props[att]), 0)) { for (int k = 0; k < props[att].length; k++) { props[att][k] = 1.0 / props[att].length; } } else { Utils.normalize(props[att]); } // Any instances with missing values ? if (indexOfFirstMissingValue > -1) { // Distribute weights for instances with missing values for (int i = indexOfFirstMissingValue; i < data.numInstances(); i++) { Instance inst = data.instance(i); if (attribute.isNominal()) { // Need to check if attribute value is missing if (inst.isMissing(att)) { for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } else { // Can be sure that value is missing, so no test required for (int j = 0; j < dist.length; j++) { dist[j][(int) inst.classValue()] += props[att][j] * inst.weight(); } } } } // Return distribution and split point dists[att] = dist; mp.put("split_point", splitPoint); mp.put("orig_split_point", origSplitPoint); return mp; }
From source file:org.scripps.branch.classifier.ManualTree.java
License:Open Source License
/** * Splits instances into subsets based on the given split. * /*from ww w .j a va 2s.co m*/ * @param data * the data to work with * @return the subsets of instances * @throws Exception * if something goes wrong */ protected Instances[] splitData(Instances data) throws Exception { // Allocate array of Instances objects Instances[] subsets = new Instances[m_Prop.length]; for (int i = 0; i < m_Prop.length; i++) { subsets[i] = new Instances(data, data.numInstances()); } if (m_Attribute >= data.numAttributes()) { if (m_Attribute >= listOfFc.size() + data.numAttributes() - 1) { CustomSet cSet = getReqCustomSet(m_Attribute - (data.numAttributes() - 1 + listOfFc.size()), cSetList); JsonNode vertices = mapper.readTree(cSet.getConstraints()); ArrayList<double[]> attrVertices = generateVerticesList(vertices); List<Attribute> aList = generateAttributeList(cSet, data, d); double[] testPoint = new double[2]; int ctr = 0; for (int k = 0; k < data.numInstances(); k++) { ctr = 0; for (Attribute a : aList) { testPoint[ctr] = data.instance(k).value(a); ctr++; } int check = checkPointInPolygon(attrVertices, testPoint); subsets[check].add(data.instance(k)); continue; } } else { Classifier fc; double predictedClass; // Go through the data for (int i = 0; i < data.numInstances(); i++) { // Get instance Instance inst = data.instance(i); String classifierId = getKeyinMap(listOfFc, m_Attribute, data); fc = listOfFc.get(classifierId); predictedClass = fc.classifyInstance(inst); if (predictedClass != Instance.missingValue()) { subsets[(int) predictedClass].add(inst); continue; } // Else throw an exception throw new IllegalArgumentException("Unknown attribute type"); } } } else { // Go through the data for (int i = 0; i < data.numInstances(); i++) { // Get instance Instance inst = data.instance(i); // Does the instance have a missing value? if (inst.isMissing(m_Attribute)) { // Split instance up for (int k = 0; k < m_Prop.length; k++) { if (m_Prop[k] > 0) { Instance copy = (Instance) inst.copy(); copy.setWeight(m_Prop[k] * inst.weight()); subsets[k].add(copy); } } // Proceed to next instance continue; } // Do we have a nominal attribute? if (data.attribute(m_Attribute).isNominal()) { subsets[(int) inst.value(m_Attribute)].add(inst); // Proceed to next instance continue; } // Do we have a numeric attribute? if (data.attribute(m_Attribute).isNumeric()) { subsets[(inst.value(m_Attribute) < m_SplitPoint) ? 0 : 1].add(inst); // Proceed to next instance continue; } // Else throw an exception throw new IllegalArgumentException("Unknown attribute type"); } } // Save memory for (int i = 0; i < m_Prop.length; i++) { subsets[i].compactify(); } // Return the subsets return subsets; }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * //from w w w .j av a2 s . c om * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:resample.OverSubsample.java
License:Open Source License
/** * Creates a subsample of the current set of input instances. The output * instances are pushed onto the output queue for collection. *//*from w ww .j a va 2 s . c om*/ private void createSubsample() { int classI = getInputFormat().classIndex(); // Sort according to class attribute. getInputFormat().sort(classI); // Determine where each class starts in the sorted dataset int[] classIndices = getClassIndices(); // Get the existing class distribution int[] counts = new int[getInputFormat().numClasses()]; double[] weights = new double[getInputFormat().numClasses()]; int max = -1; for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance current = getInputFormat().instance(i); if (current.classIsMissing() == false) { counts[(int) current.classValue()]++; weights[(int) current.classValue()] += current.weight(); } } // Convert from total weight to average weight for (int i = 0; i < counts.length; i++) { if (counts[i] > 0) { weights[i] = weights[i] / counts[i]; } /* System.err.println("Class:" + i + " " + getInputFormat().classAttribute().value(i) + " Count:" + counts[i] + " Total:" + weights[i] * counts[i] + " Avg:" + weights[i]); */ } // find the class with the minimum number of instances int maxIndex = -1; for (int i = 0; i < counts.length; i++) { if ((max < 0) && (counts[i] > 0)) { max = counts[i]; maxIndex = i; } else if ((counts[i] > max) && (counts[i] > 0)) { max = counts[i]; maxIndex = i; } } if (max < 0) { System.err.println("SpreadSubsample: *warning* none of the classes have any values in them."); return; } // determine the new distribution int[] new_counts = new int[getInputFormat().numClasses()]; for (int i = 0; i < counts.length; i++) { new_counts[i] = (int) Math.abs(Math.max(counts[i], max * m_DistributionSpread)); if (i == maxIndex) { if (m_DistributionSpread > 0 && m_DistributionSpread < 1.0) { // don't undersample the majority class! new_counts[i] = counts[i]; } } if (m_DistributionSpread == 0) { new_counts[i] = counts[i]; } if (m_MaxCount > 0) { new_counts[i] = Math.min(new_counts[i], m_MaxCount); } } // Sample with replacement Random random = new Random(m_RandomSeed); //Hashtable t = new Hashtable(); for (int j = 0; j < new_counts.length; j++) { double newWeight = 1.0; if (m_AdjustWeights && (new_counts[j] > 0)) { newWeight = weights[j] * counts[j] / new_counts[j]; /* System.err.println("Class:" + j + " " + getInputFormat().classAttribute().value(j) + " Count:" + counts[j] + " Total:" + weights[j] * counts[j] + " Avg:" + weights[j] + " NewCount:" + new_counts[j] + " NewAvg:" + newWeight); */ } int index = -1; for (int k = 0; k < new_counts[j]; k++) { //boolean ok = false; //do { index = classIndices[j] + (Math.abs(random.nextInt()) % (classIndices[j + 1] - classIndices[j])); // Have we used this instance before? //if (t.get("" + index) == null) { // if not, add it to the hashtable and use it //t.put("" + index, ""); //ok = true; if (index >= 0) { Instance newInst = (Instance) getInputFormat().instance(index).copy(); if (m_AdjustWeights) { newInst.setWeight(newWeight); } push(newInst); } //} //} while (!ok); } } }
From source file:svmal.SVMStrategy.java
public static Instances InstancesToInstances2(Instances insts) { Instances result = new Instances(insts, 0, 0); for (int i = 0; i < insts.numInstances(); i++) { Instance orig = insts.get(i); Instance2 inst2 = new Instance2(orig.weight(), orig.toDoubleArray()); inst2.setDataset(result);/*from www .j a va 2 s .c om*/ result.add(inst2); } return result; }
From source file:test.org.moa.opencl.IBk.java
License:Open Source License
/** * Turn the list of nearest neighbors into a probability distribution. * * @param neighbours the list of nearest neighboring instances * @param distances the distances of the neighbors * @return the probability distribution//from w w w. jav a 2 s .c om * @throws Exception if computation goes wrong or has no class attribute */ protected double[] makeDistribution(Instances neighbours, double[] distances) throws Exception { double total = 0, weight; double[] distribution = new double[m_NumClasses]; // Set up a correction to the estimator if (m_ClassType == Attribute.NOMINAL) { for (int i = 0; i < m_NumClasses; i++) { distribution[i] = 1.0 / Math.max(1, m_Train.numInstances()); } total = (double) m_NumClasses / Math.max(1, m_Train.numInstances()); } for (int i = 0; i < neighbours.numInstances(); i++) { // Collect class counts Instance current = neighbours.instance(i); distances[i] = distances[i] * distances[i]; distances[i] = Math.sqrt(distances[i] / m_NumAttributesUsed); switch (m_DistanceWeighting) { case WEIGHT_INVERSE: weight = 1.0 / (distances[i] + 0.001); // to avoid div by zero break; case WEIGHT_SIMILARITY: weight = 1.0 - distances[i]; break; default: // WEIGHT_NONE: weight = 1.0; break; } weight *= current.weight(); try { switch (m_ClassType) { case Attribute.NOMINAL: distribution[(int) current.classValue()] += weight; break; case Attribute.NUMERIC: distribution[0] += current.classValue() * weight; break; } } catch (Exception ex) { throw new Error("Data has no class attribute!"); } total += weight; } // Normalise distribution if (total > 0) { Utils.normalize(distribution, total); } return distribution; }
From source file:themeextractor.filters.MauiFilter.java
License:Open Source License
/** * Builds the classifier./*from ww w. j a v a 2 s.c o m*/ */ private void buildClassifier() throws Exception { // Generate input format for classifier FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == documentAtt) { atts.addElement(new Attribute("Term_frequency")); // 2 atts.addElement(new Attribute("IDF")); // atts.addElement(new Attribute("TFxIDF")); // atts.addElement(new Attribute("First_occurrence")); // atts.addElement(new Attribute("Last_occurrence")); // atts.addElement(new Attribute("Spread")); // atts.addElement(new Attribute("Domain_keyphraseness")); // atts.addElement(new Attribute("Length")); // atts.addElement(new Attribute("Generality")); // atts.addElement(new Attribute("Node_degree")); // atts.addElement(new Attribute("Semantic_relatedness")); // atts.addElement(new Attribute("Wikipedia_keyphraseness")); // atts.addElement(new Attribute("Inverse_Wikip_frequency")); // atts.addElement(new Attribute("Total_Wikip_keyphraseness")); // 13 } else if (i == keyphrasesAtt) { if (nominalClassValue) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); atts.addElement(new Attribute("Keyphrase?", vals)); } else { atts.addElement(new Attribute("Keyphrase?")); } } } classifierData = new Instances("ClassifierData", atts, 0); classifierData.setClassIndex(numFeatures); if (debugMode) { System.err.println("--- Converting instances for classifier"); } int totalDocuments = getInputFormat().numInstances(); // Convert pending input instances into data for classifier for (int i = 0; i < totalDocuments; i++) { Instance current = getInputFormat().instance(i); // Get the key phrases for the document String keyphrases = current.stringValue(keyphrasesAtt); HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases); // Get the phrases for the document HashMap<String, Candidate> candidateList = allCandidates.get(current); // Compute the feature values for each phrase and // add the instance to the data for the classifier int countPos = 0; int countNeg = 0; if (debugMode) { System.err .println("--- Computing features for document " + i + " out of " + totalDocuments + "..."); } for (Candidate candidate : candidateList.values()) { // ignore all candidates that appear less than a threshold if (candidate.getFrequency() < minOccurFrequency) { continue; } // compute feature values double[] vals = computeFeatureValues(candidate, true, hashKeyphrases, candidateList); if (vals[vals.length - 1] == 0) { countNeg++; } else { countPos++; } Instance inst = new Instance(current.weight(), vals); // System.out.println(candidate + "\t" + inst); classifierData.add(inst); } if (debugMode) { System.err.println(countPos + " positive; " + countNeg + " negative instances"); } } if (debugMode) { System.err.println("--- Building classifier"); } if (classifier == null) { // Build classifier if (nominalClassValue) { // FilteredClassifier fclass = new FilteredClassifier(); // fclass.setClassifier(new NaiveBayesSimple()); // fclass.setFilter(new Discretize()); // classifier = fclass; classifier = new Bagging(); // try also // classifier.setOptions( Utils.splitOptions("-P 10 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -U -M 2")); } else { classifier = new Bagging(); // try also // classifier.setOptions(Utils.splitOptions("-P 10 -S 1 -I 10 -W // weka.classifiers.trees.J48 -- -U -M 2")) ; String optionsString = "-P 100 -S 1 -I 10 -W weka.classifiers.trees.M5P -- -U -M 7.0"; String[] options = Utils.splitOptions(optionsString); classifier.setOptions(options); } } classifier.buildClassifier(classifierData); if (debugMode) { System.err.println(classifier); } // Save space classifierData = new Instances(classifierData, 0); }