List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * * @param instance the instance to convert * * @param ArrayList<Instance> the list of instances * @return the document length/*w w w . j a v a2 s .c o m*/ */ private double convertInstancewoDocNorm(Instance instance, ArrayList<Instance> converted) { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute index not valid"); } int numOldValues = instance.numAttributes(); double[] newValues = new double[numOldValues + m_selectedTerms.size()]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } String stringValue = instance.stringValue(stringAttributeIndex); double docLength = 0; HashMap<String, Integer> termMatches = m_selectedTermsTrie.countNonoverlappingMatches(stringValue); for (Map.Entry<String, Integer> entry : termMatches.entrySet()) { String term = entry.getKey(); int termIdx = m_selectedTermIndices.get(term); double matches = entry.getValue(); if (!m_OutputCounts && matches > 0) { matches = 1; } if (matches > 0) { if (m_TFTransform == true) { matches = Math.log(matches + 1); } if (m_IDFTransform == true) { matches = matches * Math.log(m_NumInstances / (double) m_DocsCounts[termIdx]); } newValues[numOldValues + termIdx] = matches; docLength += matches * matches; } } Instance result = new SparseInstance(instance.weight(), newValues); converted.add(result); return Math.sqrt(docLength); }
From source file:etc.aloe.filters.WordFeaturesExtractor.java
License:Open Source License
protected List<List<String>> tokenizeDocuments(Instances instances) { //Convert all instances into term lists List<List<String>> documents = new ArrayList<List<String>>(); for (int i = 0; i < instances.size(); i++) { Instance instance = instances.get(i); if (instance.isMissing(selectedAttributeIndex) == false) { List<String> words = tokenizeDocument(instance); documents.add(words);/* ww w .j a va 2s .co m*/ } } return documents; }
From source file:etc.aloe.filters.WordFeaturesExtractor.java
License:Open Source License
@Override protected Instance process(Instance instance) throws Exception { if (selectedAttributeIndex < 0) { throw new IllegalStateException("String attribute not set"); }// ww w .ja v a 2 s . com int numOldValues = instance.numAttributes(); int numNewFeatures = unigrams.size() + bigrams.size(); double[] newValues = new double[numOldValues + numNewFeatures]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } String stringValue = instance.stringValue(selectedAttributeIndex); if (instance.isMissing(selectedAttributeIndex) == false) { List<String> words = tokenizeDocument(instance); Set<String> wordSet = new HashSet<String>(words); for (int i = 0; i < unigrams.size(); i++) { String unigram = unigrams.get(i); int count = 0; if (wordSet.contains(unigram)) { //Count the times the word is in the document for (int w = 0; w < words.size(); w++) { if (words.get(w).equals(unigram)) { count += 1; } } } int featureIndex = numOldValues + i; newValues[featureIndex] = count; } for (int i = 0; i < bigrams.size(); i++) { Bigram bigram = bigrams.get(i); int count = bigram.getTimesInDocument(words); int featureIndex = numOldValues + unigrams.size() + i; newValues[featureIndex] = count; } } Instance result = new SparseInstance(instance.weight(), newValues); return result; }
From source file:filters.MauiFilter.java
License:Open Source License
/** * Converts an instance.// ww w.ja v a 2s . c o m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); String fileName = instance.stringValue(fileNameAtt); if (debugMode) { System.err.println("-- Converting instance for document " + fileName); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; if (!instance.isMissing(keyphrasesAtt)) { String keyphrases = instance.stringValue(keyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases); } // Get the document text String documentText = instance.stringValue(documentAtt); // Compute the candidate topics HashMap<String, Candidate> candidateList; if (allCandidates != null && allCandidates.containsKey(instance)) { candidateList = allCandidates.get(instance); } else { candidateList = getCandidates(documentText); } if (debugMode) { System.err.println(candidateList.size() + " candidates "); } // Set indices for key attributes int tfidfAttIndex = documentAtt + 2; int distAttIndex = documentAtt + 3; int probsAttIndex = documentAtt + numFeatures; int countPos = 0; int countNeg = 0; // Go through the phrases and convert them into instances for (Candidate candidate : candidateList.values()) { if (candidate.getFrequency() < minOccurFrequency) { continue; } String name = candidate.getName(); String orig = candidate.getBestFullForm(); if (!vocabularyName.equals("none")) { orig = candidate.getTitle(); } double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(classifierData); // Get probability of a phrase being key phrase double[] probs = classifier.distributionForInstance(inst); double prob = probs[0]; if (nominalClassValue) { prob = probs[1]; } // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures + 2]; int pos = 0; for (int i = 1; i < instance.numAttributes(); i++) { if (i == documentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(name); newInst[pos++] = index; // Add original version if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(name); } newInst[pos++] = index; // Add features newInst[pos++] = inst.value(tfIndex); newInst[pos++] = inst.value(idfIndex); newInst[pos++] = inst.value(tfidfIndex); newInst[pos++] = inst.value(firstOccurIndex); newInst[pos++] = inst.value(lastOccurIndex); newInst[pos++] = inst.value(spreadOccurIndex); newInst[pos++] = inst.value(domainKeyphIndex); newInst[pos++] = inst.value(lengthIndex); newInst[pos++] = inst.value(generalityIndex); newInst[pos++] = inst.value(nodeDegreeIndex); newInst[pos++] = inst.value(semRelIndex); newInst[pos++] = inst.value(wikipKeyphrIndex); newInst[pos++] = inst.value(invWikipFreqIndex); newInst[pos++] = inst.value(totalWikipKeyphrIndex); // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == keyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); if (inst.classValue() == 0) { countNeg++; } else { countPos++; } } if (debugMode) { System.err.println(countPos + " positive; " + countNeg + " negative instances"); } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and // continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current // phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:hr.irb.fastRandomForest.FastRandomTree.java
License:Open Source License
/** * Computes class distribution of an instance using the FastRandomTree.<p> * * In Weka's RandomTree, the distributions were normalized so that all * probabilities sum to 1; this would abolish the effect of instance weights * on voting. In FastRandomForest 0.97 onwards, the distributions are * normalized by dividing with the number of instances going into a leaf.<p> * /*from www . j ava 2 s.co m*/ * @param instance the instance to compute the distribution for * @return the computed class distribution * @throws Exception if computation fails */ @Override public double[] distributionForInstance(Instance instance) throws Exception { double[] returnedDist = null; if (m_Attribute > -1) { // ============================ node is not a leaf if (instance.isMissing(m_Attribute)) { // ---------------- missing value returnedDist = new double[m_MotherForest.m_Info.numClasses()]; // split instance up for (int i = 0; i < m_Successors.length; i++) { double[] help = m_Successors[i].distributionForInstance(instance); if (help != null) { for (int j = 0; j < help.length; j++) { returnedDist[j] += m_Prop[i] * help[j]; } } } } else if (m_MotherForest.m_Info.attribute(m_Attribute).isNominal()) { // ------ nominal //returnedDist = m_Successors[(int) instance.value(m_Attribute)] // .distributionForInstance(instance); // 0.99: new - binary splits (also) for nominal attributes if (instance.value(m_Attribute) == m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } else { // ------------------------------------------ numeric attributes if (instance.value(m_Attribute) < m_SplitPoint) { returnedDist = m_Successors[0].distributionForInstance(instance); } else { returnedDist = m_Successors[1].distributionForInstance(instance); } } return returnedDist; } else { // =============================================== node is a leaf return m_ClassProbs; } }
From source file:hr.irb.fastRandomForest.NakedFastRandomTree.java
License:Open Source License
public NakedFastRandomTree getNodeForInstance(final Instance i, final int max_depth) { if (max_depth == 0) { return this; }// www.j a va 2s . c o m if (m_Attribute == -1) { // Leaf node return this; } if (i.isMissing(m_Attribute)) { throw new IllegalStateException("NakedFastRandomTree does not support missing attributes"); } final int next_depth = max_depth - 1; final NakedFastRandomTree succ; if (m_MotherForest.m_Info.attribute(m_Attribute).isNominal()) { // nominal // 0.99: new - binary splits (also) for nominal attributes if (i.value(m_Attribute) == m_SplitPoint) { succ = (NakedFastRandomTree) m_Successors[0]; } else { succ = (NakedFastRandomTree) m_Successors[1]; } } else { // numeric if (i.value(m_Attribute) < m_SplitPoint) { succ = (NakedFastRandomTree) m_Successors[0]; } else { succ = (NakedFastRandomTree) m_Successors[1]; } } return succ.getNodeForInstance(i, next_depth); }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Creates split on enumerated attribute. * * @exception Exception if something goes wrong */// w w w.j ava2 s . co m private void handleEnumeratedAttribute(Instances trainInstances) throws Exception { Distribution newDistribution, secondDistribution; int numAttValues; double currIG, currGR; Instance instance; int i; numAttValues = trainInstances.attribute(m_attIndex).numValues(); newDistribution = new Distribution(numAttValues, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) newDistribution.add((int) instance.value(m_attIndex), instance); } m_distribution = newDistribution; // For all values for (i = 0; i < numAttValues; i++) { if (Utils.grOrEq(newDistribution.perBag(i), m_minNoObj)) { secondDistribution = new Distribution(newDistribution, i); // Check if minimum number of Instances in the two // subsets. if (secondDistribution.check(m_minNoObj)) { m_numSubsets = 2; currIG = m_infoGainCrit.splitCritValue(secondDistribution, m_sumOfWeights); currGR = m_gainRatioCrit.splitCritValue(secondDistribution, m_sumOfWeights, currIG); if ((i == 0) || Utils.gr(currGR, m_gainRatio)) { m_gainRatio = currGR; m_infoGain = currIG; m_splitPoint = (double) i; m_distribution = secondDistribution; } } } } }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Creates split on numeric attribute./*from w w w. j a va 2 s . c om*/ * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { int firstMiss; int next = 1; int last = 0; int index = 0; int splitIndex = -1; double currentInfoGain; double defaultEnt; double minSplit; Instance instance; int i; // Current attribute is a numeric attribute. m_distribution = new Distribution(2, trainInstances.numClasses()); // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); i = 0; while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (instance.isMissing(m_attIndex)) break; m_distribution.add(1, instance); i++; } firstMiss = i; // Compute minimum number of Instances required in each // subset. minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses()); if (Utils.smOrEq(minSplit, m_minNoObj)) minSplit = m_minNoObj; else if (Utils.gr(minSplit, 25)) minSplit = 25; // Enough Instances with known values? if (Utils.sm((double) firstMiss, 2 * minSplit)) return; // Compute values of criteria for all possible split // indices. defaultEnt = m_infoGainCrit.oldEnt(m_distribution); while (next < firstMiss) { if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next) .value(m_attIndex)) { // Move class values for all Instances up to next // possible split point. m_distribution.shiftRange(1, 0, trainInstances, last, next); // Check if enough Instances in each subset and compute // values for criteria. if (Utils.grOrEq(m_distribution.perBag(0), minSplit) && Utils.grOrEq(m_distribution.perBag(1), minSplit)) { currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt); if (Utils.gr(currentInfoGain, m_infoGain)) { m_infoGain = currentInfoGain; splitIndex = next - 1; } index++; } last = next; } next++; } // Was there any useful split? if (index == 0) return; // Compute modified information gain for best split. m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights); if (Utils.smOrEq(m_infoGain, 0)) return; // Set instance variables' values to values for // best split. m_numSubsets = 2; m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex) + trainInstances.instance(splitIndex).value(m_attIndex)) / 2; // In case we have a numerical precision problem we need to choose the // smaller value if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) { m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex); } // Restore distributioN for best split. m_distribution = new Distribution(2, trainInstances.numClasses()); m_distribution.addRange(0, trainInstances, 0, splitIndex + 1); m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss); // Compute modified gain ratio for best split. m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain); }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Sets split point to greatest value in given data smaller or equal to * old split point.// w ww . jav a 2 s .c om * (C4.5 does this for some strange reason). */ public final void setSplitPoint(Instances allInstances) { double newSplitPoint = -Double.MAX_VALUE; double tempValue; Instance instance; if ((!allInstances.attribute(m_attIndex).isNominal()) && (m_numSubsets > 1)) { Enumeration enu = allInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { tempValue = instance.value(m_attIndex); if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint)) newSplitPoint = tempValue; } } m_splitPoint = newSplitPoint; } }
From source file:j48.BinC45Split.java
License:Open Source License
/** * Returns weights if instance is assigned to more than one subset. * Returns null if instance is only assigned to one subset. *///from ww w. jav a2 s .co m public final double[] weights(Instance instance) { double[] weights; int i; if (instance.isMissing(m_attIndex)) { weights = new double[m_numSubsets]; for (i = 0; i < m_numSubsets; i++) weights[i] = m_distribution.perBag(i) / m_distribution.total(); return weights; } else { return null; } }