Example usage for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 *
 * @param instance the instance to convert
 *
 * @param ArrayList<Instance> the list of instances
 * @return the document length/*w  w  w .  j  a v  a2  s .c o m*/
 */
private double convertInstancewoDocNorm(Instance instance, ArrayList<Instance> converted) {
    if (stringAttributeIndex < 0) {
        throw new IllegalStateException("String attribute index not valid");
    }

    int numOldValues = instance.numAttributes();
    double[] newValues = new double[numOldValues + m_selectedTerms.size()];

    // Copy all attributes from input to output
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (getInputFormat().attribute(i).type() != Attribute.STRING) {
            // Add simple nominal and numeric attributes directly
            if (instance.value(i) != 0.0) {
                newValues[i] = instance.value(i);
            }
        } else {
            if (instance.isMissing(i)) {
                newValues[i] = Utils.missingValue();
            } else {

                // If this is a string attribute, we have to first add
                // this value to the range of possible values, then add
                // its new internal index.
                if (outputFormatPeek().attribute(i).numValues() == 0) {
                    // Note that the first string value in a
                    // SparseInstance doesn't get printed.
                    outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                newValues[i] = newIndex;
            }
        }
    }

    String stringValue = instance.stringValue(stringAttributeIndex);
    double docLength = 0;

    HashMap<String, Integer> termMatches = m_selectedTermsTrie.countNonoverlappingMatches(stringValue);
    for (Map.Entry<String, Integer> entry : termMatches.entrySet()) {
        String term = entry.getKey();
        int termIdx = m_selectedTermIndices.get(term);
        double matches = entry.getValue();
        if (!m_OutputCounts && matches > 0) {
            matches = 1;
        }

        if (matches > 0) {
            if (m_TFTransform == true) {
                matches = Math.log(matches + 1);
            }

            if (m_IDFTransform == true) {
                matches = matches * Math.log(m_NumInstances / (double) m_DocsCounts[termIdx]);
            }

            newValues[numOldValues + termIdx] = matches;
            docLength += matches * matches;
        }
    }

    Instance result = new SparseInstance(instance.weight(), newValues);
    converted.add(result);

    return Math.sqrt(docLength);
}

From source file:etc.aloe.filters.WordFeaturesExtractor.java

License:Open Source License

protected List<List<String>> tokenizeDocuments(Instances instances) {
    //Convert all instances into term lists
    List<List<String>> documents = new ArrayList<List<String>>();

    for (int i = 0; i < instances.size(); i++) {
        Instance instance = instances.get(i);

        if (instance.isMissing(selectedAttributeIndex) == false) {
            List<String> words = tokenizeDocument(instance);
            documents.add(words);/*  ww  w  .j a va  2s .co  m*/
        }
    }

    return documents;
}

From source file:etc.aloe.filters.WordFeaturesExtractor.java

License:Open Source License

@Override
protected Instance process(Instance instance) throws Exception {
    if (selectedAttributeIndex < 0) {
        throw new IllegalStateException("String attribute not set");
    }// ww  w .ja  v a 2 s  .  com

    int numOldValues = instance.numAttributes();
    int numNewFeatures = unigrams.size() + bigrams.size();
    double[] newValues = new double[numOldValues + numNewFeatures];

    // Copy all attributes from input to output
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (getInputFormat().attribute(i).type() != Attribute.STRING) {
            // Add simple nominal and numeric attributes directly
            if (instance.value(i) != 0.0) {
                newValues[i] = instance.value(i);
            }
        } else {
            if (instance.isMissing(i)) {
                newValues[i] = Utils.missingValue();
            } else {

                // If this is a string attribute, we have to first add
                // this value to the range of possible values, then add
                // its new internal index.
                if (outputFormatPeek().attribute(i).numValues() == 0) {
                    // Note that the first string value in a
                    // SparseInstance doesn't get printed.
                    outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                newValues[i] = newIndex;
            }
        }
    }

    String stringValue = instance.stringValue(selectedAttributeIndex);
    if (instance.isMissing(selectedAttributeIndex) == false) {

        List<String> words = tokenizeDocument(instance);
        Set<String> wordSet = new HashSet<String>(words);

        for (int i = 0; i < unigrams.size(); i++) {
            String unigram = unigrams.get(i);
            int count = 0;
            if (wordSet.contains(unigram)) {
                //Count the times the word is in the document
                for (int w = 0; w < words.size(); w++) {
                    if (words.get(w).equals(unigram)) {
                        count += 1;
                    }
                }
            }

            int featureIndex = numOldValues + i;
            newValues[featureIndex] = count;
        }

        for (int i = 0; i < bigrams.size(); i++) {
            Bigram bigram = bigrams.get(i);
            int count = bigram.getTimesInDocument(words);
            int featureIndex = numOldValues + unigrams.size() + i;
            newValues[featureIndex] = count;
        }
    }

    Instance result = new SparseInstance(instance.weight(), newValues);
    return result;
}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance.//  ww w.ja v a 2s  . c  o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        System.err.println("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        System.err.println(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        // Get probability of a phrase being key phrase
        double[] probs = classifier.distributionForInstance(inst);

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex);
                newInst[pos++] = inst.value(idfIndex);
                newInst[pos++] = inst.value(tfidfIndex);
                newInst[pos++] = inst.value(firstOccurIndex);
                newInst[pos++] = inst.value(lastOccurIndex);
                newInst[pos++] = inst.value(spreadOccurIndex);
                newInst[pos++] = inst.value(domainKeyphIndex);
                newInst[pos++] = inst.value(lengthIndex);
                newInst[pos++] = inst.value(generalityIndex);
                newInst[pos++] = inst.value(nodeDegreeIndex);
                newInst[pos++] = inst.value(semRelIndex);
                newInst[pos++] = inst.value(wikipKeyphrIndex);
                newInst[pos++] = inst.value(invWikipFreqIndex);
                newInst[pos++] = inst.value(totalWikipKeyphrIndex);

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }
    }
    if (debugMode) {
        System.err.println(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:hr.irb.fastRandomForest.FastRandomTree.java

License:Open Source License

/**
 * Computes class distribution of an instance using the FastRandomTree.<p>
 *
 * In Weka's RandomTree, the distributions were normalized so that all
 * probabilities sum to 1; this would abolish the effect of instance weights
 * on voting. In FastRandomForest 0.97 onwards, the distributions are
 * normalized by dividing with the number of instances going into a leaf.<p>
 * /*from  www  . j  ava 2  s.co m*/
 * @param instance the instance to compute the distribution for
 * @return the computed class distribution
 * @throws Exception if computation fails
 */
@Override
public double[] distributionForInstance(Instance instance) throws Exception {

    double[] returnedDist = null;

    if (m_Attribute > -1) { // ============================ node is not a leaf

        if (instance.isMissing(m_Attribute)) { // ---------------- missing value

            returnedDist = new double[m_MotherForest.m_Info.numClasses()];
            // split instance up
            for (int i = 0; i < m_Successors.length; i++) {
                double[] help = m_Successors[i].distributionForInstance(instance);
                if (help != null) {
                    for (int j = 0; j < help.length; j++) {
                        returnedDist[j] += m_Prop[i] * help[j];
                    }
                }
            }

        } else if (m_MotherForest.m_Info.attribute(m_Attribute).isNominal()) { // ------ nominal

            //returnedDist = m_Successors[(int) instance.value(m_Attribute)]
            //        .distributionForInstance(instance);

            // 0.99: new - binary splits (also) for nominal attributes
            if (instance.value(m_Attribute) == m_SplitPoint) {
                returnedDist = m_Successors[0].distributionForInstance(instance);
            } else {
                returnedDist = m_Successors[1].distributionForInstance(instance);
            }

        } else { // ------------------------------------------ numeric attributes

            if (instance.value(m_Attribute) < m_SplitPoint) {
                returnedDist = m_Successors[0].distributionForInstance(instance);
            } else {
                returnedDist = m_Successors[1].distributionForInstance(instance);
            }
        }

        return returnedDist;

    } else { // =============================================== node is a leaf

        return m_ClassProbs;

    }

}

From source file:hr.irb.fastRandomForest.NakedFastRandomTree.java

License:Open Source License

public NakedFastRandomTree getNodeForInstance(final Instance i, final int max_depth) {
    if (max_depth == 0) {
        return this;
    }// www.j a  va  2s . c o  m

    if (m_Attribute == -1) { // Leaf node
        return this;
    }

    if (i.isMissing(m_Attribute)) {
        throw new IllegalStateException("NakedFastRandomTree does not support missing attributes");
    }

    final int next_depth = max_depth - 1;
    final NakedFastRandomTree succ;
    if (m_MotherForest.m_Info.attribute(m_Attribute).isNominal()) { // nominal
        // 0.99: new - binary splits (also) for nominal attributes
        if (i.value(m_Attribute) == m_SplitPoint) {
            succ = (NakedFastRandomTree) m_Successors[0];
        } else {
            succ = (NakedFastRandomTree) m_Successors[1];
        }
    } else { // numeric
        if (i.value(m_Attribute) < m_SplitPoint) {
            succ = (NakedFastRandomTree) m_Successors[0];
        } else {
            succ = (NakedFastRandomTree) m_Successors[1];
        }
    }

    return succ.getNodeForInstance(i, next_depth);
}

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 *
 * @exception Exception if something goes wrong
 */// w  w  w.j ava2  s  .  co m
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    Distribution newDistribution, secondDistribution;
    int numAttValues;
    double currIG, currGR;
    Instance instance;
    int i;

    numAttValues = trainInstances.attribute(m_attIndex).numValues();
    newDistribution = new Distribution(numAttValues, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (!instance.isMissing(m_attIndex))
            newDistribution.add((int) instance.value(m_attIndex), instance);
    }
    m_distribution = newDistribution;

    // For all values
    for (i = 0; i < numAttValues; i++) {

        if (Utils.grOrEq(newDistribution.perBag(i), m_minNoObj)) {
            secondDistribution = new Distribution(newDistribution, i);

            // Check if minimum number of Instances in the two
            // subsets.
            if (secondDistribution.check(m_minNoObj)) {
                m_numSubsets = 2;
                currIG = m_infoGainCrit.splitCritValue(secondDistribution, m_sumOfWeights);
                currGR = m_gainRatioCrit.splitCritValue(secondDistribution, m_sumOfWeights, currIG);
                if ((i == 0) || Utils.gr(currGR, m_gainRatio)) {
                    m_gainRatio = currGR;
                    m_infoGain = currIG;
                    m_splitPoint = (double) i;
                    m_distribution = secondDistribution;
                }
            }
        }
    }
}

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Creates split on numeric attribute./*from w  w w. j a  va  2  s .  c om*/
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    int firstMiss;
    int next = 1;
    int last = 0;
    int index = 0;
    int splitIndex = -1;
    double currentInfoGain;
    double defaultEnt;
    double minSplit;
    Instance instance;
    int i;

    // Current attribute is a numeric attribute.
    m_distribution = new Distribution(2, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    i = 0;
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (instance.isMissing(m_attIndex))
            break;
        m_distribution.add(1, instance);
        i++;
    }
    firstMiss = i;

    // Compute minimum number of Instances required in each
    // subset.
    minSplit = 0.1 * (m_distribution.total()) / ((double) trainInstances.numClasses());
    if (Utils.smOrEq(minSplit, m_minNoObj))
        minSplit = m_minNoObj;
    else if (Utils.gr(minSplit, 25))
        minSplit = 25;

    // Enough Instances with known values?
    if (Utils.sm((double) firstMiss, 2 * minSplit))
        return;

    // Compute values of criteria for all possible split
    // indices.
    defaultEnt = m_infoGainCrit.oldEnt(m_distribution);
    while (next < firstMiss) {

        if (trainInstances.instance(next - 1).value(m_attIndex) + 1e-5 < trainInstances.instance(next)
                .value(m_attIndex)) {

            // Move class values for all Instances up to next 
            // possible split point.
            m_distribution.shiftRange(1, 0, trainInstances, last, next);

            // Check if enough Instances in each subset and compute
            // values for criteria.
            if (Utils.grOrEq(m_distribution.perBag(0), minSplit)
                    && Utils.grOrEq(m_distribution.perBag(1), minSplit)) {
                currentInfoGain = m_infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights, defaultEnt);
                if (Utils.gr(currentInfoGain, m_infoGain)) {
                    m_infoGain = currentInfoGain;
                    splitIndex = next - 1;
                }
                index++;
            }
            last = next;
        }
        next++;
    }

    // Was there any useful split?
    if (index == 0)
        return;

    // Compute modified information gain for best split.
    m_infoGain = m_infoGain - (Utils.log2(index) / m_sumOfWeights);
    if (Utils.smOrEq(m_infoGain, 0))
        return;

    // Set instance variables' values to values for
    // best split.
    m_numSubsets = 2;
    m_splitPoint = (trainInstances.instance(splitIndex + 1).value(m_attIndex)
            + trainInstances.instance(splitIndex).value(m_attIndex)) / 2;

    // In case we have a numerical precision problem we need to choose the
    // smaller value
    if (m_splitPoint == trainInstances.instance(splitIndex + 1).value(m_attIndex)) {
        m_splitPoint = trainInstances.instance(splitIndex).value(m_attIndex);
    }

    // Restore distributioN for best split.
    m_distribution = new Distribution(2, trainInstances.numClasses());
    m_distribution.addRange(0, trainInstances, 0, splitIndex + 1);
    m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss);

    // Compute modified gain ratio for best split.
    m_gainRatio = m_gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
}

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Sets split point to greatest value in given data smaller or equal to
 * old split point.// w  ww . jav  a  2 s  .c om
 * (C4.5 does this for some strange reason).
 */
public final void setSplitPoint(Instances allInstances) {

    double newSplitPoint = -Double.MAX_VALUE;
    double tempValue;
    Instance instance;

    if ((!allInstances.attribute(m_attIndex).isNominal()) && (m_numSubsets > 1)) {
        Enumeration enu = allInstances.enumerateInstances();
        while (enu.hasMoreElements()) {
            instance = (Instance) enu.nextElement();
            if (!instance.isMissing(m_attIndex)) {
                tempValue = instance.value(m_attIndex);
                if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint))
                    newSplitPoint = tempValue;
            }
        }
        m_splitPoint = newSplitPoint;
    }
}

From source file:j48.BinC45Split.java

License:Open Source License

/**
 * Returns weights if instance is assigned to more than one subset.
 * Returns null if instance is only assigned to one subset.
 *///from ww w.  jav a2  s .co m
public final double[] weights(Instance instance) {

    double[] weights;
    int i;

    if (instance.isMissing(m_attIndex)) {
        weights = new double[m_numSubsets];
        for (i = 0; i < m_numSubsets; i++)
            weights[i] = m_distribution.perBag(i) / m_distribution.total();
        return weights;
    } else {
        return null;
    }
}