Example usage for weka.core Instance isMissing

List of usage examples for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:wekimini.DataManager.java

public void setOutputValue(int index, int whichOutput, double val) {
    Instance i = allInstances.instance(index);
    if (i == null) {
        return;//from   w ww.  j a v  a2  s . c  o m
    }

    boolean changesNumberOfInstances = i.isMissing(numMetaData + numInputs + whichOutput);

    if (isDiscrete[whichOutput]) {
        int v = (int) val;
        Attribute a = i.attribute(numMetaData + numInputs + whichOutput);
        if (a.isNominal() && v >= 0 && v <= numClasses[whichOutput]) {
            i.setValue(numMetaData + numInputs + whichOutput, v);
        } else {
            logger.log(Level.SEVERE, "Attribute value out of range");
            //TODO: CHeck this
        }
    } else {
        //TODO insert error checking / range limiting for this version!
        i.setValue(numMetaData + numInputs + whichOutput, val);
    }
    if (changesNumberOfInstances) {
        setNumExamplesPerOutput(whichOutput, getNumExamplesPerOutput(whichOutput) + 1);
    }
}

From source file:wekimini.DataManager.java

public void setOutputMissing(int index, int outputNum) {
    //if (paramNum >= 0 && paramNum < numParams) {
    Instance i = allInstances.instance(index);
    if (!i.isMissing(numMetaData + numInputs + outputNum)) {
        i.setMissing(numMetaData + numInputs + outputNum);
        setNumExamplesPerOutput(outputNum, getNumExamplesPerOutput(outputNum) - 1);
    }//  w  ww .j a va2 s.  c o  m

    //Need to recompute numOutputs!
    //}
}

From source file:wekimini.DataManager.java

public boolean isOutputMissing(int index, int outputNum) {
    Instance i = allInstances.instance(index);
    return (i.isMissing(numMetaData + numInputs + outputNum));
}

From source file:wekimini.DataManager.java

public double getOutputValue(int index, int whichOutput) {
    Instance i = allInstances.instance(index);
    if (i == null || i.numAttributes() < (numInputs + numMetaData + whichOutput)) {
        return Double.NaN;
    }//from  w  w w  . j a v  a  2 s .  c  o  m
    if (i.isMissing(numMetaData + numInputs + whichOutput)) {
        return Double.NaN;
    }
    return i.value(numMetaData + numInputs + whichOutput);
    /* if (i.attribute(numMetaData + numInputs + whichOutput).isNumeric()) {
     return i.value(numMetaData + numInputs + whichOutput);
     } else {
     //What we need to do if we allow classes that don't start at 1:
     //return Double.parseDouble(i.attribute(numMetaData + numInputs + whichOutput).value((int)i.value(numMetaData + numInputs + whichOutput)));
     return i.value(numMetaData + numInputs + whichOutput) + 1;
     } */
}

From source file:wekimini.gui.WekiArffLoader.java

private void receivedConfiguration(int[] selectedIndices, boolean overwrite, boolean ignoreWithNoOutputs) {
    //Now load the data. TODO
    //For each instance: 

    //Slow, not great, but should work:
    //addImportedData(double[] inputs, double[][] outputs, boolean[] inputMask, boolean[] outputMask) {
    //w.getSupervisedLearningManager().addBundleToTraining(null, outputs, recordingMask);
    //w.getDataManager().addToTraining(inputs, outputs, recordingMask, recordingRound);
    w.getSupervisedLearningManager().incrementRecordingRound();

    if (overwrite) {
        w.getSupervisedLearningManager().deleteAllExamples();
    }// w w  w  . j av a2 s .  co  m

    boolean[] inputMaskForSet = createInputMaskForSet(selectedIndices);
    boolean[] outputMaskForSet = createOutputMaskForSet(selectedIndices);

    try {
        //Get enumerator for instances...
        Instance nextInstance = af.getNextInstance(structure);
        int numInputs = inputMaskForSet.length;
        int numOutputs = outputMaskForSet.length;

        while (nextInstance != null) {
            double[] inputs = new double[inputMaskForSet.length];
            double[] outputs = new double[outputMaskForSet.length];
            boolean[] inputMask = new boolean[inputMaskForSet.length];
            System.arraycopy(inputMaskForSet, 0, inputMask, 0, inputMask.length);
            boolean[] outputMask = new boolean[outputMaskForSet.length];
            System.arraycopy(outputMaskForSet, 0, outputMask, 0, outputMask.length);

            int numOutputsMissing = 0;
            for (int i = 0; i < selectedIndices.length; i++) {
                int projectIndexForCol = projectIndicesPerColumn.get(i).get(selectedIndices[i]);
                //selectedIndices[i] : says which input/output corresponds to the ith attribute
                if (projectIndexForCol == 0) {
                    //do nothing: ignore it
                } else if (projectIndexForCol <= inputs.length) { //it's an input
                    if (nextInstance.isMissing(i)) {
                        inputs[projectIndexForCol - 1] = 0;
                        inputMask[projectIndexForCol - 1] = false;
                    } else {
                        inputs[projectIndexForCol - 1] = nextInstance.value(i);
                    }
                } else { //it's an output
                    if (nextInstance.isMissing(i)) {
                        outputs[projectIndexForCol - 1 - numInputs] = 0;
                        outputMask[projectIndexForCol - 1 - numInputs] = false;
                        numOutputsMissing++;
                    } else {
                        double val = nextInstance.value(i);
                        outputs[projectIndexForCol - 1 - numInputs] = val;
                    }
                }
            }
            if (!ignoreWithNoOutputs || numOutputsMissing < numOutputs) {
                w.getSupervisedLearningManager().addToTraining(inputs, outputs, inputMask, outputMask);
            }
            nextInstance = af.getNextInstance(structure);
        }

    } catch (IOException ex) {
        w.getStatusUpdateCenter().warn(this, "Encountered error in reading from ARFF file.");
        Logger.getLogger(WekiArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        recv.completed();
    }

    //TODO: Prevent this from being available when in DTW mode.
    recv.completed();
}

From source file:zhaop.textmining.proj.MyStringToWordVector.java

License:Open Source License

/**
 * determines the dictionary.//from ww w.  java 2 s. co m
 */
private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
        try {
            if (getStopwords().exists() && !getStopwords().isDirectory())
                stopwords.read(getStopwords());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = getInputFormat().attribute(classInd).numValues();
    }

    // TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap[] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
        dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance instance = getInputFormat().instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        // Iterate through all relevant string attributes of the current instance
        Hashtable h = new Hashtable();
        for (int j = 0; j < instance.numAttributes(); j++) {
            if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

                // Get tokenizer
                m_Tokenizer.tokenize(instance.stringValue(j));

                // Iterate through tokens, perform stemming, and remove stopwords
                // (if required)
                while (m_Tokenizer.hasMoreElements()) {
                    String word = ((String) m_Tokenizer.nextElement()).intern();

                    if (this.m_lowerCaseTokens == true)
                        word = word.toLowerCase();

                    // stop first.
                    if (this.m_useStoplist == true)
                        if (stopwords.is(word)) {
                            //                System.out.println("a stop word: " + word);
                            continue;
                        }

                    // stem next
                    word = m_Stemmer.stem(word);

                    if (!(h.contains(word)))
                        h.put(word, new Integer(0));

                    Count count = (Count) dictionaryArr[vInd].get(word);
                    if (count == null) {
                        dictionaryArr[vInd].put(word, new Count(1));
                    } else {
                        count.count++;
                    }
                }
            }
        }

        // updating the docCount for the words that have occurred in this
        // instance(document).
        Enumeration e = h.keys();
        while (e.hasMoreElements()) {
            String word = (String) e.nextElement();
            Count c = (Count) dictionaryArr[vInd].get(word);
            if (c != null) {
                c.docCount++;
            } else
                System.err.println(
                        "Warning: A word should definitely be in the " + "dictionary.Please check the code");
        }

        if (pruneRate > 0) {
            if (i % pruneRate == 0 && i > 0) {
                for (int z = 0; z < values; z++) {
                    Vector d = new Vector(1000);
                    Iterator it = dictionaryArr[z].keySet().iterator();
                    while (it.hasNext()) {
                        String word = (String) it.next();
                        Count count = (Count) dictionaryArr[z].get(word);
                        if (count.count <= 1) {
                            d.add(word);
                        }
                    }
                    Iterator iter = d.iterator();
                    while (iter.hasNext()) {
                        String word = (String) iter.next();
                        dictionaryArr[z].remove(word);
                    }
                }
            }
        }
    }

    // Figure out the minimum required word frequency
    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        totalsize += dictionaryArr[z].size();

        int array[] = new int[dictionaryArr[z].size()];
        int pos = 0;
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            array[pos] = count.count;
            pos++;
        }

        // sort the array
        sortArray(array);
        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes());

    // Add the non-converted attributes
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().classIndex() == i) {
                classIndex = attributes.size();
            }
            attributes.addElement(getInputFormat().attribute(i).copy());
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    TreeMap newDictionary = new TreeMap();
    int index = attributes.size();
    for (int z = 0; z < values; z++) {
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            if (count.count >= prune[z]) {
                if (newDictionary.get(word) == null) {
                    newDictionary.put(word, new Integer(index++));
                    attributes.addElement(new Attribute(m_Prefix + word));
                }
            }
        }
    }

    // Compute document frequencies
    m_DocsCounts = new int[attributes.size()];
    Iterator it = newDictionary.keySet().iterator();
    while (it.hasNext()) {
        String word = (String) it.next();
        int idx = ((Integer) newDictionary.get(word)).intValue();
        int docsCount = 0;
        for (int j = 0; j < values; j++) {
            Count c = (Count) dictionaryArr[j].get(word);
            if (c != null)
                docsCount += c.docCount;
        }
        m_DocsCounts[idx] = docsCount;
    }

    // Trim vector and set instance variables
    attributes.trimToSize();
    m_Dictionary = newDictionary;
    m_NumInstances = getInputFormat().numInstances();

    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
}

From source file:zhaop.textmining.proj.MyStringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 * /*from   ww  w. j a v a2 s .c om*/
 * @oaram instance the instance to convert
 * @param v
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Instance.missingValue()));
                } else {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than two lines down
                                          // to avoid hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    // Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    // Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}