Example usage for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:com.esda.util.StringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 *
 * @oaram instance the instance to convert
 * @param v/*from  w  w w. j  a  v  a2 s. c  o m*/
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING
                    && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Double.NaN));
                } else if (getInputFormat().attribute(i).type() == Attribute.STRING) {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                } else {
                    // relational
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation();

                        // hack to defeat sparse instances bug
                        outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader);
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addRelation(instance.relationalValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than
                        // two lines down to avoid
                        // hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    // Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    // Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from w ww  . ja  v  a2  s. c om
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        log.info("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;
    HashMap<String, Counter> hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
    //   hash = getComposits(hash);

    /* Experimental:
     To compute how many of the manual keyphrases appear in the documents:
            
    log.info("Doc phrases found " + hash.size());
    log.info("Manual keyphrases: ");
    Iterator iter = hashKeyphrases.keySet().iterator();
    int count = 0;
    while (iter.hasNext()) {
       String id = (String)iter.next();
       if (hash.containsKey(id)) {
    count++;
       }
    }
            
    double max_recall = (double)count/(double)hashKeyphrases.size();
            
            
    m_max_recall += max_recall;
    doc++;
    double avg_m_max_recall = m_max_recall/(double)doc;
            
    String file = instance.stringValue(2);
    log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
    log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
    */

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }
    if (m_STDEVfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_NODEfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_LENGTHfeature) {
        numFeatures = numFeatures + 1;
    }

    // Set indices of key attributes
    //int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;
    //int classAttIndex = numFeatures;

    // Go through the phrases and convert them into instances
    Iterator<String> it = hash.keySet().iterator();
    while (it.hasNext()) {
        String id = it.next();
        FastVector phraseInfo = (FastVector) hash.get(id);

        double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(m_ClassifierData);

        // Get probability of a phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);

        // If simple Naive Bayes used, change here to
        //double prob = probs[1];
        double prob = probs[0];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(id);
                newInst[pos++] = index;

                // Add original version
                String orig = (String) phraseInfo.elementAt(2);

                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(id);
                }
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }
                if (m_STDEVfeature) {
                    newInst[pos++] = inst.value(m_STDEVIndex);
                }
                if (m_NODEfeature) {
                    newInst[pos++] = inst.value(m_NodeIndex);
                }
                if (m_LENGTHfeature) {
                    newInst[pos++] = inst.value(m_LengthIndex);
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator<String> phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {
                    // log.info("Here: " + phrase);
                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }
                    if (m_STDEVfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_NODEfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_LENGTHfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    // newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }

                Instance inst = new Instance(instance.weight(), newInst);
                inst.setDataset(outputFormatPeek());
                vector.addElement(inst);
            }

        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }
    return vector;
}

From source file:com.openkm.kea.filter.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./*  w  w w . j  av  a 2s .c  o  m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            // aly: str = text of the document
            String str = instance.stringValue(i);

            String tokenized = tokenize(str);

            // aly: resultStr is the clean version of str
            // log.info(resultStr.toString());
            int index = getOutputFormat().attribute(i).addStringValue(tokenized);
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.openkm.kea.filter.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found./*from  w ww  . jav a 2 s .  c om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);

            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.tum.classifiertest.FastRandomTree.java

License:Open Source License

/**
 * Computes class distribution of an instance using the FastRandomTree.<p>
 *
 * In Weka's RandomTree, the distributions were normalized so that all
 * probabilities sum to 1; this would abolish the effect of instance weights
 * on voting. In FastRandomForest 0.97 onwards, the distributions are
 * normalized by dividing with the number of instances going into a leaf.<p>
 * /*w  ww  . j  a  v a2 s.c om*/
 * @param instance the instance to compute the distribution for
 * @return the computed class distribution
 * @throws Exception if computation fails
 */
@Override
public double[] distributionForInstance(Instance instance) throws Exception {

    double[] returnedDist = null;

    if (m_Attribute > -1) { // ============================ node is not a leaf

        if (instance.isMissing(m_Attribute)) { // ---------------- missing value

            returnedDist = new double[m_MotherForest.getM_Info().numClasses()];
            // split instance up
            for (int i = 0; i < m_Successors.length; i++) {
                double[] help = m_Successors[i].distributionForInstance(instance);
                if (help != null) {
                    for (int j = 0; j < help.length; j++) {
                        returnedDist[j] += m_Prop[i] * help[j];
                    }
                }
            }

        } else if (m_MotherForest.getM_Info().attribute(m_Attribute).isNominal()) { // ------ nominal

            //returnedDist = m_Successors[(int) instance.value(m_Attribute)]
            //        .distributionForInstance(instance);

            // 0.99: new - binary splits (also) for nominal attributes
            if (instance.value(m_Attribute) == m_SplitPoint) {
                returnedDist = m_Successors[0].distributionForInstance(instance);
            } else {
                returnedDist = m_Successors[1].distributionForInstance(instance);
            }

        } else { // ------------------------------------------ numeric attributes

            if (instance.value(m_Attribute) < m_SplitPoint) {
                returnedDist = m_Successors[0].distributionForInstance(instance);
            } else {
                returnedDist = m_Successors[1].distributionForInstance(instance);
            }
        }

        return returnedDist;

    } else { // =============================================== node is a leaf

        return m_ClassProbs;

    }

}

From source file:core.DatabaseSaverEx.java

License:Open Source License

/**
 * inserts the given instance into the table.
 * //from  ww  w  . j av a 2  s .co m
 * @param inst the instance to insert
 * @throws Exception if something goes wrong
 */
public void writeInstance(Instance inst) throws Exception {

    StringBuffer insert = new StringBuffer();
    insert.append("INSERT INTO ");
    insert.append(m_tableName);
    insert.append(" VALUES ( ");
    if (m_id) {
        insert.append(m_count);
        insert.append(", ");
        m_count++;
    }
    for (int j = 0; j < inst.numAttributes(); j++) {
        if (inst.isMissing(j))
            insert.append("NULL");
        else {
            if ((inst.attribute(j)).isDate())
                insert.append("'" + m_DateFormat.format((long) inst.value(j)) + "'");
            else if ((inst.attribute(j)).isNumeric())
                insert.append(inst.value(j));
            else {
                String stringInsert = "'" + inst.stringValue(j) + "'";
                if (stringInsert.length() > 2)
                    stringInsert = stringInsert.replaceAll("''", "'");
                insert.append(stringInsert);
            }
        }
        if (j != inst.numAttributes() - 1)
            insert.append(", ");
    }
    insert.append(" )");
    //System.out.println(insert.toString());
    if (m_DataBaseConnection.update(insert.toString()) < 1) {
        throw new IOException("Tuple cannot be inserted.");
    } else {
        m_DataBaseConnection.close();
    }
}

From source file:de.ugoe.cs.cpdp.loader.NetgeneLoader.java

License:Apache License

@Override
public Instances load(File fileMetricsFile) {
    // first determine all files
    String path = fileMetricsFile.getParentFile().getAbsolutePath();
    String project = fileMetricsFile.getName().split("_")[0];
    File bugsFile = new File(path + "/" + project + "_bugs_per_file.csv");
    File networkMetrics = new File(path + "/" + project + "_network_metrics.csv");
    Instances metricsData = null;/*  w w  w.  j a v  a  2 s .com*/

    try {
        CSVLoader wekaCsvLoader = new CSVLoader();
        wekaCsvLoader.setSource(fileMetricsFile);
        metricsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(bugsFile);
        Instances bugsData = wekaCsvLoader.getDataSet();
        wekaCsvLoader.setSource(networkMetrics);
        Instances networkData = wekaCsvLoader.getDataSet();

        metricsData.setRelationName(project);

        // fix nominal attributes (i.e., NA values)
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isNominal()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }
        // fix string attributes
        for (int j = 2; j < networkData.numAttributes(); j++) {
            if (networkData.attribute(j).isString()) {
                String attributeName = networkData.attribute(j).name();
                double[] tmpVals = new double[networkData.size()];
                // get temporary values
                for (int i = 0; i < networkData.size(); i++) {
                    Instance inst = networkData.instance(i);
                    if (!inst.isMissing(j)) {
                        String val = networkData.instance(i).stringValue(j);
                        try {
                            tmpVals[i] = Double.parseDouble(val);
                        } catch (NumberFormatException e) {
                            // not a number, using 0.0;
                            tmpVals[i] = 0.0;
                        }
                    } else {
                        tmpVals[i] = 0.0;
                    }
                }
                // replace attribute
                networkData.deleteAttributeAt(j);
                networkData.insertAttributeAt(new Attribute(attributeName), j);
                for (int i = 0; i < networkData.size(); i++) {
                    networkData.instance(i).setValue(j, tmpVals[i]);
                }
            }
        }

        Map<String, Integer> filenames = new HashMap<>();
        for (int j = 0; j < metricsData.size(); j++) {
            filenames.put(metricsData.instance(j).stringValue(0), j);
        }
        // merge with network data
        int attributeIndex;
        for (int j = 2; j < networkData.numAttributes(); j++) {
            attributeIndex = metricsData.numAttributes();
            metricsData.insertAttributeAt(networkData.attribute(j), attributeIndex);
            for (int i = 0; i < networkData.size(); i++) {
                Integer instanceIndex = filenames.get(networkData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex,
                            networkData.instance(i).value(j));
                }
            }
        }

        // add bug information
        attributeIndex = metricsData.numAttributes();
        final ArrayList<String> classAttVals = new ArrayList<String>();
        classAttVals.add("0");
        classAttVals.add("1");
        final Attribute classAtt = new Attribute("bug", classAttVals);
        metricsData.insertAttributeAt(classAtt, attributeIndex);
        for (int i = 0; i < bugsData.size(); i++) {
            if (bugsData.instance(i).value(2) > 0.0d) {
                Integer instanceIndex = filenames.get(bugsData.instance(i).stringValue(1));
                if (instanceIndex != null) {
                    metricsData.instance(instanceIndex).setValue(attributeIndex, 1.0);
                }
            }
        }

        // remove filenames
        metricsData.deleteAttributeAt(0);
        Attribute eigenvector = metricsData.attribute("eigenvector");
        if (eigenvector != null) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.attribute(j) == eigenvector) {
                    metricsData.deleteAttributeAt(j);
                }
            }
        }

        metricsData.setClassIndex(metricsData.numAttributes() - 1);

        // set all missing values to 0
        for (int i = 0; i < metricsData.size(); i++) {
            for (int j = 0; j < metricsData.numAttributes(); j++) {
                if (metricsData.instance(i).isMissing(j)) {
                    metricsData.instance(i).setValue(j, 0.0d);
                }
            }
        }
    } catch (IOException e) {
        Console.traceln(Level.SEVERE, "failure reading file: " + e.getMessage());
        metricsData = null;
    }
    return metricsData;
}

From source file:edu.drexel.psal.jstylo.verifiers.WLSVM.java

License:Open Source License

/**
 * Converts an ARFF Instance into a string in the sparse format accepted by
 * LIBSVM/*from   w ww .j  a va2  s .c  om*/
 * 
 * @param instance
 * @return
 */
protected String InstanceToSparse(Instance instance) {
    String line = new String();
    int c = (int) instance.classValue();
    if (c == 0)
        c = -1;
    line = c + " ";
    for (int j = 1; j < instance.numAttributes(); j++) {
        if (j - 1 == instance.classIndex()) {
            continue;
        }
        if (instance.isMissing(j - 1))
            continue;
        if (instance.value(j - 1) != 0)
            line += " " + j + ":" + instance.value(j - 1);
    }
    // LOG.info(line); 
    return (line + "\n");
}

From source file:en_deep.mlprocess.manipulation.featmodif.ReplaceMissing.java

License:Open Source License

/**
 * Convert a single instance over if the class is nominal. The converted
 * instance is added to the end of the output queue.
 *
 * @param instance the instance to convert
 */// w  w  w. j a va 2  s .c o m
private void convertInstance(Instance instance) {

    // create a copy of the input instance
    Instance inst = null;

    if (instance instanceof SparseInstance) {
        inst = new SparseInstance(instance.weight(), instance.toDoubleArray());
    } else {
        inst = new DenseInstance(instance.weight(), instance.toDoubleArray());
    }

    // copy the string values from this instance as well (only the existing ones)
    inst.setDataset(getOutputFormat());
    copyValues(inst, false, instance.dataset(), getOutputFormat()); // beware of weird behavior of this function (see source)!!
    inst.setDataset(getOutputFormat());

    // find the missing values to be filled + the double values for the new "missing" label and store it
    double[] vals = instance.toDoubleArray();

    for (int j = 0; j < getInputFormat().numAttributes(); j++) {

        Attribute att = instance.attribute(j);

        if (m_Columns.isInRange(j) && instance.isMissing(j)) {
            // find the "missing" value in the output nominal attribute
            if (att.isNominal()) {
                vals[j] = inst.dataset().attribute(j).indexOfValue(m_ReplVal);
            }
            // add a string value for the new "missing" label
            else if (att.isString()) {
                vals[j] = inst.dataset().attribute(j).addStringValue(m_ReplVal);
            }
        }
    }

    // fill in the missing values found
    inst.replaceMissingValues(vals);

    push(inst);
}

From source file:etc.aloe.filters.AbstractRegexFilter.java

License:Open Source License

@Override
protected Instance process(Instance instance) throws Exception {
    if (stringAttributeIndex < 0) {
        throw new IllegalStateException("String attribute not set");
    }//from www  .j a v  a  2s.  c o  m

    String stringValue = instance.stringValue(stringAttributeIndex);
    NamedRegex[] regexFeatures = getRegexFeatures();

    int numOldValues = instance.numAttributes();
    int numNewFeatures = regexFeatures.length;
    if (countRegexLengths) {
        numNewFeatures = regexFeatures.length * 2;
    }
    double[] newValues = new double[numOldValues + numNewFeatures];

    // Copy all attributes from input to output
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (getInputFormat().attribute(i).type() != Attribute.STRING) {
            // Add simple nominal and numeric attributes directly
            if (instance.value(i) != 0.0) {
                newValues[i] = instance.value(i);
            }
        } else {
            if (instance.isMissing(i)) {
                newValues[i] = Utils.missingValue();
            } else {

                // If this is a string attribute, we have to first add
                // this value to the range of possible values, then add
                // its new internal index.
                if (outputFormatPeek().attribute(i).numValues() == 0) {
                    // Note that the first string value in a
                    // SparseInstance doesn't get printed.
                    outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                newValues[i] = newIndex;
            }
        }
    }

    for (int i = 0; i < regexFeatures.length; i++) {
        Pattern pattern = regexFeatures[i].getPattern();

        Matcher matches = pattern.matcher(stringValue);
        int count = 0;
        int maxLength = 0;
        while (matches.find()) {
            count++;
            int len = matches.group().length();
            if (len > maxLength) {
                maxLength = len;
            }
        }

        int index = numOldValues + i;
        if (countRegexLengths) {
            index = numOldValues + 2 * i;
        }
        newValues[index] = count;

        if (countRegexLengths) {
            newValues[numOldValues + 2 * i + 1] = maxLength;
        }
    }

    Instance result = new SparseInstance(instance.weight(), newValues);
    return result;
}