Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:es.bsc.autonomic.powermodeller.tools.classifiers.WekaWrapper.java

License:Apache License

public static DataSet processDataSet(DataSet ds, VariableParser parser) {

    String independent = ds.getIndependent();

    if (independent == null)
        throw new WekaWrapperException("Independent variable is not set in dataset.");

    HashMap<String, String> expression_list = parser.getNewMetrics();
    Instances data = convertDataSetToInstances(ds);

    try {/*from  w  w w  .java2s. c  o m*/
        // Apply filters for all the new variables
        for (Map.Entry<String, String> entry : expression_list.entrySet()) {
            String key = entry.getKey();
            String value = entry.getValue();
            logger.debug("Generating new variable " + key + " as " + value);

            AddExpression add_filter = new AddExpression();
            add_filter.setName(key);
            add_filter.setExpression(value);
            add_filter.setInputFormat(data);

            data = useFilter(data, add_filter);

        }

    } catch (Exception e) {
        logger.error("Error while processing new variables", e);
        throw new WekaWrapperException("Error while processing new variables");
    }

    // Iterate over all the columns and keep only the ones contained in variables list
    List<String> variables = parser.getColumns();

    // Append independent variable to the list of variables to keep
    variables.add(independent);

    // Remove unneeded attributes
    try {

        // it's important to iterate from last to first, because when we remove
        // an instance, the rest shifts by one position.
        for (int i = data.numAttributes() - 1; i >= 0; i--) {
            String n = data.attribute(i).name();
            if (!variables.contains(data.attribute(i).name())) {
                logger.trace("Deleting unnecessary attribute " + data.attribute(i).name());
                data.deleteAttributeAt(i);
            }
        }

        data.toString();
    } catch (Exception e) {
        logger.error("Error while removing unneeded variables", e);
        throw new WekaWrapperException("Error while removing unneeded variables");
    }

    // Convert Instances in csv and return the new DataSet
    String new_path = CoreConfiguration.getNewCSVFileName();
    try {
        CSVSaver saver = new CSVSaver();
        saver.setInstances(data);
        saver.setFile(new File(new_path));
        saver.writeBatch();
    } catch (Exception e) {
        logger.error("Error while removing unneeded variables", e);
        throw new WekaWrapperException("Error while removing unneeded variables");
    }

    DataSet ret = new DataSet(new_path);
    ret.setIndependent(independent);
    return ret;
}

From source file:es.bsc.autonomic.powermodeller.tools.classifiers.WekaWrapper.java

License:Apache License

public static Instances convertDataSetToInstances(DataSet ds) {

    Instances instances;
    try {// w  w w .j a  v  a2 s  .c  o m
        // Read all the instances in the file and initialize data

        ConverterUtils.DataSource source = new ConverterUtils.DataSource(ds.getFilePath());
        instances = source.getDataSet();

        instances.setClassIndex(instances.attribute(ds.getIndependent()).index());

    } catch (Exception e) {
        logger.error("Error while reading input DataSet", e);
        throw new WekaWrapperException("Error while reading input DataSet");
    }

    return instances;
}

From source file:es.jarias.FMC.FMC.java

License:Open Source License

public static double[][] mutualInfo(Instances data, int[] indexes) {

    double[][] m_counts = new double[indexes.length][];
    double[][][] m_2counts = new double[indexes.length][indexes.length][];

    double[] nValues = new double[indexes.length];

    double[][] I = new double[indexes.length][indexes.length];

    for (int i = 0; i < indexes.length; i++) {
        nValues[i] = data.attribute(indexes[i]).numValues();
        m_counts[i] = new double[(int) nValues[i]];
    }//from www. j av  a2 s . co m

    for (int i = 0; i < indexes.length; i++) {
        for (int j = 0; j < indexes.length; j++) {
            if (i != j) {
                double cardinality = nValues[i] * nValues[j];
                m_2counts[i][j] = new double[(int) cardinality];
            }
        }
    }

    // Compute counts:
    for (Instance d : data) {
        for (int i = 0; i < indexes.length; i++) {
            m_counts[i][(int) d.value(indexes[i])]++;
            for (int j = 0; j < indexes.length; j++) {
                if (i != j) {
                    int index = (int) (d.value(indexes[j]) * nValues[i] + d.value(indexes[i]));
                    m_2counts[i][j][index]++;
                }
            }
        }
    }

    // Calculate MI(X_i; X_j)
    for (int i = 0; i < indexes.length; i++) {
        for (int j = 0; j < indexes.length; j++) {
            if (i != j) {
                double mi = 0.0;
                for (int v_i = 0; v_i < nValues[i]; v_i++) {
                    for (int v_j = 0; v_j < nValues[j]; v_j++) {

                        if ((1.0 * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)])
                                / (1.0 * m_counts[i][v_i] * m_counts[j][v_j]) > 0)
                            mi += m_2counts[i][j][(int) (v_j * nValues[i] + v_i)] * Math.log((1.0
                                    * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)])
                                    / (1.0 * m_counts[i][v_i] * m_counts[j][v_j]));
                    }
                }
                I[i][j] = mi / data.numInstances();
            }
        }
    }

    return I;
}

From source file:etc.aloe.cscw2013.SMOFeatureWeighting.java

License:Open Source License

@Override
public List<Entry<String, Double>> getFeatureWeights(ExampleSet trainingExamples, Model model) {
    WekaModel wekaModel = (WekaModel) model;
    Classifier classifier = wekaModel.getClassifier();
    Instances dataFormat = trainingExamples.getInstances();

    SMO smo = getSMO(classifier);/*ww w. j  ava 2 s  .c  o  m*/

    double[] sparseWeights = smo.sparseWeights()[0][1];
    int[] sparseIndices = smo.sparseIndices()[0][1];

    Map<String, Double> weights = new HashMap<String, Double>();
    for (int i = 0; i < sparseWeights.length; i++) {
        int index = sparseIndices[i];
        double weight = sparseWeights[i];
        String name = dataFormat.attribute(index).name();
        weights.put(name, weight);
    }

    List<Map.Entry<String, Double>> entries = new ArrayList<Map.Entry<String, Double>>(weights.entrySet());

    Collections.sort(entries, new Comparator<Map.Entry<String, Double>>() {
        @Override
        public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {
            return o1.getKey().compareTo(o2.getKey());
        }
    });

    return entries;
}

From source file:etc.aloe.data.SegmentSet.java

License:Open Source License

/**
 * Convert the segment set into an ExampleSet (ready for feature
 * extraction). The returned example set includes an id attribute, the
 * message text, a label attribute, and several basic features extracted
 * from the segment.//from w  ww.j  a  va 2  s  .c  o  m
 *
 * @return
 */
public ExampleSet getBasicExamples() {
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();

    attributes.add(new Attribute(ExampleSet.ID_ATTR_NAME));
    attributes.add(new Attribute(ExampleSet.MESSAGE_ATTR_NAME, (List<String>) null));
    attributes.add(new Attribute(ExampleSet.LABEL_ATTR_NAME, Arrays.asList(new String[] { "false", "true" })));
    attributes.add(new Attribute(ExampleSet.PARTICIPANT_ATTR_NAME, (List<String>) null));
    attributes.add(new Attribute(DURATION_ATTR_NAME));
    attributes.add(new Attribute(LENGTH_ATTR_NAME));
    attributes.add(new Attribute(CPS_ATTR_NAME));
    attributes.add(new Attribute(RATE_ATTR_NAME));

    Instances instances = new Instances("BasicExamples", attributes, 0);
    instances.setClassIndex(2);

    Attribute idAttr = instances.attribute(ExampleSet.ID_ATTR_NAME);
    Attribute messageAttr = instances.attribute(ExampleSet.MESSAGE_ATTR_NAME);
    Attribute labelAttr = instances.attribute(ExampleSet.LABEL_ATTR_NAME);
    Attribute participantAttr = instances.attribute(ExampleSet.PARTICIPANT_ATTR_NAME);
    Attribute durationAttr = instances.attribute(DURATION_ATTR_NAME);
    Attribute lengthAttr = instances.attribute(LENGTH_ATTR_NAME);
    Attribute cpsAttr = instances.attribute(CPS_ATTR_NAME);
    Attribute rateAttr = instances.attribute(RATE_ATTR_NAME);

    for (int i = 0; i < size(); i++) {
        Segment segment = get(i);
        Instance instance = new DenseInstance(instances.numAttributes());

        String messageStr = segment.concatMessages();
        String participantStr = segment.concatParticipants();

        instance.setValue(idAttr, segment.getId());
        instance.setValue(messageAttr, messageStr);
        instance.setValue(participantAttr, participantStr);

        if (segment.hasTrueLabel()) {
            instance.setValue(labelAttr, segment.getTrueLabel() ? "true" : "false");
        }

        computeRateValues(segment, instance, messageStr, durationAttr, lengthAttr, cpsAttr, rateAttr);

        instances.add(instance);
    }

    return new ExampleSet(instances);
}

From source file:etc.aloe.filters.AbstractRegexFilter.java

License:Open Source License

@Override
protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    if (stringAttributeName == null) {
        throw new IllegalStateException("String attribute name not set");
    }//from  w ww  . j  a  v  a 2  s . co  m

    Instances outputFormat = new Instances(inputFormat, 0);

    Attribute stringAttr = inputFormat.attribute(stringAttributeName);
    stringAttributeIndex = stringAttr.index();

    //Add the new columns. There is one for each regex feature.
    NamedRegex[] regexFeatures = getRegexFeatures();
    for (int i = 0; i < regexFeatures.length; i++) {
        String name = regexFeatures[i].getName();
        Attribute attr = new Attribute(name);
        outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());

        if (countRegexLengths) {
            name = name + "_L";
            attr = new Attribute(name);
            outputFormat.insertAttributeAt(attr, outputFormat.numAttributes());
        }

    }

    return outputFormat;
}

From source file:etc.aloe.filters.SimpleStringToWordVector.java

License:Open Source License

@Override
public boolean setInputFormat(Instances instanceInfo) throws Exception {
    if (stringAttributeName == null) {
        throw new IllegalStateException("String attribute name was not set");
    }/*from   ww w  .  java2  s  . co m*/

    Attribute stringAttr = instanceInfo.attribute(stringAttributeName);
    if (stringAttr == null) {
        throw new IllegalStateException("Attribute " + stringAttributeName + " does not exist");
    }

    this.setAttributeIndicesArray(new int[] { stringAttr.index() });

    return super.setInputFormat(instanceInfo);
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

@Override
protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    if (getStringAttribute() == null) {
        throw new IllegalStateException("String attribute name not set");
    }/*from w  w  w . ja  va  2  s . c o m*/

    stringAttributeIndex = inputFormat.attribute(getStringAttribute()).index();

    inputFormat = getInputFormat();
    //This generates m_selectedTerms and m_DocsCounts
    int[] docsCountsByTermIdx = determineDictionary(inputFormat);

    //Initialize the output format to be just like the input
    Instances outputFormat = new Instances(inputFormat, 0);

    //Set up the map from attr index to document frequency
    m_DocsCounts = new int[m_selectedTerms.size()];
    //And add the new attributes
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        int attrIdx = outputFormat.numAttributes();
        int docsCount = docsCountsByTermIdx[i];
        m_DocsCounts[i] = docsCount;

        outputFormat.insertAttributeAt(new Attribute(m_Prefix + m_selectedTerms.get(i)), attrIdx);
    }

    return outputFormat;
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

private int[] determineDictionary(Instances instances) {
    if (stringAttributeIndex < 0) {
        throw new IllegalStateException("String attribute index not valid");
    }/*  w w w  . ja  va2  s  . c  om*/

    // Operate on a per-class basis if class attribute is set
    int classInd = instances.classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = instances.attribute(classInd).numValues();
    }

    HashMap<String, Integer> termIndices = new HashMap<String, Integer>();
    for (int i = 0; i < termList.size(); i++) {
        termIndices.put(termList.get(i), i);
    }

    //Create the trie for matching terms
    Trie termTrie = new Trie(termList);

    //Initialize the dictionary/count map
    ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>();
    for (int z = 0; z < values; z++) {
        termCounts.add(new HashMap<Integer, Count>());
    }

    //Go through all the instances and count the emoticons
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        //Get the string attribute to examine
        String stringValue = instance.stringValue(stringAttributeIndex);

        HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd);

        HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue);
        for (Map.Entry<String, Integer> entry : termMatches.entrySet()) {
            String term = entry.getKey();
            int termIdx = termIndices.get(term);

            int matches = entry.getValue();

            Count count = termCountsForClass.get(termIdx);
            if (count == null) {
                count = new Count(0);
                termCountsForClass.put(termIdx, count);
            }

            if (matches > 0) {
                count.docCount += 1;
                count.count += matches;
            }
        }
    }

    // Figure out the minimum required word frequency
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        int array[] = new int[termCountsForClass.size()];
        int pos = 0;
        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            array[pos] = entry.getValue().count;
            pos++;
        }

        // sort the array
        sortArray(array);

        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    HashSet<String> selectedTerms = new HashSet<String>();
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            int termIndex = entry.getKey();
            String term = termList.get(termIndex);
            Count count = entry.getValue();
            if (count.count >= prune[z]) {
                selectedTerms.add(term);
            }
        }
    }

    //Save the selected terms as a list
    this.m_selectedTerms = new ArrayList<String>(selectedTerms);
    this.m_selectedTermsTrie = new Trie(this.m_selectedTerms);
    this.m_NumInstances = instances.size();

    //Construct the selected terms to index map
    this.m_selectedTermIndices = new HashMap<String, Integer>();
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        m_selectedTermIndices.put(m_selectedTerms.get(i), i);
    }

    // Compute document frequencies, organized by selected term index (not original term index)
    int[] docsCounts = new int[m_selectedTerms.size()];
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        String term = m_selectedTerms.get(i);
        int termIndex = termIndices.get(term);
        int docsCount = 0;
        for (int z = 0; z < values; z++) {
            HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

            Count count = termCountsForClass.get(termIndex);
            if (count != null) {
                docsCount += count.docCount;
            }
        }
        docsCounts[i] = docsCount;
    }
    return docsCounts;
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

public static void main(String[] args) {

    //Create a test dataset
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    attributes.add(new Attribute("message", (ArrayList<String>) null));
    attributes.add(new Attribute("id"));
    {/*from   www.  jav a2s  .  c  o m*/
        ArrayList<String> classValues = new ArrayList<String>();
        classValues.add("0");
        classValues.add("1");
        attributes.add(new Attribute("class", classValues));
    }

    Instances instances = new Instances("test", attributes, 0);
    instances.setClassIndex(2);

    String[] messages = new String[] { "No emoticons here", "I have a smiley :)",
            "Two smileys and a frownie :) :) :(", "Several emoticons :( :-( :) :-) ;-) 8-) :-/ :-P" };

    for (int i = 0; i < messages.length; i++) {
        Instance instance = new DenseInstance(instances.numAttributes());
        instance.setValue(instances.attribute(0), messages[i]);
        instance.setValue(instances.attribute(1), i);
        instance.setValue(instances.attribute(2), Integer.toString(i % 2));
        instances.add(instance);
    }

    System.out.println("Before filter:");
    for (int i = 0; i < instances.size(); i++) {
        System.out.println(instances.instance(i).toString());
    }

    try {
        String dictionaryName = "emoticons.txt";
        StringToDictionaryVector filter = new StringToDictionaryVector();
        List<String> termList = StringToDictionaryVector.readDictionaryFile(new File(dictionaryName));
        filter.setTermList(termList);
        filter.setMinTermFreq(1);
        filter.setTFTransform(true);
        filter.setIDFTransform(true);
        filter.setNormalizeDocLength(new SelectedTag(FILTER_NORMALIZE_TEST_ONLY, TAGS_FILTER));
        filter.setOutputWordCounts(true);
        filter.setStringAttribute("message");

        filter.setInputFormat(instances);
        Instances trans1 = Filter.useFilter(instances, filter);
        Instances trans2 = Filter.useFilter(instances, filter);

        System.out.println("\nFirst application:");
        System.out.println(trans1.toString());

        System.out.println("\nSecond application:");
        System.out.println(trans2.toString());

    } catch (Exception e) {
        e.printStackTrace();
    }
}