List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:es.bsc.autonomic.powermodeller.tools.classifiers.WekaWrapper.java
License:Apache License
public static DataSet processDataSet(DataSet ds, VariableParser parser) { String independent = ds.getIndependent(); if (independent == null) throw new WekaWrapperException("Independent variable is not set in dataset."); HashMap<String, String> expression_list = parser.getNewMetrics(); Instances data = convertDataSetToInstances(ds); try {/*from w w w .java2s. c o m*/ // Apply filters for all the new variables for (Map.Entry<String, String> entry : expression_list.entrySet()) { String key = entry.getKey(); String value = entry.getValue(); logger.debug("Generating new variable " + key + " as " + value); AddExpression add_filter = new AddExpression(); add_filter.setName(key); add_filter.setExpression(value); add_filter.setInputFormat(data); data = useFilter(data, add_filter); } } catch (Exception e) { logger.error("Error while processing new variables", e); throw new WekaWrapperException("Error while processing new variables"); } // Iterate over all the columns and keep only the ones contained in variables list List<String> variables = parser.getColumns(); // Append independent variable to the list of variables to keep variables.add(independent); // Remove unneeded attributes try { // it's important to iterate from last to first, because when we remove // an instance, the rest shifts by one position. for (int i = data.numAttributes() - 1; i >= 0; i--) { String n = data.attribute(i).name(); if (!variables.contains(data.attribute(i).name())) { logger.trace("Deleting unnecessary attribute " + data.attribute(i).name()); data.deleteAttributeAt(i); } } data.toString(); } catch (Exception e) { logger.error("Error while removing unneeded variables", e); throw new WekaWrapperException("Error while removing unneeded variables"); } // Convert Instances in csv and return the new DataSet String new_path = CoreConfiguration.getNewCSVFileName(); try { CSVSaver saver = new CSVSaver(); saver.setInstances(data); saver.setFile(new File(new_path)); saver.writeBatch(); } catch (Exception e) { logger.error("Error while removing unneeded variables", e); throw new WekaWrapperException("Error while removing unneeded variables"); } DataSet ret = new DataSet(new_path); ret.setIndependent(independent); return ret; }
From source file:es.bsc.autonomic.powermodeller.tools.classifiers.WekaWrapper.java
License:Apache License
public static Instances convertDataSetToInstances(DataSet ds) { Instances instances; try {// w w w .j a v a2 s .c o m // Read all the instances in the file and initialize data ConverterUtils.DataSource source = new ConverterUtils.DataSource(ds.getFilePath()); instances = source.getDataSet(); instances.setClassIndex(instances.attribute(ds.getIndependent()).index()); } catch (Exception e) { logger.error("Error while reading input DataSet", e); throw new WekaWrapperException("Error while reading input DataSet"); } return instances; }
From source file:es.jarias.FMC.FMC.java
License:Open Source License
public static double[][] mutualInfo(Instances data, int[] indexes) { double[][] m_counts = new double[indexes.length][]; double[][][] m_2counts = new double[indexes.length][indexes.length][]; double[] nValues = new double[indexes.length]; double[][] I = new double[indexes.length][indexes.length]; for (int i = 0; i < indexes.length; i++) { nValues[i] = data.attribute(indexes[i]).numValues(); m_counts[i] = new double[(int) nValues[i]]; }//from www. j av a2 s . co m for (int i = 0; i < indexes.length; i++) { for (int j = 0; j < indexes.length; j++) { if (i != j) { double cardinality = nValues[i] * nValues[j]; m_2counts[i][j] = new double[(int) cardinality]; } } } // Compute counts: for (Instance d : data) { for (int i = 0; i < indexes.length; i++) { m_counts[i][(int) d.value(indexes[i])]++; for (int j = 0; j < indexes.length; j++) { if (i != j) { int index = (int) (d.value(indexes[j]) * nValues[i] + d.value(indexes[i])); m_2counts[i][j][index]++; } } } } // Calculate MI(X_i; X_j) for (int i = 0; i < indexes.length; i++) { for (int j = 0; j < indexes.length; j++) { if (i != j) { double mi = 0.0; for (int v_i = 0; v_i < nValues[i]; v_i++) { for (int v_j = 0; v_j < nValues[j]; v_j++) { if ((1.0 * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)]) / (1.0 * m_counts[i][v_i] * m_counts[j][v_j]) > 0) mi += m_2counts[i][j][(int) (v_j * nValues[i] + v_i)] * Math.log((1.0 * data.numInstances() * m_2counts[i][j][(int) (v_j * nValues[i] + v_i)]) / (1.0 * m_counts[i][v_i] * m_counts[j][v_j])); } } I[i][j] = mi / data.numInstances(); } } } return I; }
From source file:etc.aloe.cscw2013.SMOFeatureWeighting.java
License:Open Source License
@Override public List<Entry<String, Double>> getFeatureWeights(ExampleSet trainingExamples, Model model) { WekaModel wekaModel = (WekaModel) model; Classifier classifier = wekaModel.getClassifier(); Instances dataFormat = trainingExamples.getInstances(); SMO smo = getSMO(classifier);/*ww w. j ava 2 s .c o m*/ double[] sparseWeights = smo.sparseWeights()[0][1]; int[] sparseIndices = smo.sparseIndices()[0][1]; Map<String, Double> weights = new HashMap<String, Double>(); for (int i = 0; i < sparseWeights.length; i++) { int index = sparseIndices[i]; double weight = sparseWeights[i]; String name = dataFormat.attribute(index).name(); weights.put(name, weight); } List<Map.Entry<String, Double>> entries = new ArrayList<Map.Entry<String, Double>>(weights.entrySet()); Collections.sort(entries, new Comparator<Map.Entry<String, Double>>() { @Override public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { return o1.getKey().compareTo(o2.getKey()); } }); return entries; }
From source file:etc.aloe.data.SegmentSet.java
License:Open Source License
/** * Convert the segment set into an ExampleSet (ready for feature * extraction). The returned example set includes an id attribute, the * message text, a label attribute, and several basic features extracted * from the segment.//from w ww.j a va 2 s .c o m * * @return */ public ExampleSet getBasicExamples() { ArrayList<Attribute> attributes = new ArrayList<Attribute>(); attributes.add(new Attribute(ExampleSet.ID_ATTR_NAME)); attributes.add(new Attribute(ExampleSet.MESSAGE_ATTR_NAME, (List<String>) null)); attributes.add(new Attribute(ExampleSet.LABEL_ATTR_NAME, Arrays.asList(new String[] { "false", "true" }))); attributes.add(new Attribute(ExampleSet.PARTICIPANT_ATTR_NAME, (List<String>) null)); attributes.add(new Attribute(DURATION_ATTR_NAME)); attributes.add(new Attribute(LENGTH_ATTR_NAME)); attributes.add(new Attribute(CPS_ATTR_NAME)); attributes.add(new Attribute(RATE_ATTR_NAME)); Instances instances = new Instances("BasicExamples", attributes, 0); instances.setClassIndex(2); Attribute idAttr = instances.attribute(ExampleSet.ID_ATTR_NAME); Attribute messageAttr = instances.attribute(ExampleSet.MESSAGE_ATTR_NAME); Attribute labelAttr = instances.attribute(ExampleSet.LABEL_ATTR_NAME); Attribute participantAttr = instances.attribute(ExampleSet.PARTICIPANT_ATTR_NAME); Attribute durationAttr = instances.attribute(DURATION_ATTR_NAME); Attribute lengthAttr = instances.attribute(LENGTH_ATTR_NAME); Attribute cpsAttr = instances.attribute(CPS_ATTR_NAME); Attribute rateAttr = instances.attribute(RATE_ATTR_NAME); for (int i = 0; i < size(); i++) { Segment segment = get(i); Instance instance = new DenseInstance(instances.numAttributes()); String messageStr = segment.concatMessages(); String participantStr = segment.concatParticipants(); instance.setValue(idAttr, segment.getId()); instance.setValue(messageAttr, messageStr); instance.setValue(participantAttr, participantStr); if (segment.hasTrueLabel()) { instance.setValue(labelAttr, segment.getTrueLabel() ? "true" : "false"); } computeRateValues(segment, instance, messageStr, durationAttr, lengthAttr, cpsAttr, rateAttr); instances.add(instance); } return new ExampleSet(instances); }
From source file:etc.aloe.filters.AbstractRegexFilter.java
License:Open Source License
@Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception { if (stringAttributeName == null) { throw new IllegalStateException("String attribute name not set"); }//from w ww . j a v a 2 s . co m Instances outputFormat = new Instances(inputFormat, 0); Attribute stringAttr = inputFormat.attribute(stringAttributeName); stringAttributeIndex = stringAttr.index(); //Add the new columns. There is one for each regex feature. NamedRegex[] regexFeatures = getRegexFeatures(); for (int i = 0; i < regexFeatures.length; i++) { String name = regexFeatures[i].getName(); Attribute attr = new Attribute(name); outputFormat.insertAttributeAt(attr, outputFormat.numAttributes()); if (countRegexLengths) { name = name + "_L"; attr = new Attribute(name); outputFormat.insertAttributeAt(attr, outputFormat.numAttributes()); } } return outputFormat; }
From source file:etc.aloe.filters.SimpleStringToWordVector.java
License:Open Source License
@Override public boolean setInputFormat(Instances instanceInfo) throws Exception { if (stringAttributeName == null) { throw new IllegalStateException("String attribute name was not set"); }/*from ww w . java2 s . co m*/ Attribute stringAttr = instanceInfo.attribute(stringAttributeName); if (stringAttr == null) { throw new IllegalStateException("Attribute " + stringAttributeName + " does not exist"); } this.setAttributeIndicesArray(new int[] { stringAttr.index() }); return super.setInputFormat(instanceInfo); }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
@Override protected Instances determineOutputFormat(Instances inputFormat) throws Exception { if (getStringAttribute() == null) { throw new IllegalStateException("String attribute name not set"); }/*from w w w . ja va 2 s . c o m*/ stringAttributeIndex = inputFormat.attribute(getStringAttribute()).index(); inputFormat = getInputFormat(); //This generates m_selectedTerms and m_DocsCounts int[] docsCountsByTermIdx = determineDictionary(inputFormat); //Initialize the output format to be just like the input Instances outputFormat = new Instances(inputFormat, 0); //Set up the map from attr index to document frequency m_DocsCounts = new int[m_selectedTerms.size()]; //And add the new attributes for (int i = 0; i < m_selectedTerms.size(); i++) { int attrIdx = outputFormat.numAttributes(); int docsCount = docsCountsByTermIdx[i]; m_DocsCounts[i] = docsCount; outputFormat.insertAttributeAt(new Attribute(m_Prefix + m_selectedTerms.get(i)), attrIdx); } return outputFormat; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
private int[] determineDictionary(Instances instances) { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute index not valid"); }/* w w w . ja va2 s . c om*/ // Operate on a per-class basis if class attribute is set int classInd = instances.classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = instances.attribute(classInd).numValues(); } HashMap<String, Integer> termIndices = new HashMap<String, Integer>(); for (int i = 0; i < termList.size(); i++) { termIndices.put(termList.get(i), i); } //Create the trie for matching terms Trie termTrie = new Trie(termList); //Initialize the dictionary/count map ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>(); for (int z = 0; z < values; z++) { termCounts.add(new HashMap<Integer, Count>()); } //Go through all the instances and count the emoticons for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } //Get the string attribute to examine String stringValue = instance.stringValue(stringAttributeIndex); HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd); HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue); for (Map.Entry<String, Integer> entry : termMatches.entrySet()) { String term = entry.getKey(); int termIdx = termIndices.get(term); int matches = entry.getValue(); Count count = termCountsForClass.get(termIdx); if (count == null) { count = new Count(0); termCountsForClass.put(termIdx, count); } if (matches > 0) { count.docCount += 1; count.count += matches; } } } // Figure out the minimum required word frequency int prune[] = new int[values]; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); int array[] = new int[termCountsForClass.size()]; int pos = 0; for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { array[pos] = entry.getValue().count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) HashSet<String> selectedTerms = new HashSet<String>(); for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { int termIndex = entry.getKey(); String term = termList.get(termIndex); Count count = entry.getValue(); if (count.count >= prune[z]) { selectedTerms.add(term); } } } //Save the selected terms as a list this.m_selectedTerms = new ArrayList<String>(selectedTerms); this.m_selectedTermsTrie = new Trie(this.m_selectedTerms); this.m_NumInstances = instances.size(); //Construct the selected terms to index map this.m_selectedTermIndices = new HashMap<String, Integer>(); for (int i = 0; i < m_selectedTerms.size(); i++) { m_selectedTermIndices.put(m_selectedTerms.get(i), i); } // Compute document frequencies, organized by selected term index (not original term index) int[] docsCounts = new int[m_selectedTerms.size()]; for (int i = 0; i < m_selectedTerms.size(); i++) { String term = m_selectedTerms.get(i); int termIndex = termIndices.get(term); int docsCount = 0; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); Count count = termCountsForClass.get(termIndex); if (count != null) { docsCount += count.docCount; } } docsCounts[i] = docsCount; } return docsCounts; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
public static void main(String[] args) { //Create a test dataset ArrayList<Attribute> attributes = new ArrayList<Attribute>(); attributes.add(new Attribute("message", (ArrayList<String>) null)); attributes.add(new Attribute("id")); {/*from www. jav a2s . c o m*/ ArrayList<String> classValues = new ArrayList<String>(); classValues.add("0"); classValues.add("1"); attributes.add(new Attribute("class", classValues)); } Instances instances = new Instances("test", attributes, 0); instances.setClassIndex(2); String[] messages = new String[] { "No emoticons here", "I have a smiley :)", "Two smileys and a frownie :) :) :(", "Several emoticons :( :-( :) :-) ;-) 8-) :-/ :-P" }; for (int i = 0; i < messages.length; i++) { Instance instance = new DenseInstance(instances.numAttributes()); instance.setValue(instances.attribute(0), messages[i]); instance.setValue(instances.attribute(1), i); instance.setValue(instances.attribute(2), Integer.toString(i % 2)); instances.add(instance); } System.out.println("Before filter:"); for (int i = 0; i < instances.size(); i++) { System.out.println(instances.instance(i).toString()); } try { String dictionaryName = "emoticons.txt"; StringToDictionaryVector filter = new StringToDictionaryVector(); List<String> termList = StringToDictionaryVector.readDictionaryFile(new File(dictionaryName)); filter.setTermList(termList); filter.setMinTermFreq(1); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength(new SelectedTag(FILTER_NORMALIZE_TEST_ONLY, TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setStringAttribute("message"); filter.setInputFormat(instances); Instances trans1 = Filter.useFilter(instances, filter); Instances trans2 = Filter.useFilter(instances, filter); System.out.println("\nFirst application:"); System.out.println(trans1.toString()); System.out.println("\nSecond application:"); System.out.println(trans2.toString()); } catch (Exception e) { e.printStackTrace(); } }