List of usage examples for weka.core Instance stringValue
public String stringValue(Attribute att);
From source file:edu.brandeis.wisedb.scheduler.training.decisiontree.DTSearcher.java
License:Open Source License
@Override public List<Action> schedule(Set<ModelQuery> toSched) { SingularMachineState start = new SingularMachineState(toSched, qtp, sla); List<Action> toR = new LinkedList<Action>(); applyLoop: while (!start.isGoalState()) { log.fine("Current state: " + start); SortedMap<String, String> features = start.getFeatures(); Instance toClassify = new Instance(attributes.length); toClassify.setDataset(wekaDataSet); for (Attribute a : attributes) { if (a.name().equals("action")) { //toClassify.setValue(a, "N"); continue; }/*ww w . ja va 2 s. com*/ try { if (features.get(a.name()).equals("?")) { toClassify.setMissing(a); continue; } try { double d = Double.valueOf(features.get(a.name())); toClassify.setValue(a, d); } catch (NumberFormatException e) { toClassify.setValue(a, features.get(a.name())); } } catch (IllegalArgumentException e) { e.printStackTrace(); log.warning( "Encountered previously unseen attribute value! Might need better training data... making random selection."); log.warning("Value for attribute " + a.name() + " was " + features.get(a.name())); Action rand = getPUAction(start); log.warning("Random action selected: " + rand); toR.add(rand); start.applyAction(rand); continue applyLoop; } } toClassify.setClassMissing(); log.finer("Going to classify: " + toClassify); try { double d = tree.classifyInstance(toClassify); toClassify.setClassValue(d); String action = toClassify.stringValue(toClassify.classIndex()); log.finer("Got action string: " + action); Action selected = null; for (Action a : start.getPossibleActions()) { if (actionMatches(a, action)) { selected = a; break; } } if (selected == null) { //log.warning("Could not find applicable action for string: " + action + " ... picking random action"); Action a = getPUAction(start); start.applyAction(a); toR.add(a); continue; } log.fine("Selected action: " + selected); start.applyAction(selected); toR.add(selected); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); return null; } } return toR; }
From source file:edu.umbc.cs.maple.utils.WekaUtils.java
License:Open Source License
/** Converts the instances in the given dataset to binary, setting the specified labels to positive. * Note this method is destructive to data, directly modifying its contents. * @param data the multiclass dataset to be converted to binary. * @param positiveClassValue the class value to treat as positive. *//* w w w .jav a2 s.c o m*/ public static void convertMulticlassToBinary(Instances data, String positiveClassValue) { // ensure that data is nominal if (!data.classAttribute().isNominal()) throw new IllegalArgumentException("Instances must have a nominal class."); // create the new class attribute FastVector newClasses = new FastVector(2); newClasses.addElement("Y"); newClasses.addElement("N"); Attribute newClassAttribute = new Attribute("class", newClasses); // alter the class attribute to be binary int newClassAttIdx = data.classIndex(); data.insertAttributeAt(newClassAttribute, newClassAttIdx); int classAttIdx = data.classIndex(); // set the instances classes to be binary, with the labels [Y,N] (indices 0 and 1 respectively) int numInstances = data.numInstances(); for (int instIdx = 0; instIdx < numInstances; instIdx++) { Instance inst = data.instance(instIdx); if (inst.stringValue(classAttIdx).equals(positiveClassValue)) { inst.setValue(newClassAttIdx, 0); // set it to the first class, which will be Y } else { inst.setValue(newClassAttIdx, 1); // set it to the second class, which will be 0 } } // switch the class index to the new class and delete the old class data.setClassIndex(newClassAttIdx); data.deleteAttributeAt(classAttIdx); // alter the dataset name data.setRelationName(data.relationName() + "-" + positiveClassValue); }
From source file:elh.eus.absa.CLI.java
License:Open Source License
/** * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier * (E and A) for E#A aspect categories/* w w w . jav a 2 s . c o m*/ * @throws Exception */ public final void trainATC2(final InputStream inputStream) throws IOException { // load training parameters file String paramFile = parsedArguments.getString("params"); String testFile = parsedArguments.getString("testset"); String paramFile2 = parsedArguments.getString("params2"); String corpusFormat = parsedArguments.getString("corpusFormat"); //String validation = parsedArguments.getString("validation"); String lang = parsedArguments.getString("language"); //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum")); //boolean printPreds = parsedArguments.getBoolean("printPreds"); boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences"); boolean onlyTest = parsedArguments.getBoolean("testOnly"); double threshold = 0.5; double threshold2 = 0.5; String modelsPath = "/home/inaki/elixa-atp/ovsaModels"; CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang); Features atcTrain = new Features(reader, paramFile, "3"); Instances traindata = atcTrain.loadInstances(true, "atc"); if (onlyTest) { if (FileUtilsElh.checkFile(testFile)) { System.err.println("read from test file"); reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat, nullSentenceOpinions, lang); atcTrain.setCorpus(reader); traindata = atcTrain.loadInstances(true, "atc"); } } //setting class attribute (entCat|attCat|entAttCat|polarityCat) //HashMap<String, Integer> opInst = atcTrain.getOpinInst(); //WekaWrapper classifyAtts; WekaWrapper onevsall; try { //classify.printMultilabelPredictions(classify.multiLabelPrediction()); */ //onevsall Instances entdata = new Instances(traindata); entdata.deleteAttributeAt(entdata.attribute("attCat").index()); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); entdata.setClassIndex(entdata.attribute("entCat").index()); onevsall = new WekaWrapper(entdata, true); if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all models ready"); } onevsall.setTestdata(entdata); HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all predictions ready"); HashMap<Integer, String> instOps = new HashMap<Integer, String>(); for (String oId : atcTrain.getOpinInst().keySet()) { instOps.put(atcTrain.getOpinInst().get(oId), oId); } atcTrain = new Features(reader, paramFile2, "3"); entdata = atcTrain.loadInstances(true, "attTrain2_data"); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); //entdata.setClassIndex(entdata.attribute("entCat").index()); Attribute insAtt = entdata.attribute("instanceId"); double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins" + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { Instance newIns = new SparseInstance(currentInst); newIns.setDataset(entdata); entdata.add(newIns); newIns.setValue(insAtt, maxInstId + oSubId); newIns.setClassValue(cl); instOps.put((int) maxInstId + oSubId, oId); } // if the are more create new instances else { currentInst.setClassValue(cl); //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId); //reader.addOpinion(op); } oSubId++; } } //finished updating instances data } entdata.setClass(entdata.attribute("attCat")); onevsall = new WekaWrapper(entdata, true); /** * Bigarren sailkatzailea * * */ if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "attCat"); System.out.println("trainATC: one vs all attcat models ready"); } ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat"); insAtt = entdata.attribute("instanceId"); maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues()); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins: " + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold2) { ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } // if the are more create new instances else { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); reader.removeOpinion(oId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } oSubId++; } } //finished updating instances data } } reader.print2Semeval2015format(paramFile + "entAttCat.xml"); } catch (Exception e) { e.printStackTrace(); } //traindata.setClass(traindata.attribute("entAttCat")); System.err.println("DONE CLI train-atc2 (oneVsAll)"); }
From source file:elh.eus.absa.WekaWrapper.java
License:Open Source License
/** * Simple function to print the results of a multilabel prediction. * //w ww. j a va 2s.c om * @param HashMap<Instance, double[]> pred hashmap containing a set of instances and their corresponding * multilabel prediction, as computed by the multiLabelPrediction() function in this class. */ public void printMultilabelPredictions(HashMap<Instance, double[]> pred) { for (Instance i : pred.keySet()) { double[] kk = pred.get(i); int c = 0; System.out.print("instance " + Integer.parseInt(Double.toString(i.value(0))) + " (" + i.classValue() + "|" + i.stringValue(i.classIndex()) + ") --> "); for (double d : kk) { System.out.print("cl_" + c + "=" + d + "; "); c++; } System.out.print("\n"); } }
From source file:elh.eus.absa.WekaWrapper.java
License:Open Source License
/** * Train one vs all models over the given training data. * //from w w w.j a va2 s .c o m * @param modelpath directory to store each model for the one vs. all method * @param prefix prefix the models should have (each model will have the name of its class appended * @throws Exception */ public void trainOneVsAll(String modelpath, String prefix) throws Exception { Instances orig = new Instances(traindata); Enumeration<Object> classValues = traindata.classAttribute().enumerateValues(); String classAtt = traindata.classAttribute().name(); while (classValues.hasMoreElements()) { String v = (String) classValues.nextElement(); System.err.println("trainer onevsall for class " + v + " classifier"); //needed because of weka's sparse data format problems THIS IS TROUBLE! ... if (v.equalsIgnoreCase("dummy")) { continue; } // copy instances and set the same class value Instances ovsa = new Instances(orig); //create a new class attribute // // Declare the class attribute along with its values ArrayList<String> classVal = new ArrayList<String>(); classVal.add("dummy"); //needed because of weka's sparse data format problems... classVal.add(v); classVal.add("UNKNOWN"); ovsa.insertAttributeAt(new Attribute(classAtt + "2", classVal), ovsa.numAttributes()); //change all instance labels that have not the current class value to "other" for (int i = 0; i < ovsa.numInstances(); i++) { Instance inst = ovsa.instance(i); String instClass = inst.stringValue(ovsa.attribute(classAtt).index()); if (instClass.equalsIgnoreCase(v)) { inst.setValue(ovsa.attribute(classAtt + "2").index(), v); } else { inst.setValue(ovsa.attribute(classAtt + "2").index(), "UNKNOWN"); } } //delete the old class attribute and set the new. ovsa.setClassIndex(ovsa.attribute(classAtt + "2").index()); ovsa.deleteAttributeAt(ovsa.attribute(classAtt).index()); ovsa.renameAttribute(ovsa.attribute(classAtt + "2").index(), classAtt); ovsa.setClassIndex(ovsa.attribute(classAtt).index()); //build the classifier, crossvalidate and store the model setTraindata(ovsa); saveModel(modelpath + File.separator + prefix + "_" + v + ".model"); setTestdata(ovsa); testModel(modelpath + File.separator + prefix + "_" + v + ".model"); System.err.println("trained onevsall " + v + " classifier"); } setTraindata(orig); }
From source file:es.upm.dit.gsi.barmas.dataset.utils.DatasetSplitter.java
License:Open Source License
/** * @param folds//from www . j a va2 s. c o m * @param minAgents * @param maxAgents * @param originalDatasetPath * @param outputDir * @param scenario * @param logger */ public void splitDataset(int folds, int minAgents, int maxAgents, String originalDatasetPath, String outputDir, String scenario, Logger logger) { int ratioint = (int) ((1 / (double) folds) * 100); double roundedratio = ((double) ratioint) / 100; // Look for essentials List<String[]> essentials = this.getEssentials(originalDatasetPath, logger); for (int fold = 0; fold < folds; fold++) { String outputDirWithRatio = outputDir + "/" + roundedratio + "testRatio/iteration-" + fold; File dir = new File(outputDirWithRatio); if (!dir.exists() || !dir.isDirectory()) { dir.mkdirs(); } logger.finer("--> splitDataset()"); logger.fine("Creating experiment.info..."); try { Instances originalData = this.getDataFromCSV(originalDatasetPath); originalData.randomize(new Random()); originalData.stratify(folds); // TestDataSet Instances testData = originalData.testCV(folds, fold); CSVSaver saver = new CSVSaver(); ArffSaver arffsaver = new ArffSaver(); File file = new File(outputDirWithRatio + File.separator + "test-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(testData); saver.setFile(file); saver.writeBatch(); } file = new File(outputDirWithRatio + File.separator + "test-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(testData); arffsaver.setFile(file); arffsaver.writeBatch(); } // BayesCentralDataset Instances trainData = originalData.trainCV(folds, fold); file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.csv"); if (!file.exists()) { saver.resetOptions(); saver.setInstances(trainData); saver.setFile(file); saver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.csv"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } file = new File(outputDirWithRatio + File.separator + "bayes-central-dataset.arff"); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(trainData); arffsaver.setFile(file); arffsaver.writeBatch(); this.copyFileUsingApacheCommonsIO(file, new File( outputDirWithRatio + File.separator + "bayes-central-dataset-noEssentials.arff"), logger); CsvWriter w = new CsvWriter(new FileWriter(file, true), ','); for (String[] essential : essentials) { w.writeRecord(essential); } w.close(); } // Agent datasets CsvReader csvreader = new CsvReader(new FileReader(new File(originalDatasetPath))); csvreader.readHeaders(); String[] headers = csvreader.getHeaders(); csvreader.close(); for (int agents = minAgents; agents <= maxAgents; agents++) { this.createExperimentInfoFile(folds, agents, originalDatasetPath, outputDirWithRatio, scenario, logger); HashMap<String, CsvWriter> writers = new HashMap<String, CsvWriter>(); String agentsDatasetsDir = outputDirWithRatio + File.separator + agents + "agents"; HashMap<String, CsvWriter> arffWriters = new HashMap<String, CsvWriter>(); File f = new File(agentsDatasetsDir); if (!f.isDirectory()) { f.mkdirs(); } Instances copy = new Instances(trainData); copy.delete(); for (int i = 0; i < agents; i++) { String fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.csv"; file = new File(fileName); if (!file.exists()) { CsvWriter writer = new CsvWriter(new FileWriter(fileName), ','); writer.writeRecord(headers); writers.put("AGENT" + i, writer); } fileName = agentsDatasetsDir + File.separator + "agent-" + i + "-dataset.arff"; file = new File(fileName); if (!file.exists()) { arffsaver.resetOptions(); arffsaver.setInstances(copy); arffsaver.setFile(new File(fileName)); arffsaver.writeBatch(); CsvWriter arffwriter = new CsvWriter(new FileWriter(fileName, true), ','); arffWriters.put("AGENT" + i, arffwriter); } logger.fine("AGENT" + i + " dataset created in csv and arff formats."); } // Append essentials to all for (String[] essential : essentials) { for (CsvWriter wr : writers.values()) { wr.writeRecord(essential); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.writeRecord(essential); } } int agentCounter = 0; for (int j = 0; j < trainData.numInstances(); j++) { Instance instance = trainData.instance(j); CsvWriter writer = writers.get("AGENT" + agentCounter); CsvWriter arffwriter = arffWriters.get("AGENT" + agentCounter); String[] row = new String[instance.numAttributes()]; for (int a = 0; a < instance.numAttributes(); a++) { row[a] = instance.stringValue(a); } if (writer != null) { writer.writeRecord(row); } if (arffwriter != null) { arffwriter.writeRecord(row); } agentCounter++; if (agentCounter == agents) { agentCounter = 0; } } for (CsvWriter wr : writers.values()) { wr.close(); } for (CsvWriter arffwr : arffWriters.values()) { arffwr.close(); } } } catch (Exception e) { logger.severe("Exception while splitting dataset. ->"); logger.severe(e.getMessage()); System.exit(1); } logger.finest("Dataset for fold " + fold + " created."); } logger.finer("<-- splitDataset()"); }
From source file:etc.aloe.filters.AbstractRegexFilter.java
License:Open Source License
@Override protected Instance process(Instance instance) throws Exception { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute not set"); }// w w w . j a v a 2 s . c o m String stringValue = instance.stringValue(stringAttributeIndex); NamedRegex[] regexFeatures = getRegexFeatures(); int numOldValues = instance.numAttributes(); int numNewFeatures = regexFeatures.length; if (countRegexLengths) { numNewFeatures = regexFeatures.length * 2; } double[] newValues = new double[numOldValues + numNewFeatures]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } for (int i = 0; i < regexFeatures.length; i++) { Pattern pattern = regexFeatures[i].getPattern(); Matcher matches = pattern.matcher(stringValue); int count = 0; int maxLength = 0; while (matches.find()) { count++; int len = matches.group().length(); if (len > maxLength) { maxLength = len; } } int index = numOldValues + i; if (countRegexLengths) { index = numOldValues + 2 * i; } newValues[index] = count; if (countRegexLengths) { newValues[numOldValues + 2 * i + 1] = maxLength; } } Instance result = new SparseInstance(instance.weight(), newValues); return result; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
private int[] determineDictionary(Instances instances) { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute index not valid"); }/* ww w .j a v a 2 s .c o m*/ // Operate on a per-class basis if class attribute is set int classInd = instances.classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = instances.attribute(classInd).numValues(); } HashMap<String, Integer> termIndices = new HashMap<String, Integer>(); for (int i = 0; i < termList.size(); i++) { termIndices.put(termList.get(i), i); } //Create the trie for matching terms Trie termTrie = new Trie(termList); //Initialize the dictionary/count map ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>(); for (int z = 0; z < values; z++) { termCounts.add(new HashMap<Integer, Count>()); } //Go through all the instances and count the emoticons for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } //Get the string attribute to examine String stringValue = instance.stringValue(stringAttributeIndex); HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd); HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue); for (Map.Entry<String, Integer> entry : termMatches.entrySet()) { String term = entry.getKey(); int termIdx = termIndices.get(term); int matches = entry.getValue(); Count count = termCountsForClass.get(termIdx); if (count == null) { count = new Count(0); termCountsForClass.put(termIdx, count); } if (matches > 0) { count.docCount += 1; count.count += matches; } } } // Figure out the minimum required word frequency int prune[] = new int[values]; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); int array[] = new int[termCountsForClass.size()]; int pos = 0; for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { array[pos] = entry.getValue().count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) HashSet<String> selectedTerms = new HashSet<String>(); for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { int termIndex = entry.getKey(); String term = termList.get(termIndex); Count count = entry.getValue(); if (count.count >= prune[z]) { selectedTerms.add(term); } } } //Save the selected terms as a list this.m_selectedTerms = new ArrayList<String>(selectedTerms); this.m_selectedTermsTrie = new Trie(this.m_selectedTerms); this.m_NumInstances = instances.size(); //Construct the selected terms to index map this.m_selectedTermIndices = new HashMap<String, Integer>(); for (int i = 0; i < m_selectedTerms.size(); i++) { m_selectedTermIndices.put(m_selectedTerms.get(i), i); } // Compute document frequencies, organized by selected term index (not original term index) int[] docsCounts = new int[m_selectedTerms.size()]; for (int i = 0; i < m_selectedTerms.size(); i++) { String term = m_selectedTerms.get(i); int termIndex = termIndices.get(term); int docsCount = 0; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); Count count = termCountsForClass.get(termIndex); if (count != null) { docsCount += count.docCount; } } docsCounts[i] = docsCount; } return docsCounts; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * * @param instance the instance to convert * * @param ArrayList<Instance> the list of instances * @return the document length// w w w .j av a 2s . c o m */ private double convertInstancewoDocNorm(Instance instance, ArrayList<Instance> converted) { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute index not valid"); } int numOldValues = instance.numAttributes(); double[] newValues = new double[numOldValues + m_selectedTerms.size()]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } String stringValue = instance.stringValue(stringAttributeIndex); double docLength = 0; HashMap<String, Integer> termMatches = m_selectedTermsTrie.countNonoverlappingMatches(stringValue); for (Map.Entry<String, Integer> entry : termMatches.entrySet()) { String term = entry.getKey(); int termIdx = m_selectedTermIndices.get(term); double matches = entry.getValue(); if (!m_OutputCounts && matches > 0) { matches = 1; } if (matches > 0) { if (m_TFTransform == true) { matches = Math.log(matches + 1); } if (m_IDFTransform == true) { matches = matches * Math.log(m_NumInstances / (double) m_DocsCounts[termIdx]); } newValues[numOldValues + termIdx] = matches; docLength += matches * matches; } } Instance result = new SparseInstance(instance.weight(), newValues); converted.add(result); return Math.sqrt(docLength); }
From source file:etc.aloe.filters.WordFeaturesExtractor.java
License:Open Source License
protected List<String> tokenizeDocument(Instance instance) { List<String> words = new ArrayList<String>(); // Use tokenizer tokenizer.tokenize(instance.stringValue(selectedAttributeIndex)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (tokenizer.hasMoreElements()) { String word = ((String) tokenizer.nextElement()).intern(); if (this.lowerCaseTokens == true) { word = word.toLowerCase();//from w w w . j a v a 2 s. c o m } word = stemmer.stem(word); if (stopwords.contains(word.toLowerCase())) { continue; } if (word.length() == 0) { continue; } words.add(word); } return words; }