List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:mao.datamining.DataSetPair.java
/** * To drop the useless columns accordingly on the test dataset, if it exists *//*from ww w . j av a 2 s . c om*/ private void processTestDataSet() { if (!new File(testSourceFileName).exists()) return; try { Instances orangeTestDataSet = ConverterUtils.DataSource.read(testSourceFileName); Remove remove = new Remove(); StringBuilder indexBuffer = new StringBuilder(); for (String attrName : finalTrainAttrList) { int attrIndex = orangeTestDataSet.attribute(attrName).index(); indexBuffer.append(attrIndex + 1).append(","); } Main.logging("Attribute Indices: \n" + indexBuffer.toString()); remove.setAttributeIndices(indexBuffer.toString()); remove.setInvertSelection(true); remove.setInputFormat(orangeTestDataSet); Instances testNewDataSet = Filter.useFilter(orangeTestDataSet, remove); try (BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(this.testFileName)))) { writer.write(testNewDataSet.toString()); } //set the final test dataset finalTestDataSet = testNewDataSet; finalTestDataSet.setClassIndex(finalTestDataSet.numAttributes() - 1); Main.logging("test dataset class attr: " + finalTestDataSet.classAttribute().toString()); } catch (Exception e) { Main.logging(null, e); } }
From source file:mao.datamining.RemoveUselessColumnsByMissingValues.java
License:Open Source License
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws Exception if no input format defined *//*from w w w . j a v a2s . c o m*/ public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_removeFilter == null) { // establish attributes to remove from first batch Instances toFilter = getInputFormat(); int[] attsToDelete = new int[toFilter.numAttributes()]; int numToDelete = 0; for (int i = 0; i < toFilter.numAttributes(); i++) { if (i == toFilter.classIndex()) continue; // skip class AttributeStats stats = toFilter.attributeStats(i); //remove those attributes who has high ratio of missing values if ((stats.missingCount * 100) / stats.totalCount > m_maxMissingPercentage) { // System.out.println("stats.missingPercentage: " + (stats.missingCount*100)/stats.totalCount+"%"); attsToDelete[numToDelete++] = i; } //remove those columns defined in the list by manual check if (this.column2DeleteSet.contains(toFilter.attribute(i).name())) { attsToDelete[numToDelete++] = i; } } int[] finalAttsToDelete = new int[numToDelete]; System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete); m_removeFilter = new Remove(); m_removeFilter.setAttributeIndicesArray(finalAttsToDelete); m_removeFilter.setInvertSelection(false); m_removeFilter.setInputFormat(toFilter); for (int i = 0; i < toFilter.numInstances(); i++) { m_removeFilter.input(toFilter.instance(i)); } m_removeFilter.batchFinished(); Instance processed; Instances outputDataset = m_removeFilter.getOutputFormat(); // restore old relation name to hide attribute filter stamp outputDataset.setRelationName(toFilter.relationName()); setOutputFormat(outputDataset); while ((processed = m_removeFilter.output()) != null) { processed.setDataset(outputDataset); push(processed); } } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
From source file:mao.datamining.Util.java
/** * Based on the defined list of attributes, transform them into nominal from numeric type * weka.filters.unsupervised.attribute.NumericToNominal -R first-last * @param newData/*from ww w . j ava2s . c o m*/ * @param columns2Nominal * @return */ public static Instances transformNum2Nominal(Instances newData, String[] columns2Nominal) { StringBuilder indexArrayStr = new StringBuilder(); for (int i = 0; i < columns2Nominal.length; i++) { String attrName = columns2Nominal[i]; Attribute attr = newData.attribute(attrName); if (attr != null) { indexArrayStr.append(attr.index() + 1).append(","); } } try { NumericToNominal transform = new NumericToNominal(); transform.setInputFormat(newData); transform.setAttributeIndices(indexArrayStr.toString()); newData = Filter.useFilter(newData, transform); } catch (Exception e) { Main.logging(null, e); } // Main.logging("== New Data after transforming numeric data : ===\n" + newData.toSummaryString()); return newData; }
From source file:marytts.tools.voiceimport.PauseDurationTrainer.java
License:Open Source License
private Instance createInstance(Instances data, FeatureDefinition fd, FeatureVector fv) { // relevant features + one target Instance currInst = new DenseInstance(data.numAttributes()); currInst.setDataset(data);/*from ww w.ja v a 2 s . c o m*/ // read only relevant features for (String attName : this.featureNames) { int featNr = fd.getFeatureIndex(attName); String value = fv.getFeatureAsString(featNr, fd); currInst.setValue(data.attribute(attName), value); } return currInst; }
From source file:maui.main.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data *//*w w w . j a v a2 s .c o m*/ public void buildModel(HashSet<String> fileNames, VocabularyStore store) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } System.err.println("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model mauiFilter = new MauiFilter(); mauiFilter.setDebug(getDebug()); mauiFilter.setMaxPhraseLength(getMaxPhraseLength()); mauiFilter.setMinPhraseLength(getMinPhraseLength()); mauiFilter.setMinNumOccur(getMinNumOccur()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures); mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setClassifier(classifier); mauiFilter.setContextSize(contextSize); mauiFilter.setMinKeyphraseness(minKeyphraseness); mauiFilter.setMinSenseProbability(minSenseProbability); if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store); } System.err.println("-- Reading the input documents... "); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTopicsFile); e.printStackTrace(); newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } ; }
From source file:maui.main.MauiTopicExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w . j a v a 2 s. c om */ public void extractKeyphrases(HashSet<String> fileNames, VocabularyStore store) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); System.err.println("-- Extracting keyphrases... "); Vector<Double> correctStatistics = new Vector<Double>(); Vector<Double> precisionStatistics = new Vector<Double>(); Vector<Double> recallStatistics = new Vector<Double>(); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { if (debugMode) { System.err.println("No existing topics for " + documentTextFile); } newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (debugMode) { System.err.println("-- Processing document: " + fileName); } Instance[] topRankedInstances = new Instance[topicsPerDocument]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = mauiFilter.output()) != null) { int index = (int) inst.value(mauiFilter.getRankIndex()) - 1; if (index < topicsPerDocument) { topRankedInstances[index] = inst; } } if (debugMode) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; if (!documentTopicsFile.exists()) { out = new FileOutputStream(documentTopicsFile); if (!documentEncoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; wikipedia = mauiFilter.getWikipedia(); HashMap<Article, Integer> topics = null; if (printGraph) { topics = new HashMap<Article, Integer>(); } int p = 0; String root = ""; for (int i = 0; i < topicsPerDocument; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()); printer.print(topic); if (printGraph) { Article article = wikipedia.getArticleByTitle(topic); if (article == null) { article = wikipedia.getMostLikelyArticle(topic, new CaseFolder()); } if (article != null) { if (root == "") { root = article.getTitle(); } topics.put(article, new Integer(p)); } else { if (debugMode) { System.err.println( "Couldn't find article for " + topic + " in " + documentTopicsFile); } } p++; } if (additionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4)); } printer.println(); } if (debugMode) { System.err.println(topRankedInstances[i]); } } } if (printGraph) { String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv"); computeGraph(topics, root, graphFile); } if (numExtracted > 0) { if (debugMode) { System.err.println("-- " + numCorrect + " correct"); } double totalCorrect = mauiFilter.getTotalCorrect(); correctStatistics.addElement(new Double(numCorrect)); precisionStatistics.addElement(new Double(numCorrect / numExtracted)); recallStatistics.addElement(new Double(numCorrect / totalCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } if (correctStatistics.size() != 0) { double[] st = new double[correctStatistics.size()]; for (int i = 0; i < correctStatistics.size(); i++) { st[i] = correctStatistics.elementAt(i).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); if (correctStatistics.size() == 1) { System.err.println("\n-- Evaluation results based on 1 document:"); } else { System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:"); } System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); st = new double[precisionStatistics.size()]; for (int i = 0; i < precisionStatistics.size(); i++) { st[i] = precisionStatistics.elementAt(i).doubleValue(); } double avgPrecision = Utils.mean(st); double stdDevPrecision = Math.sqrt(Utils.variance(st)); System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- " + Utils.doubleToString(stdDevPrecision * 100, 2)); st = new double[recallStatistics.size()]; for (int i = 0; i < recallStatistics.size(); i++) { st[i] = recallStatistics.elementAt(i).doubleValue(); } double avgRecall = Utils.mean(st); double stdDevRecall = Math.sqrt(Utils.variance(st)); System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- " + Utils.doubleToString(stdDevRecall * 100, 2)); double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision); System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2)); System.err.println(""); } mauiFilter.batchFinished(); }
From source file:meddle.PredictByDomainOS.java
License:Open Source License
public static boolean loadAllModels(String className) { domainOSModel = new HashMap<String, Classifier>(); domainOSFeature = new HashMap<String, Map<String, Integer>>(); domainOSStruct = new HashMap<String, Instances>(); try {//from w ww.j a v a 2 s . c om File modelFolder = new File(RConfig.modelFolder); File[] models = modelFolder.listFiles(); if (models != null) for (int i = 0; i < models.length; i++) { String fn = models[i].getName(); if (!fn.endsWith(className + ".model")) continue; String domainOS = fn.substring(0, fn.length() - className.length() - ".model".length() - 1); Classifier classifier; classifier = (Classifier) (Class.forName(className).newInstance()); classifier = (Classifier) SerializationHelper.read(RConfig.modelFolder + fn); // System.out.println(domainOS); domainOSModel.put(domainOS, classifier); ArffLoader loader = new ArffLoader(); String arffStructureFile = RConfig.arffFolder + domainOS + ".arff"; File af = new File(arffStructureFile); if (!af.exists()) continue; loader.setFile(new File(arffStructureFile)); Instances structure; try { structure = loader.getStructure(); } catch (Exception e) { continue; } structure.setClassIndex(structure.numAttributes() - 1); domainOSStruct.put(domainOS, structure); Map<String, Integer> fi = new HashMap<String, Integer>(); for (int j = 0; j < structure.numAttributes(); j++) { fi.put(structure.attribute(j).name(), j); } domainOSFeature.put(domainOS, fi); } } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { e.printStackTrace(); return false; } catch (Exception e) { e.printStackTrace(); return false; } isModelLoaded = true; return true; }
From source file:meka.classifiers.multilabel.AbstractMultiLabelClassifier.java
License:Open Source License
/** * TestCapabilities./*from w w w .j ava2 s. c o m*/ * Make sure the training data is suitable. * @param D the data */ public void testCapabilities(Instances D) throws Exception { // get the classifier's capabilities, enable all class attributes and do the usual test Capabilities cap = getCapabilities(); cap.enableAllClasses(); //getCapabilities().testWithFail(D); // get the capabilities again, test class attributes individually int L = D.classIndex(); for (int j = 0; j < L; j++) { Attribute c = D.attribute(j); cap.testWithFail(c, true); } }
From source file:meka.classifiers.multilabel.Evaluation.java
License:Open Source License
/** * IsMT - see if dataset D is multi-target (else only multi-label) * @param D data/* w w w . j a v a 2 s . c o m*/ * @return true iff D is multi-target only (else false) */ public static boolean isMT(Instances D) { int L = D.classIndex(); for (int j = 0; j < L; j++) { if (D.attribute(j).isNominal()) { // Classification if (D.attribute(j).numValues() > 2) { // Multi-class return true; } } else { // Regression? System.err.println("[Warning] Found a non-nominal class -- not sure how this happened?"); } } return false; }
From source file:meka.classifiers.multilabel.PCC.java
License:Open Source License
/** * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}. * In the multi-label case, K[j] = 2 for all j = 1,...,L. * @param D a dataset//from www. ja v a2s.c o m * @return an array of the number of values that each label can take */ private static int[] getKs(Instances D) { int L = D.classIndex(); int K[] = new int[L]; for (int k = 0; k < L; k++) { K[k] = D.attribute(k).numValues(); } return K; }