List of usage examples for weka.core Instances instance
publicInstance instance(int index)
From source file:machine_learing_clasifier.MyID3.java
public void makeTree(Instances data) throws Exception { if (data.numInstances() == 0) { return;//from www . j av a 2s .c o m } double[] infoGains = new double[data.numAttributes()]; for (int i = 0; i < data.numAttributes(); i++) { Attribute att = data.attribute(i); if (data.classIndex() != att.index()) { infoGains[att.index()] = computeInformationGain(data, att); } } m_Attribute = data.attribute(Utils.maxIndex(infoGains)); //System.out.println("huhu = " + m_Attribute.toString()); if (Utils.eq(infoGains[m_Attribute.index()], 0)) { m_Attribute = null; m_Distribution = new double[data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { int inst = (int) data.instance(i).value(data.classAttribute()); m_Distribution[inst]++; } Utils.normalize(m_Distribution); m_ClassValue = Utils.maxIndex(m_Distribution); m_ClassAttribute = data.classAttribute(); } else { Instances[] splitData = splitData(data, m_Attribute); m_Successors = new MyID3[m_Attribute.numValues()]; for (int j = 0; j < m_Attribute.numValues(); j++) { m_Successors[j] = new MyID3(); m_Successors[j].buildClassifier(splitData[j]); } } }
From source file:mao.datamining.RemoveUselessColumnsByMissingValues.java
License:Open Source License
/** * Signify that this batch of input to the filter is finished. * * @return true if there are instances pending output * @throws Exception if no input format defined *///from www . j av a 2s. c om public boolean batchFinished() throws Exception { if (getInputFormat() == null) { throw new IllegalStateException("No input instance format defined"); } if (m_removeFilter == null) { // establish attributes to remove from first batch Instances toFilter = getInputFormat(); int[] attsToDelete = new int[toFilter.numAttributes()]; int numToDelete = 0; for (int i = 0; i < toFilter.numAttributes(); i++) { if (i == toFilter.classIndex()) continue; // skip class AttributeStats stats = toFilter.attributeStats(i); //remove those attributes who has high ratio of missing values if ((stats.missingCount * 100) / stats.totalCount > m_maxMissingPercentage) { // System.out.println("stats.missingPercentage: " + (stats.missingCount*100)/stats.totalCount+"%"); attsToDelete[numToDelete++] = i; } //remove those columns defined in the list by manual check if (this.column2DeleteSet.contains(toFilter.attribute(i).name())) { attsToDelete[numToDelete++] = i; } } int[] finalAttsToDelete = new int[numToDelete]; System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete); m_removeFilter = new Remove(); m_removeFilter.setAttributeIndicesArray(finalAttsToDelete); m_removeFilter.setInvertSelection(false); m_removeFilter.setInputFormat(toFilter); for (int i = 0; i < toFilter.numInstances(); i++) { m_removeFilter.input(toFilter.instance(i)); } m_removeFilter.batchFinished(); Instance processed; Instances outputDataset = m_removeFilter.getOutputFormat(); // restore old relation name to hide attribute filter stamp outputDataset.setRelationName(toFilter.relationName()); setOutputFormat(outputDataset); while ((processed = m_removeFilter.output()) != null) { processed.setDataset(outputDataset); push(processed); } } flushInput(); m_NewBatch = true; return (numPendingOutput() != 0); }
From source file:marytts.tools.voiceimport.PauseDurationTrainer.java
License:Open Source License
private Instances enterDurations(Instances data, List<Integer> durs) { // System.out.println("discretizing durations..."); // now discretize and set target attributes (= pause durations) // for that, first train discretizer GmmDiscretizer discr = GmmDiscretizer.trainDiscretizer(durs, 6, true); // used to store the collected values ArrayList<String> targetVals = new ArrayList<String>(); for (int mappedDur : discr.getPossibleValues()) { targetVals.add(mappedDur + "ms"); }// w ww .jav a 2 s . c o m // FastVector attributeDeclarations = data.; // attribute declaration finished data.insertAttributeAt(new Attribute("target", targetVals), data.numAttributes()); for (int i = 0; i < durs.size(); i++) { Instance currInst = data.instance(i); int dur = durs.get(i); // System.out.println(" mapping " + dur + " to " + discr.discretize(dur) + " - bi:" + // data.instance(i).value(data.attribute("breakindex"))); currInst.setValue(data.numAttributes() - 1, discr.discretize(dur) + "ms"); } // Make the last attribute be the class data.setClassIndex(data.numAttributes() - 1); return data; }
From source file:matres.MatResUI.java
private void doClassification() { J48 m_treeResiko;// www .ja v a2 s .com J48 m_treeAksi; NaiveBayes m_nbResiko; NaiveBayes m_nbAksi; FastVector m_fvInstanceRisks; FastVector m_fvInstanceActions; InputStream isRiskTree = getClass().getResourceAsStream("data/ResikoTree.model"); InputStream isRiskNB = getClass().getResourceAsStream("data/ResikoNB.model"); InputStream isActionTree = getClass().getResourceAsStream("data/AksiTree.model"); InputStream isActionNB = getClass().getResourceAsStream("data/AksiNB.model"); m_treeResiko = new J48(); m_treeAksi = new J48(); m_nbResiko = new NaiveBayes(); m_nbAksi = new NaiveBayes(); try { //m_treeResiko = (J48) weka.core.SerializationHelper.read("ResikoTree.model"); m_treeResiko = (J48) weka.core.SerializationHelper.read(isRiskTree); //m_nbResiko = (NaiveBayes) weka.core.SerializationHelper.read("ResikoNB.model"); m_nbResiko = (NaiveBayes) weka.core.SerializationHelper.read(isRiskNB); //m_treeAksi = (J48) weka.core.SerializationHelper.read("AksiTree.model"); m_treeAksi = (J48) weka.core.SerializationHelper.read(isActionTree); //m_nbAksi = (NaiveBayes) weka.core.SerializationHelper.read("AksiNB.model"); m_nbAksi = (NaiveBayes) weka.core.SerializationHelper.read(isActionNB); } catch (Exception ex) { Logger.getLogger(MatResUI.class.getName()).log(Level.SEVERE, null, ex); } System.out.println("Setting up an Instance..."); // Values for LIKELIHOOD OF OCCURRENCE FastVector fvLO = new FastVector(5); fvLO.addElement("> 10 in 1 year"); fvLO.addElement("1 - 10 in 1 year"); fvLO.addElement("1 in 1 year to 1 in 10 years"); fvLO.addElement("1 in 10 years to 1 in 100 years"); fvLO.addElement("1 in more than 100 years"); // Values for SAFETY FastVector fvSafety = new FastVector(5); fvSafety.addElement("near miss"); fvSafety.addElement("first aid injury, medical aid injury"); fvSafety.addElement("lost time injury / temporary disability"); fvSafety.addElement("permanent disability"); fvSafety.addElement("fatality"); // Values for EXTRA FUEL COST FastVector fvEFC = new FastVector(5); fvEFC.addElement("< 100 million rupiah"); fvEFC.addElement("0,1 - 1 billion rupiah"); fvEFC.addElement("1 - 10 billion rupiah"); fvEFC.addElement("10 - 100 billion rupiah"); fvEFC.addElement("> 100 billion rupiah"); // Values for SYSTEM RELIABILITY FastVector fvSR = new FastVector(5); fvSR.addElement("< 100 MWh"); fvSR.addElement("0,1 - 1 GWh"); fvSR.addElement("1 - 10 GWh"); fvSR.addElement("10 - 100 GWh"); fvSR.addElement("> 100 GWh"); // Values for EQUIPMENT COST FastVector fvEC = new FastVector(5); fvEC.addElement("< 50 million rupiah"); fvEC.addElement("50 - 500 million rupiah"); fvEC.addElement("0,5 - 5 billion rupiah"); fvEC.addElement("5 -50 billion rupiah"); fvEC.addElement("> 50 billion rupiah"); // Values for CUSTOMER SATISFACTION SOCIAL FACTOR FastVector fvCSSF = new FastVector(5); fvCSSF.addElement("Complaint from the VIP customer"); fvCSSF.addElement("Complaint from industrial customer"); fvCSSF.addElement("Complaint from community"); fvCSSF.addElement("Complaint from community that have potential riot"); fvCSSF.addElement("High potential riot"); // Values for RISK FastVector fvRisk = new FastVector(4); fvRisk.addElement("Low"); fvRisk.addElement("Moderate"); fvRisk.addElement("High"); fvRisk.addElement("Extreme"); // Values for ACTION FastVector fvAction = new FastVector(3); fvAction.addElement("Life Extension Program"); fvAction.addElement("Repair/Refurbish"); fvAction.addElement("Replace/Run to Fail + Investment"); // Defining Attributes, including Class(es) Attributes Attribute attrLO = new Attribute("LO", fvLO); Attribute attrSafety = new Attribute("Safety", fvSafety); Attribute attrEFC = new Attribute("EFC", fvEFC); Attribute attrSR = new Attribute("SR", fvSR); Attribute attrEC = new Attribute("EC", fvEC); Attribute attrCSSF = new Attribute("CSSF", fvCSSF); Attribute attrRisk = new Attribute("Risk", fvRisk); Attribute attrAction = new Attribute("Action", fvAction); m_fvInstanceRisks = new FastVector(7); m_fvInstanceRisks.addElement(attrLO); m_fvInstanceRisks.addElement(attrSafety); m_fvInstanceRisks.addElement(attrEFC); m_fvInstanceRisks.addElement(attrSR); m_fvInstanceRisks.addElement(attrEC); m_fvInstanceRisks.addElement(attrCSSF); m_fvInstanceRisks.addElement(attrRisk); m_fvInstanceActions = new FastVector(7); m_fvInstanceActions.addElement(attrLO); m_fvInstanceActions.addElement(attrSafety); m_fvInstanceActions.addElement(attrEFC); m_fvInstanceActions.addElement(attrSR); m_fvInstanceActions.addElement(attrEC); m_fvInstanceActions.addElement(attrCSSF); m_fvInstanceActions.addElement(attrAction); Instances dataRisk = new Instances("A-Risk-instance-to-classify", m_fvInstanceRisks, 0); Instances dataAction = new Instances("An-Action-instance-to-classify", m_fvInstanceActions, 0); double[] riskValues = new double[dataRisk.numAttributes()]; double[] actionValues = new double[dataRisk.numAttributes()]; String strLO = (String) m_cmbLO.getSelectedItem(); String strSafety = (String) m_cmbSafety.getSelectedItem(); String strEFC = (String) m_cmbEFC.getSelectedItem(); String strSR = (String) m_cmbSR.getSelectedItem(); String strEC = (String) m_cmbEC.getSelectedItem(); String strCSSF = (String) m_cmbCSSF.getSelectedItem(); Instance instRisk = new DenseInstance(7); Instance instAction = new DenseInstance(7); if (strLO.equals("-- none --")) { instRisk.setMissing(0); instAction.setMissing(0); } else { instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(0), strLO); instAction.setValue((Attribute) m_fvInstanceActions.elementAt(0), strLO); } if (strSafety.equals("-- none --")) { instRisk.setMissing(1); instAction.setMissing(1); } else { instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(1), strSafety); instAction.setValue((Attribute) m_fvInstanceActions.elementAt(1), strSafety); } if (strEFC.equals("-- none --")) { instRisk.setMissing(2); instAction.setMissing(2); } else { instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(2), strEFC); instAction.setValue((Attribute) m_fvInstanceActions.elementAt(2), strEFC); } if (strSR.equals("-- none --")) { instRisk.setMissing(3); instAction.setMissing(3); } else { instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(3), strSR); instAction.setValue((Attribute) m_fvInstanceActions.elementAt(3), strSR); } if (strEC.equals("-- none --")) { instRisk.setMissing(4); instAction.setMissing(4); } else { instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(4), strEC); instAction.setValue((Attribute) m_fvInstanceActions.elementAt(4), strEC); } if (strCSSF.equals("-- none --")) { instRisk.setMissing(5); instAction.setMissing(5); } else { instAction.setValue((Attribute) m_fvInstanceActions.elementAt(5), strCSSF); instRisk.setValue((Attribute) m_fvInstanceRisks.elementAt(5), strCSSF); } instRisk.setMissing(6); instAction.setMissing(6); dataRisk.add(instRisk); instRisk.setDataset(dataRisk); dataRisk.setClassIndex(dataRisk.numAttributes() - 1); dataAction.add(instAction); instAction.setDataset(dataAction); dataAction.setClassIndex(dataAction.numAttributes() - 1); System.out.println("Instance Resiko: " + dataRisk.instance(0)); System.out.println("\tNum Attributes : " + dataRisk.numAttributes()); System.out.println("\tNum instances : " + dataRisk.numInstances()); System.out.println("Instance Action: " + dataAction.instance(0)); System.out.println("\tNum Attributes : " + dataAction.numAttributes()); System.out.println("\tNum instances : " + dataAction.numInstances()); int classIndexRisk = 0; int classIndexAction = 0; String strClassRisk = null; String strClassAction = null; try { //classIndexRisk = (int) m_treeResiko.classifyInstance(dataRisk.instance(0)); classIndexRisk = (int) m_treeResiko.classifyInstance(instRisk); classIndexAction = (int) m_treeAksi.classifyInstance(instAction); } catch (Exception ex) { Logger.getLogger(MatResUI.class.getName()).log(Level.SEVERE, null, ex); } strClassRisk = (String) fvRisk.elementAt(classIndexRisk); strClassAction = (String) fvAction.elementAt(classIndexAction); System.out.println("[Risk Class Index: " + classIndexRisk + " Class Label: " + strClassRisk + "]"); System.out.println("[Action Class Index: " + classIndexAction + " Class Label: " + strClassAction + "]"); if (strClassRisk != null) { m_txtRisk.setText(strClassRisk); } double[] riskDist = null; double[] actionDist = null; try { riskDist = m_nbResiko.distributionForInstance(dataRisk.instance(0)); actionDist = m_nbAksi.distributionForInstance(dataAction.instance(0)); String strProb; // set up RISK progress bars m_jBarRiskLow.setValue((int) (100 * riskDist[0])); m_jBarRiskLow.setString(String.format("%6.3f%%", 100 * riskDist[0])); m_jBarRiskModerate.setValue((int) (100 * riskDist[1])); m_jBarRiskModerate.setString(String.format("%6.3f%%", 100 * riskDist[1])); m_jBarRiskHigh.setValue((int) (100 * riskDist[2])); m_jBarRiskHigh.setString(String.format("%6.3f%%", 100 * riskDist[2])); m_jBarRiskExtreme.setValue((int) (100 * riskDist[3])); m_jBarRiskExtreme.setString(String.format("%6.3f%%", 100 * riskDist[3])); } catch (Exception ex) { Logger.getLogger(MatResUI.class.getName()).log(Level.SEVERE, null, ex); } double predictedProb = 0.0; String predictedClass = ""; // Loop over all the prediction labels in the distribution. for (int predictionDistributionIndex = 0; predictionDistributionIndex < riskDist.length; predictionDistributionIndex++) { // Get this distribution index's class label. String predictionDistributionIndexAsClassLabel = dataRisk.classAttribute() .value(predictionDistributionIndex); int classIndex = dataRisk.classAttribute().indexOfValue(predictionDistributionIndexAsClassLabel); // Get the probability. double predictionProbability = riskDist[predictionDistributionIndex]; if (predictionProbability > predictedProb) { predictedProb = predictionProbability; predictedClass = predictionDistributionIndexAsClassLabel; } System.out.printf("[%2d %10s : %6.3f]", classIndex, predictionDistributionIndexAsClassLabel, predictionProbability); } m_txtRiskNB.setText(predictedClass); }
From source file:maui.main.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data */// w w w. j ava2 s . co m public void buildModel(HashSet<String> fileNames, VocabularyStore store) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } System.err.println("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model mauiFilter = new MauiFilter(); mauiFilter.setDebug(getDebug()); mauiFilter.setMaxPhraseLength(getMaxPhraseLength()); mauiFilter.setMinPhraseLength(getMinPhraseLength()); mauiFilter.setMinNumOccur(getMinNumOccur()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures); mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setClassifier(classifier); mauiFilter.setContextSize(contextSize); mauiFilter.setMinKeyphraseness(minKeyphraseness); mauiFilter.setMinSenseProbability(minSenseProbability); if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store); } System.err.println("-- Reading the input documents... "); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTopicsFile); e.printStackTrace(); newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } ; }
From source file:maui.main.MauiTopicExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w . j ava 2s . c o m */ public void extractKeyphrases(HashSet<String> fileNames, VocabularyStore store) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); System.err.println("-- Extracting keyphrases... "); Vector<Double> correctStatistics = new Vector<Double>(); Vector<Double> precisionStatistics = new Vector<Double>(); Vector<Double> recallStatistics = new Vector<Double>(); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { if (debugMode) { System.err.println("No existing topics for " + documentTextFile); } newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (debugMode) { System.err.println("-- Processing document: " + fileName); } Instance[] topRankedInstances = new Instance[topicsPerDocument]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = mauiFilter.output()) != null) { int index = (int) inst.value(mauiFilter.getRankIndex()) - 1; if (index < topicsPerDocument) { topRankedInstances[index] = inst; } } if (debugMode) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; if (!documentTopicsFile.exists()) { out = new FileOutputStream(documentTopicsFile); if (!documentEncoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; wikipedia = mauiFilter.getWikipedia(); HashMap<Article, Integer> topics = null; if (printGraph) { topics = new HashMap<Article, Integer>(); } int p = 0; String root = ""; for (int i = 0; i < topicsPerDocument; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()); printer.print(topic); if (printGraph) { Article article = wikipedia.getArticleByTitle(topic); if (article == null) { article = wikipedia.getMostLikelyArticle(topic, new CaseFolder()); } if (article != null) { if (root == "") { root = article.getTitle(); } topics.put(article, new Integer(p)); } else { if (debugMode) { System.err.println( "Couldn't find article for " + topic + " in " + documentTopicsFile); } } p++; } if (additionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4)); } printer.println(); } if (debugMode) { System.err.println(topRankedInstances[i]); } } } if (printGraph) { String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv"); computeGraph(topics, root, graphFile); } if (numExtracted > 0) { if (debugMode) { System.err.println("-- " + numCorrect + " correct"); } double totalCorrect = mauiFilter.getTotalCorrect(); correctStatistics.addElement(new Double(numCorrect)); precisionStatistics.addElement(new Double(numCorrect / numExtracted)); recallStatistics.addElement(new Double(numCorrect / totalCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } if (correctStatistics.size() != 0) { double[] st = new double[correctStatistics.size()]; for (int i = 0; i < correctStatistics.size(); i++) { st[i] = correctStatistics.elementAt(i).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); if (correctStatistics.size() == 1) { System.err.println("\n-- Evaluation results based on 1 document:"); } else { System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:"); } System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); st = new double[precisionStatistics.size()]; for (int i = 0; i < precisionStatistics.size(); i++) { st[i] = precisionStatistics.elementAt(i).doubleValue(); } double avgPrecision = Utils.mean(st); double stdDevPrecision = Math.sqrt(Utils.variance(st)); System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- " + Utils.doubleToString(stdDevPrecision * 100, 2)); st = new double[recallStatistics.size()]; for (int i = 0; i < recallStatistics.size(); i++) { st[i] = recallStatistics.elementAt(i).doubleValue(); } double avgRecall = Utils.mean(st); double stdDevRecall = Math.sqrt(Utils.variance(st)); System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- " + Utils.doubleToString(stdDevRecall * 100, 2)); double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision); System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2)); System.err.println(""); } mauiFilter.batchFinished(); }
From source file:meansagnes.MyKMeans.java
@Override public void buildClusterer(Instances data) throws Exception { currentIteration = 0;// www .j a v a 2 s . c o m replaceMissingFilter = new ReplaceMissingValues(); instances = new Instances(data); instances.setClassIndex(-1); replaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, replaceMissingFilter); distanceFunction.setInstances(instances); clusterCentroids = new Instances(instances, numCluster); clusterAssignments = new int[instances.numInstances()]; // assign a number of instance become a centroid randomly Random randomizer = new Random(getSeed()); int[] instanceAsCentroid = new int[numCluster]; for (int i = 0; i < numCluster; i++) { instanceAsCentroid[i] = -1; } for (int i = 0; i < numCluster; i++) { int centroidCluster = randomizer.nextInt(instances.numInstances()); boolean found = false; for (int j = 0; j < i /* instanceAsCentroid.length */ && !found; j++) { if (instanceAsCentroid[j] == centroidCluster) { i--; found = true; } } if (!found) { clusterCentroids.add(instances.instance(centroidCluster)); instanceAsCentroid[i] = centroidCluster; } } double[][] distancesToCentroid = new double[numCluster][instances.numInstances()]; double[] minDistancesToCentroid = new double[instances.numInstances()]; boolean converged = false; Instances prevCentroids; while (!converged) { currentIteration++; // check distance to each centroid to decide clustering result for (int i = 0; i < numCluster; i++) { // i is cluster index for (int j = 0; j < instances.numInstances(); j++) { // j is instance index distancesToCentroid[i][j] = distanceFunction.distance(clusterCentroids.instance(i), instances.instance(j)); } } for (int j = 0; j < instances.numInstances(); j++) { // j is instance index minDistancesToCentroid[j] = distancesToCentroid[0][j]; clusterAssignments[j] = 0; } for (int j = 0; j < instances.numInstances(); j++) { // j is instance index for (int i = 1; i < numCluster; i++) { // i is cluster index if (minDistancesToCentroid[j] > distancesToCentroid[i][j]) { minDistancesToCentroid[j] = distancesToCentroid[i][j]; clusterAssignments[j] = i; } } } for (int i = 0; i < numCluster; i++) { System.out.println(clusterCentroids.instance(i)); } // update centroids prevCentroids = clusterCentroids; clusterCentroids = new Instances(instances, numCluster); clusteredInstances = new Instances[numCluster]; for (int i = 0; i < numCluster; i++) { clusteredInstances[i] = new Instances(instances, 0); } for (int i = 0; i < instances.numInstances(); i++) { clusteredInstances[clusterAssignments[i]].add(instances.instance(i)); System.out.println(instances.instance(i).toString() + " : " + clusterAssignments[i]); } if (currentIteration == maxIterations) { converged = true; } Instances newCentroids = new Instances(instances, numCluster); for (int i = 0; i < numCluster; i++) { newCentroids.add(moveCentroid(clusteredInstances[i])); } clusterCentroids = newCentroids; boolean centroidChanged = false; for (int i = 0; i < numCluster; i++) { if (distanceFunction.distance(prevCentroids.instance(i), clusterCentroids.instance(i)) > 0) { centroidChanged = true; } } if (!centroidChanged) { converged = true; } System.out.println("\n\n"); } clusterSizes = new int[numCluster]; for (int i = 0; i < numCluster; i++) { clusterSizes[i] = clusteredInstances[i].numInstances(); } distanceFunction.clean(); }
From source file:meka.classifiers.multilabel.BRq.java
License:Open Source License
@Override public void buildClassifier(Instances data) throws Exception { testCapabilities(data);// w w w . j a v a2 s . c o m int c = data.classIndex(); if (getDebug()) System.out.print("-: Creating " + c + " models (" + m_Classifier.getClass().getName() + "): "); m_MultiClassifiers = AbstractClassifier.makeCopies(m_Classifier, c); Instances sub_data = null; for (int i = 0; i < c; i++) { int indices[][] = new int[c][c - 1]; for (int j = 0, k = 0; j < c; j++) { if (j != i) { indices[i][k++] = j; } } //Select only class attribute 'i' Remove FilterRemove = new Remove(); FilterRemove.setAttributeIndicesArray(indices[i]); FilterRemove.setInputFormat(data); FilterRemove.setInvertSelection(true); sub_data = Filter.useFilter(data, FilterRemove); sub_data.setClassIndex(0); /* BEGIN downsample for this link */ sub_data.randomize(m_Random); int numToRemove = sub_data.numInstances() - (int) Math.round(sub_data.numInstances() * m_DownSampleRatio); for (int m = 0, removed = 0; m < sub_data.numInstances(); m++) { if (sub_data.instance(m).classValue() <= 0.0) { sub_data.instance(m).setClassMissing(); if (++removed >= numToRemove) break; } } sub_data.deleteWithMissingClass(); /* END downsample for this link */ //Build the classifier for that class m_MultiClassifiers[i].buildClassifier(sub_data); if (getDebug()) System.out.print(" " + (i + 1)); } if (getDebug()) System.out.println(" :-"); m_InstancesTemplate = new Instances(sub_data, 0); }
From source file:meka.classifiers.multilabel.Evaluation.java
License:Open Source License
/** * TestClassifier - test classifier h on D_test * @param h a multi-dim. classifier, ALREADY BUILT * @param D_test test data/*from w w w.j av a 2 s . c o m*/ * @return Result with raw prediction data ONLY */ public static Result testClassifier(MultiLabelClassifier h, Instances D_test) throws Exception { int L = D_test.classIndex(); Result result = new Result(D_test.numInstances(), L); if (h.getDebug()) System.out.print(":- Evaluate "); for (int i = 0, c = 0; i < D_test.numInstances(); i++) { if (h.getDebug()) { int t = i * 50 / D_test.numInstances(); if (t > c) { System.out.print("#"); c = t; } } // No cheating allowed; clear all class information AbstractInstance x = (AbstractInstance) ((AbstractInstance) D_test.instance(i)).copy(); for (int v = 0; v < D_test.classIndex(); v++) x.setValue(v, 0.0); // Get and store ranking double y[] = h.distributionForInstance(x); // Cut off any [no-longer-needed] probabalistic information from MT classifiers. if (h instanceof MultiTargetClassifier) y = Arrays.copyOf(y, L); // Store the result result.addResult(y, D_test.instance(i)); } if (h.getDebug()) System.out.println(":-"); /* if(h.getDebug()) { for(int i = 0; i < result.size(); i++) { System.out.println("\t"+Arrays.toString(result.rowTrue(i))+" vs "+Arrays.toString(result.rowRanking(i))); } } */ return result; }
From source file:meka.classifiers.multilabel.Evaluation.java
License:Open Source License
/** *Test Classifier but threaded (Multiple) * @param h a multi-dim. classifier, ALREADY BUILT (threaded, implements MultiLabelThreaded) * @param D_test test data/*from w w w . ja v a2 s . c om*/ * @return Result with raw prediction data ONLY */ public static Result testClassifierM(MultiLabelClassifier h, Instances D_test) throws Exception { int L = D_test.classIndex(); Result result = new Result(D_test.numInstances(), L); if (h.getDebug()) System.out.print(":- Evaluate "); if (h instanceof MultiLabelClassifierThreaded) { ((MultiLabelClassifierThreaded) h).setThreaded(true); double y[][] = ((MultiLabelClassifierThreaded) h).distributionForInstanceM(D_test); for (int i = 0, c = 0; i < D_test.numInstances(); i++) { // Store the result result.addResult(y[i], D_test.instance(i)); } if (h.getDebug()) System.out.println(":-"); /* if(h.getDebug()) { for(int i = 0; i < result.size(); i++) { System.out.println("\t"+Arrays.toString(result.rowActual(i))+" vs "+Arrays.toString(result.rowRanking(i))); } } */ } return result; }