List of usage examples for weka.core Instances instance
publicInstance instance(int index)
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on enumerated attribute. * * @exception Exception if something goes wrong *///from w w w .j a v a2 s. com private void handleEnumeratedAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instance instance; Instances[] trainingSets = new Instances[m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { trainingSets[i] = new Instances(trainInstances, 0); } /* m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses()); */ int subset; for (int i = 0; i < trainInstances.numInstances(); i++) { instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset > -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { try { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } catch (Exception ex) { ex.printStackTrace(); System.err.println("*** " + m_complexityIndex); System.err.println(weights.length); System.exit(1); } } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ /* // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { // m_distribution.add((int)instance.value(m_attIndex),instance); trainingSets[(int)instances.value(m_attIndex)].add(instance); } else { // add these to the error count m_errors += instance.weight(); } } */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() >= 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { // if fewer than min obj then just count them as errors for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if there are at least five instances in at least two of the subsets // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on numeric attribute.//from ww w. j a va2s. c o m * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instances[] trainingSets = new Instances[m_complexityIndex]; trainingSets[0] = new Instances(trainInstances, 0); trainingSets[1] = new Instances(trainInstances, 0); int subset = -1; // populate the subsets for (int i = 0; i < trainInstances.numInstances(); i++) { Instance instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset != -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() > 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if minimum number of Instances in at least two // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }
From source file:jjj.asap.sas.ensemble.impl.CrossValidatedEnsemble.java
License:Open Source License
@Override public StrongLearner build(int essaySet, String ensembleName, List<WeakLearner> learners) { // can't handle empty case if (learners.isEmpty()) { return this.ensemble.build(essaySet, ensembleName, learners); }/*w w w .j a v a 2s .c o m*/ // create a dummy dataset. DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); builder.addNominalVariable("class", Contest.getRubrics(essaySet)); Instances dummy = builder.getDataset("dummy"); // add data Map<Double, Double> groundTruth = Contest.getGoldStandard(essaySet); for (double id : learners.get(0).getPreds().keySet()) { dummy.add(new DenseInstance(1.0, new double[] { id, groundTruth.get(id) })); } // stratify dummy.sort(0); dummy.randomize(new Random(1)); dummy.setClassIndex(1); dummy.stratify(nFolds); // now evaluate each fold Map<Double, Double> preds = new HashMap<Double, Double>(); for (int k = 0; k < nFolds; k++) { Instances train = dummy.trainCV(nFolds, k); Instances test = dummy.testCV(nFolds, k); List<WeakLearner> cvLeaners = new ArrayList<WeakLearner>(); for (WeakLearner learner : learners) { WeakLearner copy = learner.copyOf(); for (int i = 0; i < test.numInstances(); i++) { copy.getPreds().remove(test.instance(i).value(0)); copy.getProbs().remove(test.instance(i).value(0)); } cvLeaners.add(copy); } // train on fold StrongLearner cv = this.ensemble.build(essaySet, ensembleName, cvLeaners); List<WeakLearner> testLeaners = new ArrayList<WeakLearner>(); for (WeakLearner learner : cv.getLearners()) { WeakLearner copy = learner.copyOf(); copy.getPreds().clear(); copy.getProbs().clear(); WeakLearner source = find(copy.getName(), learners); for (int i = 0; i < test.numInstances(); i++) { double id = test.instance(i).value(0); copy.getPreds().put(id, source.getPreds().get(id)); copy.getProbs().put(id, source.getProbs().get(id)); } testLeaners.add(copy); } preds.putAll(this.ensemble.classify(essaySet, ensembleName, testLeaners, cv.getContext())); } // now prepare final result StrongLearner strong = this.ensemble.build(essaySet, ensembleName, learners); double trainingError = strong.getKappa(); double cvError = Calc.kappa(essaySet, preds, groundTruth); // Job.log(essaySet+"-"+ensembleName, "XVAL: training error = " + trainingError + " cv error = " + cvError); strong.setKappa(cvError); return strong; }
From source file:jjj.asap.sas.parser.job.ImportParserData.java
License:Open Source License
private void process(final String parent, int essaySet, Map<Double, List<String>> tags, Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) { // check if output exists boolean any = false; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff")) any = true;//from w w w . j ava 2 s .com if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff")) any = true; if (!any) { Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet + "-*.arff returns all required datasets - nothing to do"); return; } // Load an existing dataset to use as a template. Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff"); // create the output datasets here. except for the extra statistics, // the format is the same as 'dataset'. Instances tagsData = new Instances(dataset, 0); tagsData.setRelationName(essaySet + "-pos-tags.arff"); Instances treeData = new Instances(dataset, 0); treeData.setRelationName(essaySet + "-parse-tree.arff"); Instances dependsData[] = new Instances[7]; for (int j = 0; j < 7; j++) { dependsData[j] = new Instances(dataset, 0); dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff"); } // extra stats DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); if (Contest.isMultiChoice(essaySet)) { builder.addNominalVariable("color", Contest.COLORS); } builder.addVariable("x_sent"); builder.addVariable("x_para"); builder.addVariable("x_length"); builder.addVariable("x_words"); builder.addVariable("x_unique_words"); builder.addNominalVariable("score", Contest.getRubrics(essaySet)); Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff"); // now add rows for each instance for (int i = 0; i < dataset.numInstances(); i++) { // common variables Instance ob = dataset.instance(i); double id = ob.value(0); String y = ob.isMissing(dataset.numAttributes() - 1) ? null : ob.stringValue(dataset.numAttributes() - 1); String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null; String str = ob.stringValue(dataset.attribute("text")); // // Extra stats // int nSent = tags.containsKey(id) ? tags.get(id).size() : 0; int nPara = 0; for (int a = 0; a < str.length(); a++) { if (str.charAt(a) == '^') nPara++; } int nLength = str.length(); int nWords = 0; int nUniqueWords = 0; String[] words = str.toLowerCase().split(" "); nWords = words.length; Set<String> u = new HashSet<String>(); for (String w : words) { u.add(w); } nUniqueWords = u.size(); extraStats.add(new DenseInstance(extraStats.numAttributes())); Instance extra = extraStats.lastInstance(); extra.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { extra.setValue(1, color); } extra.setValue(extraStats.attribute("x_sent"), nSent); extra.setValue(extraStats.attribute("x_para"), nPara); extra.setValue(extraStats.attribute("x_length"), nLength); extra.setValue(extraStats.attribute("x_words"), nWords); extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords); if (y == null) extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue()); else extra.setValue(extraStats.numAttributes() - 1, y); // // POS tags // String tagsText = ""; List<String> tagsList = tags.get(id); if (tagsList == null || tagsList.isEmpty()) { Job.log("WARNING", "no tags for " + id); tagsText = "x"; } else { for (String tagsItem : tagsList) { tagsText += tagsItem; } } tagsData.add(new DenseInstance(ob.numAttributes())); Instance tagsOb = tagsData.lastInstance(); tagsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { tagsOb.setValue(1, color); tagsOb.setValue(2, tagsText.trim()); if (y == null) { tagsOb.setValue(3, Utils.missingValue()); } else { tagsOb.setValue(3, y); } } else { tagsOb.setValue(1, tagsText.trim()); if (y == null) { tagsOb.setValue(2, Utils.missingValue()); } else { tagsOb.setValue(2, y); } } // // Parse Tree // String treeText = ""; List<String> treeList = parseTrees.get(id); if (treeList == null || treeList.isEmpty()) { Job.log("WARNING", "no parse tree for " + id); treeText = "x"; } else { for (String treeItem : treeList) { treeText += treeItem; } } treeData.add(new DenseInstance(ob.numAttributes())); Instance treeOb = treeData.lastInstance(); treeOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { treeOb.setValue(1, color); treeOb.setValue(2, treeText.trim()); if (y == null) { treeOb.setValue(3, Utils.missingValue()); } else { treeOb.setValue(3, y); } } else { treeOb.setValue(1, treeText.trim()); if (y == null) { treeOb.setValue(2, Utils.missingValue()); } else { treeOb.setValue(2, y); } } // // Depends data // for (int j = 0; j < 7; j++) { String text = ""; List<String> list = depends.get(id); if (list == null || list.isEmpty()) { Job.log("WARNING", "no depends for " + id); text = "x"; } else { for (String item : list) { String[] term = StringUtils.safeSplit(item, "/", 3); switch (j) { case 0: text += item; break; case 1: text += term[1] + "/" + term[2]; break; case 2: text += term[0] + "/" + term[2]; break; case 3: text += term[0] + "/" + term[1]; break; case 4: text += term[0]; break; case 5: text += term[1]; break; case 6: text += term[2]; break; } text += " "; } } dependsData[j].add(new DenseInstance(ob.numAttributes())); Instance dependsOb = dependsData[j].lastInstance(); dependsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { dependsOb.setValue(1, color); dependsOb.setValue(2, text.trim()); if (y == null) { dependsOb.setValue(3, Utils.missingValue()); } else { dependsOb.setValue(3, y); } } else { dependsOb.setValue(1, text.trim()); if (y == null) { dependsOb.setValue(2, Utils.missingValue()); } else { dependsOb.setValue(2, y); } } } // j } // dataset // Now save the new datasets Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData); Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData); for (int j = 0; j < 7; j++) { Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]); } Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats); }
From source file:kea.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/*from w ww . j a v a 2 s.c om*/ */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i] .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }
From source file:kea.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files//from ww w . j a v a 2 s .c o m */ public void buildModel(Hashtable stems) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setInputFormat(data); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find document for stem " + str + "."); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); // Get rid of instances in filter while (m_KEAFilter.output() != null) { } ; }
From source file:kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files// w w w.ja v a 2s .co m */ public synchronized void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } this.m_KEAFilter.setNumPhrases(m_numPhrases); this.m_KEAFilter.setVocabulary(m_vocabulary); this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); this.m_KEAFilter.setStemmer(m_Stemmer); this.m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { this.m_KEAFilter.m_NODEfeature = false; } else { // Know thesaurus is loaded in the constructor //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (this.m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.out.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } is.close(); newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); this.m_KEAFilter.input(data.instance(0), vocabulary); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = this.m_KEAFilter.output()) != null) { int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.out.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.out.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:lattice.Lattice.java
License:Open Source License
/** * Constructor of a lattice over the given variables of the dataset. * /*from w w w . j a v a 2s .c om*/ * @param dataset */ public Lattice(Instances dataset) { // ~ initialise internal structure for counting (TID sets) this.nbInstances = dataset.numInstances(); this.nbVariables = dataset.numAttributes(); BitSet[][] presence = new BitSet[nbVariables][]; TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>(); int[] nbValuesForAttribute = new int[nbVariables]; for (int a = 0; a < nbVariables; a++) { nbValuesForAttribute[a] = dataset.numDistinctValues(a) + 1; //+1 for missing presence[a] = new BitSet[nbValuesForAttribute[a]]; allAttributesNumbers.add(a); for (int v = 0; v < presence[a].length; v++) { presence[a][v] = new BitSet(); } } for (int i = 0; i < nbInstances; i++) { Instance row = dataset.instance(i); for (int a = 0; a < nbVariables; a++) { int indexOfValue; if (row.isMissing(a)) { // indexOfValue = (int) dataset.meanOrMode(a); indexOfValue = dataset.numDistinctValues(a); //missing at the end } else { String value = row.stringValue(a); indexOfValue = row.attribute(a).indexOfValue(value); } presence[a][indexOfValue].set(i); } } // initialise the first nodes of the lattice (i.e., the ones // corresponding to single variables this.all = new LatticeNode(this, nbValuesForAttribute); this.singleNodes = new LatticeNode[nbVariables]; for (int a = 0; a < nbVariables; a++) { int[] variablesNumbers = { a }; LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all); singleNodes[a] = node; } }
From source file:lector.Analizador.java
public static void clasificador() { BufferedReader reader1;/*from w w w . j a va 2 s .c o m*/ BufferedReader reader2; try { reader1 = new BufferedReader(new FileReader("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/" + "proyecto/compartida/DataSetAnalisisSentimientos.arff")); reader2 = new BufferedReader(new FileReader("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/" + "proyecto/compartida/DataSetAnalisisSentimientos_inc.arff")); Instances train = new Instances(reader1); train.setClassIndex(train.numAttributes() - 1); System.out.println(train.classIndex() + " " + train.numAttributes()); Instances test = new Instances(reader2); test.setClassIndex(train.numAttributes() - 1); System.out.println(test.classIndex() + " " + test.numAttributes()); NaiveBayes model = new NaiveBayes(); model.buildClassifier(train); //classify Instances labeled = new Instances(test); for (int i = 0; i < test.numInstances(); i++) { double clsLabel = model.classifyInstance(test.instance(i)); labeled.instance(i).setClassValue(clsLabel); } // https://youtu.be/JY_x5zKTfyo?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag Evaluation eval_train = new Evaluation(test); eval_train.evaluateModel(model, test); reader1.close(); reader2.close(); //System.out.println(eval_train.toSummaryString("\nResults\n======\n", false)); String[] options = new String[4]; options[0] = "-t"; //name of training file options[1] = "/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/proyecto/" + "compartida/DataSetAnalisisSentimientos.arff"; options[2] = "-T"; options[3] = "/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/proyecto/" + "compartida/DataSetAnalisisSentimientos_inc.arff"; System.out.println(Evaluation.evaluateModel(model, options)); try ( // print classification results to file BufferedWriter writer = new BufferedWriter( new FileWriter("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/" + "proyecto/compartida/DataSetAnalisisSentimientos_labeled.arff"))) { writer.write(labeled.toString()); } } catch (Exception e) { } }
From source file:LeerArchivo.Leer.java
public String leerModelo() { try {/*w ww . j a v a 2 s . co m*/ String[] valoresAtributos = { "0", "1" }; Classifier clasificador = (Classifier) weka.core.SerializationHelper.read("./KStar.model"); ConverterUtils.DataSource source = new ConverterUtils.DataSource("./test.arff"); Instances data = source.getDataSet(); data.setClassIndex(5); System.out.println(data.instance(0)); return valoresAtributos[(int) clasificador.classifyInstance(data.instance(0))]; } catch (Exception ex) { Logger.getLogger(Leer.class.getName()).log(Level.SEVERE, null, ex); } return null; }