List of usage examples for weka.core Instances stringFreeStructure
public Instances stringFreeStructure()
From source file:com.entopix.maui.main.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data * @throws MauiFilterException /*from ww w .jav a 2 s . c om*/ */ public MauiFilter buildModel(List<MauiDocument> documents) throws MauiFilterException { log.info("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); mauiFilter = new MauiFilter(); mauiFilter.setMaxPhraseLength(maxPhraseLength); mauiFilter.setMinPhraseLength(minPhraseLength); mauiFilter.setMinNumOccur(minNumOccur); mauiFilter.setStemmer(stemmer); mauiFilter.setDocumentLanguage(documentLanguage); mauiFilter.setVocabularyName(vocabularyName); mauiFilter.setVocabularyFormat(vocabularyFormat); mauiFilter.setStopwords(stopwords); mauiFilter.setVocabulary(vocabulary); if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useThesaurusFeatures); mauiFilter.setWikipediaFeatures(useWikipediaFeatures, wikiFeatures); mauiFilter.setClassifier(classifier); if (!vocabularyName.equals("none")) { loadVocabulary(); mauiFilter.setVocabulary(vocabulary); } log.info("-- Adding documents as instances... "); for (MauiDocument document : documents) { double[] newInst = new double[3]; newInst[0] = data.attribute(0).addStringValue(document.getFileName()); // Adding the text and the topics for the document to the instance if (document.getTextContent().length() > 0) { newInst[1] = data.attribute(1).addStringValue(document.getTextContent()); } else { newInst[1] = Instance.missingValue(); } if (document.getTopicsString().length() > 0) { newInst[2] = data.attribute(2).addStringValue(document.getTopicsString()); } else { newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } log.info("-- Building the model... "); mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } return mauiFilter; }
From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//w w w . j av a2s . c o m */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.err.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.metadata.SubjectExtractor.java
License:Open Source License
/** * extractSuggestedSubjects/*from ww w.j a v a 2 s .com*/ * * @param documentText * @return */ public List<String> extractSuggestedSubjects(String documentText) { Date start, stop; start = new Date(); List<String> subjects = new ArrayList<String>(); // no idea what this is .... FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0); try { // this is the exrtraction process part - not too well understood yet // "unkowndatastructure" is called instances in original KEA code double[] unknownStructure = new double[2]; unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText); unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none unknownDataStructure.add(new Instance(1.0, unknownStructure)); filter.input(unknownDataStructure.instance(0)); unknownDataStructure.stringFreeStructure(); //??**&%%!!!?? // this is getting the results out - better understood Instance[] rankedSubjects = new Instance[this.subjectNumLimit]; Instance subject; while ((subject = filter.output()) != null) { int index = (int) subject.value(filter.getRankIndex()) - 1; if (index < subjectNumLimit) { rankedSubjects[index] = subject; } } for (int i = 0; i < subjectNumLimit; i++) { if (rankedSubjects[i] != null) { subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex())); } } } catch (Exception e) { log.error("problem in subject extraction: ", e); } finally { stop = new Date(); long time = (stop.getTime() - start.getTime()); log.info("Subject extraction completed in " + time + "ms"); } return subjects; }
From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files/* w ww .jav a 2s .co m*/ */ public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception { Vector<Double> stats = new Vector<Double>(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } log.info("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration<String> elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { log.debug("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { log.debug("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { // My addition: to exclude low ranking phrases double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()); if (rank >= 0.00) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { log.debug("" + topRankedInstances[i]); } } } } if (numExtracted > 0) { if (m_debug) { log.debug("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); log.info("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files//w w w.j av a 2s .c om */ public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(stopwords); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setVocabulary(getVocabulary()); m_KEAFilter.setVocabularyFormat(getVocabularyFormat()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); m_KEAFilter.setInputFormat(data); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(getStemmer(), getStopwords()); } m_KEAFilter.setNumFeature(); log.info("-- Reading the Documents... "); Enumeration<String> elem = stems.keys(); while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { log.error("Can't find document for stem " + str + "."); newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { log.error("Can't find keyphrases for stem " + str + "."); newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); while ((m_KEAFilter.output()) != null) { } ; }
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * builds the classifier/*from w w w .j av a 2s .c om*/ * * @param data the training instances * @throws Exception if something goes wrong */ @Override public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // save original header (needed for clusters to classes output) m_OriginalHeader = data.stringFreeStructure(); // remove class attribute for clusterer Instances clusterData = new Instances(data); clusterData.setClassIndex(-1); clusterData.deleteAttributeAt(data.classIndex()); m_ClusteringHeader = clusterData.stringFreeStructure(); if (m_ClusteringHeader.numAttributes() == 0) { System.err.println("Data contains only class attribute, defaulting to ZeroR model."); m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(data); } else { m_ZeroR = null; // build clusterer m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); m_ActualClusterer.buildClusterer(clusterData); if (!getLabelAllClusters()) { // determine classes-to-clusters mapping ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(m_ActualClusterer); eval.evaluateClusterer(clusterData); double[] clusterAssignments = eval.getClusterAssignments(); int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()]; int[] clusterTotals = new int[eval.getNumClusters()]; double[] best = new double[eval.getNumClusters() + 1]; double[] current = new double[eval.getNumClusters() + 1]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); if (!instance.classIsMissing()) { counts[(int) clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) clusterAssignments[i]]++; } } best[eval.getNumClusters()] = Double.MAX_VALUE; ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0); m_ClustersToClasses = new double[best.length]; System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length); } else { m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance clusterInstance = clusterData.instance(i); Instance originalInstance = data.instance(i); if (!originalInstance.classIsMissing()) { double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance); for (int j = 0; j < probs.length; j++) { m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j]; } } } for (int i = 0; i < m_ClusterClassProbs.length; i++) { Utils.normalize(m_ClusterClassProbs[i]); } } } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
public Instances DeleteNoise(Instances data) { noise = data.stringFreeStructure(); for (int i = 0; i < data.numInstances(); i++) { if (data.instance(i).value(1) == -1) { noise.add(data.instance(i)); data.delete(i);/*ww w . j a v a 2 s . com*/ i--; } } return data; }
From source file:kea.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from ww w .j a v a 2 s. co m */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Extract keyphrases Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); Reader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i] .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i] .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); m_KEAFilter.batchFinished(); }
From source file:kea.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files/*from www. jav a 2s.co m*/ */ public void buildModel(Hashtable stems) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setInputFormat(data); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); Enumeration elem = stems.keys(); while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find document for stem " + str + "."); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); BufferedReader is; if (!m_encoding.equals("default")) { is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding); } else { is = new BomStrippingInputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't find keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); // Get rid of instances in filter while (m_KEAFilter.output() != null) { } ; }
From source file:kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//w w w . j a v a2 s. co m */ public synchronized void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } this.m_KEAFilter.setNumPhrases(m_numPhrases); this.m_KEAFilter.setVocabulary(m_vocabulary); this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); this.m_KEAFilter.setStemmer(m_Stemmer); this.m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { this.m_KEAFilter.m_NODEfeature = false; } else { // Know thesaurus is loaded in the constructor //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (this.m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.out.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } is.close(); newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); this.m_KEAFilter.input(data.instance(0), vocabulary); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = this.m_KEAFilter.output()) != null) { int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.out.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.out.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }