List of usage examples for weka.core Instance value
public double value(Attribute att);
From source file:com.mycompany.knnclassifier.kNNClassifier.java
private static double getClassification(List<Instance> instances) { int index = instances.get(0).classIndex(); HashMap<Double, Integer> counts = new HashMap<>(); for (Instance instance : instances) { double val = instance.value(index); if (!counts.containsKey(val)) counts.put(val, 1); else {/*from w ww . jav a 2 s .c o m*/ counts.put(val, counts.get(val) + 1); } } int maxCount = 0; double maxValue = 0; for (Entry<Double, Integer> entry : counts.entrySet()) { if (entry.getValue() > maxCount) { maxCount = entry.getValue(); maxValue = entry.getKey(); } } return maxValue; }
From source file:com.mycompany.neuralnetwork.NeuralNetworkClassifier.java
@Override public void buildClassifier(Instances instances) throws Exception { int inputCount = instances.numAttributes() - 1; List<Integer> nodesPerLayer = new ArrayList<>(); for (int i = 0; i < layers - 1; i++) { nodesPerLayer.add(inputCount);/*from ww w. j a va 2 s .c o m*/ } nodesPerLayer.add(instances.numDistinctValues(instances.classIndex())); network = new Network(inputCount, nodesPerLayer); ArrayList<Double> errorsPerIteration = new ArrayList<>(); for (int j = 0; j < iterations; j++) { double errorsPer = 0; for (int k = 0; k < instances.numInstances(); k++) { Instance instance = instances.instance(k); List<Double> input = new ArrayList<>(); for (int i = 0; i < instance.numAttributes(); i++) { if (Double.isNaN(instance.value(i)) && i != instance.classIndex()) input.add(0.0); else if (i != instance.classIndex()) input.add(instance.value(i)); } errorsPer += network.train(input, instance.value(instance.classIndex()), learningFactor); } errorsPerIteration.add(errorsPer); } //Display Errors This is used to collect the data for the graph //for (Double d : errorsPerIteration) //{ // System.out.println(d); //} }
From source file:com.mycompany.neuralnetwork.NeuralNetworkClassifier.java
@Override public double classifyInstance(Instance instance) throws Exception { List<Double> input = new ArrayList<>(); for (int i = 0; i < instance.numAttributes(); i++) { if (Double.isNaN(instance.value(i)) && i != instance.classIndex()) input.add(0.0);//from w w w .ja v a 2s . c om else if (i != instance.classIndex()) input.add(instance.value(i)); } List<Double> outputs = network.getOutputs(input); double largeVal = -1; int index = 0; for (int i = 0; i < outputs.size(); i++) { double temp = outputs.get(i); if (temp > largeVal) { largeVal = temp; index = i; } } return index; }
From source file:com.mycompany.tubesann.MyANN.java
public double classifyInstance(Instance instance) throws Exception { double result = 0; for (int i = 0; i < instance.numAttributes() - 1; i++) { startNode[i].setInput(instance.value(i)); }//from w w w .j a v a 2 s .co m List<Double> output = new ArrayList<Double>(); for (int i = 0; i < finalNode.length; i++) { output.add(finalNode[i].calculate()); // System.out.println("Output "+i+" "+output.get(i)); } if (rule == 1) { boolean found = false; int i = 0; while (!found && i < output.size()) { if (output.get(i) == 1) { result = (double) i; found = true; } i++; } } else { int imax = 0; //System.out.println("output i= "+0+" output= "+output.get(0)); for (int i = 1; i < output.size(); i++) { //System.out.println("output i= "+i+" output= "+output.get(i)); if (output.get(i) > output.get(imax)) { imax = i; } } result = (double) imax; //double max = Collections.max(output); //result = (double) output.indexOf(max); } return result; }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Converts an instance.//from w ww. j a v a2s . co m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { log.info("-- Converting instance"); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; HashMap<String, Counter> hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // hash = getComposits(hash); /* Experimental: To compute how many of the manual keyphrases appear in the documents: log.info("Doc phrases found " + hash.size()); log.info("Manual keyphrases: "); Iterator iter = hashKeyphrases.keySet().iterator(); int count = 0; while (iter.hasNext()) { String id = (String)iter.next(); if (hash.containsKey(id)) { count++; } } double max_recall = (double)count/(double)hashKeyphrases.size(); m_max_recall += max_recall; doc++; double avg_m_max_recall = m_max_recall/(double)doc; String file = instance.stringValue(2); log.info(count + " out of " + hashKeyphrases.size() + " are in the document "); log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents "); */ // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } if (m_STDEVfeature) { numFeatures = numFeatures + 1; } if (m_NODEfeature) { numFeatures = numFeatures + 1; } if (m_LENGTHfeature) { numFeatures = numFeatures + 1; } // Set indices of key attributes //int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; //int classAttIndex = numFeatures; // Go through the phrases and convert them into instances Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String id = it.next(); FastVector phraseInfo = (FastVector) hash.get(id); double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of a phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); // If simple Naive Bayes used, change here to //double prob = probs[1]; double prob = probs[0]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(id); newInst[pos++] = index; // Add original version String orig = (String) phraseInfo.elementAt(2); if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(id); } newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } if (m_STDEVfeature) { newInst[pos++] = inst.value(m_STDEVIndex); } if (m_NODEfeature) { newInst[pos++] = inst.value(m_NodeIndex); } if (m_LENGTHfeature) { newInst[pos++] = inst.value(m_LengthIndex); } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator<String> phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // log.info("Here: " + phrase); // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } if (m_STDEVfeature) { newInst[pos++] = Instance.missingValue(); } if (m_NODEfeature) { newInst[pos++] = Instance.missingValue(); } if (m_LENGTHfeature) { newInst[pos++] = Instance.missingValue(); } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; // newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.openkm.kea.filter.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values./*from w w w . ja va2s .c o m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } // aly: str = text of the document String str = instance.stringValue(i); String tokenized = tokenize(str); // aly: resultStr is the clean version of str // log.info(resultStr.toString()); int index = getOutputFormat().attribute(i).addStringValue(tokenized); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.openkm.kea.filter.NumbersFilter.java
License:Open Source License
/** * Converts an instance. A phrase boundary is inserted where * a number is found./*from www . j a v a 2 s .c o m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if ((!instance.attribute(i).isString()) || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); StringTokenizer tok = new StringTokenizer(str, " \t\n", true); while (tok.hasMoreTokens()) { String token = tok.nextToken(); // Everything that doesn't contain at least // one letter is considered to be a number boolean isNumber = true; for (int j = 0; j < token.length(); j++) { if (Character.isLetter(token.charAt(j))) { isNumber = false; break; } } if (!isNumber) { resultStr.append(token); } else { if (token.equals(" ") || token.equals("\t") || token.equals("\n")) { resultStr.append(token); } else { resultStr.append(" \n "); } } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.openkm.kea.metadata.SubjectExtractor.java
License:Open Source License
/** * extractSuggestedSubjects//from w w w . j a v a2 s.c o m * * @param documentText * @return */ public List<String> extractSuggestedSubjects(String documentText) { Date start, stop; start = new Date(); List<String> subjects = new ArrayList<String>(); // no idea what this is .... FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0); try { // this is the exrtraction process part - not too well understood yet // "unkowndatastructure" is called instances in original KEA code double[] unknownStructure = new double[2]; unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText); unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none unknownDataStructure.add(new Instance(1.0, unknownStructure)); filter.input(unknownDataStructure.instance(0)); unknownDataStructure.stringFreeStructure(); //??**&%%!!!?? // this is getting the results out - better understood Instance[] rankedSubjects = new Instance[this.subjectNumLimit]; Instance subject; while ((subject = filter.output()) != null) { int index = (int) subject.value(filter.getRankIndex()) - 1; if (index < subjectNumLimit) { rankedSubjects[index] = subject; } } for (int i = 0; i < subjectNumLimit; i++) { if (rankedSubjects[i] != null) { subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex())); } } } catch (Exception e) { log.error("problem in subject extraction: ", e); } finally { stop = new Date(); long time = (stop.getTime() - start.getTime()); log.info("Subject extraction completed in " + time + "ms"); } return subjects; }
From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w . j a v a 2 s . co m */ public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception { Vector<Double> stats = new Vector<Double>(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } log.info("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration<String> elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { log.debug("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { log.debug("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { // My addition: to exclude low ranking phrases double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()); if (rank >= 0.00) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { log.debug("" + topRankedInstances[i]); } } } } if (numExtracted > 0) { if (m_debug) { log.debug("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); log.info("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.rapidminer.tools.WekaTools.java
License:Open Source License
/** * Creates a RapidMiner example set from Weka instances. Only a label can be used * as special attributes, other types of special attributes are not * supported. If <code>attributeNamePrefix</code> is not null, the given * string prefix plus a number is used as attribute names. *//* w w w . j a v a 2 s. co m*/ public static ExampleSet toRapidMinerExampleSet(Instances instances, String attributeNamePrefix, int datamanagement) { int classIndex = instances.classIndex(); // create example table // 1. Extract attributes List<Attribute> attributes = new ArrayList<Attribute>(); int number = 1; // use for attribute names for (int i = 0; i < instances.numAttributes(); i++) { weka.core.Attribute wekaAttribute = instances.attribute(i); int rapidMinerAttributeValueType = Ontology.REAL; if (wekaAttribute.isNominal()) rapidMinerAttributeValueType = Ontology.NOMINAL; else if (wekaAttribute.isString()) rapidMinerAttributeValueType = Ontology.STRING; Attribute attribute = AttributeFactory.createAttribute(wekaAttribute.name(), rapidMinerAttributeValueType); if ((i != classIndex) && (attributeNamePrefix != null) && (attributeNamePrefix.length() > 0)) { attribute.setName(attributeNamePrefix + "_" + (number++)); } if (wekaAttribute.isNominal()) { for (int a = 0; a < wekaAttribute.numValues(); a++) { String nominalValue = wekaAttribute.value(a); attribute.getMapping().mapString(nominalValue); } } attributes.add(attribute); } Attribute label = null; if (classIndex >= 0) { label = attributes.get(classIndex); label.setName("label"); } // 2. Guarantee alphabetical mapping to numbers for (int j = 0; j < attributes.size(); j++) { Attribute attribute = attributes.get(j); if (attribute.isNominal()) attribute.getMapping().sortMappings(); } // 3. Read data MemoryExampleTable table = new MemoryExampleTable(attributes); DataRowFactory factory = new DataRowFactory(datamanagement, '.'); // create data List<DataRow> dataList = new LinkedList<DataRow>(); int numberOfRapidMinerAttributes = instances.numAttributes(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); DataRow dataRow = factory.create(numberOfRapidMinerAttributes); for (int a = 0; a < instances.numAttributes(); a++) { Attribute attribute = table.getAttribute(a); double wekaValue = instance.value(a); if (attribute.isNominal()) { String nominalValue = instances.attribute(a).value((int) wekaValue); dataRow.set(attribute, attribute.getMapping().mapString(nominalValue)); } else { dataRow.set(attribute, wekaValue); } } dataRow.trim(); dataList.add(dataRow); } // handle label extra table.readExamples(new ListDataRowReader(dataList.iterator())); // create and return example set return table.createExampleSet(label); }