List of usage examples for weka.core Instances Instances
public Instances(String name, ArrayList<Attribute> attInfo, int capacity)
From source file:com.kdcloud.lib.domain.DataSpecification.java
License:Open Source License
public static Instances newInstances(String name, int columns) { ArrayList<Attribute> info = new ArrayList<Attribute>(columns); for (int i = 0; i < columns; i++) { info.add(new Attribute("attr" + i)); }/*www.ja v a2s . c om*/ return new Instances(name, info, 0); }
From source file:com.mechaglot_Alpha2.controller.Calculate.java
License:Creative Commons License
/** * /*from w w w .ja v a2s . c o m*/ * @param in * String representing the calculated String-metric distances, * comma separated. * @return Instance The inputted series of numbers (comma separated) as * Instance. */ private Instance instanceMaker(String in) { String[] s = in.split(","); double[] r = new double[s.length]; for (int t = 0; t < r.length; t++) { r[t] = Double.parseDouble(s[t]); } int sz = r.length - 1; ArrayList<Attribute> atts = new ArrayList<Attribute>(sz); for (int t = 0; t < sz + 1; t++) { atts.add(new Attribute("number" + t, t)); } Instances dataRaw = new Instances("TestInstances", atts, sz); dataRaw.add(new DenseInstance(1.0, r)); Instance first = dataRaw.firstInstance(); // int cIdx = dataRaw.numAttributes() - 1; dataRaw.setClassIndex(cIdx); return first; }
From source file:com.mycompany.knnclassifier.kNNShell.java
public static void main(String[] args) throws Exception { ConverterUtils.DataSource source = new ConverterUtils.DataSource("carData.csv"); Instances dataSet = source.getDataSet(); Standardize standardize = new Standardize(); standardize.setInputFormat(dataSet); dataSet = Filter.useFilter(dataSet, standardize); dataSet.setClassIndex(dataSet.numAttributes() - 1); dataSet.randomize(new Random(9001)); //It's over 9000!! int trainingSize = (int) Math.round(dataSet.numInstances() * .7); int testSize = dataSet.numInstances() - trainingSize; Instances trainingData = new Instances(dataSet, 0, trainingSize); Instances testData = new Instances(dataSet, trainingSize, testSize); kNNClassifier classifier = new kNNClassifier(3); classifier.buildClassifier(trainingData); //Used to compare to Weka's built in KNN algorithm //Classifier classifier = new IBk(1); //classifier.buildClassifier(trainingData); Evaluation eval = new Evaluation(trainingData); eval.evaluateModel(classifier, testData); System.out.println(eval.toSummaryString("\nResults:\n", false)); }
From source file:com.mycompany.neuralnetwork.NeuralNetworkShell.java
public static void main(String[] args) throws Exception { ConverterUtils.DataSource source = new ConverterUtils.DataSource("irisData.csv"); Instances dataSet = source.getDataSet(); Standardize standardize = new Standardize(); standardize.setInputFormat(dataSet); dataSet = Filter.useFilter(dataSet, standardize); dataSet.setClassIndex(dataSet.numAttributes() - 1); dataSet.randomize(new Random(9001)); //It's over 9000!! int trainingSize = (int) Math.round(dataSet.numInstances() * .7); int testSize = dataSet.numInstances() - trainingSize; Instances trainingData = new Instances(dataSet, 0, trainingSize); Instances testData = new Instances(dataSet, trainingSize, testSize); //MultilayerPerceptron classifier = new MultilayerPerceptron(); NeuralNetworkClassifier classifier = new NeuralNetworkClassifier(3, 20000, 0.1); classifier.buildClassifier(trainingData); Evaluation eval = new Evaluation(trainingData); eval.evaluateModel(classifier, testData); System.out.println(eval.toSummaryString("\nResults:\n", false)); }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Builds the classifier./*from ww w . j ava 2 s . co m*/ */ // aly: The main function, where everything important happens private void buildClassifier() throws Exception { // Generate input format for classifier FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == m_DocumentAtt) { atts.addElement(new Attribute("TFxIDF")); atts.addElement(new Attribute("First_occurrence")); if (m_KFused) { atts.addElement(new Attribute("Keyphrase_frequency")); } if (m_STDEVfeature) { atts.addElement(new Attribute("Standard_deviation")); } if (m_NODEfeature) { atts.addElement(new Attribute("Relations_number")); } if (m_LENGTHfeature) { atts.addElement(new Attribute("Phrase_length")); } } else if (i == m_KeyphrasesAtt) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); //atts.addElement(new Attribute("Keyphrase?", vals)); atts.addElement(new Attribute("Keyphrase?")); } } m_ClassifierData = new Instances("ClassifierData", atts, 0); m_ClassifierData.setClassIndex(m_NumFeatures); if (m_Debug) { log.info("--- Converting instances for classifier"); } // Convert pending input instances into data for classifier for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance current = getInputFormat().instance(i); // Get the key phrases for the document String keyphrases = current.stringValue(m_KeyphrasesAtt); HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases, false); HashMap<String, Counter> hashKeysEval = getGivenKeyphrases(keyphrases, true); // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, current.stringValue(m_DocumentAtt)); // hash = getComposits(hash); // Compute the feature values for each phrase and // add the instance to the data for the classifier Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = it.next(); FastVector phraseInfo = (FastVector) hash.get(phrase); double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length, hash); //log.info(vals); Instance inst = new Instance(current.weight(), vals); // .err.println(phrase + "\t" + inst.toString()); m_ClassifierData.add(inst); } } if (m_Debug) { log.info("--- Building classifier"); } // Build classifier // Uncomment if you want to use a different classifier // Caution: Other places in the code will have to be adjusted!! /*I. Naive Bayes: FilteredClassifier fclass = new FilteredClassifier(); fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple()); fclass.setFilter(new Discretize()); m_Classifier = fclass; */ //NaiveBayes nb = new NaiveBayes(); //nb.setUseSupervisedDiscretization(true); //m_Classifier = nb; /* II. Linear Regression: LinearRegression lr = new LinearRegression(); lr.setAttributeSelectionMethod(new weka.core.SelectedTag(1, LinearRegression.TAGS_SELECTION)); lr.setEliminateColinearAttributes(false); lr.setDebug(false); m_Classifier = lr;*/ /* III. Bagging with REPTrees Bagging bagging = new Bagging(); String[] ops_bagging = { new String("-P"), new String("100"), new String("-S"), new String("1"), new String("-I"), new String("50")}; */ /* * REPTree rept = new REPTree(); //results are worse! rept.setNoPruning(true); String[] ops_rept = { new String("-M"), new String("2"), new String("-V"), new String("0.0010"), new String("-N"), new String("3"), new String("-S"), new String("1"), new String("-L"), new String("1"),}; rept.setOptions(ops_rept); bagging.setClassifier(rept); */ // bagging.setOptions(ops_bagging); //FilteredClassifier fclass = new FilteredClassifier(); //fclass.setClassifier(new REPTree()); //fclass.setFilter(new Discretize()); //bagging.setClassifier(fclass); // m_Classifier = bagging; RegressionByDiscretization rvd = new RegressionByDiscretization(); FilteredClassifier fclass = new FilteredClassifier(); fclass.setClassifier(new weka.classifiers.bayes.NaiveBayesSimple()); fclass.setFilter(new Discretize()); rvd.setClassifier(fclass); rvd.setNumBins(m_Indexers + 1); m_Classifier = rvd; // log.info(m_ClassifierData); //System.exit(1); m_Classifier.buildClassifier(m_ClassifierData); if (m_Debug) { log.info("" + m_Classifier); } // Save space m_ClassifierData = new Instances(m_ClassifierData, 0); }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Sets output format and converts pending input instances. *///from w w w .ja va2 s . c o m @SuppressWarnings("unchecked") private void convertPendingInstances() throws Exception { if (m_Debug) { log.info("--- Converting pending instances"); } // Create output format for filter FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == m_DocumentAtt) { // string attributes atts.addElement(new Attribute("N-gram", (FastVector) null)); atts.addElement(new Attribute("N-gram-original", (FastVector) null)); // numeric attributes atts.addElement(new Attribute("TFxIDF")); atts.addElement(new Attribute("First_occurrence")); // optional attributes if (m_Debug) { if (m_KFused) { atts.addElement(new Attribute("Keyphrase_frequency")); } } if (m_STDEVfeature) { //FastVector rvals = new FastVector(2); //rvals.addElement("False"); //rvals.addElement("True"); atts.addElement(new Attribute("Standard_deviation")); } if (m_NODEfeature) { atts.addElement(new Attribute("Relations_number")); } if (m_LENGTHfeature) { atts.addElement(new Attribute("Phrase_length")); } atts.addElement(new Attribute("Probability")); atts.addElement(new Attribute("Rank")); } else if (i == m_KeyphrasesAtt) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); //atts.addElement(new Attribute("Keyphrase?", vals)); atts.addElement(new Attribute("Keyphrase?")); } else { atts.addElement(getInputFormat().attribute(i)); } } Instances outFormat = new Instances("KEAdata", atts, 0); setOutputFormat(outFormat); // Convert pending input instances into output data for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance current = getInputFormat().instance(i); FastVector vector = convertInstance(current, true); Enumeration<Instance> en = vector.elements(); while (en.hasMoreElements()) { Instance inst = en.nextElement(); push(inst); } } }
From source file:com.openkm.kea.metadata.SubjectExtractor.java
License:Open Source License
/** * extractSuggestedSubjects// ww w. j ava2s . com * * @param documentText * @return */ public List<String> extractSuggestedSubjects(String documentText) { Date start, stop; start = new Date(); List<String> subjects = new ArrayList<String>(); // no idea what this is .... FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0); try { // this is the exrtraction process part - not too well understood yet // "unkowndatastructure" is called instances in original KEA code double[] unknownStructure = new double[2]; unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText); unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none unknownDataStructure.add(new Instance(1.0, unknownStructure)); filter.input(unknownDataStructure.instance(0)); unknownDataStructure.stringFreeStructure(); //??**&%%!!!?? // this is getting the results out - better understood Instance[] rankedSubjects = new Instance[this.subjectNumLimit]; Instance subject; while ((subject = filter.output()) != null) { int index = (int) subject.value(filter.getRankIndex()) - 1; if (index < subjectNumLimit) { rankedSubjects[index] = subject; } } for (int i = 0; i < subjectNumLimit; i++) { if (rankedSubjects[i] != null) { subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex())); } } } catch (Exception e) { log.error("problem in subject extraction: ", e); } finally { stop = new Date(); long time = (stop.getTime() - start.getTime()); log.info("Subject extraction completed in " + time + "ms"); } return subjects; }
From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w. j av a 2 s .c o m */ public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception { Vector<Double> stats = new Vector<Double>(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } log.info("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration<String> elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { log.debug("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { log.debug("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { // My addition: to exclude low ranking phrases double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()); if (rank >= 0.00) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { log.debug("" + topRankedInstances[i]); } } } } if (numExtracted > 0) { if (m_debug) { log.debug("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); log.info("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files// w ww . j av a 2 s.c o m */ public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(stopwords); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setVocabulary(getVocabulary()); m_KEAFilter.setVocabularyFormat(getVocabularyFormat()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); m_KEAFilter.setInputFormat(data); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(getStemmer(), getStopwords()); } m_KEAFilter.setNumFeature(); log.info("-- Reading the Documents... "); Enumeration<String> elem = stems.keys(); while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { log.error("Can't find document for stem " + str + "."); newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { log.error("Can't find keyphrases for stem " + str + "."); newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); while ((m_KEAFilter.output()) != null) { } ; }
From source file:com.reactivetechnologies.analytics.core.eval.AdaBoostM1WithBuiltClassifiers.java
License:Open Source License
@Override protected void buildClassifierWithWeights(Instances data) throws Exception { Instances training;//from w w w. j a v a2 s .c om double epsilon, reweight; Evaluation evaluation; int numInstances = data.numInstances(); // Initialize data m_Betas = new double[m_Classifiers.length]; m_NumIterationsPerformed = 0; // Create a copy of the data so that when the weights are diddled // with it doesn't mess up the weights for anyone else training = new Instances(data, 0, numInstances); // Do boostrap iterations for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) { if (m_Debug) { System.err.println("Training classifier " + (m_NumIterationsPerformed + 1)); } // Select instances to train the classifier on if (m_WeightThreshold < 100) { selectWeightQuantile(training, (double) m_WeightThreshold / 100); } else { new Instances(training, 0, numInstances); } /** Changed here: DO NOT Build the classifier! */ /*if (m_Classifiers[m_NumIterationsPerformed] instanceof Randomizable) ((Randomizable) m_Classifiers[m_NumIterationsPerformed]).setSeed(randomInstance.nextInt()); m_Classifiers[m_NumIterationsPerformed].buildClassifier(trainData);*/ /** End change */ // Evaluate the classifier evaluation = new Evaluation(data); evaluation.evaluateModel(m_Classifiers[m_NumIterationsPerformed], training); epsilon = evaluation.errorRate(); // Stop if error too small or error too big and ignore this model if (Utils.grOrEq(epsilon, 0.5) || Utils.eq(epsilon, 0)) { if (m_NumIterationsPerformed == 0) { m_NumIterationsPerformed = 1; // If we're the first we have to to use it } break; } // Determine the weight to assign to this model m_Betas[m_NumIterationsPerformed] = Math.log((1 - epsilon) / epsilon); reweight = (1 - epsilon) / epsilon; if (m_Debug) { System.err.println("\terror rate = " + epsilon + " beta = " + m_Betas[m_NumIterationsPerformed]); } // Update instance weights setWeights(training, reweight); } }