List of usage examples for weka.core Instance stringValue
public String stringValue(Attribute att);
From source file:com.openkm.kea.filter.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values.// w w w.j a v a 2 s. c o m */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } // aly: str = text of the document String str = instance.stringValue(i); String tokenized = tokenize(str); // aly: resultStr is the clean version of str // log.info(resultStr.toString()); int index = getOutputFormat().attribute(i).addStringValue(tokenized); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.openkm.kea.filter.NumbersFilter.java
License:Open Source License
/** * Converts an instance. A phrase boundary is inserted where * a number is found./* www . j av a 2s. co m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if ((!instance.attribute(i).isString()) || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); StringTokenizer tok = new StringTokenizer(str, " \t\n", true); while (tok.hasMoreTokens()) { String token = tok.nextToken(); // Everything that doesn't contain at least // one letter is considered to be a number boolean isNumber = true; for (int j = 0; j < token.length(); j++) { if (Character.isLetter(token.charAt(j))) { isNumber = false; break; } } if (!isNumber) { resultStr.append(token); } else { if (token.equals(" ") || token.equals("\t") || token.equals("\n")) { resultStr.append(token); } else { resultStr.append(" \n "); } } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * Returns class probability distribution for the given instance. * /*from ww w .j a v a 2s. co m*/ * @param instance the instance to be classified * @return the class probabilities * @throws Exception if an error occurred during the prediction */ @Override public double[] distributionForInstance(Instance instance) throws Exception { if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } else { double[] result = new double[instance.numClasses()]; if (m_ActualClusterer != null) { // build new instance Instances tempData = m_ClusteringHeader.stringFreeStructure(); double[] values = new double[tempData.numAttributes()]; int n = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == instance.classIndex()) { continue; } if (instance.attribute(i).isString()) { values[n] = tempData.attribute(n).addStringValue(instance.stringValue(i)); } else if (instance.attribute(i).isRelationValued()) { values[n] = tempData.attribute(n).addRelation(instance.relationalValue(i)); } else { values[n] = instance.value(i); } n++; } Instance newInst = new DenseInstance(instance.weight(), values); newInst.setDataset(tempData); if (!getLabelAllClusters()) { // determine cluster/class double r = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)]; if (r == -1) { return result; // Unclassified } else { result[(int) r] = 1.0; return result; } } else { double[] classProbs = new double[instance.numClasses()]; double[] dist = m_ActualClusterer.distributionForInstance(newInst); for (int i = 0; i < dist.length; i++) { for (int j = 0; j < instance.numClasses(); j++) { classProbs[j] += dist[i] * m_ClusterClassProbs[i][j]; } } Utils.normalize(classProbs); return classProbs; } } else { return result; // Unclassified } } }
From source file:com.zooclassifier.Model.FileLoader.java
public FileLoader(String filename) throws FileNotFoundException, IOException { BufferedReader reader = new BufferedReader(new FileReader(filename)); ArffLoader.ArffReader arff = new ArffLoader.ArffReader(reader); Instances data = arff.getData();// www . ja v a2s .c o m data.setClassIndex(data.numAttributes() - 1); attributes = new String[data.numInstances()][data.numAttributes() - 1]; labels = new String[data.numInstances()]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); for (int j = 0; j < instance.numAttributes() - 1; j++) { attributes[i][j] = instance.stringValue(j); } labels[i] = instance.stringValue(instance.numAttributes() - 1); } attributesLegalValues = new String[data.numAttributes() - 1][]; for (int i = 0; i < data.numAttributes() - 1; i++) { attributesLegalValues[i] = (String[]) Collections.list(data.attribute(i).enumerateValues()) .toArray(new String[data.attribute(i).numValues()]); } labelsLegalValues = (String[]) Collections.list(data.attribute(data.numAttributes() - 1).enumerateValues()) .toArray(new String[data.attribute(data.numAttributes() - 1).numValues()]); }
From source file:core.DatabaseSaverEx.java
License:Open Source License
/** * inserts the given instance into the table. * //from w w w.j ava 2s. com * @param inst the instance to insert * @throws Exception if something goes wrong */ public void writeInstance(Instance inst) throws Exception { StringBuffer insert = new StringBuffer(); insert.append("INSERT INTO "); insert.append(m_tableName); insert.append(" VALUES ( "); if (m_id) { insert.append(m_count); insert.append(", "); m_count++; } for (int j = 0; j < inst.numAttributes(); j++) { if (inst.isMissing(j)) insert.append("NULL"); else { if ((inst.attribute(j)).isDate()) insert.append("'" + m_DateFormat.format((long) inst.value(j)) + "'"); else if ((inst.attribute(j)).isNumeric()) insert.append(inst.value(j)); else { String stringInsert = "'" + inst.stringValue(j) + "'"; if (stringInsert.length() > 2) stringInsert = stringInsert.replaceAll("''", "'"); insert.append(stringInsert); } } if (j != inst.numAttributes() - 1) insert.append(", "); } insert.append(" )"); //System.out.println(insert.toString()); if (m_DataBaseConnection.update(insert.toString()) < 1) { throw new IOException("Tuple cannot be inserted."); } else { m_DataBaseConnection.close(); } }
From source file:data.statistics.MLStatistics.java
License:Open Source License
/** * This method calculates a matrix with the coocurrences of pairs of labels. * It requires the method calculateStats to be previously called. * * @param mlDataSet/*from w ww. ja va 2s . c o m*/ * A multi-label dataset. * @return A coocurrences matrix of pairs of labels. */ public double[][] calculateCoocurrence(MultiLabelInstances mlDataSet) { coocurrenceMatrix = new double[numLabels][numLabels]; int[] labelIndices = mlDataSet.getLabelIndices(); for (int k = 0; k < numExamples; k++) { Instance temp = mlDataSet.getDataSet().instance(k); for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { if ((i < j) && (temp.stringValue(labelIndices[i]).equals("1") && temp.stringValue(labelIndices[j]).equals("1"))) { coocurrenceMatrix[i][j]++; } } } } return coocurrenceMatrix; }
From source file:de.tudarmstadt.ukp.similarity.experiments.coling2012.util.Evaluator.java
License:Open Source License
public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception { // Set parameters int folds = 10; Classifier baseClassifier = getClassifier(wekaClassifier); // Set up the random number generator long seed = new Date().getTime(); Random random = new Random(seed); // Add IDs to the instances AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o", MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" }); Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff"); data.setClassIndex(data.numAttributes() - 1); // Instantiate the Remove filter Remove removeIDFilter = new Remove(); removeIDFilter.setAttributeIndices("first"); // Randomize the data data.randomize(random);//from w ww. j av a 2 s .c o m // Perform cross-validation Instances predictedData = null; Evaluation eval = new Evaluation(data); for (int n = 0; n < folds; n++) { Instances train = data.trainCV(folds, n, random); Instances test = data.testCV(folds, n); // Apply log filter // Filter logFilter = new LogFilter(); // logFilter.setInputFormat(train); // train = Filter.useFilter(train, logFilter); // logFilter.setInputFormat(test); // test = Filter.useFilter(test, logFilter); // Copy the classifier Classifier classifier = AbstractClassifier.makeCopy(baseClassifier); // Instantiate the FilteredClassifier FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(removeIDFilter); filteredClassifier.setClassifier(classifier); // Build the classifier filteredClassifier.buildClassifier(train); // Evaluate eval.evaluateModel(filteredClassifier, test); // Add predictions AddClassification filter = new AddClassification(); filter.setClassifier(filteredClassifier); filter.setOutputClassification(true); filter.setOutputDistribution(false); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); // trains the classifier Instances pred = Filter.useFilter(test, filter); // performs predictions on test set if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } // Prepare output classification String[] scores = new String[predictedData.numInstances()]; for (Instance predInst : predictedData) { int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1; int valueIdx = predictedData.numAttributes() - 2; String value = predInst.stringValue(predInst.attribute(valueIdx)); scores[id] = value; } // Output StringBuilder sb = new StringBuilder(); for (String score : scores) sb.append(score.toString() + LF); FileUtils.writeStringToFile( new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/output.csv"), sb.toString()); }
From source file:de.unidue.langtech.grading.tc.ClusteringTask.java
License:Open Source License
@Override public void execute(TaskContext aContext) throws Exception { if (learningMode.equals(Constants.LM_MULTI_LABEL)) { throw new IllegalArgumentException("Cannot use multi-label setup in clustering."); }//from w ww . jav a 2 s . c o m boolean multiLabel = false; File arffFileTrain = new File( aContext.getStorageLocation(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + TRAINING_DATA_FILENAME); Instances trainData = TaskUtils.getInstances(arffFileTrain, multiLabel); // get number of outcomes List<String> trainOutcomeValues = TaskUtils.getClassLabels(trainData, multiLabel); Clusterer clusterer = AbstractClusterer.forName(clusteringArguments.get(0), clusteringArguments.subList(1, clusteringArguments.size()).toArray(new String[0])); Instances copyTrainData = new Instances(trainData); trainData = WekaUtils.removeOutcomeId(trainData, multiLabel); // generate data for clusterer (w/o class) Remove filter = new Remove(); filter.setAttributeIndices("" + (trainData.classIndex() + 1)); filter.setInputFormat(trainData); Instances clusterTrainData = Filter.useFilter(trainData, filter); clusterer.buildClusterer(clusterTrainData); // get a mapping from clusterIDs to instance offsets in the ARFF Map<Integer, Set<Integer>> clusterMap = getClusterMap(clusterTrainData, clusterer); Map<String, String> instanceId2TextMap = getInstanceId2TextMap(aContext); ConditionalFrequencyDistribution<Integer, String> clusterAssignments = new ConditionalFrequencyDistribution<Integer, String>(); for (Integer clusterId : clusterMap.keySet()) { System.out.println("CLUSTER: " + clusterId); for (Integer offset : clusterMap.get(clusterId)) { // get instance ID from instance Instance instance = copyTrainData.get(offset); Double classOffset = new Double(instance.value(copyTrainData.classAttribute())); String label = (String) trainOutcomeValues.get(classOffset.intValue()); clusterAssignments.addSample(clusterId, label); String instanceId = instance .stringValue(copyTrainData.attribute(AddIdFeatureExtractor.ID_FEATURE_NAME).index()); System.out.println(label + "\t" + instanceId2TextMap.get(instanceId)); } System.out.println(); } System.out.println("ID\tSIZE\tPURITY\tRMSE"); for (Integer clusterId : clusterMap.keySet()) { FrequencyDistribution<String> fd = clusterAssignments.getFrequencyDistribution(clusterId); double purity = (double) fd.getCount(fd.getSampleWithMaxFreq()) / fd.getN(); String purityString = String.format("%.2f", purity); double rmse = getRMSE(fd, trainOutcomeValues); String rmseString = String.format("%.2f", rmse); System.out.println( clusterId + "\t" + clusterMap.get(clusterId).size() + "\t" + purityString + "\t" + rmseString); } System.out.println(); }
From source file:de.uniheidelberg.cl.swp.mlprocess.MLProcess.java
License:Apache License
/** * Creates the classifications of the test-{@link CoreferencePair}s by using the classifier * trained on the test-{@link CoreferencePair}s. * /* w ww . j av a 2 s .co m*/ * @param testCorefs {@link CoreferencePair}s extraced from the test corpus by the ACR-Systems. * @return {@link CoreferencePair} which are predicted by our classifier to be correct. */ private List<CoreferencePair> createPrediction(Map<String, List<CoreferencePair>> testCorefs) throws Exception { List<CoreferencePair> predictions = new ArrayList<CoreferencePair>(); for (String s : testCorefs.keySet()) { for (final CoreferencePair cp : testCorefs.get(s)) { Instance ini = ic.addCorefInstance(cp, s); ini.setDataset(ic.getInstances()); /* use the classifier to select a label */ if (wr.labelUnknownInstance(ini) == 0.0) { cp.setAcrSystem(ini.stringValue(ini.numAttributes() - 2)); predictions.add(cp); } } } predictions = removeDuplicates(predictions); return predictions; }
From source file:dkpro.similarity.experiments.rte.util.Evaluator.java
License:Open Source License
public static void runClassifierCV(WekaClassifier wekaClassifier, Dataset dataset) throws Exception { // Set parameters int folds = 10; Classifier baseClassifier = ClassifierSimilarityMeasure.getClassifier(wekaClassifier); // Set up the random number generator long seed = new Date().getTime(); Random random = new Random(seed); // Add IDs to the instances AddID.main(new String[] { "-i", MODELS_DIR + "/" + dataset.toString() + ".arff", "-o", MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff" }); Instances data = DataSource.read(MODELS_DIR + "/" + dataset.toString() + "-plusIDs.arff"); data.setClassIndex(data.numAttributes() - 1); // Instantiate the Remove filter Remove removeIDFilter = new Remove(); removeIDFilter.setAttributeIndices("first"); // Randomize the data data.randomize(random);/* w ww . j a v a 2 s . c om*/ // Perform cross-validation Instances predictedData = null; Evaluation eval = new Evaluation(data); for (int n = 0; n < folds; n++) { Instances train = data.trainCV(folds, n, random); Instances test = data.testCV(folds, n); // Apply log filter // Filter logFilter = new LogFilter(); // logFilter.setInputFormat(train); // train = Filter.useFilter(train, logFilter); // logFilter.setInputFormat(test); // test = Filter.useFilter(test, logFilter); // Copy the classifier Classifier classifier = AbstractClassifier.makeCopy(baseClassifier); // Instantiate the FilteredClassifier FilteredClassifier filteredClassifier = new FilteredClassifier(); filteredClassifier.setFilter(removeIDFilter); filteredClassifier.setClassifier(classifier); // Build the classifier filteredClassifier.buildClassifier(train); // Evaluate eval.evaluateModel(filteredClassifier, test); // Add predictions AddClassification filter = new AddClassification(); filter.setClassifier(classifier); filter.setOutputClassification(true); filter.setOutputDistribution(false); filter.setOutputErrorFlag(true); filter.setInputFormat(train); Filter.useFilter(train, filter); // trains the classifier Instances pred = Filter.useFilter(test, filter); // performs predictions on test set if (predictedData == null) predictedData = new Instances(pred, 0); for (int j = 0; j < pred.numInstances(); j++) predictedData.add(pred.instance(j)); } System.out.println(eval.toSummaryString()); System.out.println(eval.toMatrixString()); // Prepare output scores String[] scores = new String[predictedData.numInstances()]; for (Instance predInst : predictedData) { int id = new Double(predInst.value(predInst.attribute(0))).intValue() - 1; int valueIdx = predictedData.numAttributes() - 2; String value = predInst.stringValue(predInst.attribute(valueIdx)); scores[id] = value; } // Output classifications StringBuilder sb = new StringBuilder(); for (String score : scores) sb.append(score.toString() + LF); FileUtils.writeStringToFile(new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/" + dataset.toString() + ".csv"), sb.toString()); // Output prediction arff DataSink.write(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/" + dataset.toString() + ".predicted.arff", predictedData); // Output meta information sb = new StringBuilder(); sb.append(baseClassifier.toString() + LF); sb.append(eval.toSummaryString() + LF); sb.append(eval.toMatrixString() + LF); FileUtils.writeStringToFile(new File(OUTPUT_DIR + "/" + dataset.toString() + "/" + wekaClassifier.toString() + "/" + dataset.toString() + ".meta.txt"), sb.toString()); }