List of usage examples for weka.core Instance stringValue
public String stringValue(Attribute att);
From source file:net.sf.bddbddb.OrderClassifier.java
License:LGPL
public double importance(weka.core.Attribute attribute, String attrValue) {//, String classValue){ int count = 0; int goodCount = 0, badCount = 0; List newInstances = new LinkedList(); for (Iterator it = orders.iterator(); it.hasNext();) { Instance instance = (Instance) it.next(); if (//!instance.stringValue(instance.classIndex()).equals(classValue) || !instance.stringValue(attribute).equals(attrValue)) continue; if (goodClusters.contains(instance.stringValue(instance.classIndex()))) ++goodCount;//from w ww . ja v a2 s.c om else ++badCount; Instance newInstance = new Instance(instance); newInstance.setDataset(instance.dataset()); newInstances.add(newInstance); } goodCount *= attrOptions.size() - 1; badCount *= attrOptions.size() - 1; for (Iterator it = newInstances.iterator(); it.hasNext();) { Instance instance = (Instance) it.next(); /* if(//!instance.stringValue(instance.classIndex()).equals(classValue) || !instance.stringValue(attribute).equals(attrValue)) continue; */ String classValue = instance.stringValue(instance.classIndex()); FastVector newOptions = new FastVector(); newOptions.appendElements(attrOptions); newOptions.removeElementAt(newOptions.indexOf(instance.stringValue(attribute))); //int index = Math.abs(LearnedOrder.randomNumGen.nextInt()) % newOptions.size(); int index = 0; while (index < newOptions.size()) { instance.setValue(attribute, attrOptions.indexOf(newOptions.elementAt(index))); String value = classify(instance); if (goodClusters.contains(classValue)) { if (goodClusters.contains(value)) --goodCount; } else if (!goodClusters.contains(classValue)) { if (!goodClusters.contains(value)) --badCount; } ++index; } //if(value.equals(classValue)) --count; } count = goodCount - badCount; count /= attrOptions.size() - 1; double importance = ((double) count) / newInstances.size(); if (Double.isNaN(importance)) return 0; return importance; }
From source file:net.sf.bddbddb.OrderClassifier.java
License:LGPL
public double vote(weka.core.Attribute attribute, String attrValue, String classValue) { int count = 0; int numOrders = 0; for (Iterator it = orders.iterator(); it.hasNext();) { Instance instance = (Instance) it.next(); if (!instance.stringValue(instance.classIndex()).equals(classValue)) continue; ++numOrders;// w ww . ja v a 2s . c o m if (instance.stringValue(attribute).equals(attrValue)) ++count; } return ((double) count) / numOrders; }
From source file:net.sf.jclal.activelearning.oracle.SimulatedOracle.java
License:Open Source License
public void showSingleLabelInstance(Instance instance) { System.out.println("\nInstance selected: " + instance.toString()); System.out.println("Class: " + instance.stringValue(instance.classIndex())); }
From source file:net.sf.jclal.activelearning.oracle.SimulatedOracle.java
License:Open Source License
public void showMultiLabelInstance(Instance instance, int[] labels) { System.out.println("\nInstance selected: " + instance.toString()); System.out.println("Labels: "); for (int l = 0; l < labels.length; l++) { System.out.println(l + ":" + instance.stringValue(labels[l])); }/*w w w .ja va 2 s. co m*/ }
From source file:nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser.java
License:Open Source License
/** * use the classifier to test the sequences in a genbank or fasta file for boundaries * @param fileLocation the location of the genbank of fasta file * @param classifier the classifier to use * @return /*from www. ja v a 2s .com*/ */ public static ArrayList<ClassifiedNucleotide> getPossibleBoundaries(String fileLocation, Classifier classifier) { ArrayList<Gene> genesFromFile = new ArrayList<>(); ArrayList<ClassifiedNucleotide> classifiedNucleotides = new ArrayList<>(); //read from fasta if (fileLocation.toUpperCase().endsWith(".FASTA") || fileLocation.toUpperCase().endsWith(".FA") || fileLocation.toUpperCase().endsWith(".FAN")) { genesFromFile.addAll(readFasta(fileLocation)); } //read from genbank else if (fileLocation.toUpperCase().endsWith(".GENBANK") || fileLocation.toUpperCase().endsWith(".GB")) { GenBankReader gbr = new GenBankReader(); gbr.readFile(fileLocation); GenbankResult gbresult = gbr.getResult(); genesFromFile = gbresult.getGenes(); } //get the test data HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults; geneTestResults = TestGenes.testForIntronExonBoundaries(genesFromFile, 1); ArrayList<InstanceToClassify> instanceNucs = new ArrayList<>(); try { //write our results to a temporary file File tempArrf = File.createTempFile("realSet", ".arff"); ArffWriter.write(tempArrf.getAbsolutePath(), geneTestResults, null); //get data ConverterUtils.DataSource source = new ConverterUtils.DataSource(tempArrf.getAbsolutePath()); //SET DATA AND OPTIONS Instances data = source.getDataSet(); for (int i = 0; i < data.numInstances(); i++) { Instance in = data.instance(i); //get the name of the gene or sequence tested String nameOfInstance = in.stringValue(in.numAttributes() - 3); //get the tested position int testedPosition = (int) in.value(in.numAttributes() - 2); //set the class as missing, because we want to find it in.setMissing((in.numAttributes() - 1)); Instance instanceNoExtras = new Instance(in); //delete the name and position, they are irrelevant for classifying instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); InstanceToClassify ic = new InstanceToClassify(instanceNoExtras, testedPosition, nameOfInstance); instanceNucs.add(ic); } for (InstanceToClassify ic : instanceNucs) { Instance in = ic.getInstance(); in.setDataset(data); data.setClassIndex(data.numAttributes() - 1); //classify our instance classifier.classifyInstance(in); //save the likelyhood something is part of something double likelyhoodBoundary = classifier.distributionForInstance(in)[0]; double likelyhoodNotBoundary = classifier.distributionForInstance(in)[1]; //create a classified nucleotide and give it the added data ClassifiedNucleotide cn = new ClassifiedNucleotide(likelyhoodBoundary, likelyhoodNotBoundary, ic.getName(), ic.getPosition()); classifiedNucleotides.add(cn); } } catch (IOException ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } return classifiedNucleotides; }
From source file:org.deidentifier.arx.ARFF2ARX.java
License:Open Source License
/** * * @param instance/*from ww w. j a v a 2s .co m*/ * @return */ protected String[] convertRow(Instance instance) { String[] row = new String[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (instance.attribute(i).type() == Attribute.NOMINAL || instance.attribute(i).type() == Attribute.STRING) { row[i] = instance.stringValue(i); } else { row[i] = String.valueOf((int) instance.value(i)); } } return row; }
From source file:org.opentox.toxotis.factory.DatasetFactory.java
License:Open Source License
/** * Create a {@link DataEntry data entry} from a single instance. * @param instance/* w w w. j a va 2 s. c om*/ * @return * A Data Entry that corresponds to the provided instance. * @throws ToxOtisException */ public DataEntry createDataEntry(Instance instance) throws ToxOtisException { Enumeration attributes = instance.enumerateAttributes(); DataEntry de = new DataEntry(); try { while (attributes.hasMoreElements()) { Attribute attribute = (Attribute) attributes.nextElement(); if (attribute.name().equals(Dataset.COMPOUND_URI) || attribute.name().equals("URI")) { de.setConformer(new Compound(new VRI(instance.stringValue(attribute)))); } else { FeatureValue fv = new FeatureValue(); Feature feature = new Feature(new VRI(attribute.name())); LiteralValue value = null; if (attribute.isNumeric()) { value = new LiteralValue<Double>(instance.value(attribute), XSDDatatype.XSDdouble); feature.getOntologicalClasses().add(OTClasses.numericFeature()); } else if (attribute.isString() || attribute.isDate()) { value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring); feature.getOntologicalClasses().add(OTClasses.stringFeature()); } else if (attribute.isNominal()) { value = new LiteralValue<String>(instance.stringValue(attribute), XSDDatatype.XSDstring); Enumeration nominalValues = attribute.enumerateValues(); feature.getOntologicalClasses().add(OTClasses.nominalFeature()); while (nominalValues.hasMoreElements()) { String nomValue = (String) nominalValues.nextElement(); feature.getAdmissibleValues() .add(new LiteralValue<String>(nomValue, XSDDatatype.XSDstring)); } } fv.setFeature(feature); fv.setValue(value); de.addFeatureValue(fv); } } } catch (URISyntaxException ex) { throw new ToxOtisException(ex); } return de; }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * determines the dictionary.// w ww . jav a 2 s . c o m */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } //TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); if (this.m_useStoplist == true) if (stopwords.is(word)) continue; if (!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } //updating the docCount for the words that have occurred in this //instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; } m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * // w w w . java2 s . com * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:probcog.bayesnets.learning.CPTLearner.java
License:Open Source License
/** * learns all the examples in the instances. Each instance in the instances represents one example. * All the random variables (nodes) in the network * need to be found in each instance as columns that are named accordingly, i.e. for each * random variable, there must be an attribute with a matching name in the instance. * @param instances the instances * @throws Exception if the result set is empty * @throws SQLException particularly if there is no matching column for one of the node names *///w ww . jav a 2 s . c o m public void learn(Instances instances) throws Exception { if (!initialized) init(); // if it's an empty result set, throw exception if (instances.numInstances() == 0) throw new Exception("empty result set!"); BeliefNode[] nodes = bn.bn.getNodes(); int numAttributes = instances.numAttributes(); // Now we can get much more nodes than attributes // if(numAttributes != nodes.length) // throw new Exception("Result does not contain suitable data (attribute count = " + numAttributes + "; node count = " + nodes.length + ")"); // map node indices to attribute index int[] nodeIdx2colIdx = new int[nodes.length]; Arrays.fill(nodeIdx2colIdx, -1); for (int i = 0; i < numAttributes; i++) { Set<String> nodeNames = bn.getNodeNamesForAttribute(instances.attribute(i).name()); //logger.debug("Nodes for attribute "+instances.attribute(i).name()+": "+nodeNames); if (nodeNames == null) continue; for (String nodeName : nodeNames) { int node_idx = bn.getNodeIndex(nodeName); if (node_idx == -1) throw new Exception("Unknown node referenced in result set: " + instances.attribute(i).name()); nodeIdx2colIdx[node_idx] = i; } } // gather data, iterating over the result set int[] domainIndices = new int[nodes.length]; @SuppressWarnings("unchecked") Enumeration<Instance> instanceEnum = instances.enumerateInstances(); while (instanceEnum.hasMoreElements()) { Instance instance = instanceEnum.nextElement(); // for each row... // - get the indices into the domains of each node // that correspond to the current row of data // (sorted in the same order as the nodes are ordered // in the BeliefNetwork) for (int node_idx = 0; node_idx < nodes.length; node_idx++) { int domain_idx; if (clusterers[node_idx] == null) { Discrete domain = (Discrete) nodes[node_idx].getDomain(); String strValue; if (domain instanceof Discretized) { // If we have a discretized domain we discretize first... int colIdx = nodeIdx2colIdx[node_idx]; if (colIdx < 0) { //bn.dump(); /* for (int i = 0; i < numAttributes; i++) { logger.debug("Attribute "+i+": "+instances.attribute(i).name()); } StringBuffer sb = new StringBuffer(); for (int i = 0; i < nodeIdx2colIdx.length; i++) { sb.append(i+"\t"); } sb.append("\n"); for (int i = 0; i < nodeIdx2colIdx.length; i++) { sb.append(nodeIdx2colIdx[i]+"\t"); } logger.debug(sb); */ throw new Exception( "No attribute specified for " + bn.bn.getNodes()[node_idx].getName()); } double value = instance.value(colIdx); strValue = (((Discretized) domain).getNameFromContinuous(value)); /*if (domain.findName(strValue) == -1) { logger.debug(domain); logger.debug(strValue); }*/ } else { int colIdx = nodeIdx2colIdx[node_idx]; if (colIdx < 0) { throw new Exception( "No attribute specified for " + bn.bn.getNodes()[node_idx].getName()); } strValue = instance.stringValue(nodeIdx2colIdx[node_idx]); } domain_idx = domain.findName(strValue); if (domain_idx == -1) { /*String[] myDomain = bn.getDiscreteDomainAsArray(bn.bn.getNodes()[node_idx].getName()); for (int i=0; i<myDomain.length; i++) { logger.debug(myDomain[i]); }*/ throw new Exception(strValue + " not found in domain of " + nodes[node_idx].getName()); } } else { Instance inst = new Instance(1); inst.setValue(0, instance.value(nodeIdx2colIdx[node_idx])); domain_idx = clusterers[node_idx].clusterInstance(inst); } domainIndices[node_idx] = domain_idx; } // - update each node's CPT for (int i = 0; i < nodes.length; i++) { counters[i].count(domainIndices); } } }