List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:org.wkwk.classifier.MyC45.java
@Override public void buildClassifier(Instances data) throws Exception { getCapabilities().testWithFail(data); for (int i = 0; i < data.numAttributes(); i++) { Attribute attr = data.attribute(i); for (int j = 0; j < 10; j++) { Instance instance = data.instance(j); if (instance.isMissing(attr)) { instance.setValue(attr, fillMissingValue(data, attr)); }/* ww w.j a v a 2s . c om*/ } } data.deleteWithMissingClass(); makeTree(data); }
From source file:predictors.HelixIndexer.java
License:Open Source License
/** * Predicts transmembrane residues for a given protein. * /*ww w . ja v a2s.c om*/ * @param protein */ public void predict(Protein protein) { if (protein == null || protein.getPssm() == null) { return; } Pssm pssm = protein.getPssm(); int length = pssm.getLength(); int[] scoresSol = new int[length]; int[] scoresTmh = new int[length]; int[] scoresSig = new int[length]; this.globalComposition(pssm); //slide window along the sequence for (int i = 0; i < length; ++i) { try { Instance window = this.buildInstance(pssm, i); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); if (i < 40) { scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]); scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]); scoresSig[i] = (int) (1000 * probabilities[HelixIndexer.indexSignal]); } else { scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]); scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]); scoresSig[i] = 0; } } catch (Exception e) { ErrorUtils.printError(HelixIndexer.class, "Prediction failed for " + protein.getHeader(), e); return; } } //save scores into the protein protein.setSolRaw(scoresSol); protein.setTmhRaw(scoresTmh); protein.setSigRaw(scoresSig); }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Adjusts predicted transmembrane helices within a given protein. * /*from w w w . j a va 2s .c om*/ * @param protein * @param cutoff * @return */ private boolean adjustTMHs(Protein protein, double cutoff) { boolean adjust = false; Pssm pssm = protein.getPssm(); char[] structure = protein.getPrediction(); int[] segmentRaw = protein.getSegmentRaw(); for (int i = 0; i < structure.length; ++i) { try { if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) { int start = i; char type = structure[i]; //go to end of transmembrane helix while (i < structure.length && structure[i] == type) { ++i; } --i; int end = i; Instance window = this.buildInstance(pssm, start, end); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); double bestProb = probabilities[Mappings.indexTmh]; int bestStart = -1; int bestEnd = -1; //shift TMH start/end around and find best position for (int newStart = start - Globals.PREDICTOR_MAX_SHIFT; newStart <= start + Globals.PREDICTOR_MAX_SHIFT; ++newStart) { if (newStart < 0) { continue; } for (int newEnd = end - Globals.PREDICTOR_MAX_SHIFT; newEnd <= end + Globals.PREDICTOR_MAX_SHIFT; ++newEnd) { if (newEnd >= structure.length) { break; } window = this.buildInstance(pssm, newStart, newEnd); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); probabilities = this.classifier.distributionForInstance(window); if (probabilities[Mappings.indexTmh] > bestProb) { bestProb = probabilities[Mappings.indexTmh]; bestStart = newStart; bestEnd = newEnd; } } } //adjust the TMH! if (bestProb < cutoff) { for (int j = start; j <= end; ++j) { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } } else if (bestStart != -1 && bestEnd != -1) { start = Math.min(start, bestStart); end = Math.max(end, bestEnd); for (int j = start; j <= end; ++j) { if (j >= bestStart && j <= bestEnd) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb); } else { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } } adjust = true; i = end; } else { for (int j = start; j <= end; ++j) { segmentRaw[j] = (int) (1000 * bestProb); } } } else { segmentRaw[i] = 0; } } catch (Exception e) { ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e); return false; } } return adjust; }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Splits predicted transmembrane helices within a given protein. * // www.j a v a 2s.co m * @param protein * @param cutoff * @return */ private boolean splitTMHs(Protein protein, double cutoff) { boolean split = false; Pssm pssm = protein.getPssm(); char[] structure = protein.getPrediction(); int[] segmentRaw = protein.getSegmentRaw(); int minLength = 2 * Globals.PREDICTOR_HELIX_MIN_SIZE + Globals.PREDICTOR_GAP_MIN_SIZE; for (int i = 0; i < structure.length; ++i) { try { if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) { int start = i; char type = structure[i]; //go to end of transmembrane helix while (i < structure.length && structure[i] == type) { ++i; } --i; int end = i; //if TMH is too short jump to the next one if (end - start + 1 < minLength) { continue; } Instance window = this.buildInstance(pssm, start, end); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); double bestProb = probabilities[Mappings.indexTmh]; double bestProb1 = 0; double bestProb2 = 0; int bestBreak1 = -1; int bestBreak2 = -1; //insert a variable gap into the TMH and find best constellation for (int break1 = start + (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); break1 < end; ++break1) { for (int break2 = break1 + Globals.PREDICTOR_GAP_MIN_SIZE; break2 < end - (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); ++break2) { if (break2 == break1) { continue; } Instance window1 = this.buildInstance(pssm, start, break1); Instance window2 = this.buildInstance(pssm, break2 + 1, end); window1.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window2.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window1.setDataset(this.dataset); window2.setDataset(this.dataset); double prob1 = this.classifier.distributionForInstance(window1)[Mappings.indexTmh]; double prob2 = this.classifier.distributionForInstance(window2)[Mappings.indexTmh]; if (prob1 >= cutoff && prob2 >= cutoff) { double avgProb = (prob1 + prob2) / 2.0; if (avgProb > bestProb) { bestProb = avgProb; bestProb1 = prob1; bestProb2 = prob2; bestBreak1 = break1; bestBreak2 = break2; } } } } //split the TMH! if (bestBreak1 != -1 && bestBreak2 != -1) { for (int j = start; j <= bestBreak1; ++j) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb1); } for (int j = bestBreak1 + 1; j <= bestBreak2; ++j) { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } for (int j = bestBreak2 + 1; j <= end; ++j) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb2); } split = true; } } } catch (Exception e) { ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e); return false; } } return split; }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Analyzes a given segment and returns the TMH probability. * /*from w w w .j a v a 2 s . c om*/ * @param pssm * @param start * @param end * @return */ public double getSegmentProbability(Pssm pssm, int start, int end) { double tmhProbability = -1; try { Instance window = this.buildInstance(pssm, start, end); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); tmhProbability = this.classifier.distributionForInstance(window)[Mappings.indexTmh]; ; } catch (Exception e) { ErrorUtils.printError(HelixPredictor.class, "Prediction failed for segment (" + start + "-" + end + ")", e); return -1.0; } return tmhProbability; }
From source file:predictors.TopologyPredictor.java
License:Open Source License
/** * Predicts the N-terminal topology for a given protein. * //from w w w . ja v a 2 s . c o m * @param protein * @param cutoff */ public void predict(Protein protein, double cutoff) { if (protein == null) { return; } if (protein.getPssm() == null) { return; } if (protein.getPrediction() == null) { return; } if (!protein.isPredTmp()) { return; } Pssm pssm = protein.getPssm(); char[] prediction = protein.getPrediction(); try { ArrayList<Segment> solSegments = findSegments(prediction); Instance instance = this.buildInstance(pssm, prediction, solSegments, 0); instance.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); instance.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(instance); char top = Character.UNASSIGNED; protein.setTopologyRaw((int) (1000 * probabilities[TopologyPredictor.indexInside])); if (!protein.hasPredSigP() && probabilities[TopologyPredictor.indexInside] >= cutoff) { top = Mappings.intToTop(Mappings.indexInside); } else { top = Mappings.intToTop(Mappings.indexOutside); } for (int i = 0; i < prediction.length; ++i) { char type = prediction[i]; if (Mappings.ssToInt(type) == Mappings.indexNotTmh) { prediction[i] = top; } else if (Mappings.ssToInt(type) == Mappings.indexTmh) { if (top == Mappings.intToTop(Mappings.indexInside)) { top = Mappings.intToTop(Mappings.indexOutside); } else { top = Mappings.intToTop(Mappings.indexInside); } while (i < prediction.length && type == prediction[i]) { ++i; } --i; } } } catch (Exception e) { ErrorUtils.printError(TopologyPredictor.class, "Prediction failed for " + protein.getHeader(), e); return; } }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * determines the dictionary.//from w w w .j a v a 2s . c om */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } //TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); if (this.m_useStoplist == true) if (stopwords.is(word)) continue; if (!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } //updating the docCount for the words that have occurred in this //instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; } m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:preprocess.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * //from w w w .j a v a2s. c o m * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { //if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } //Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } //Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:tr.gov.ulakbim.jDenetX.streams.filters.AddNoiseFilter.java
License:Open Source License
public Instance nextInstance() { Instance inst = (Instance) this.inputStream.nextInstance().copy(); for (int i = 0; i < inst.numAttributes(); i++) { double noiseFrac = i == inst.classIndex() ? this.classNoiseFractionOption.getValue() : this.attNoiseFractionOption.getValue(); if (inst.attribute(i).isNominal()) { DoubleVector obs = (DoubleVector) this.attValObservers.get(i); if (obs == null) { obs = new DoubleVector(); this.attValObservers.set(i, obs); }//from w w w .j a v a 2 s.c om int originalVal = (int) inst.value(i); if (!inst.isMissing(i)) { obs.addToValue(originalVal, inst.weight()); } if ((this.random.nextDouble() < noiseFrac) && (obs.numNonZeroEntries() > 1)) { do { inst.setValue(i, this.random.nextInt(obs.numValues())); } while (((int) inst.value(i) == originalVal) || (obs.getValue((int) inst.value(i)) == 0.0)); } } else { GaussianEstimator obs = (GaussianEstimator) this.attValObservers.get(i); if (obs == null) { obs = new GaussianEstimator(); this.attValObservers.set(i, obs); } obs.addObservation(inst.value(i), inst.weight()); inst.setValue(i, inst.value(i) + this.random.nextGaussian() * obs.getStdDev() * noiseFrac); } } return inst; }
From source file:wekimini.DataManager.java
public void reAddDeletedTrainingRound() { if (deletedTrainingRound != null) { for (Instance in : deletedTrainingRound) { for (int j = 0; j < numOutputs; j++) { if (!in.isMissing(numMetaData + numInputs + j)) { setNumExamplesPerOutput(j, getNumExamplesPerOutput(j) + 1); }/*from w w w . ja v a2 s . c o m*/ } in.setDataset(allInstances); allInstances.add(in); setHasInstances(true); fireStateChanged(); } //Could get interesting behavior if we allow multiple re-adds; don't do this now. deletedTrainingRound = null; } }