List of usage examples for weka.core Instance setDataset
public void setDataset(Instances instances);
From source file:org.wikipedia.miner.annotation.Disambiguator.java
License:Open Source License
private Result<Integer> test(Article article, int snippetLength, RelatednessCache rc) throws Exception { System.out.println(" - testing " + article); Vector<Anchor> unambigAnchors = new Vector<Anchor>(); Vector<TopicReference> ambigRefs = new Vector<TopicReference>(); String content = cleaner.getMarkupLinksOnly(article, snippetLength); Pattern linkPattern = Pattern.compile("\\[\\[(.*?)\\]\\]"); Matcher linkMatcher = linkPattern.matcher(content); HashSet<Integer> goldStandard = new HashSet<Integer>(); HashSet<Integer> disambiguatedLinks = new HashSet<Integer>(); while (linkMatcher.find()) { String linkText = content.substring(linkMatcher.start() + 2, linkMatcher.end() - 2); String anchorText = linkText; String destText = linkText; int pos = linkText.lastIndexOf('|'); if (pos > 0) { destText = linkText.substring(0, pos); anchorText = linkText.substring(pos + 1); }// w ww . j a va 2 s . c o m destText = Character.toUpperCase(destText.charAt(0)) + destText.substring(1); // Get first char and capitalize Anchor anchor = new Anchor(anchorText, tp, wikipedia.getDatabase()); int senseCount = anchor.getSenses().size(); Article dest = wikipedia.getArticleByTitle(destText); if (senseCount > 0 && dest != null) { goldStandard.add(dest.getId()); if (senseCount == 1 || anchor.getSenses().first().getProbability() >= (1 - minSenseProbability)) { unambigAnchors.add(anchor); disambiguatedLinks.add(dest.getId()); } else { TopicReference ref = new TopicReference(anchor, dest.getId(), null); ambigRefs.add(ref); } } } // use all terms as context Context context = getContext(article, snippetLength, rc); //only use links //Context context = new Context(unambigAnchors, rc, maxContextSize) ; // resolve senses for (TopicReference ref : ambigRefs) { TreeSet<Article> validSenses = new TreeSet<Article>(); for (Sense sense : ref.getAnchor().getSenses()) { if (sense.getProbability() < minSenseProbability) break; double[] values = new double[attributes.size()]; values[0] = sense.getProbability(); values[1] = context.getRelatednessTo(sense); values[2] = context.getQuality(); values[3] = Instance.missingValue(); Instance i = new Instance(1.0, values); i.setDataset(header); double prob = classifier.distributionForInstance(i)[0]; if (prob > 0.5) { Article art = new Article(wikipedia.getDatabase(), sense.getId()); art.setWeight(prob); validSenses.add(art); } } //use most valid sense if (!validSenses.isEmpty()) disambiguatedLinks.add(validSenses.first().getId()); } Result<Integer> result = new Result<Integer>(disambiguatedLinks, goldStandard); System.out.println(" " + result); return result; }
From source file:org.wikipedia.miner.annotation.weighting.LinkDetector.java
License:Open Source License
/** * Weights the given list of topics according to how likely they are to be Wikipedia links if the * document they were extracted from was a Wikipedia article. * // w ww. j a v a 2 s. c o m * @param topics * @return a sorted vector of the same topics, where the weight of each topic is the probability that it is a link * @throws Exception */ public SortedVector<Topic> getWeightedTopics(Collection<Topic> topics) throws Exception { if (classifier == null) throw new Exception("You must train the link detector first."); SortedVector<Topic> weightedTopics = new SortedVector<Topic>(); for (Topic topic : topics) { double[] values = new double[header.numAttributes()]; values[0] = topic.getOccurances(); values[1] = topic.getMaxDisambigConfidence(); values[2] = topic.getAverageDisambigConfidence(); values[3] = topic.getRelatednessToOtherTopics(); values[4] = topic.getMaxLinkProbability(); values[5] = topic.getAverageLinkProbability(); if (topic.getGenerality() >= 0) values[6] = topic.getGenerality(); else values[7] = Instance.missingValue(); values[7] = topic.getFirstOccurance(); values[8] = topic.getLastOccurance(); values[9] = topic.getSpread(); //values[10] = topic.getRelatednessToContext() ; values[10] = Instance.missingValue(); Instance instance = new Instance(1.0, values); instance.setDataset(header); double prob = classifier.distributionForInstance(instance)[0]; topic.setWeight(prob); weightedTopics.add(topic, false); } return weightedTopics; }
From source file:oxis.yologp.YOLogPDescriptor.java
License:Open Source License
/** * Predict the LogP.//from ww w . j ava2s . co m * */ private void predict() throws Exception { Instances instances = buildDataset(); Map<Object, Object> properties; for (DrugStruct drugStruct : listDrug) { if (drugStruct.drug.getProperty("flag")) { properties = drugStruct.drug.getProperties(); Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024 instance.setDataset(instances); for (Object propKey : properties.keySet()) { if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) { try { instance.setValue(instances.attribute(propKey.toString()), Double.parseDouble(properties.get(propKey).toString())); } catch (NullPointerException ex) { Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING, "Property not used: {0}", propKey.toString()); } } } double predicted = model.classifyInstance(instance); predicted = Math.round(predicted * 100) / 100.0d; instance.setClassValue(predicted); instances.add(instance); drugStruct.drug.setProperty("predicted", predicted); } } }
From source file:oxis.yologp.YOLogPDescriptor.java
License:Open Source License
/** * Train a model, erase the other one/*from www . ja v a 2s .com*/ * * @param String name of the model to save */ public void train(String name) throws Exception { compute(); Instances instances = buildDataset(); model = new RandomForest(); Map<Object, Object> properties; for (DrugStruct drugStruct : listDrug) { if (drugStruct.drug.getProperty("flag")) { properties = drugStruct.drug.getProperties(); Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024 instance.setDataset(instances); for (Object propKey : properties.keySet()) { if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) { try { instance.setValue(instances.attribute(propKey.toString()), Double.parseDouble(properties.get(propKey).toString())); } catch (NullPointerException ex) { Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING, "Property not used: {0}", propKey.toString()); } } } instance.setClassValue(drugStruct.getLogP()); instances.add(instance); } } model.setNumFeatures(200); model.setNumTrees(400); model.setMaxDepth(0); model.buildClassifier(instances); weka.core.SerializationHelper.write(path + name, model); }
From source file:pl.nask.hsn2.service.analysis.JSWekaAnalyzer.java
License:Open Source License
public final JSClass classifyString(File file) { String ngrams = NGramsCalc.getNgramsForFile(file.getPath(), ngramsLength, ngramsQuantity); if (ngrams == null) { LOGGER.info("No ngrams extracted, probably JS source is too short"); } else {/*from ww w .j av a 2 s . com*/ StringTokenizer st = new StringTokenizer(ngrams, " "); if (st.countTokens() >= ngramsQuantity) { Instance t = new Instance(2); t.setDataset(trainingSet); t.setValue(0, ngrams); try { double dd = fc.classifyInstance(t); return JSClass.valueOf(trainingSet.classAttribute().value((int) dd).toUpperCase()); } catch (Exception e) { LOGGER.error(e.getMessage(), e); } } } return JSClass.UNCLASSIFIED; }
From source file:predictors.HelixIndexer.java
License:Open Source License
/** * Predicts transmembrane residues for a given protein. * /*from w w w . j a v a2s .c o m*/ * @param protein */ public void predict(Protein protein) { if (protein == null || protein.getPssm() == null) { return; } Pssm pssm = protein.getPssm(); int length = pssm.getLength(); int[] scoresSol = new int[length]; int[] scoresTmh = new int[length]; int[] scoresSig = new int[length]; this.globalComposition(pssm); //slide window along the sequence for (int i = 0; i < length; ++i) { try { Instance window = this.buildInstance(pssm, i); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); if (i < 40) { scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]); scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]); scoresSig[i] = (int) (1000 * probabilities[HelixIndexer.indexSignal]); } else { scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]); scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]); scoresSig[i] = 0; } } catch (Exception e) { ErrorUtils.printError(HelixIndexer.class, "Prediction failed for " + protein.getHeader(), e); return; } } //save scores into the protein protein.setSolRaw(scoresSol); protein.setTmhRaw(scoresTmh); protein.setSigRaw(scoresSig); }
From source file:predictors.HelixIndexer.java
License:Open Source License
/** * Analyzes a given window and saves it in the database. * /*ww w .ja va2 s. c om*/ * @param pssm * @param windowCenter * @param structure */ private void addWindowToDatabase(Pssm pssm, int windowCenter, char[] structure) { int index = Mappings.ssToInt(structure[windowCenter]); Instance window = this.buildInstance(pssm, windowCenter); if (index == Mappings.indexTmh) { index = HelixIndexer.indexTmh; } else if (index == Mappings.indexSignal) { index = HelixIndexer.indexSignal; } else { index = HelixIndexer.indexNotTmh; } window.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), index); window.setDataset(this.dataset); this.dataset.add(window); }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Adjusts predicted transmembrane helices within a given protein. * /*w w w . ja v a 2 s. c om*/ * @param protein * @param cutoff * @return */ private boolean adjustTMHs(Protein protein, double cutoff) { boolean adjust = false; Pssm pssm = protein.getPssm(); char[] structure = protein.getPrediction(); int[] segmentRaw = protein.getSegmentRaw(); for (int i = 0; i < structure.length; ++i) { try { if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) { int start = i; char type = structure[i]; //go to end of transmembrane helix while (i < structure.length && structure[i] == type) { ++i; } --i; int end = i; Instance window = this.buildInstance(pssm, start, end); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); double bestProb = probabilities[Mappings.indexTmh]; int bestStart = -1; int bestEnd = -1; //shift TMH start/end around and find best position for (int newStart = start - Globals.PREDICTOR_MAX_SHIFT; newStart <= start + Globals.PREDICTOR_MAX_SHIFT; ++newStart) { if (newStart < 0) { continue; } for (int newEnd = end - Globals.PREDICTOR_MAX_SHIFT; newEnd <= end + Globals.PREDICTOR_MAX_SHIFT; ++newEnd) { if (newEnd >= structure.length) { break; } window = this.buildInstance(pssm, newStart, newEnd); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); probabilities = this.classifier.distributionForInstance(window); if (probabilities[Mappings.indexTmh] > bestProb) { bestProb = probabilities[Mappings.indexTmh]; bestStart = newStart; bestEnd = newEnd; } } } //adjust the TMH! if (bestProb < cutoff) { for (int j = start; j <= end; ++j) { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } } else if (bestStart != -1 && bestEnd != -1) { start = Math.min(start, bestStart); end = Math.max(end, bestEnd); for (int j = start; j <= end; ++j) { if (j >= bestStart && j <= bestEnd) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb); } else { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } } adjust = true; i = end; } else { for (int j = start; j <= end; ++j) { segmentRaw[j] = (int) (1000 * bestProb); } } } else { segmentRaw[i] = 0; } } catch (Exception e) { ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e); return false; } } return adjust; }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Splits predicted transmembrane helices within a given protein. * //from w w w . j a va 2 s .c o m * @param protein * @param cutoff * @return */ private boolean splitTMHs(Protein protein, double cutoff) { boolean split = false; Pssm pssm = protein.getPssm(); char[] structure = protein.getPrediction(); int[] segmentRaw = protein.getSegmentRaw(); int minLength = 2 * Globals.PREDICTOR_HELIX_MIN_SIZE + Globals.PREDICTOR_GAP_MIN_SIZE; for (int i = 0; i < structure.length; ++i) { try { if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) { int start = i; char type = structure[i]; //go to end of transmembrane helix while (i < structure.length && structure[i] == type) { ++i; } --i; int end = i; //if TMH is too short jump to the next one if (end - start + 1 < minLength) { continue; } Instance window = this.buildInstance(pssm, start, end); window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window.setDataset(this.dataset); double[] probabilities = this.classifier.distributionForInstance(window); double bestProb = probabilities[Mappings.indexTmh]; double bestProb1 = 0; double bestProb2 = 0; int bestBreak1 = -1; int bestBreak2 = -1; //insert a variable gap into the TMH and find best constellation for (int break1 = start + (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); break1 < end; ++break1) { for (int break2 = break1 + Globals.PREDICTOR_GAP_MIN_SIZE; break2 < end - (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); ++break2) { if (break2 == break1) { continue; } Instance window1 = this.buildInstance(pssm, start, break1); Instance window2 = this.buildInstance(pssm, break2 + 1, end); window1.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window2.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1)); window1.setDataset(this.dataset); window2.setDataset(this.dataset); double prob1 = this.classifier.distributionForInstance(window1)[Mappings.indexTmh]; double prob2 = this.classifier.distributionForInstance(window2)[Mappings.indexTmh]; if (prob1 >= cutoff && prob2 >= cutoff) { double avgProb = (prob1 + prob2) / 2.0; if (avgProb > bestProb) { bestProb = avgProb; bestProb1 = prob1; bestProb2 = prob2; bestBreak1 = break1; bestBreak2 = break2; } } } } //split the TMH! if (bestBreak1 != -1 && bestBreak2 != -1) { for (int j = start; j <= bestBreak1; ++j) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb1); } for (int j = bestBreak1 + 1; j <= bestBreak2; ++j) { structure[j] = Mappings.intToSs(Mappings.indexNotTmh); segmentRaw[j] = 0; } for (int j = bestBreak2 + 1; j <= end; ++j) { structure[j] = Mappings.intToSs(Mappings.indexTmh); segmentRaw[j] = (int) (1000 * bestProb2); } split = true; } } } catch (Exception e) { ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e); return false; } } return split; }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Analyzes a given segment (TMH or not) and saves it in the database. * /*w ww. j a va2 s . com*/ * @param pssm * @param start * @param end * @param structureIndex */ private void addSegmentToDatabse(Pssm pssm, int start, int end, int structureIndex) { Instance segment = this.buildInstance(pssm, start, end); segment.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), structureIndex); segment.setDataset(this.dataset); this.dataset.add(segment); }