Example usage for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:org.wikipedia.miner.annotation.Disambiguator.java

License:Open Source License

private Result<Integer> test(Article article, int snippetLength, RelatednessCache rc) throws Exception {

    System.out.println(" - testing " + article);

    Vector<Anchor> unambigAnchors = new Vector<Anchor>();
    Vector<TopicReference> ambigRefs = new Vector<TopicReference>();

    String content = cleaner.getMarkupLinksOnly(article, snippetLength);

    Pattern linkPattern = Pattern.compile("\\[\\[(.*?)\\]\\]");
    Matcher linkMatcher = linkPattern.matcher(content);

    HashSet<Integer> goldStandard = new HashSet<Integer>();
    HashSet<Integer> disambiguatedLinks = new HashSet<Integer>();

    while (linkMatcher.find()) {
        String linkText = content.substring(linkMatcher.start() + 2, linkMatcher.end() - 2);

        String anchorText = linkText;
        String destText = linkText;

        int pos = linkText.lastIndexOf('|');
        if (pos > 0) {
            destText = linkText.substring(0, pos);
            anchorText = linkText.substring(pos + 1);
        }// w ww . j a va  2 s  .  c  o m

        destText = Character.toUpperCase(destText.charAt(0)) + destText.substring(1); // Get first char and capitalize

        Anchor anchor = new Anchor(anchorText, tp, wikipedia.getDatabase());
        int senseCount = anchor.getSenses().size();
        Article dest = wikipedia.getArticleByTitle(destText);

        if (senseCount > 0 && dest != null) {

            goldStandard.add(dest.getId());

            if (senseCount == 1 || anchor.getSenses().first().getProbability() >= (1 - minSenseProbability)) {
                unambigAnchors.add(anchor);
                disambiguatedLinks.add(dest.getId());
            } else {
                TopicReference ref = new TopicReference(anchor, dest.getId(), null);
                ambigRefs.add(ref);
            }
        }
    }

    // use all terms as context
    Context context = getContext(article, snippetLength, rc);

    //only use links
    //Context context = new Context(unambigAnchors, rc, maxContextSize) ;

    // resolve senses      
    for (TopicReference ref : ambigRefs) {

        TreeSet<Article> validSenses = new TreeSet<Article>();

        for (Sense sense : ref.getAnchor().getSenses()) {

            if (sense.getProbability() < minSenseProbability)
                break;

            double[] values = new double[attributes.size()];

            values[0] = sense.getProbability();
            values[1] = context.getRelatednessTo(sense);
            values[2] = context.getQuality();
            values[3] = Instance.missingValue();

            Instance i = new Instance(1.0, values);
            i.setDataset(header);

            double prob = classifier.distributionForInstance(i)[0];

            if (prob > 0.5) {
                Article art = new Article(wikipedia.getDatabase(), sense.getId());
                art.setWeight(prob);
                validSenses.add(art);
            }
        }

        //use most valid sense
        if (!validSenses.isEmpty())
            disambiguatedLinks.add(validSenses.first().getId());
    }

    Result<Integer> result = new Result<Integer>(disambiguatedLinks, goldStandard);

    System.out.println("   " + result);

    return result;
}

From source file:org.wikipedia.miner.annotation.weighting.LinkDetector.java

License:Open Source License

/**
 * Weights the given list of topics according to how likely they are to be Wikipedia links if the 
 * document they were extracted from was a Wikipedia article. 
 * // w  ww.  j a  v a 2  s.  c  o m
 * @param topics
 * @return a sorted vector of the same topics, where the weight of each topic is the probability that it is a link
 * @throws Exception
 */
public SortedVector<Topic> getWeightedTopics(Collection<Topic> topics) throws Exception {

    if (classifier == null)
        throw new Exception("You must train the link detector first.");

    SortedVector<Topic> weightedTopics = new SortedVector<Topic>();

    for (Topic topic : topics) {

        double[] values = new double[header.numAttributes()];

        values[0] = topic.getOccurances();
        values[1] = topic.getMaxDisambigConfidence();
        values[2] = topic.getAverageDisambigConfidence();
        values[3] = topic.getRelatednessToOtherTopics();
        values[4] = topic.getMaxLinkProbability();
        values[5] = topic.getAverageLinkProbability();

        if (topic.getGenerality() >= 0)
            values[6] = topic.getGenerality();
        else
            values[7] = Instance.missingValue();

        values[7] = topic.getFirstOccurance();
        values[8] = topic.getLastOccurance();
        values[9] = topic.getSpread();

        //values[10] = topic.getRelatednessToContext() ;

        values[10] = Instance.missingValue();

        Instance instance = new Instance(1.0, values);
        instance.setDataset(header);

        double prob = classifier.distributionForInstance(instance)[0];
        topic.setWeight(prob);
        weightedTopics.add(topic, false);
    }

    return weightedTopics;
}

From source file:oxis.yologp.YOLogPDescriptor.java

License:Open Source License

/**
 * Predict the LogP.//from ww w .  j ava2s . co  m
 *
 */
private void predict() throws Exception {

    Instances instances = buildDataset();

    Map<Object, Object> properties;
    for (DrugStruct drugStruct : listDrug) {

        if (drugStruct.drug.getProperty("flag")) {
            properties = drugStruct.drug.getProperties();
            Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024
            instance.setDataset(instances);
            for (Object propKey : properties.keySet()) {
                if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) {
                    try {
                        instance.setValue(instances.attribute(propKey.toString()),
                                Double.parseDouble(properties.get(propKey).toString()));
                    } catch (NullPointerException ex) {
                        Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING,
                                "Property not used: {0}", propKey.toString());
                    }
                }
            }

            double predicted = model.classifyInstance(instance);
            predicted = Math.round(predicted * 100) / 100.0d;
            instance.setClassValue(predicted);
            instances.add(instance);
            drugStruct.drug.setProperty("predicted", predicted);
        }
    }
}

From source file:oxis.yologp.YOLogPDescriptor.java

License:Open Source License

/**
 * Train a model, erase the other one/*from   www .  ja v a 2s .com*/
 *
 * @param String name of the model to save
 */
public void train(String name) throws Exception {

    compute();

    Instances instances = buildDataset();

    model = new RandomForest();

    Map<Object, Object> properties;
    for (DrugStruct drugStruct : listDrug) {

        if (drugStruct.drug.getProperty("flag")) {
            properties = drugStruct.drug.getProperties();
            Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024
            instance.setDataset(instances);
            for (Object propKey : properties.keySet()) {
                if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) {
                    try {
                        instance.setValue(instances.attribute(propKey.toString()),
                                Double.parseDouble(properties.get(propKey).toString()));
                    } catch (NullPointerException ex) {
                        Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING,
                                "Property not used: {0}", propKey.toString());
                    }
                }
            }
            instance.setClassValue(drugStruct.getLogP());
            instances.add(instance);
        }
    }
    model.setNumFeatures(200);
    model.setNumTrees(400);
    model.setMaxDepth(0);
    model.buildClassifier(instances);

    weka.core.SerializationHelper.write(path + name, model);
}

From source file:pl.nask.hsn2.service.analysis.JSWekaAnalyzer.java

License:Open Source License

public final JSClass classifyString(File file) {
    String ngrams = NGramsCalc.getNgramsForFile(file.getPath(), ngramsLength, ngramsQuantity);

    if (ngrams == null) {
        LOGGER.info("No ngrams extracted, probably JS source is too short");
    } else {/*from ww  w .j av  a  2 s . com*/
        StringTokenizer st = new StringTokenizer(ngrams, " ");
        if (st.countTokens() >= ngramsQuantity) {

            Instance t = new Instance(2);
            t.setDataset(trainingSet);
            t.setValue(0, ngrams);

            try {
                double dd = fc.classifyInstance(t);
                return JSClass.valueOf(trainingSet.classAttribute().value((int) dd).toUpperCase());
            } catch (Exception e) {
                LOGGER.error(e.getMessage(), e);
            }
        }
    }
    return JSClass.UNCLASSIFIED;
}

From source file:predictors.HelixIndexer.java

License:Open Source License

/**
 * Predicts transmembrane residues for a given protein.
 * /*from  w  w  w  .  j  a v a2s .c  o  m*/
 * @param protein
 */
public void predict(Protein protein) {
    if (protein == null || protein.getPssm() == null) {
        return;
    }

    Pssm pssm = protein.getPssm();
    int length = pssm.getLength();
    int[] scoresSol = new int[length];
    int[] scoresTmh = new int[length];
    int[] scoresSig = new int[length];

    this.globalComposition(pssm);

    //slide window along the sequence
    for (int i = 0; i < length; ++i) {
        try {
            Instance window = this.buildInstance(pssm, i);

            window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
            window.setDataset(this.dataset);

            double[] probabilities = this.classifier.distributionForInstance(window);

            if (i < 40) {
                scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]);
                scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]);
                scoresSig[i] = (int) (1000 * probabilities[HelixIndexer.indexSignal]);
            } else {
                scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]);
                scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]);
                scoresSig[i] = 0;
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixIndexer.class, "Prediction failed for " + protein.getHeader(), e);

            return;
        }
    }

    //save scores into the protein
    protein.setSolRaw(scoresSol);
    protein.setTmhRaw(scoresTmh);
    protein.setSigRaw(scoresSig);
}

From source file:predictors.HelixIndexer.java

License:Open Source License

/**
 * Analyzes a given window and saves it in the database.
 * /*ww  w  .ja va2  s.  c  om*/
 * @param pssm
 * @param windowCenter
 * @param structure
 */
private void addWindowToDatabase(Pssm pssm, int windowCenter, char[] structure) {
    int index = Mappings.ssToInt(structure[windowCenter]);
    Instance window = this.buildInstance(pssm, windowCenter);

    if (index == Mappings.indexTmh) {
        index = HelixIndexer.indexTmh;
    } else if (index == Mappings.indexSignal) {
        index = HelixIndexer.indexSignal;
    } else {
        index = HelixIndexer.indexNotTmh;
    }

    window.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), index);
    window.setDataset(this.dataset);

    this.dataset.add(window);
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Adjusts predicted transmembrane helices within a given protein.
 * /*w  w  w  . ja  v  a  2 s.  c  om*/
 * @param protein
 * @param cutoff
 * @return
 */
private boolean adjustTMHs(Protein protein, double cutoff) {
    boolean adjust = false;
    Pssm pssm = protein.getPssm();
    char[] structure = protein.getPrediction();
    int[] segmentRaw = protein.getSegmentRaw();

    for (int i = 0; i < structure.length; ++i) {
        try {
            if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) {
                int start = i;
                char type = structure[i];

                //go to end of transmembrane helix
                while (i < structure.length && structure[i] == type) {
                    ++i;
                }

                --i;

                int end = i;

                Instance window = this.buildInstance(pssm, start, end);

                window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                window.setDataset(this.dataset);

                double[] probabilities = this.classifier.distributionForInstance(window);

                double bestProb = probabilities[Mappings.indexTmh];
                int bestStart = -1;
                int bestEnd = -1;

                //shift TMH start/end around and find best position
                for (int newStart = start - Globals.PREDICTOR_MAX_SHIFT; newStart <= start
                        + Globals.PREDICTOR_MAX_SHIFT; ++newStart) {
                    if (newStart < 0) {
                        continue;
                    }

                    for (int newEnd = end - Globals.PREDICTOR_MAX_SHIFT; newEnd <= end
                            + Globals.PREDICTOR_MAX_SHIFT; ++newEnd) {
                        if (newEnd >= structure.length) {
                            break;
                        }

                        window = this.buildInstance(pssm, newStart, newEnd);

                        window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window.setDataset(this.dataset);

                        probabilities = this.classifier.distributionForInstance(window);

                        if (probabilities[Mappings.indexTmh] > bestProb) {
                            bestProb = probabilities[Mappings.indexTmh];
                            bestStart = newStart;
                            bestEnd = newEnd;
                        }
                    }
                }

                //adjust the TMH!
                if (bestProb < cutoff) {
                    for (int j = start; j <= end; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                        segmentRaw[j] = 0;
                    }
                } else if (bestStart != -1 && bestEnd != -1) {
                    start = Math.min(start, bestStart);
                    end = Math.max(end, bestEnd);

                    for (int j = start; j <= end; ++j) {
                        if (j >= bestStart && j <= bestEnd) {
                            structure[j] = Mappings.intToSs(Mappings.indexTmh);
                            segmentRaw[j] = (int) (1000 * bestProb);
                        } else {
                            structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                            segmentRaw[j] = 0;
                        }
                    }

                    adjust = true;
                    i = end;
                } else {
                    for (int j = start; j <= end; ++j) {
                        segmentRaw[j] = (int) (1000 * bestProb);
                    }
                }
            } else {
                segmentRaw[i] = 0;
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e);

            return false;
        }
    }

    return adjust;
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Splits predicted transmembrane helices within a given protein.
 * //from   w  w  w . j a  va 2 s .c o  m
 * @param protein
 * @param cutoff
 * @return
 */
private boolean splitTMHs(Protein protein, double cutoff) {
    boolean split = false;
    Pssm pssm = protein.getPssm();
    char[] structure = protein.getPrediction();
    int[] segmentRaw = protein.getSegmentRaw();
    int minLength = 2 * Globals.PREDICTOR_HELIX_MIN_SIZE + Globals.PREDICTOR_GAP_MIN_SIZE;

    for (int i = 0; i < structure.length; ++i) {
        try {
            if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) {
                int start = i;
                char type = structure[i];

                //go to end of transmembrane helix
                while (i < structure.length && structure[i] == type) {
                    ++i;
                }

                --i;

                int end = i;

                //if TMH is too short jump to the next one
                if (end - start + 1 < minLength) {
                    continue;
                }

                Instance window = this.buildInstance(pssm, start, end);

                window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                window.setDataset(this.dataset);

                double[] probabilities = this.classifier.distributionForInstance(window);

                double bestProb = probabilities[Mappings.indexTmh];
                double bestProb1 = 0;
                double bestProb2 = 0;
                int bestBreak1 = -1;
                int bestBreak2 = -1;

                //insert a variable gap into the TMH and find best constellation
                for (int break1 = start + (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); break1 < end; ++break1) {
                    for (int break2 = break1 + Globals.PREDICTOR_GAP_MIN_SIZE; break2 < end
                            - (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); ++break2) {
                        if (break2 == break1) {
                            continue;
                        }

                        Instance window1 = this.buildInstance(pssm, start, break1);
                        Instance window2 = this.buildInstance(pssm, break2 + 1, end);

                        window1.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window2.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window1.setDataset(this.dataset);
                        window2.setDataset(this.dataset);

                        double prob1 = this.classifier.distributionForInstance(window1)[Mappings.indexTmh];
                        double prob2 = this.classifier.distributionForInstance(window2)[Mappings.indexTmh];

                        if (prob1 >= cutoff && prob2 >= cutoff) {
                            double avgProb = (prob1 + prob2) / 2.0;

                            if (avgProb > bestProb) {
                                bestProb = avgProb;
                                bestProb1 = prob1;
                                bestProb2 = prob2;
                                bestBreak1 = break1;
                                bestBreak2 = break2;
                            }
                        }
                    }
                }

                //split the TMH!
                if (bestBreak1 != -1 && bestBreak2 != -1) {
                    for (int j = start; j <= bestBreak1; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexTmh);
                        segmentRaw[j] = (int) (1000 * bestProb1);
                    }

                    for (int j = bestBreak1 + 1; j <= bestBreak2; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                        segmentRaw[j] = 0;
                    }

                    for (int j = bestBreak2 + 1; j <= end; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexTmh);
                        segmentRaw[j] = (int) (1000 * bestProb2);
                    }

                    split = true;
                }
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e);

            return false;
        }
    }

    return split;
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Analyzes a given segment (TMH or not) and saves it in the database.
 * /*w  ww. j  a  va2 s  . com*/
 * @param pssm
 * @param start
 * @param end
 * @param structureIndex
 */
private void addSegmentToDatabse(Pssm pssm, int start, int end, int structureIndex) {
    Instance segment = this.buildInstance(pssm, start, end);

    segment.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), structureIndex);

    segment.setDataset(this.dataset);

    this.dataset.add(segment);
}