Example usage for weka.core Instance setDataset

List of usage examples for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:org.wikipedia.miner.annotation.Disambiguator.java

License:Open Source License

private Result<Integer> test(Article article, int snippetLength, RelatednessCache rc) throws Exception {

    System.out.println(" - testing " + article);

    Vector<Anchor> unambigAnchors = new Vector<Anchor>();
    Vector<TopicReference> ambigRefs = new Vector<TopicReference>();

    String content = cleaner.getMarkupLinksOnly(article, snippetLength);

    Pattern linkPattern = Pattern.compile("\\[\\[(.*?)\\]\\]");
    Matcher linkMatcher = linkPattern.matcher(content);

    HashSet<Integer> goldStandard = new HashSet<Integer>();
    HashSet<Integer> disambiguatedLinks = new HashSet<Integer>();

    while (linkMatcher.find()) {
        String linkText = content.substring(linkMatcher.start() + 2, linkMatcher.end() - 2);

        String anchorText = linkText;
        String destText = linkText;

        int pos = linkText.lastIndexOf('|');
        if (pos > 0) {
            destText = linkText.substring(0, pos);
            anchorText = linkText.substring(pos + 1);
        }// w ww . j a va  2 s  .  c  o m

        destText = Character.toUpperCase(destText.charAt(0)) + destText.substring(1); // Get first char and capitalize

        Anchor anchor = new Anchor(anchorText, tp, wikipedia.getDatabase());
        int senseCount = anchor.getSenses().size();
        Article dest = wikipedia.getArticleByTitle(destText);

        if (senseCount > 0 && dest != null) {

            goldStandard.add(dest.getId());

            if (senseCount == 1 || anchor.getSenses().first().getProbability() >= (1 - minSenseProbability)) {
                unambigAnchors.add(anchor);
                disambiguatedLinks.add(dest.getId());
            } else {
                TopicReference ref = new TopicReference(anchor, dest.getId(), null);
                ambigRefs.add(ref);
            }
        }
    }

    // use all terms as context
    Context context = getContext(article, snippetLength, rc);

    //only use links
    //Context context = new Context(unambigAnchors, rc, maxContextSize) ;

    // resolve senses      
    for (TopicReference ref : ambigRefs) {

        TreeSet<Article> validSenses = new TreeSet<Article>();

        for (Sense sense : ref.getAnchor().getSenses()) {

            if (sense.getProbability() < minSenseProbability)
                break;

            double[] values = new double[attributes.size()];

            values[0] = sense.getProbability();
            values[1] = context.getRelatednessTo(sense);
            values[2] = context.getQuality();
            values[3] = Instance.missingValue();

            Instance i = new Instance(1.0, values);
            i.setDataset(header);

            double prob = classifier.distributionForInstance(i)[0];

            if (prob > 0.5) {
                Article art = new Article(wikipedia.getDatabase(), sense.getId());
                art.setWeight(prob);
                validSenses.add(art);
            }
        }

        //use most valid sense
        if (!validSenses.isEmpty())
            disambiguatedLinks.add(validSenses.first().getId());
    }

    Result<Integer> result = new Result<Integer>(disambiguatedLinks, goldStandard);

    System.out.println("   " + result);

    return result;
}

From source file:org.wikipedia.miner.annotation.weighting.LinkDetector.java

License:Open Source License

/**
 * Weights the given list of topics according to how likely they are to be Wikipedia links if the 
 * document they were extracted from was a Wikipedia article. 
 * // w  ww.  j a  v a 2  s.  c  o m
 * @param topics
 * @return a sorted vector of the same topics, where the weight of each topic is the probability that it is a link
 * @throws Exception
 */
public SortedVector<Topic> getWeightedTopics(Collection<Topic> topics) throws Exception {

    if (classifier == null)
        throw new Exception("You must train the link detector first.");

    SortedVector<Topic> weightedTopics = new SortedVector<Topic>();

    for (Topic topic : topics) {

        double[] values = new double[header.numAttributes()];

        values[0] = topic.getOccurances();
        values[1] = topic.getMaxDisambigConfidence();
        values[2] = topic.getAverageDisambigConfidence();
        values[3] = topic.getRelatednessToOtherTopics();
        values[4] = topic.getMaxLinkProbability();
        values[5] = topic.getAverageLinkProbability();

        if (topic.getGenerality() >= 0)
            values[6] = topic.getGenerality();
        else
            values[7] = Instance.missingValue();

        values[7] = topic.getFirstOccurance();
        values[8] = topic.getLastOccurance();
        values[9] = topic.getSpread();

        //values[10] = topic.getRelatednessToContext() ;

        values[10] = Instance.missingValue();

        Instance instance = new Instance(1.0, values);
        instance.setDataset(header);

        double prob = classifier.distributionForInstance(instance)[0];
        topic.setWeight(prob);
        weightedTopics.add(topic, false);
    }

    return weightedTopics;
}

From source file:oxis.yologp.YOLogPDescriptor.java

License:Open Source License

/**
 * Predict the LogP.//from ww w .  j ava2s . co  m
 *
 */
private void predict() throws Exception {

    Instances instances = buildDataset();

    Map<Object, Object> properties;
    for (DrugStruct drugStruct : listDrug) {

        if (drugStruct.drug.getProperty("flag")) {
            properties = drugStruct.drug.getProperties();
            Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024
            instance.setDataset(instances);
            for (Object propKey : properties.keySet()) {
                if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) {
                    try {
                        instance.setValue(instances.attribute(propKey.toString()),
                                Double.parseDouble(properties.get(propKey).toString()));
                    } catch (NullPointerException ex) {
                        Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING,
                                "Property not used: {0}", propKey.toString());
                    }
                }
            }

            double predicted = model.classifyInstance(instance);
            predicted = Math.round(predicted * 100) / 100.0d;
            instance.setClassValue(predicted);
            instances.add(instance);
            drugStruct.drug.setProperty("predicted", predicted);
        }
    }
}

From source file:oxis.yologp.YOLogPDescriptor.java

License:Open Source License

/**
 * Train a model, erase the other one/*from   www .  ja v a 2s .com*/
 *
 * @param String name of the model to save
 */
public void train(String name) throws Exception {

    compute();

    Instances instances = buildDataset();

    model = new RandomForest();

    Map<Object, Object> properties;
    for (DrugStruct drugStruct : listDrug) {

        if (drugStruct.drug.getProperty("flag")) {
            properties = drugStruct.drug.getProperties();
            Instance instance = new DenseInstance(instances.numAttributes()); //28 + 1024
            instance.setDataset(instances);
            for (Object propKey : properties.keySet()) {
                if (!(propKey.equals("hash") || propKey.equals("flag") || propKey.equals("smiles"))) {
                    try {
                        instance.setValue(instances.attribute(propKey.toString()),
                                Double.parseDouble(properties.get(propKey).toString()));
                    } catch (NullPointerException ex) {
                        Logger.getLogger(YOLogPDescriptor.class.getName()).log(Level.WARNING,
                                "Property not used: {0}", propKey.toString());
                    }
                }
            }
            instance.setClassValue(drugStruct.getLogP());
            instances.add(instance);
        }
    }
    model.setNumFeatures(200);
    model.setNumTrees(400);
    model.setMaxDepth(0);
    model.buildClassifier(instances);

    weka.core.SerializationHelper.write(path + name, model);
}

From source file:pl.nask.hsn2.service.analysis.JSWekaAnalyzer.java

License:Open Source License

public final JSClass classifyString(File file) {
    String ngrams = NGramsCalc.getNgramsForFile(file.getPath(), ngramsLength, ngramsQuantity);

    if (ngrams == null) {
        LOGGER.info("No ngrams extracted, probably JS source is too short");
    } else {/*from ww  w .j av  a  2 s . com*/
        StringTokenizer st = new StringTokenizer(ngrams, " ");
        if (st.countTokens() >= ngramsQuantity) {

            Instance t = new Instance(2);
            t.setDataset(trainingSet);
            t.setValue(0, ngrams);

            try {
                double dd = fc.classifyInstance(t);
                return JSClass.valueOf(trainingSet.classAttribute().value((int) dd).toUpperCase());
            } catch (Exception e) {
                LOGGER.error(e.getMessage(), e);
            }
        }
    }
    return JSClass.UNCLASSIFIED;
}

From source file:predictors.HelixIndexer.java

License:Open Source License

/**
 * Predicts transmembrane residues for a given protein.
 * /*from  w  w  w  .  j  a v a2s .c  o  m*/
 * @param protein
 */
public void predict(Protein protein) {
    if (protein == null || protein.getPssm() == null) {
        return;
    }

    Pssm pssm = protein.getPssm();
    int length = pssm.getLength();
    int[] scoresSol = new int[length];
    int[] scoresTmh = new int[length];
    int[] scoresSig = new int[length];

    this.globalComposition(pssm);

    //slide window along the sequence
    for (int i = 0; i < length; ++i) {
        try {
            Instance window = this.buildInstance(pssm, i);

            window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
            window.setDataset(this.dataset);

            double[] probabilities = this.classifier.distributionForInstance(window);

            if (i < 40) {
                scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]);
                scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]);
                scoresSig[i] = (int) (1000 * probabilities[HelixIndexer.indexSignal]);
            } else {
                scoresSol[i] = (int) (1000 * probabilities[HelixIndexer.indexNotTmh]);
                scoresTmh[i] = (int) (1000 * probabilities[HelixIndexer.indexTmh]);
                scoresSig[i] = 0;
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixIndexer.class, "Prediction failed for " + protein.getHeader(), e);

            return;
        }
    }

    //save scores into the protein
    protein.setSolRaw(scoresSol);
    protein.setTmhRaw(scoresTmh);
    protein.setSigRaw(scoresSig);
}

From source file:predictors.HelixIndexer.java

License:Open Source License

/**
 * Analyzes a given window and saves it in the database.
 * /*ww  w  .ja va2  s.  c  om*/
 * @param pssm
 * @param windowCenter
 * @param structure
 */
private void addWindowToDatabase(Pssm pssm, int windowCenter, char[] structure) {
    int index = Mappings.ssToInt(structure[windowCenter]);
    Instance window = this.buildInstance(pssm, windowCenter);

    if (index == Mappings.indexTmh) {
        index = HelixIndexer.indexTmh;
    } else if (index == Mappings.indexSignal) {
        index = HelixIndexer.indexSignal;
    } else {
        index = HelixIndexer.indexNotTmh;
    }

    window.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), index);
    window.setDataset(this.dataset);

    this.dataset.add(window);
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Adjusts predicted transmembrane helices within a given protein.
 * /*w  w  w  . ja  v  a  2 s.  c  om*/
 * @param protein
 * @param cutoff
 * @return
 */
private boolean adjustTMHs(Protein protein, double cutoff) {
    boolean adjust = false;
    Pssm pssm = protein.getPssm();
    char[] structure = protein.getPrediction();
    int[] segmentRaw = protein.getSegmentRaw();

    for (int i = 0; i < structure.length; ++i) {
        try {
            if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) {
                int start = i;
                char type = structure[i];

                //go to end of transmembrane helix
                while (i < structure.length && structure[i] == type) {
                    ++i;
                }

                --i;

                int end = i;

                Instance window = this.buildInstance(pssm, start, end);

                window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                window.setDataset(this.dataset);

                double[] probabilities = this.classifier.distributionForInstance(window);

                double bestProb = probabilities[Mappings.indexTmh];
                int bestStart = -1;
                int bestEnd = -1;

                //shift TMH start/end around and find best position
                for (int newStart = start - Globals.PREDICTOR_MAX_SHIFT; newStart <= start
                        + Globals.PREDICTOR_MAX_SHIFT; ++newStart) {
                    if (newStart < 0) {
                        continue;
                    }

                    for (int newEnd = end - Globals.PREDICTOR_MAX_SHIFT; newEnd <= end
                            + Globals.PREDICTOR_MAX_SHIFT; ++newEnd) {
                        if (newEnd >= structure.length) {
                            break;
                        }

                        window = this.buildInstance(pssm, newStart, newEnd);

                        window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window.setDataset(this.dataset);

                        probabilities = this.classifier.distributionForInstance(window);

                        if (probabilities[Mappings.indexTmh] > bestProb) {
                            bestProb = probabilities[Mappings.indexTmh];
                            bestStart = newStart;
                            bestEnd = newEnd;
                        }
                    }
                }

                //adjust the TMH!
                if (bestProb < cutoff) {
                    for (int j = start; j <= end; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                        segmentRaw[j] = 0;
                    }
                } else if (bestStart != -1 && bestEnd != -1) {
                    start = Math.min(start, bestStart);
                    end = Math.max(end, bestEnd);

                    for (int j = start; j <= end; ++j) {
                        if (j >= bestStart && j <= bestEnd) {
                            structure[j] = Mappings.intToSs(Mappings.indexTmh);
                            segmentRaw[j] = (int) (1000 * bestProb);
                        } else {
                            structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                            segmentRaw[j] = 0;
                        }
                    }

                    adjust = true;
                    i = end;
                } else {
                    for (int j = start; j <= end; ++j) {
                        segmentRaw[j] = (int) (1000 * bestProb);
                    }
                }
            } else {
                segmentRaw[i] = 0;
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e);

            return false;
        }
    }

    return adjust;
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Splits predicted transmembrane helices within a given protein.
 * //from   w  w  w . j a  va 2 s .c o  m
 * @param protein
 * @param cutoff
 * @return
 */
private boolean splitTMHs(Protein protein, double cutoff) {
    boolean split = false;
    Pssm pssm = protein.getPssm();
    char[] structure = protein.getPrediction();
    int[] segmentRaw = protein.getSegmentRaw();
    int minLength = 2 * Globals.PREDICTOR_HELIX_MIN_SIZE + Globals.PREDICTOR_GAP_MIN_SIZE;

    for (int i = 0; i < structure.length; ++i) {
        try {
            if (Mappings.ssToInt(structure[i]) == Mappings.indexTmh) {
                int start = i;
                char type = structure[i];

                //go to end of transmembrane helix
                while (i < structure.length && structure[i] == type) {
                    ++i;
                }

                --i;

                int end = i;

                //if TMH is too short jump to the next one
                if (end - start + 1 < minLength) {
                    continue;
                }

                Instance window = this.buildInstance(pssm, start, end);

                window.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                window.setDataset(this.dataset);

                double[] probabilities = this.classifier.distributionForInstance(window);

                double bestProb = probabilities[Mappings.indexTmh];
                double bestProb1 = 0;
                double bestProb2 = 0;
                int bestBreak1 = -1;
                int bestBreak2 = -1;

                //insert a variable gap into the TMH and find best constellation
                for (int break1 = start + (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); break1 < end; ++break1) {
                    for (int break2 = break1 + Globals.PREDICTOR_GAP_MIN_SIZE; break2 < end
                            - (Globals.PREDICTOR_HELIX_MIN_SIZE - 1); ++break2) {
                        if (break2 == break1) {
                            continue;
                        }

                        Instance window1 = this.buildInstance(pssm, start, break1);
                        Instance window2 = this.buildInstance(pssm, break2 + 1, end);

                        window1.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window2.isMissing((Attribute) this.attributes.get(this.attributes.size() - 1));
                        window1.setDataset(this.dataset);
                        window2.setDataset(this.dataset);

                        double prob1 = this.classifier.distributionForInstance(window1)[Mappings.indexTmh];
                        double prob2 = this.classifier.distributionForInstance(window2)[Mappings.indexTmh];

                        if (prob1 >= cutoff && prob2 >= cutoff) {
                            double avgProb = (prob1 + prob2) / 2.0;

                            if (avgProb > bestProb) {
                                bestProb = avgProb;
                                bestProb1 = prob1;
                                bestProb2 = prob2;
                                bestBreak1 = break1;
                                bestBreak2 = break2;
                            }
                        }
                    }
                }

                //split the TMH!
                if (bestBreak1 != -1 && bestBreak2 != -1) {
                    for (int j = start; j <= bestBreak1; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexTmh);
                        segmentRaw[j] = (int) (1000 * bestProb1);
                    }

                    for (int j = bestBreak1 + 1; j <= bestBreak2; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexNotTmh);
                        segmentRaw[j] = 0;
                    }

                    for (int j = bestBreak2 + 1; j <= end; ++j) {
                        structure[j] = Mappings.intToSs(Mappings.indexTmh);
                        segmentRaw[j] = (int) (1000 * bestProb2);
                    }

                    split = true;
                }
            }
        } catch (Exception e) {
            ErrorUtils.printError(HelixPredictor.class, "Prediction failed for " + protein.getHeader(), e);

            return false;
        }
    }

    return split;
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Analyzes a given segment (TMH or not) and saves it in the database.
 * /*w  ww. j  a  va2 s  . com*/
 * @param pssm
 * @param start
 * @param end
 * @param structureIndex
 */
private void addSegmentToDatabse(Pssm pssm, int start, int end, int structureIndex) {
    Instance segment = this.buildInstance(pssm, start, end);

    segment.setValue((Attribute) this.attributes.get(this.attributes.size() - 1), structureIndex);

    segment.setDataset(this.dataset);

    this.dataset.add(segment);
}