Example usage for weka.core Instances instance

Introduction

In this page you can find the example usage for weka.core Instances instance.

Prototype



publicInstance instance(int index)

Source Link

Document

Returns the instance at the given position.

Usage

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 *
 * @exception Exception if something goes wrong
 *///from  w w w  .j  a  v a2 s. com
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;
    Instance instance;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
        trainingSets[i] = new Instances(trainInstances, 0);
    }
    /*    m_distribution = new Distribution(m_complexityIndex,
     trainInstances.numClasses()); */
    int subset;
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset > -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                try {
                    Instance temp = (Instance) instance.copy();
                    if (weights.length == m_complexityIndex) {
                        temp.setWeight(temp.weight() * weights[j]);
                    } else {
                        temp.setWeight(temp.weight() / m_complexityIndex);
                    }
                    trainingSets[j].add(temp);
                } catch (Exception ex) {
                    ex.printStackTrace();
                    System.err.println("*** " + m_complexityIndex);
                    System.err.println(weights.length);
                    System.exit(1);
                }
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    /*
    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (!instance.isMissing(m_attIndex)) {
    //   m_distribution.add((int)instance.value(m_attIndex),instance);
    trainingSets[(int)instances.value(m_attIndex)].add(instance);
      } else {
    // add these to the error count
    m_errors += instance.weight();
      }
      } */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() >= 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            // if fewer than min obj then just count them as errors
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if there are at least five instances in at least two of the subsets
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on numeric attribute.//from ww  w.  j  a va2s.  c o  m
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    trainingSets[0] = new Instances(trainInstances, 0);
    trainingSets[1] = new Instances(trainInstances, 0);
    int subset = -1;

    // populate the subsets
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        Instance instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset != -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                Instance temp = (Instance) instance.copy();
                if (weights.length == m_complexityIndex) {
                    temp.setWeight(temp.weight() * weights[j]);
                } else {
                    temp.setWeight(temp.weight() / m_complexityIndex);
                }
                trainingSets[j].add(temp);
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() > 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}

From source file:jjj.asap.sas.ensemble.impl.CrossValidatedEnsemble.java

License:Open Source License

@Override
public StrongLearner build(int essaySet, String ensembleName, List<WeakLearner> learners) {

    // can't handle empty case
    if (learners.isEmpty()) {
        return this.ensemble.build(essaySet, ensembleName, learners);
    }/*w  w w .j  a v  a 2s .c o  m*/

    // create a dummy dataset.
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    builder.addNominalVariable("class", Contest.getRubrics(essaySet));
    Instances dummy = builder.getDataset("dummy");

    // add data
    Map<Double, Double> groundTruth = Contest.getGoldStandard(essaySet);
    for (double id : learners.get(0).getPreds().keySet()) {
        dummy.add(new DenseInstance(1.0, new double[] { id, groundTruth.get(id) }));
    }

    // stratify
    dummy.sort(0);
    dummy.randomize(new Random(1));
    dummy.setClassIndex(1);
    dummy.stratify(nFolds);

    // now evaluate each fold
    Map<Double, Double> preds = new HashMap<Double, Double>();
    for (int k = 0; k < nFolds; k++) {
        Instances train = dummy.trainCV(nFolds, k);
        Instances test = dummy.testCV(nFolds, k);

        List<WeakLearner> cvLeaners = new ArrayList<WeakLearner>();
        for (WeakLearner learner : learners) {
            WeakLearner copy = learner.copyOf();
            for (int i = 0; i < test.numInstances(); i++) {
                copy.getPreds().remove(test.instance(i).value(0));
                copy.getProbs().remove(test.instance(i).value(0));
            }
            cvLeaners.add(copy);
        }

        // train on fold
        StrongLearner cv = this.ensemble.build(essaySet, ensembleName, cvLeaners);

        List<WeakLearner> testLeaners = new ArrayList<WeakLearner>();
        for (WeakLearner learner : cv.getLearners()) {
            WeakLearner copy = learner.copyOf();
            copy.getPreds().clear();
            copy.getProbs().clear();
            WeakLearner source = find(copy.getName(), learners);
            for (int i = 0; i < test.numInstances(); i++) {
                double id = test.instance(i).value(0);
                copy.getPreds().put(id, source.getPreds().get(id));
                copy.getProbs().put(id, source.getProbs().get(id));
            }
            testLeaners.add(copy);
        }

        preds.putAll(this.ensemble.classify(essaySet, ensembleName, testLeaners, cv.getContext()));
    }

    // now prepare final result

    StrongLearner strong = this.ensemble.build(essaySet, ensembleName, learners);

    double trainingError = strong.getKappa();
    double cvError = Calc.kappa(essaySet, preds, groundTruth);
    //   Job.log(essaySet+"-"+ensembleName, "XVAL: training error = " + trainingError + " cv error = " + cvError);      

    strong.setKappa(cvError);
    return strong;
}

From source file:jjj.asap.sas.parser.job.ImportParserData.java

License:Open Source License

private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
        Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

    // check if output exists
    boolean any = false;

    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
        any = true;//from  w  w  w .  j ava  2 s .com
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
        any = true;

    if (!any) {
        Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                + "-*.arff returns all required datasets - nothing to do");
        return;
    }

    // Load an existing dataset to use as a template.
    Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

    // create the output datasets here. except for the extra statistics, 
    // the format is the same as 'dataset'.

    Instances tagsData = new Instances(dataset, 0);
    tagsData.setRelationName(essaySet + "-pos-tags.arff");
    Instances treeData = new Instances(dataset, 0);
    treeData.setRelationName(essaySet + "-parse-tree.arff");

    Instances dependsData[] = new Instances[7];
    for (int j = 0; j < 7; j++) {
        dependsData[j] = new Instances(dataset, 0);
        dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
    }

    // extra stats
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(essaySet)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addVariable("x_sent");
    builder.addVariable("x_para");
    builder.addVariable("x_length");
    builder.addVariable("x_words");
    builder.addVariable("x_unique_words");
    builder.addNominalVariable("score", Contest.getRubrics(essaySet));

    Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

    // now add rows for each instance

    for (int i = 0; i < dataset.numInstances(); i++) {

        // common variables
        Instance ob = dataset.instance(i);
        double id = ob.value(0);
        String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                : ob.stringValue(dataset.numAttributes() - 1);
        String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
        String str = ob.stringValue(dataset.attribute("text"));

        //
        // Extra stats
        //

        int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
        int nPara = 0;
        for (int a = 0; a < str.length(); a++) {
            if (str.charAt(a) == '^')
                nPara++;
        }
        int nLength = str.length();
        int nWords = 0;
        int nUniqueWords = 0;
        String[] words = str.toLowerCase().split(" ");
        nWords = words.length;
        Set<String> u = new HashSet<String>();
        for (String w : words) {
            u.add(w);
        }
        nUniqueWords = u.size();

        extraStats.add(new DenseInstance(extraStats.numAttributes()));
        Instance extra = extraStats.lastInstance();
        extra.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            extra.setValue(1, color);
        }

        extra.setValue(extraStats.attribute("x_sent"), nSent);
        extra.setValue(extraStats.attribute("x_para"), nPara);
        extra.setValue(extraStats.attribute("x_length"), nLength);
        extra.setValue(extraStats.attribute("x_words"), nWords);
        extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

        if (y == null)
            extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
        else
            extra.setValue(extraStats.numAttributes() - 1, y);

        //
        // POS tags
        //

        String tagsText = "";
        List<String> tagsList = tags.get(id);
        if (tagsList == null || tagsList.isEmpty()) {
            Job.log("WARNING", "no tags for " + id);
            tagsText = "x";
        } else {
            for (String tagsItem : tagsList) {
                tagsText += tagsItem;
            }
        }

        tagsData.add(new DenseInstance(ob.numAttributes()));
        Instance tagsOb = tagsData.lastInstance();
        tagsOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            tagsOb.setValue(1, color);
            tagsOb.setValue(2, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(3, Utils.missingValue());
            } else {
                tagsOb.setValue(3, y);
            }
        } else {
            tagsOb.setValue(1, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(2, Utils.missingValue());
            } else {
                tagsOb.setValue(2, y);
            }
        }

        //
        // Parse Tree
        //

        String treeText = "";
        List<String> treeList = parseTrees.get(id);
        if (treeList == null || treeList.isEmpty()) {
            Job.log("WARNING", "no parse tree for " + id);
            treeText = "x";
        } else {
            for (String treeItem : treeList) {
                treeText += treeItem;
            }
        }

        treeData.add(new DenseInstance(ob.numAttributes()));
        Instance treeOb = treeData.lastInstance();
        treeOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            treeOb.setValue(1, color);
            treeOb.setValue(2, treeText.trim());
            if (y == null) {
                treeOb.setValue(3, Utils.missingValue());
            } else {
                treeOb.setValue(3, y);
            }
        } else {
            treeOb.setValue(1, treeText.trim());
            if (y == null) {
                treeOb.setValue(2, Utils.missingValue());
            } else {
                treeOb.setValue(2, y);
            }
        }

        //
        // Depends data
        //

        for (int j = 0; j < 7; j++) {

            String text = "";
            List<String> list = depends.get(id);
            if (list == null || list.isEmpty()) {
                Job.log("WARNING", "no depends for " + id);
                text = "x";
            } else {
                for (String item : list) {
                    String[] term = StringUtils.safeSplit(item, "/", 3);
                    switch (j) {
                    case 0:
                        text += item;
                        break;
                    case 1:
                        text += term[1] + "/" + term[2];
                        break;
                    case 2:
                        text += term[0] + "/" + term[2];
                        break;
                    case 3:
                        text += term[0] + "/" + term[1];
                        break;
                    case 4:
                        text += term[0];
                        break;
                    case 5:
                        text += term[1];
                        break;
                    case 6:
                        text += term[2];
                        break;
                    }
                    text += " ";
                }
            }

            dependsData[j].add(new DenseInstance(ob.numAttributes()));
            Instance dependsOb = dependsData[j].lastInstance();
            dependsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                dependsOb.setValue(1, color);
                dependsOb.setValue(2, text.trim());
                if (y == null) {
                    dependsOb.setValue(3, Utils.missingValue());
                } else {
                    dependsOb.setValue(3, y);
                }
            } else {
                dependsOb.setValue(1, text.trim());
                if (y == null) {
                    dependsOb.setValue(2, Utils.missingValue());
                } else {
                    dependsOb.setValue(2, y);
                }
            }

        } // j
    } // dataset

    // Now save the new datasets

    Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
    Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
    for (int j = 0; j < 7; j++) {
        Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
    }
    Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from  w  ww  .  j a  v a 2 s.c om*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}

From source file:kea.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files//from  ww  w . j  a  v a  2 s .c o m
 */
public void buildModel(Hashtable stems) throws Exception {

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter();
    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());
    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setInputFormat(data);
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find document for stem " + str + ".");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            BufferedReader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't find keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    // Get rid of instances in filter
    while (m_KEAFilter.output() != null) {
    }
    ;
}

From source file:kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//  w  w w.ja  v a  2s  .co  m
 */
public synchronized void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    this.m_KEAFilter.setNumPhrases(m_numPhrases);
    this.m_KEAFilter.setVocabulary(m_vocabulary);
    this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    this.m_KEAFilter.setStemmer(m_Stemmer);
    this.m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        this.m_KEAFilter.m_NODEfeature = false;
    } else {
        // Know thesaurus is loaded in the constructor
        //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (this.m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.out.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            is.close();

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        this.m_KEAFilter.input(data.instance(0), vocabulary);

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = this.m_KEAFilter.output()) != null) {

            int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(
                            topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(
                                topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.out.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.out.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:lattice.Lattice.java

License:Open Source License

/**
 * Constructor of a lattice over the given variables of the dataset.
 * /*from  w  w  w .  j  a  v  a 2s .c  om*/
 * @param dataset
 */
public Lattice(Instances dataset) {

    // ~ initialise internal structure for counting (TID sets)
    this.nbInstances = dataset.numInstances();
    this.nbVariables = dataset.numAttributes();

    BitSet[][] presence = new BitSet[nbVariables][];

    TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>();
    int[] nbValuesForAttribute = new int[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        nbValuesForAttribute[a] = dataset.numDistinctValues(a) + 1; //+1 for missing
        presence[a] = new BitSet[nbValuesForAttribute[a]];
        allAttributesNumbers.add(a);
        for (int v = 0; v < presence[a].length; v++) {
            presence[a][v] = new BitSet();
        }
    }

    for (int i = 0; i < nbInstances; i++) {
        Instance row = dataset.instance(i);
        for (int a = 0; a < nbVariables; a++) {

            int indexOfValue;
            if (row.isMissing(a)) {
                //               indexOfValue = (int) dataset.meanOrMode(a);
                indexOfValue = dataset.numDistinctValues(a); //missing at the end
            } else {
                String value = row.stringValue(a);
                indexOfValue = row.attribute(a).indexOfValue(value);
            }
            presence[a][indexOfValue].set(i);

        }
    }

    // initialise the first nodes of the lattice (i.e., the ones
    // corresponding to single variables
    this.all = new LatticeNode(this, nbValuesForAttribute);
    this.singleNodes = new LatticeNode[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        int[] variablesNumbers = { a };
        LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all);
        singleNodes[a] = node;
    }

}

From source file:lector.Analizador.java

public static void clasificador() {

    BufferedReader reader1;/*from w  w  w .  j a  va 2  s .c o m*/
    BufferedReader reader2;
    try {
        reader1 = new BufferedReader(new FileReader("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/"
                + "proyecto/compartida/DataSetAnalisisSentimientos.arff"));

        reader2 = new BufferedReader(new FileReader("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/"
                + "proyecto/compartida/DataSetAnalisisSentimientos_inc.arff"));
        Instances train = new Instances(reader1);
        train.setClassIndex(train.numAttributes() - 1);
        System.out.println(train.classIndex() + " " + train.numAttributes());

        Instances test = new Instances(reader2);
        test.setClassIndex(train.numAttributes() - 1);
        System.out.println(test.classIndex() + " " + test.numAttributes());

        NaiveBayes model = new NaiveBayes();
        model.buildClassifier(train);

        //classify
        Instances labeled = new Instances(test);

        for (int i = 0; i < test.numInstances(); i++) {
            double clsLabel = model.classifyInstance(test.instance(i));
            labeled.instance(i).setClassValue(clsLabel);
        }

        // https://youtu.be/JY_x5zKTfyo?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag
        Evaluation eval_train = new Evaluation(test);
        eval_train.evaluateModel(model, test);

        reader1.close();
        reader2.close();

        //System.out.println(eval_train.toSummaryString("\nResults\n======\n", false));
        String[] options = new String[4];
        options[0] = "-t"; //name of training file
        options[1] = "/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/proyecto/"
                + "compartida/DataSetAnalisisSentimientos.arff";
        options[2] = "-T";
        options[3] = "/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/proyecto/"
                + "compartida/DataSetAnalisisSentimientos_inc.arff";
        System.out.println(Evaluation.evaluateModel(model, options));

        try ( // print classification results to file
                BufferedWriter writer = new BufferedWriter(
                        new FileWriter("/Users/danieltapia/Google Drive/EPN/MAESTRIA/MSW128 BI/"
                                + "proyecto/compartida/DataSetAnalisisSentimientos_labeled.arff"))) {
            writer.write(labeled.toString());
        }

    } catch (Exception e) {
    }
}

From source file:LeerArchivo.Leer.java

public String leerModelo() {
    try {/*w  ww .  j a v a 2 s  .  co  m*/
        String[] valoresAtributos = { "0", "1" };
        Classifier clasificador = (Classifier) weka.core.SerializationHelper.read("./KStar.model");
        ConverterUtils.DataSource source = new ConverterUtils.DataSource("./test.arff");
        Instances data = source.getDataSet();
        data.setClassIndex(5);
        System.out.println(data.instance(0));
        return valoresAtributos[(int) clasificador.classifyInstance(data.instance(0))];
    } catch (Exception ex) {
        Logger.getLogger(Leer.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}