Example usage for weka.core Instance value

Introduction

In this page you can find the example usage for weka.core Instance value.

Prototype

public double value(Attribute att);

Source Link

Document

Returns an instance's attribute value in internal format.

Usage

From source file:j48.C45Split.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 * //w ww.ja  va  2 s .c  o  m
 * @exception Exception
 *                if something goes wrong
 */
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    Instance instance;

    m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses());

    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
        instance = (Instance) enu.nextElement();
        if (!instance.isMissing(m_attIndex))
            m_distribution.add((int) instance.value(m_attIndex), instance);
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (m_distribution.check(m_minNoObj)) {
        m_numSubsets = m_complexityIndex;
        m_infoGain = infoGainCrit.splitCritValue(m_distribution, m_sumOfWeights);
        m_gainRatio = gainRatioCrit.splitCritValue(m_distribution, m_sumOfWeights, m_infoGain);
    }
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Sets split point to greatest value in given data smaller or equal to old
 * split point. (C4.5 does this for some strange reason).
 *///from w  w  w  .  jav  a 2  s  . c  o  m
public final void setSplitPoint(Instances allInstances) {

    double newSplitPoint = -Double.MAX_VALUE;
    double tempValue;
    Instance instance;

    if ((allInstances.attribute(m_attIndex).isNumeric()) && (m_numSubsets > 1)) {
        Enumeration enu = allInstances.enumerateInstances();
        while (enu.hasMoreElements()) {
            instance = (Instance) enu.nextElement();
            if (!instance.isMissing(m_attIndex)) {
                tempValue = instance.value(m_attIndex);
                if (Utils.gr(tempValue, newSplitPoint) && Utils.smOrEq(tempValue, m_splitPoint))
                    newSplitPoint = tempValue;
            }
        }
        m_splitPoint = newSplitPoint;
    }
}

From source file:j48.C45Split.java

License:Open Source License

/**
 * Returns index of subset instance is assigned to. Returns -1 if instance
 * is assigned to more than one subset.// ww  w .  ja v a2  s. c  o m
 * 
 * @exception Exception
 *                if something goes wrong
 */
public final int whichSubset(Instance instance) throws Exception {

    if (instance.isMissing(m_attIndex))
        return -1;
    else {
        if (instance.attribute(m_attIndex).isNominal())
            return (int) instance.value(m_attIndex);
        else if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint))
            return 0;
        else
            return 1;
    }
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * @param instance the instance for which to determine the subset
 * @return an int indicating the subset this instance belongs to
 *//*from   ww  w.  j  av  a  2  s .  c  o m*/
public int whichSubset(Instance instance) {

    if (instance.isMissing(m_attIndex))
        return -1;

    if (instance.attribute(m_attIndex).isNominal()) {
        // in the case of nominal, m_splitPoint is the = value, all else is !=
        if (instance.value(m_attIndex) == m_splitPoint)
            return 0;
        else
            return 1;
    } else {
        if (Utils.smOrEq(instance.value(m_attIndex), m_splitPoint))
            return 0;
        else
            return 1;
    }
}

From source file:javaapplication1.JavaApplication1.java

/**
 * @param args the command line arguments
 *///from  w  ww . j  a v  a 2 s. c  o  m

public static void main(String[] args) throws Exception {
    // TODO code application logic here
    int numInstances = 20000;

    double curr_acc = 0;
    double curr_oba = 0;
    double curr_NB = 0;
    double[] acc_adacc;
    acc_adacc = new double[numInstances];
    double[] acc_oba;
    acc_oba = new double[numInstances];
    double[] acc_NB;
    acc_NB = new double[numInstances];
    double[] pred_NB;
    pred_NB = new double[numInstances];
    Instance[] window;
    window = new Instance[numInstances];

    Classifier tr = new DecisionStump();
    Classifier lbay = new NaiveBayes();
    Classifier learner = new HoeffdingTree();
    ADWIN adw = new ADWIN(0.1);
    Classifier knn = new kNN();
    Classifier adacc = new ADACC();
    Classifier oba = new OzaBagAdwin();
    //Evaluator ev = new Accuracy();
    Classifier rep = new repro();
    Classifier tut = new DecisionStumpTutorial();
    //J48 tree = new J48();
    SMO svm = new SMO();
    ArffFileStream readfile = new ArffFileStream();

    readfile.arffFileOption.setValue("E:/PhD/data/SEA/comb_short.arff");
    readfile.prepareForUse();

    lbay.setModelContext(readfile.getHeader());
    learner.setModelContext(readfile.getHeader());
    tut.setModelContext(readfile.getHeader());
    knn.setModelContext(readfile.getHeader());
    adacc.setModelContext(readfile.getHeader());
    oba.setModelContext(readfile.getHeader());

    learner.prepareForUse();
    lbay.prepareForUse();
    tut.prepareForUse();
    knn.prepareForUse();
    adacc.prepareForUse();
    oba.prepareForUse();

    int numberSamplesCorrect = 0;
    int numberSamplesCorrectN = 0;
    int numberSamplesCorrectTR = 0;
    int numberSamplesCorrectknn = 0;
    int numberSamplesCorrectadacc = 0;
    int numberSamplesCorrectoba = 0;

    int numberSamples = 0;
    boolean isTesting = true;
    while (readfile.hasMoreInstances() && numberSamples < numInstances) {
        Instance trainInst = readfile.nextInstance();

        if (isTesting) {

            //adwin drift detection 
            //if change detected leaners are trained
            if (adw.setInput(trainInst.value(0))) //Input data into Adwin
            {
                System.out.println("Change Detected at " + numberSamples);
                System.out.println("window width is " + adw.getWidth());

                oba.trainOnInstance(trainInst);
                adacc.trainOnInstance(trainInst);
                lbay.trainOnInstance(trainInst);

            }

            if (learner.correctlyClassifies(trainInst)) {
                numberSamplesCorrect++;
            }
            if (lbay.correctlyClassifies(trainInst)) {
                numberSamplesCorrectN++;
                curr_NB = 1;

            } else {
                curr_NB = 0;
            }

            if (tut.correctlyClassifies(trainInst)) {
                numberSamplesCorrectTR++;
            }
            if (knn.correctlyClassifies(trainInst)) {
                numberSamplesCorrectknn++;
            }
            if (adacc.correctlyClassifies(trainInst)) {
                numberSamplesCorrectadacc++;
                curr_acc = 1;
            } else {
                curr_acc = 0;
            }
            if (oba.correctlyClassifies(trainInst)) {
                numberSamplesCorrectoba++;
                curr_oba = 1;
            } else {
                curr_oba = 0;
            }

        }

        if (numberSamples == 0) {
            acc_adacc[numberSamples] = curr_acc;
            acc_oba[numberSamples] = curr_oba;
            acc_NB[numberSamples] = curr_NB;
        }

        if (numberSamples > 0) {
            acc_adacc[numberSamples] = acc_adacc[numberSamples - 1]
                    + ((curr_acc - acc_adacc[numberSamples - 1]) / numberSamples);
            acc_oba[numberSamples] = acc_oba[numberSamples - 1]
                    + ((curr_oba - acc_oba[numberSamples - 1]) / numberSamples);
            acc_NB[numberSamples] = acc_NB[numberSamples - 1]
                    + ((curr_NB - acc_NB[numberSamples - 1]) / numberSamples);

        }

        numberSamples++;

        if (numberSamples < 5000) {
            oba.trainOnInstance(trainInst);
            adacc.trainOnInstance(trainInst);
            lbay.trainOnInstance(trainInst);
        }

        knn.trainOnInstance(trainInst);
        lbay.trainOnInstance(trainInst);
        learner.trainOnInstance(trainInst);
        tut.trainOnInstance(trainInst);
    }

    double accuracy = 100.0 * (double) numberSamplesCorrect / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with HoeffdingTree " + accuracy + "% accuracy");

    double accuracyN = 100.0 * (double) numberSamplesCorrectN / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with NaivBayes " + accuracyN + "% accuracy");

    double accuracyTR = 100.0 * (double) numberSamplesCorrectTR / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with DecisionStump " + accuracyTR + "% accuracy");

    double accuracyADACC = 100.0 * (double) numberSamplesCorrectadacc / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with ADACC " + accuracyADACC + "% accuracy");

    double accuracyoba = 100.0 * (double) numberSamplesCorrectoba / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with OzaBagAdwin " + accuracyoba + "% accuracy");

    double accuracyknn = 100.0 * (double) numberSamplesCorrectknn / (double) numberSamples;
    System.out.println(numberSamples + " instances processed with kNN " + accuracyknn + "% accuracy");
    //System.out.println("Mean:"+adw.getEstimation());
    //System.out.println("Variance:"+adw.getVariance());
    //System.out.println("Stand. dev:"+Math.sqrt(adw.getVariance()));
    System.out.println("Number of ADWIN drift detections: " + adw.getNumberDetections());

    String s = Arrays.toString(acc_adacc);
    s = s.substring(1, s.length() - 2);

    String s2 = Arrays.toString(pred_NB);
    s2 = s2.substring(1, s2.length() - 1);

    String sNB = Arrays.toString(acc_NB);
    sNB = sNB.substring(1, sNB.length() - 1);

    String csv = "E:/PhD/data/SEA/accres.csv";
    CSVWriter writer = new CSVWriter(new FileWriter(csv));

    List<String[]> data = new ArrayList<>();
    data.add(new String[] { s });
    //data.add(new String[] {s2});

    writer.writeAll(data);
    System.out.println("CSV written successfully.");
    writer.close();
}

From source file:jjj.asap.sas.parser.job.ImportParserData.java

License:Open Source License

private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
        Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

    // check if output exists
    boolean any = false;

    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
        any = true;//w  w w .j  a  v a 2s.c  om
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
        any = true;

    if (!any) {
        Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                + "-*.arff returns all required datasets - nothing to do");
        return;
    }

    // Load an existing dataset to use as a template.
    Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

    // create the output datasets here. except for the extra statistics, 
    // the format is the same as 'dataset'.

    Instances tagsData = new Instances(dataset, 0);
    tagsData.setRelationName(essaySet + "-pos-tags.arff");
    Instances treeData = new Instances(dataset, 0);
    treeData.setRelationName(essaySet + "-parse-tree.arff");

    Instances dependsData[] = new Instances[7];
    for (int j = 0; j < 7; j++) {
        dependsData[j] = new Instances(dataset, 0);
        dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
    }

    // extra stats
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(essaySet)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addVariable("x_sent");
    builder.addVariable("x_para");
    builder.addVariable("x_length");
    builder.addVariable("x_words");
    builder.addVariable("x_unique_words");
    builder.addNominalVariable("score", Contest.getRubrics(essaySet));

    Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

    // now add rows for each instance

    for (int i = 0; i < dataset.numInstances(); i++) {

        // common variables
        Instance ob = dataset.instance(i);
        double id = ob.value(0);
        String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                : ob.stringValue(dataset.numAttributes() - 1);
        String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
        String str = ob.stringValue(dataset.attribute("text"));

        //
        // Extra stats
        //

        int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
        int nPara = 0;
        for (int a = 0; a < str.length(); a++) {
            if (str.charAt(a) == '^')
                nPara++;
        }
        int nLength = str.length();
        int nWords = 0;
        int nUniqueWords = 0;
        String[] words = str.toLowerCase().split(" ");
        nWords = words.length;
        Set<String> u = new HashSet<String>();
        for (String w : words) {
            u.add(w);
        }
        nUniqueWords = u.size();

        extraStats.add(new DenseInstance(extraStats.numAttributes()));
        Instance extra = extraStats.lastInstance();
        extra.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            extra.setValue(1, color);
        }

        extra.setValue(extraStats.attribute("x_sent"), nSent);
        extra.setValue(extraStats.attribute("x_para"), nPara);
        extra.setValue(extraStats.attribute("x_length"), nLength);
        extra.setValue(extraStats.attribute("x_words"), nWords);
        extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

        if (y == null)
            extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
        else
            extra.setValue(extraStats.numAttributes() - 1, y);

        //
        // POS tags
        //

        String tagsText = "";
        List<String> tagsList = tags.get(id);
        if (tagsList == null || tagsList.isEmpty()) {
            Job.log("WARNING", "no tags for " + id);
            tagsText = "x";
        } else {
            for (String tagsItem : tagsList) {
                tagsText += tagsItem;
            }
        }

        tagsData.add(new DenseInstance(ob.numAttributes()));
        Instance tagsOb = tagsData.lastInstance();
        tagsOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            tagsOb.setValue(1, color);
            tagsOb.setValue(2, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(3, Utils.missingValue());
            } else {
                tagsOb.setValue(3, y);
            }
        } else {
            tagsOb.setValue(1, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(2, Utils.missingValue());
            } else {
                tagsOb.setValue(2, y);
            }
        }

        //
        // Parse Tree
        //

        String treeText = "";
        List<String> treeList = parseTrees.get(id);
        if (treeList == null || treeList.isEmpty()) {
            Job.log("WARNING", "no parse tree for " + id);
            treeText = "x";
        } else {
            for (String treeItem : treeList) {
                treeText += treeItem;
            }
        }

        treeData.add(new DenseInstance(ob.numAttributes()));
        Instance treeOb = treeData.lastInstance();
        treeOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            treeOb.setValue(1, color);
            treeOb.setValue(2, treeText.trim());
            if (y == null) {
                treeOb.setValue(3, Utils.missingValue());
            } else {
                treeOb.setValue(3, y);
            }
        } else {
            treeOb.setValue(1, treeText.trim());
            if (y == null) {
                treeOb.setValue(2, Utils.missingValue());
            } else {
                treeOb.setValue(2, y);
            }
        }

        //
        // Depends data
        //

        for (int j = 0; j < 7; j++) {

            String text = "";
            List<String> list = depends.get(id);
            if (list == null || list.isEmpty()) {
                Job.log("WARNING", "no depends for " + id);
                text = "x";
            } else {
                for (String item : list) {
                    String[] term = StringUtils.safeSplit(item, "/", 3);
                    switch (j) {
                    case 0:
                        text += item;
                        break;
                    case 1:
                        text += term[1] + "/" + term[2];
                        break;
                    case 2:
                        text += term[0] + "/" + term[2];
                        break;
                    case 3:
                        text += term[0] + "/" + term[1];
                        break;
                    case 4:
                        text += term[0];
                        break;
                    case 5:
                        text += term[1];
                        break;
                    case 6:
                        text += term[2];
                        break;
                    }
                    text += " ";
                }
            }

            dependsData[j].add(new DenseInstance(ob.numAttributes()));
            Instance dependsOb = dependsData[j].lastInstance();
            dependsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                dependsOb.setValue(1, color);
                dependsOb.setValue(2, text.trim());
                if (y == null) {
                    dependsOb.setValue(3, Utils.missingValue());
                } else {
                    dependsOb.setValue(3, y);
                }
            } else {
                dependsOb.setValue(1, text.trim());
                if (y == null) {
                    dependsOb.setValue(2, Utils.missingValue());
                } else {
                    dependsOb.setValue(2, y);
                }
            }

        } // j
    } // dataset

    // Now save the new datasets

    Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
    Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
    for (int j = 0; j < 7; j++) {
        Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
    }
    Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance./*from   ww  w.ja  v a2 s. c o  m*/
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files/*from ww w .  j a v a 2 s.co m*/
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Extract keyphrases
    Enumeration elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();
        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            Reader is;
            if (!m_encoding.equals("default")) {
                is = new BomStrippingInputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new BomStrippingInputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;
        while ((inst = m_KEAFilter.output()) != null) {
            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;
            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }
        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));
            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;
        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i]
                        .value(topRankedInstances[i].numAttributes() - 1) == topRankedInstances[i]
                                .attribute(topRankedInstances[i].numAttributes() - 1).indexOfValue("True")) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));
                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));
    System.err.println("Avg. number of correct keyphrases: " + Utils.doubleToString(avg, 2) + " +/- "
            + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    m_KEAFilter.batchFinished();
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values.//from  www .jav a  2s .c o  m
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from w  ww  . j  a v a 2 s.  co  m
 */
public synchronized void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    this.m_KEAFilter.setNumPhrases(m_numPhrases);
    this.m_KEAFilter.setVocabulary(m_vocabulary);
    this.m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    this.m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    this.m_KEAFilter.setStemmer(m_Stemmer);
    this.m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        this.m_KEAFilter.m_NODEfeature = false;
    } else {
        // Know thesaurus is loaded in the constructor
        //m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords, vocabularyDir, manager);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (this.m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.out.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            is.close();

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        this.m_KEAFilter.input(data.instance(0), vocabulary);

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = this.m_KEAFilter.output()) != null) {

            int index = (int) inst.value(this.m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;
            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(
                            topRankedInstances[i].stringValue(this.m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(
                                topRankedInstances[i].stringValue(this.m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(this.m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.out.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.out.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}