Example usage for weka.core Instance weight

List of usage examples for weka.core Instance weight

Introduction

In this page you can find the example usage for weka.core Instance weight.

Prototype

public double weight();

Source Link

Document

Returns the instance's weight.

Usage

From source file:j48.Distribution.java

License:Open Source License

/**
 * Shifts given instance from one bag to another one.
 *
 * @exception Exception if something goes wrong
 *///from  w ww. j  av a2 s  .  c o  m
public final void shift(int from, int to, Instance instance) throws Exception {

    int classIndex;
    double weight;

    classIndex = (int) instance.classValue();
    weight = instance.weight();
    m_perClassPerBag[from][classIndex] -= weight;
    m_perClassPerBag[to][classIndex] += weight;
    m_perBag[from] -= weight;
    m_perBag[to] += weight;
}

From source file:j48.Distribution.java

License:Open Source License

/**
 * Shifts all instances in given range from one bag to another one.
 *
 * @exception Exception if something goes wrong
 *///from   w ww. jav  a  2  s  .  c o  m
public final void shiftRange(int from, int to, Instances source, int startIndex, int lastPlusOne)
        throws Exception {

    int classIndex;
    double weight;
    Instance instance;
    int i;

    for (i = startIndex; i < lastPlusOne; i++) {
        instance = (Instance) source.instance(i);
        classIndex = (int) instance.classValue();
        weight = instance.weight();
        m_perClassPerBag[from][classIndex] -= weight;
        m_perClassPerBag[to][classIndex] += weight;
        m_perBag[from] -= weight;
        m_perBag[to] += weight;
    }
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * builds m_graftdistro using the passed data
 *
 * @param data the instances to use when creating the distribution
 *///from   w w  w . ja va 2s .com
public void buildClassifier(Instances data) throws Exception {

    // distribution for the graft, not counting cases in atbop, only orig leaf
    m_graftdistro = new Distribution(2, data.numClasses());

    // which subset are we looking at for the graft?
    int subset = subsetOfInterest(); // this is the subset for m_leaf

    double thisNodeCount = 0;
    double knownCases = 0;
    boolean allKnown = true;
    // populate distribution
    for (int x = 0; x < data.numInstances(); x++) {
        Instance instance = data.instance(x);
        if (instance.isMissing(m_attIndex)) {
            allKnown = false;
            continue;
        }
        knownCases += instance.weight();
        int subst = whichSubset(instance);
        if (subst == -1)
            continue;
        m_graftdistro.add(subst, instance);
        if (subst == subset) { // instance belongs at m_leaf
            thisNodeCount += instance.weight();
        }
    }
    double factor = (knownCases == 0) ? (1.0 / (double) 2.0) : (thisNodeCount / knownCases);
    if (!allKnown) {
        for (int x = 0; x < data.numInstances(); x++) {
            if (data.instance(x).isMissing(m_attIndex)) {
                Instance instance = data.instance(x);
                int subst = whichSubset(instance);
                if (subst == -1)
                    continue;
                instance.setWeight(instance.weight() * factor);
                m_graftdistro.add(subst, instance);
            }
        }
    }

    // if there are no cases at the leaf, make sure the desired
    // class is chosen, by setting counts to 0.01
    if (m_graftdistro.perBag(subset) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[m_maxClass] = 0.01;
        m_graftdistro.add(subset, counts);
    }
    if (m_graftdistro.perBag((subset == 0) ? 1 : 0) == 0) {
        double[] counts = new double[data.numClasses()];
        counts[(int) m_otherLeafMaxClass] = 0.01;
        m_graftdistro.add((subset == 0) ? 1 : 0, counts);
    }
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 *
 * @exception Exception if something goes wrong
 *//* w w  w.  j  a  v a  2 s . com*/
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;
    Instance instance;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
        trainingSets[i] = new Instances(trainInstances, 0);
    }
    /*    m_distribution = new Distribution(m_complexityIndex,
     trainInstances.numClasses()); */
    int subset;
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset > -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                try {
                    Instance temp = (Instance) instance.copy();
                    if (weights.length == m_complexityIndex) {
                        temp.setWeight(temp.weight() * weights[j]);
                    } else {
                        temp.setWeight(temp.weight() / m_complexityIndex);
                    }
                    trainingSets[j].add(temp);
                } catch (Exception ex) {
                    ex.printStackTrace();
                    System.err.println("*** " + m_complexityIndex);
                    System.err.println(weights.length);
                    System.exit(1);
                }
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    /*
    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (!instance.isMissing(m_attIndex)) {
    //   m_distribution.add((int)instance.value(m_attIndex),instance);
    trainingSets[(int)instances.value(m_attIndex)].add(instance);
      } else {
    // add these to the error count
    m_errors += instance.weight();
      }
      } */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() >= 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            // if fewer than min obj then just count them as errors
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if there are at least five instances in at least two of the subsets
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on numeric attribute./*from w  w w  .  j  av a 2s . c  om*/
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    trainingSets[0] = new Instances(trainInstances, 0);
    trainingSets[1] = new Instances(trainInstances, 0);
    int subset = -1;

    // populate the subsets
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        Instance instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset != -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                Instance temp = (Instance) instance.copy();
                if (weights.length == m_complexityIndex) {
                    temp.setWeight(temp.weight() * weights[j]);
                } else {
                    temp.setWeight(temp.weight() / m_complexityIndex);
                }
                trainingSets[j].add(temp);
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() > 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Builds the classifier.// ww  w .  j  av  a2  s  .com
 */
private void buildClassifier() throws Exception {

    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (i == m_DocumentAtt) {
            atts.addElement(new Attribute("TFxIDF"));
            atts.addElement(new Attribute("First_occurrence"));
            if (m_KFused) {
                atts.addElement(new Attribute("Keyphrase_frequency"));
            }
        } else if (i == m_KeyphrasesAtt) {
            FastVector vals = new FastVector(2);
            vals.addElement("False");
            vals.addElement("True");
            atts.addElement(new Attribute("Keyphrase?", vals));
        }
    }
    m_ClassifierData = new Instances("ClassifierData", atts, 0);
    m_ClassifierData.setClassIndex(m_NumFeatures);

    if (m_Debug) {
        System.err.println("--- Converting instances for classifier");
    }

    // Convert pending input instances into data for classifier
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance current = getInputFormat().instance(i);

        // Get the key phrases for the document
        String keyphrases = current.stringValue(m_KeyphrasesAtt);
        HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);

        // Get the phrases for the document
        HashMap hash = new HashMap();
        int length = getPhrases(hash, current.stringValue(m_DocumentAtt));

        // Compute the feature values for each phrase and
        // add the instance to the data for the classifier
        Iterator it = hash.keySet().iterator();
        while (it.hasNext()) {
            String phrase = (String) it.next();
            FastVector phraseInfo = (FastVector) hash.get(phrase);
            double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length);
            Instance inst = new Instance(current.weight(), vals);
            m_ClassifierData.add(inst);
        }
    }

    if (m_Debug) {
        System.err.println("--- Building classifier");
    }

    // Build classifier
    FilteredClassifier fclass = new FilteredClassifier();
    fclass.setClassifier(new NaiveBayesSimple());
    fclass.setFilter(new Discretize());
    m_Classifier = fclass;
    m_Classifier.buildClassifier(m_ClassifierData);

    if (m_Debug) {
        System.err.println(m_Classifier);
    }

    // Save space
    m_ClassifierData = new Instances(m_ClassifierData, 0);
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from ww  w . j av a2 s  .co m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values.//from  w  w w  .  j ava2s . c o  m
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found./*from   w  w w  . j ava  2 s. c om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:lascer.WekaClassifier.java

License:Open Source License

/**
 * Generates the classifier.//from  w w  w.  j  ava  2 s  .  com
 *
 * @param data  the data to be used.
 *
 * @exception Exception  if the classifier can't built successfully.
 */
public void buildClassifier(Instances data) throws Exception {
    weka.coreExtended.Instances extendedInstances;
    weka.coreExtended.BasicInstance extInst;
    weka.coreExtended.BasicAttribute classAttribut;
    de.unistuttgart.commandline.Option formelnArtOption;
    de.unistuttgart.commandline.Option formelnKlasseOption;
    de.unistuttgart.commandline.Option loggingSwitch;
    Instance readInst;
    Beispieldaten invDatensatz;
    StringReader stringReader;
    Enumeration instEnum;
    Enumeration attribEnum;
    PraedErzParameter praedErzParameter = null;
    KonzErzParameter konzErzParameter = null;
    Pruning pruning;
    String formelArt;
    String formelKlasse;
    String optionWert;
    float posPruneAnt, negPruneAnt;
    int instNumber;
    boolean unbekannteWertBsp;

    Steuerung.parseArguments(parser);

    formelArt = Konstanten.WEKA_FORMEL_ART;
    formelnArtOption = parser.getOption("formelArt");
    if (parser.isEnabled(formelnArtOption)) {
        optionWert = parser.getParameter(formelnArtOption);
        if (!optionWert.equals("dis") && !optionWert.equals("kon") && !optionWert.equals("beste")) {

            System.err.println("Wert der Option formelArt unzulssig");
            System.err.println("Zulssig: " + formelnArtOption.toString());
            throw (new RuntimeException("Wert von Option unzulssig."));
        }
        formelArt = optionWert;
    }

    formelKlasse = Konstanten.WEKA_FORMEL_KLASSE;
    formelnKlasseOption = parser.getOption("formelKlasse");
    if (parser.isEnabled(formelnKlasseOption)) {
        optionWert = parser.getParameter(formelnKlasseOption);
        if (!optionWert.equals("pos") && !optionWert.equals("neg") && !optionWert.equals("beste")
                && !optionWert.equals("beide")) {

            System.err.println("Wert der Option formelKlasse unzulssig");
            System.err.println("Zulssig: " + formelnKlasseOption.toString());
            throw (new RuntimeException("Wert von Option unzulssig."));
        }
        formelKlasse = optionWert;
    }

    loggingSwitch = parser.getOption("logging");
    if (debugMode || parser.isEnabled(loggingSwitch)) {
        Steuerung.setLogLevel(Konstanten.LOGGING_LEVEL);
    }

    // Ermittlung der Parameter.
    unbekannteWertBsp = Steuerung.unbekannteWertBeispiele(parser);
    posPruneAnt = Steuerung.posPruneAnteil(parser);
    negPruneAnt = Steuerung.negPruneAnteil(parser);
    praedErzParameter = Steuerung.praedErzParameter(parser);
    konzErzParameter = Steuerung.konzErzParameter(parser);

    // Einlesen der Daten und Erzeugung des Instanzen-Objekts.
    instNumber = data.numInstances();
    stringReader = new StringReader(data.toString());
    extendedInstances = new weka.coreExtended.Instances(stringReader, instNumber);
    instEnum = data.enumerateInstances();
    while (instEnum.hasMoreElements()) {
        readInst = (Instance) instEnum.nextElement();
        extInst = new weka.coreExtended.BasicInstance(readInst.weight(), readInst.toDoubleArray());
        extendedInstances.addBasicInstance(extInst);
    }

    // Erzeugung der Datenstze.
    posDatensatz = ArffDateiEinlesen.beispieldaten(extendedInstances, unbekannteWertBsp);
    negDatensatz = posDatensatz.kopie(true);

    // Erzeugung der Liste der Attribute.
    attributListe = new LinkedList();
    attribEnum = extendedInstances.enumerateBasicAttributes();
    while (attribEnum.hasMoreElements()) {
        attributListe.add(attribEnum.nextElement());
    }

    // Ermittlung der Werte der Klassifikation.
    classAttribut = extendedInstances.basicClassAttribute();
    wekaClassTrue = classAttribut.indexOfValue("true");
    wekaClassFalse = classAttribut.indexOfValue("false");

    // Die Formel zur Klasse der positiven Beispiele erzeugen.
    if (formelKlasse.equals("pos") || formelKlasse.equals("beste") || formelKlasse.equals("beide")) {

        posFormel = generatedFormula(posDatensatz, praedErzParameter, konzErzParameter, formelArt);
    }

    // Die Formel zur Klasse der negativen Beispiele erzeugen.
    if (formelKlasse.equals("neg") || formelKlasse.equals("beste") || formelKlasse.equals("beide")) {

        negFormel = generatedFormula(negDatensatz, praedErzParameter, konzErzParameter, formelArt);
    }

    if (formelKlasse.equals("beste")) {
        // Die schlechtere Formel lschen.
        if (negFormel.istBesser(posFormel)) {
            posFormel = null;
        } else {
            negFormel = null;
        }
    }

    if ((posPruneAnt > 0) || (negPruneAnt > 0)) {
        pruning = new Pruning();

        if (posFormel != null) {
            posDatensatz = pruning.reduzierteDaten(posDatensatz, posFormel, posPruneAnt, negPruneAnt);
            posFormel = generatedFormula(posDatensatz, praedErzParameter, konzErzParameter, formelArt);
        }

        if (negFormel != null) {
            negDatensatz = pruning.reduzierteDaten(negDatensatz, negFormel, negPruneAnt, posPruneAnt);
            negFormel = generatedFormula(negDatensatz, praedErzParameter, konzErzParameter, formelArt);
        }
    }
}