Example usage for weka.core Instances sumOfWeights

List of usage examples for weka.core Instances sumOfWeights

Introduction

In this page you can find the example usage for weka.core Instances sumOfWeights.

Prototype

publicdouble sumOfWeights() 

Source Link

Document

Computes the sum of all the instances' weights.

Usage

From source file:Pair.java

License:Open Source License

private void doCV(Instances targetData) throws Exception {
    System.out.println();/*from   w ww.  j av  a 2  s.com*/
    System.out.flush();
    int numSourceInstances = m_SourceInstances.numInstances();
    int numInstances = targetData.numInstances() + numSourceInstances;
    numTargetInstances = numInstances - numSourceInstances;
    double weightSource, weightTarget;
    double initialSourceFraction;
    double[] weights = new double[numInstances];
    Random randomInstance = new Random(1);

    Instances data = new Instances(m_SourceInstances, 0, numSourceInstances);
    // Now add the target data, shallow copying the instances as they are added
    // so it doesn't mess up the weights for anyone else
    Enumeration enumer = targetData.enumerateInstances();
    while (enumer.hasMoreElements()) {
        Instance instance = (Instance) enumer.nextElement();
        data.add(instance);
    }

    if (sourceRatio < 0) { //weight all equally
        weightSource = weightTarget = 1.0/*/numInstances*/;
        initialSourceFraction = numSourceInstances / (double) numInstances;
    } else {
        double totalWeight = 1 + sourceRatio;
        weightSource = sourceRatio / totalWeight/*/numSourceInstances*/;
        weightTarget = 1.0 / totalWeight/*/numTargetInstances*/;
        initialSourceFraction = weightSource;
    }
    for (int j = 0; j < numInstances; j++) {
        Instance instance = data.instance(j);
        if (j < numSourceInstances)
            instance.setWeight(weightSource);
        else
            instance.setWeight(weightTarget);
    }

    if (doFraction) {
        for (int it = 0; it < sourceIterations/*m_NumIterations*/; it++) {

            sourceFraction = (1 - (it / (double) m_NumIterations)) * initialSourceFraction; //[same weights as regular]
            if (sourceFraction > .995)
                sourceFraction = .995;
            //double sourceWeight = (sourceFraction * numInstances) / numSourceInstances;
            double sourceWeight = (sourceFraction * numTargetInstances)
                    / (numSourceInstances * (1 - sourceFraction));
            for (int j = 0; j < numInstances; j++) {
                Instance instance = data.instance(j);
                if (j < numSourceInstances)
                    instance.setWeight(sourceWeight);
                else
                    instance.setWeight(1);
            }
            buildClassifierWithWeights(data);
            System.out.println("Iteration " + it + ":" + getTestError());
        }
    } else {

        for (int i = 0; i < numInstances; i++)
            weights[i] = data.instance(i).weight();
        buildClassifierWithWeights(data);
        System.out.println("Iteration -1:" + getTestError());
        for (int i = 0; i < numInstances; i++)
            data.instance(i).setWeight(weights[i]);

        for (int it = 0; it < sourceIterations; it++) {

            Instances sample = null;
            if (!resample || m_NumIterationsPerformed == 0) {
                sample = data;
            } else {
                double sum = data.sumOfWeights();
                double[] sweights = new double[data.numInstances()];
                for (int i = 0; i < sweights.length; i++) {
                    sweights[i] = data.instance(i).weight() / sum;
                }
                sample = data.resampleWithWeights(randomInstance, sweights);
            }

            try {
                m_Classifiers[it].buildClassifier(sample);
            } catch (Exception e) {
                e.printStackTrace();
                System.out.println("E: " + e);
            }

            sourceFraction = initialSourceFraction * (1 - (it + 1) / (double) m_NumIterations);
            setWeights(data, m_Classifiers[it], sourceFraction, numSourceInstances, false);

            for (int i = 0; i < numInstances; i++)
                weights[i] = data.instance(i).weight();

            buildClassifierWithWeights(data);

            System.out.println("Iteration " + it + ":" + getTestError());

            for (int i = 0; i < numInstances; i++)
                data.instance(i).setWeight(weights[i]);

        }

    }

}

From source file:Pair.java

License:Open Source License

/**
 * Boosting method. Boosts any classifier that can handle weighted
 * instances./*  w ww.  j av  a2  s  .  c  o m*/
 *
 * @param data the training data to be used for generating the
 * boosted classifier.
 * @exception Exception if the classifier could not be built successfully
 */
protected void buildClassifierWithWeights(Instances data) throws Exception {

    Random randomInstance = new Random(0);
    double epsilon, reweight, beta = 0;
    Evaluation evaluation;
    Instances sample;
    // Initialize data
    m_Betas = new double[m_Classifiers.length];
    m_NumIterationsPerformed = 0;
    int numSourceInstances = m_SourceInstances.numInstances();

    // Do boostrap iterations
    for (m_NumIterationsPerformed = 0; m_NumIterationsPerformed < m_Classifiers.length; m_NumIterationsPerformed++) {
        // Build the classifier
        sample = null;
        if (!resample || m_NumIterationsPerformed == 0) {
            sample = data;
        } else {
            double sum = data.sumOfWeights();
            double[] weights = new double[data.numInstances()];
            for (int i = 0; i < weights.length; i++) {
                weights[i] = data.instance(i).weight() / sum;
            }
            sample = data.resampleWithWeights(randomInstance, weights);

            if (doSampleSize) {
                int effectiveInstances = (int) (sourceFraction * weights.length + numTargetInstances);
                if (effectiveInstances > numSourceInstances + numTargetInstances)
                    effectiveInstances = numSourceInstances + numTargetInstances;
                //System.out.println(effectiveInstances);               
                sample.randomize(randomInstance);
                Instances q = new Instances(sample, 0, effectiveInstances);
                sample = q;
            }
        }
        try {
            m_Classifiers[m_NumIterationsPerformed].buildClassifier(sample);
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("E: " + e);
        }

        if (doBagging)
            beta = 0.4 / .6; //always same beta
        else
            beta = setWeights(data, m_Classifiers[m_NumIterationsPerformed], -1, numSourceInstances, true);

        // Stop if error too small or error too big and ignore this model
        if (beta < 0) { //setWeights indicates a problem with negative beta
            if (m_NumIterationsPerformed == 0) {
                m_NumIterationsPerformed = 1; // If we're the first we have to to use it
            }
            break;
        }

        // Determine the weight to assign to this model

        m_Betas[m_NumIterationsPerformed] = Math.log(1 / beta);

    }

    betaSum = 0;

    for (int i = 0; i < m_NumIterationsPerformed; i++)
        betaSum += m_Betas[i];
}

From source file:MultiClassClassifier.java

License:Open Source License

/**
 * Builds the classifiers./*from   www  .j a  v a 2  s .  c  o  m*/
 *
 * @param insts the training data.
 * @throws Exception if a classifier can't be built
 */
public void buildClassifier(Instances insts) throws Exception {

    Instances newInsts;

    // can classifier handle the data?
    getCapabilities().testWithFail(insts);

    // remove instances with missing class
    insts = new Instances(insts);
    insts.deleteWithMissingClass();

    if (m_Classifier == null) {
        throw new Exception("No base classifier has been set!");
    }
    m_ZeroR = new ZeroR();
    m_ZeroR.buildClassifier(insts);

    m_TwoClassDataset = null;

    int numClassifiers = insts.numClasses();
    if (numClassifiers <= 2) {

        m_Classifiers = Classifier.makeCopies(m_Classifier, 1);
        m_Classifiers[0].buildClassifier(insts);

        m_ClassFilters = null;

    } else if (m_Method == METHOD_1_AGAINST_1) {
        // generate fastvector of pairs
        FastVector pairs = new FastVector();
        for (int i = 0; i < insts.numClasses(); i++) {
            for (int j = 0; j < insts.numClasses(); j++) {
                if (j <= i)
                    continue;
                int[] pair = new int[2];
                pair[0] = i;
                pair[1] = j;
                pairs.addElement(pair);
            }
        }

        numClassifiers = pairs.size();
        m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers);
        m_ClassFilters = new Filter[numClassifiers];
        m_SumOfWeights = new double[numClassifiers];

        // generate the classifiers
        for (int i = 0; i < numClassifiers; i++) {
            RemoveWithValues classFilter = new RemoveWithValues();
            classFilter.setAttributeIndex("" + (insts.classIndex() + 1));
            classFilter.setModifyHeader(true);
            classFilter.setInvertSelection(true);
            classFilter.setNominalIndicesArr((int[]) pairs.elementAt(i));
            Instances tempInstances = new Instances(insts, 0);
            tempInstances.setClassIndex(-1);
            classFilter.setInputFormat(tempInstances);
            newInsts = Filter.useFilter(insts, classFilter);
            if (newInsts.numInstances() > 0) {
                newInsts.setClassIndex(insts.classIndex());
                m_Classifiers[i].buildClassifier(newInsts);
                m_ClassFilters[i] = classFilter;
                m_SumOfWeights[i] = newInsts.sumOfWeights();
            } else {
                m_Classifiers[i] = null;
                m_ClassFilters[i] = null;
            }
        }

        // construct a two-class header version of the dataset
        m_TwoClassDataset = new Instances(insts, 0);
        int classIndex = m_TwoClassDataset.classIndex();
        m_TwoClassDataset.setClassIndex(-1);
        m_TwoClassDataset.deleteAttributeAt(classIndex);
        FastVector classLabels = new FastVector();
        classLabels.addElement("class0");
        classLabels.addElement("class1");
        m_TwoClassDataset.insertAttributeAt(new Attribute("class", classLabels), classIndex);
        m_TwoClassDataset.setClassIndex(classIndex);

    } else {
        // use error correcting code style methods
        Code code = null;
        switch (m_Method) {
        case METHOD_ERROR_EXHAUSTIVE:
            code = new ExhaustiveCode(numClassifiers);
            break;
        case METHOD_ERROR_RANDOM:
            code = new RandomCode(numClassifiers, (int) (numClassifiers * m_RandomWidthFactor), insts);
            break;
        case METHOD_1_AGAINST_ALL:
            code = new StandardCode(numClassifiers);
            break;
        default:
            throw new Exception("Unrecognized correction code type");
        }
        numClassifiers = code.size();
        m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers);
        m_ClassFilters = new MakeIndicator[numClassifiers];
        for (int i = 0; i < m_Classifiers.length; i++) {
            m_ClassFilters[i] = new MakeIndicator();
            MakeIndicator classFilter = (MakeIndicator) m_ClassFilters[i];
            classFilter.setAttributeIndex("" + (insts.classIndex() + 1));
            classFilter.setValueIndices(code.getIndices(i));
            classFilter.setNumeric(false);
            classFilter.setInputFormat(insts);
            newInsts = Filter.useFilter(insts, m_ClassFilters[i]);
            m_Classifiers[i].buildClassifier(newInsts);
        }
    }
    m_ClassAttribute = insts.classAttribute();
}

From source file:clusterer.SimpleKMeansWithSilhouette.java

License:Open Source License

/**
 * Generates a clusterer. Has to initialize all fields of the clusterer that
 * are not being set via options./*from  w  w  w .  j a v  a 2 s .com*/
 * 
 * @param data set of instances serving as training data
 * @throws Exception if the clusterer has not been generated successfully
 */
@Override
public void buildClusterer(Instances data) throws Exception {

    m_canopyClusters = null;

    // can clusterer handle the data?
    getCapabilities().testWithFail(data);

    m_Iterations = 0;

    m_ReplaceMissingFilter = new ReplaceMissingValues();
    Instances instances = new Instances(data);

    instances.setClassIndex(-1);
    if (!m_dontReplaceMissing) {
        m_ReplaceMissingFilter.setInputFormat(instances);
        instances = Filter.useFilter(instances, m_ReplaceMissingFilter);
    }

    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    if (m_displayStdDevs) {
        m_FullStdDevs = instances.variances();
    }

    m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);

    m_FullMissingCounts = m_ClusterMissingCounts[0];
    m_FullNominalCounts = m_ClusterNominalCounts[0];
    double sumOfWeights = instances.sumOfWeights();
    for (int i = 0; i < instances.numAttributes(); i++) {
        if (instances.attribute(i).isNumeric()) {
            if (m_displayStdDevs) {
                m_FullStdDevs[i] = Math.sqrt(m_FullStdDevs[i]);
            }
            if (m_FullMissingCounts[i] == sumOfWeights) {
                m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean
            }
        } else {
            if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) {
                m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common
                                                     // value
            }
        }
    }

    m_ClusterCentroids = new Instances(instances, m_NumClusters);
    int[] clusterAssignments = new int[instances.numInstances()];

    if (m_PreserveOrder) {
        m_Assignments = clusterAssignments;
    }

    m_DistanceFunction.setInstances(instances);

    Random RandomO = new Random(getSeed());
    int instIndex;
    HashMap<DecisionTableHashKey, Integer> initC = new HashMap<DecisionTableHashKey, Integer>();
    DecisionTableHashKey hk = null;

    Instances initInstances = null;
    if (m_PreserveOrder) {
        initInstances = new Instances(instances);
    } else {
        initInstances = instances;
    }

    if (m_speedUpDistanceCompWithCanopies) {
        m_canopyClusters = new Canopy();
        m_canopyClusters.setNumClusters(m_NumClusters);
        m_canopyClusters.setSeed(getSeed());
        m_canopyClusters.setT2(getCanopyT2());
        m_canopyClusters.setT1(getCanopyT1());
        m_canopyClusters.setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
        m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
        m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
        m_canopyClusters.setDebug(getDebug());
        m_canopyClusters.buildClusterer(initInstances);
        // System.err.println(m_canopyClusters);
        m_centroidCanopyAssignments = new ArrayList<long[]>();
        m_dataPointCanopyAssignments = new ArrayList<long[]>();
    }

    if (m_initializationMethod == KMEANS_PLUS_PLUS) {
        kMeansPlusPlusInit(initInstances);

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else if (m_initializationMethod == CANOPY) {
        canopyInit(initInstances);

        m_initialStartPoints = new Instances(m_canopyClusters.getCanopies());
    } else if (m_initializationMethod == FARTHEST_FIRST) {
        farthestFirstInit(initInstances);

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    } else {
        // random
        for (int j = initInstances.numInstances() - 1; j >= 0; j--) {
            instIndex = RandomO.nextInt(j + 1);
            hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(),
                    true);
            if (!initC.containsKey(hk)) {
                m_ClusterCentroids.add(initInstances.instance(instIndex));
                initC.put(hk, null);
            }
            initInstances.swap(j, instIndex);

            if (m_ClusterCentroids.numInstances() == m_NumClusters) {
                break;
            }
        }

        m_initialStartPoints = new Instances(m_ClusterCentroids);
    }

    if (m_speedUpDistanceCompWithCanopies) {
        // assign canopies to training data
        for (int i = 0; i < instances.numInstances(); i++) {
            m_dataPointCanopyAssignments.add(m_canopyClusters.assignCanopies(instances.instance(i)));
        }
    }

    m_NumClusters = m_ClusterCentroids.numInstances();

    // removing reference
    initInstances = null;

    int i;
    boolean converged = false;
    int emptyClusterCount;
    Instances[] tempI = new Instances[m_NumClusters];
    m_squaredErrors = new double[m_NumClusters];
    m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
    m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
    startExecutorPool();

    while (!converged) {
        if (m_speedUpDistanceCompWithCanopies) {
            // re-assign canopies to the current cluster centers
            m_centroidCanopyAssignments.clear();
            for (int kk = 0; kk < m_ClusterCentroids.numInstances(); kk++) {
                m_centroidCanopyAssignments
                        .add(m_canopyClusters.assignCanopies(m_ClusterCentroids.instance(kk)));
            }
        }

        emptyClusterCount = 0;
        m_Iterations++;
        converged = true;

        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < instances.numInstances(); i++) {
                Instance toCluster = instances.instance(i);
                int newC = clusterProcessedInstance(toCluster, false, true,
                        m_speedUpDistanceCompWithCanopies ? m_dataPointCanopyAssignments.get(i) : null);
                if (newC != clusterAssignments[i]) {
                    converged = false;
                }
                clusterAssignments[i] = newC;
            }
        } else {
            converged = launchAssignToClusters(instances, clusterAssignments);
        }

        // update centroids
        m_ClusterCentroids = new Instances(instances, m_NumClusters);
        for (i = 0; i < m_NumClusters; i++) {
            tempI[i] = new Instances(instances, 0);
        }
        for (i = 0; i < instances.numInstances(); i++) {
            tempI[clusterAssignments[i]].add(instances.instance(i));
        }
        if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) {
            for (i = 0; i < m_NumClusters; i++) {
                if (tempI[i].numInstances() == 0) {
                    // empty cluster
                    emptyClusterCount++;
                } else {
                    moveCentroid(i, tempI[i], true, true);
                }
            }
        } else {
            emptyClusterCount = launchMoveCentroids(tempI);
        }

        if (m_Iterations == m_MaxIterations) {
            converged = true;
        }

        if (emptyClusterCount > 0) {
            m_NumClusters -= emptyClusterCount;
            if (converged) {
                Instances[] t = new Instances[m_NumClusters];
                int index = 0;
                for (int k = 0; k < tempI.length; k++) {
                    if (tempI[k].numInstances() > 0) {
                        t[index] = tempI[k];

                        for (i = 0; i < tempI[k].numAttributes(); i++) {
                            m_ClusterNominalCounts[index][i] = m_ClusterNominalCounts[k][i];
                        }
                        index++;
                    }
                }
                tempI = t;
            } else {
                tempI = new Instances[m_NumClusters];
            }
        }

        if (!converged) {
            m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
        }
    }

    // calculate errors
    if (!m_FastDistanceCalc) {
        for (i = 0; i < instances.numInstances(); i++) {
            clusterProcessedInstance(instances.instance(i), true, false, null);
        }
    }

    if (m_displayStdDevs) {
        m_ClusterStdDevs = new Instances(instances, m_NumClusters);
    }
    m_ClusterSizes = new double[m_NumClusters];
    for (i = 0; i < m_NumClusters; i++) {
        if (m_displayStdDevs) {
            double[] vals2 = tempI[i].variances();
            for (int j = 0; j < instances.numAttributes(); j++) {
                if (instances.attribute(j).isNumeric()) {
                    vals2[j] = Math.sqrt(vals2[j]);
                } else {
                    vals2[j] = Utils.missingValue();
                }
            }
            m_ClusterStdDevs.add(new DenseInstance(1.0, vals2));
        }
        m_ClusterSizes[i] = tempI[i].sumOfWeights();
    }

    m_executorPool.shutdown();

    // save memory!
    m_DistanceFunction.clean();

    // Calculate Silhouette Coefficient
    SilCoeff = new double[instances.numInstances()];
    AvgSilCoeff = 0;
    for (int z = 0; z < instances.numInstances(); z++) {
        double[] distance = new double[m_NumClusters];
        Arrays.fill(distance, 0.0);
        //Sum
        for (int y = 0; y < instances.numInstances(); y++) {
            distance[clusterAssignments[y]] += m_DistanceFunction.distance(instances.get(z), instances.get(y));
        }
        //Average
        for (int x = 0; x < m_NumClusters; x++) {
            distance[x] = distance[x] / m_ClusterSizes[x];
        }
        double a = distance[clusterAssignments[z]];
        distance[clusterAssignments[z]] = Double.MAX_VALUE;
        Arrays.sort(distance);
        double b = distance[0];
        SilCoeff[z] = (b - a) / Math.max(a, b);
        AvgSilCoeff += SilCoeff[z];
    }
    AvgSilCoeff = AvgSilCoeff / instances.numInstances();
    //System.out.println("AvgSilCoeff: " + AvgSilCoeff);
}

From source file:gyc.OverBoostM1.java

License:Open Source License

/**
 * Sets the weights for the next iteration.
 * //from   w  w  w. ja  v  a2s  .  com
 * @param training the training instances
 * @param reweight the reweighting factor
 * @throws Exception if something goes wrong
 */
protected void setWeights(Instances training, double reweight) throws Exception {

    double oldSumOfWeights, newSumOfWeights;

    oldSumOfWeights = training.sumOfWeights();
    Enumeration enu = training.enumerateInstances();
    while (enu.hasMoreElements()) {
        Instance instance = (Instance) enu.nextElement();
        if (!Utils.eq(m_Classifiers[m_NumIterationsPerformed].classifyInstance(instance),
                instance.classValue()))
            instance.setWeight(instance.weight() * reweight);
    }

    // Renormalize weights
    newSumOfWeights = training.sumOfWeights();
    enu = training.enumerateInstances();
    while (enu.hasMoreElements()) {
        Instance instance = (Instance) enu.nextElement();
        instance.setWeight(instance.weight() * oldSumOfWeights / newSumOfWeights);
    }
}

From source file:j48.BinC45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *//*from  w  w  w  . j av a2 s .com*/
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    BinC45Split[] currentModel;
    BinC45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        Enumeration enu = data.enumerateAttributes();
        while (enu.hasMoreElements()) {
            Attribute attribute = (Attribute) enu.nextElement();
            if ((attribute.isNumeric())
                    || (Utils.sm((double) attribute.numValues(), (0.3 * (double) m_allData.numInstances())))) {
                multiVal = false;
                break;
            }
        }
        currentModel = new BinC45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new BinC45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if ((data.attribute(i).isNumeric())
                            || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                    (0.3 * (double) m_allData.numInstances())))) {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.C45ModelSelection.java

License:Open Source License

/**
 * Selects C4.5-type split for the given dataset.
 *//* w  w  w  . j av a2 s .  c o  m*/
public final ClassifierSplitModel selectModel(Instances data) {

    double minResult;
    double currentResult;
    C45Split[] currentModel;
    C45Split bestModel = null;
    NoSplit noSplitModel = null;
    double averageInfoGain = 0;
    int validModels = 0;
    boolean multiVal = true;
    Distribution checkDistribution;
    Attribute attribute;
    double sumOfWeights;
    int i;

    try {

        // Check if all Instances belong to one class or if not
        // enough Instances to split.
        checkDistribution = new Distribution(data);
        noSplitModel = new NoSplit(checkDistribution);
        if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj) || Utils.eq(checkDistribution.total(),
                checkDistribution.perClass(checkDistribution.maxClass())))
            return noSplitModel;

        // Check if all attributes are nominal and have a
        // lot of values.
        if (m_allData != null) {
            Enumeration enu = data.enumerateAttributes();
            while (enu.hasMoreElements()) {
                attribute = (Attribute) enu.nextElement();
                if ((attribute.isNumeric()) || (Utils.sm((double) attribute.numValues(),
                        (0.3 * (double) m_allData.numInstances())))) {
                    multiVal = false;
                    break;
                }
            }
        }

        currentModel = new j48.C45Split[data.numAttributes()];
        sumOfWeights = data.sumOfWeights();

        // For each attribute.
        for (i = 0; i < data.numAttributes(); i++) {

            // Apart from class attribute.
            if (i != (data).classIndex()) {

                // Get models for current attribute.
                currentModel[i] = new j48.C45Split(i, m_minNoObj, sumOfWeights);
                currentModel[i].buildClassifier(data);

                // Check if useful split for current attribute
                // exists and check for enumerated attributes with
                // a lot of values.
                if (currentModel[i].checkModel())
                    if (m_allData != null) {
                        if ((data.attribute(i).isNumeric())
                                || (multiVal || Utils.sm((double) data.attribute(i).numValues(),
                                        (0.3 * (double) m_allData.numInstances())))) {
                            averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                            validModels++;
                        }
                    } else {
                        averageInfoGain = averageInfoGain + currentModel[i].infoGain();
                        validModels++;
                    }
            } else
                currentModel[i] = null;
        }

        // Check if any useful split was found.
        if (validModels == 0)
            return noSplitModel;
        averageInfoGain = averageInfoGain / (double) validModels;

        // Find "best" attribute to split on.
        minResult = 0;
        for (i = 0; i < data.numAttributes(); i++) {
            if ((i != (data).classIndex()) && (currentModel[i].checkModel()))

                // Use 1E-3 here to get a closer approximation to the
                // original
                // implementation.
                if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
                        && Utils.gr(currentModel[i].gainRatio(), minResult)) {
                    bestModel = currentModel[i];
                    minResult = currentModel[i].gainRatio();
                }
        }

        // Check if useful split was found.
        if (Utils.eq(minResult, 0))
            return noSplitModel;

        // Add all Instances with unknown values for the corresponding
        // attribute to the distribution for the model, so that
        // the complete distribution is stored with the model.
        bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex());

        // Set the split point analogue to C45 if attribute numeric.
        if (m_allData != null)
            bestModel.setSplitPoint(m_allData);
        return bestModel;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return null;
}

From source file:j48.C45PruneableClassifierTree.java

License:Open Source License

/**
 * Computes new distributions of instances for nodes
 * in tree.//  www . j a  v  a2 s .  c om
 *
 * @param data the data to compute the distributions for
 * @throws Exception if something goes wrong
 */
private void newDistribution(Instances data) throws Exception {

    Instances[] localInstances;

    localModel().resetDistribution(data);
    m_train = data;
    if (!m_isLeaf) {
        localInstances = (Instances[]) localModel().split(data);
        for (int i = 0; i < m_sons.length; i++)
            son(i).newDistribution(localInstances[i]);
    } else {

        // Check whether there are some instances at the leaf now!
        if (!Utils.eq(data.sumOfWeights(), 0)) {
            m_isEmpty = false;
        }
    }
}

From source file:j48.ClassifierTree.java

License:Open Source License

/**
 * Builds the tree structure./*from w  ww.  j  ava2s.  com*/
 *
 * @param data the data for which the tree structure is to be
 * generated.
 * @param keepData is training data to be kept?
 * @throws Exception if something goes wrong
 */
public void buildTree(Instances data, boolean keepData) throws Exception {

    Instances[] localInstances;

    if (keepData) {
        m_train = data;
    }
    m_test = null;
    m_isLeaf = false;
    m_isEmpty = false;
    m_sons = null;
    m_localModel = m_toSelectModel.selectModel(data);
    if (m_localModel.numSubsets() > 1) {
        localInstances = m_localModel.split(data);
        data = null;
        m_sons = new ClassifierTree[m_localModel.numSubsets()];
        for (int i = 0; i < m_sons.length; i++) {
            m_sons[i] = getNewTree(localInstances[i]);
            localInstances[i] = null;
        }
    } else {
        m_isLeaf = true;
        if (Utils.eq(data.sumOfWeights(), 0))
            m_isEmpty = true;
        data = null;
    }
}

From source file:j48.ClassifierTree.java

License:Open Source License

/**
 * Builds the tree structure with hold out set
 *
 * @param train the data for which the tree structure is to be
 * generated./*from  ww  w .j  a  v  a2 s .c  om*/
 * @param test the test data for potential pruning
 * @param keepData is training Data to be kept?
 * @throws Exception if something goes wrong
 */
public void buildTree(Instances train, Instances test, boolean keepData) throws Exception {

    Instances[] localTrain, localTest;
    int i;

    if (keepData) {
        m_train = train;
    }
    m_isLeaf = false;
    m_isEmpty = false;
    m_sons = null;
    m_localModel = m_toSelectModel.selectModel(train, test);
    m_test = new Distribution(test, m_localModel);
    if (m_localModel.numSubsets() > 1) {
        localTrain = m_localModel.split(train);
        localTest = m_localModel.split(test);
        train = test = null;
        m_sons = new ClassifierTree[m_localModel.numSubsets()];
        for (i = 0; i < m_sons.length; i++) {
            m_sons[i] = getNewTree(localTrain[i], localTest[i]);
            localTrain[i] = null;
            localTest[i] = null;
        }
    } else {
        m_isLeaf = true;
        if (Utils.eq(train.sumOfWeights(), 0))
            m_isEmpty = true;
        train = test = null;
    }
}