Example usage for weka.core Instances classIndex

Introduction

In this page you can find the example usage for weka.core Instances classIndex.

Prototype


publicint classIndex()

Source Link

Document

Returns the class attribute's index.

Usage

From source file:com.tum.classifiertest.FastRfBagging.java

License:Open Source License

/**
 * Bagging method. Produces DataCache objects with bootstrap samples of
 * the original data, and feeds them to the base classifier (which can only
 * be a FastRandomTree).//from  ww  w . ja v  a  2s  . c  o  m
 *
 * @param data         The training set to be used for generating the
 *                     bagged classifier.
 * @param numThreads   The number of simultaneous threads to use for
 *                     computation. Pass zero (0) for autodetection.
 * @param motherForest A reference to the FastRandomForest object that
 *                     invoked this.
 *
 * @throws Exception if the classifier could not be built successfully
 */
public void buildClassifier(Instances data, int numThreads, FastRandomForest motherForest) throws Exception {

    // can classifier handle the vals?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    if (!(m_Classifier instanceof FastRandomTree))
        throw new IllegalArgumentException(
                "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");

    /* We fill the m_Classifiers array by creating lots of trees with new()
     * because this is much faster than using serialization to deep-copy the
     * one tree in m_Classifier - this is what the super.buildClassifier(data)
     * normally does. */
    m_Classifiers = new Classifier[m_NumIterations];
    for (int i = 0; i < m_Classifiers.length; i++) {
        FastRandomTree curTree = new FastRandomTree();
        // all parameters for training will be looked up in the motherForest (maxDepth, k_Value)
        curTree.m_MotherForest = motherForest;
        // 0.99: reference to these arrays will get passed down all nodes so the array can be re-used 
        // 0.99: this array is of size two as now all splits are binary - even categorical ones
        curTree.tempProps = new double[2];
        curTree.tempDists = new double[2][];
        curTree.tempDists[0] = new double[data.numClasses()];
        curTree.tempDists[1] = new double[data.numClasses()];
        curTree.tempDistsOther = new double[2][];
        curTree.tempDistsOther[0] = new double[data.numClasses()];
        curTree.tempDistsOther[1] = new double[data.numClasses()];
        m_Classifiers[i] = curTree;
    }

    // this was SLOW.. takes approx 1/2 time as training the forest afterwards (!!!)
    // super.buildClassifier(data);

    if (m_CalcOutOfBag && (m_BagSizePercent != 100)) {
        throw new IllegalArgumentException(
                "Bag size needs to be 100% if " + "out-of-bag error is to be calculated!");
    }

    // sorting is performed inside this constructor
    DataCache myData = new DataCache(data);

    int bagSize = data.numInstances() * m_BagSizePercent / 100;
    Random random = new Random(m_Seed);

    boolean[][] inBag = new boolean[m_Classifiers.length][];

    // thread management
    ExecutorService threadPool = Executors
            .newFixedThreadPool(numThreads > 0 ? numThreads : Runtime.getRuntime().availableProcessors());
    List<Future<?>> futures = new ArrayList<Future<?>>(m_Classifiers.length);

    try {

        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {

            // create the in-bag dataset (and be sure to remember what's in bag)
            // for computing the out-of-bag error later
            DataCache bagData = myData.resample(bagSize, random);
            bagData.reusableRandomGenerator = bagData.getRandomNumberGenerator(random.nextInt());
            inBag[treeIdx] = bagData.inBag; // store later for OOB error calculation

            // build the classifier
            if (m_Classifiers[treeIdx] instanceof FastRandomTree) {

                FastRandomTree aTree = (FastRandomTree) m_Classifiers[treeIdx];
                aTree.data = bagData;

                Future<?> future = threadPool.submit(aTree);
                futures.add(future);

            } else {
                throw new IllegalArgumentException(
                        "The FastRfBagging class accepts " + "only FastRandomTree as its base classifier.");
            }

        }

        // make sure all trees have been trained before proceeding
        for (int treeIdx = 0; treeIdx < m_Classifiers.length; treeIdx++) {
            futures.get(treeIdx).get();

        }

        // calc OOB error?
        if (getCalcOutOfBag() || getComputeImportances()) {
            //m_OutOfBagError = computeOOBError(data, inBag, threadPool);
            m_OutOfBagError = computeOOBError(myData, inBag, threadPool);
        } else {
            m_OutOfBagError = 0;
        }

        //calc feature importances
        m_FeatureImportances = null;
        //m_FeatureNames = null;
        if (getComputeImportances()) {
            m_FeatureImportances = new double[data.numAttributes()];
            ///m_FeatureNames = new String[data.numAttributes()];
            //Instances dataCopy = new Instances(data); //To scramble
            //int[] permutation = FastRfUtils.randomPermutation(data.numInstances(), random);
            for (int j = 0; j < data.numAttributes(); j++) {
                if (j != data.classIndex()) {
                    //double sError = computeOOBError(FastRfUtils.scramble(data, dataCopy, j, permutation), inBag, threadPool);
                    //double sError = computeOOBError(data, inBag, threadPool, j, 0);
                    float[] unscrambled = myData.scrambleOneAttribute(j, random);
                    double sError = computeOOBError(myData, inBag, threadPool);
                    myData.vals[j] = unscrambled; // restore the original state
                    m_FeatureImportances[j] = sError - m_OutOfBagError;
                }
                //m_FeatureNames[j] = data.attribute(j).name();
            }
        }

        threadPool.shutdown();

    } finally {
        threadPool.shutdownNow();
    }
}

From source file:com.tum.classifiertest.FastRfUtils.java

License:Open Source License

/**
 * Load a dataset into memory./*ww  w  . j ava 2  s. c  om*/
 *
 * @param location the location of the dataset
 *
 * @return the dataset
 */
public static Instances readInstances(String location) throws Exception {
    Instances data = new weka.core.converters.ConverterUtils.DataSource(location).getDataSet();
    if (data.classIndex() == -1)
        data.setClassIndex(data.numAttributes() - 1);
    return data;
}

From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java

License:Apache License

/**
* Samoa instances information./*from   w  ww  .jav a2s. c o m*/
*
* @param instances the instances
* @return the instances
*/
public Instances samoaInstancesInformation(weka.core.Instances instances) {
    Instances samoaInstances;
    List<Attribute> attInfo = new ArrayList<Attribute>();
    for (int i = 0; i < instances.numAttributes(); i++) {
        attInfo.add(samoaAttribute(i, instances.attribute(i)));
    }
    samoaInstances = new Instances(instances.relationName(), attInfo, 0);
    samoaInstances.setClassIndex(instances.classIndex());
    return samoaInstances;
}

From source file:com.zazhu.BlueHub.BlueHub.java

License:Apache License

/**
 * receives the last reads from the sensors and creates the features we use
 * only the acc x,y,z (either from internal or external sensor)
 * /*w  w  w  .j  a  va  2  s .c o m*/
 * @param sensorQueue
 * @throws Exception 
 */
private Instance processingSenseData(Queue<String> sensorQueue, char whatSensor) throws Exception {

    BufferedReader reader;
    Instances format;
    Instance newInstance = null;

    Log.d(TAG, "Queue size = " + mQueueSize);

    if (sensorQueue.size() <= 0)
        throw new Exception("Queue empty");

    // create the arrays that will contain the accelerometer data
    // s.x s.y s.z
    double[] sx = new double[sensorQueue.size()];
    double[] sy = new double[sensorQueue.size()];
    double[] sz = new double[sensorQueue.size()];

    String rawReading;
    StringTokenizer st;

    int index;

    if (D)
        Log.e(TAG, "+++ COMPUTING FEATURES +++");

    // 1. collect raw data. what kind of sensing data? external vs. internal
    switch (whatSensor) {
    case EXTERNAL:
        index = 0;
        while ((rawReading = sensorQueue.poll()) != null) {
            // FORMAT:
            // "Time_SensorName_SensorNumber_Counter_Xacc_Yacc_Zacc_Xgyro_Ygyro_checksum"
            // position of the values needed: s.x = 4, s.y = 5, s.z = 6
            st = new StringTokenizer(rawReading, FIELD_SEP);
            // not needed data
            for (int i = 0; i < 4; i++)
                st.nextToken();
            // s.x, s.y, s.z
            sx[index] = Double.valueOf(st.nextToken());
            sy[index] = Double.valueOf(st.nextToken());
            sz[index] = Double.valueOf(st.nextToken());

            index += 1;
        }

        // 2. process raw data
        // 2.1 read the input format for the instance (TODO must be changed to
        // use weka classes)
        reader = new BufferedReader(new InputStreamReader(getResources().openRawResource(R.raw.format_extern)));

        try {
            format = new Instances(reader);

            if (format.classIndex() == -1)
                format.setClassIndex(format.numAttributes() - 1);

            // 2.2 create a new instance
            newInstance = new DenseInstance(7);
            newInstance.setDataset(format);
            // set attributes
            newInstance.setValue(format.attribute(0), Feature.getStd(sx));
            newInstance.setValue(format.attribute(1), Feature.getStd(sy));
            newInstance.setValue(format.attribute(2), Feature.getStd(sz));
            newInstance.setValue(format.attribute(3), Feature.getMean(sx));
            newInstance.setValue(format.attribute(4), Feature.getMean(sy));
            newInstance.setValue(format.attribute(5), Feature.getMean(sz));
            // set unknown class
            newInstance.setMissing(format.attribute(6));

        } catch (IOException e) {
            e.printStackTrace();
        }

        break;
    case INTERNAL:

        index = 0;
        while ((rawReading = sensorQueue.poll()) != null) {

            // FORMAT "Xacc_Yacc_Zacc"
            // position of the values needed: s.x = 0, s.y = 1, s.z = 2
            st = new StringTokenizer(rawReading, FIELD_SEP);

            // s.x, s.y, s.z
            sx[index] = Double.valueOf(st.nextToken());
            sy[index] = Double.valueOf(st.nextToken());
            sz[index] = Double.valueOf(st.nextToken());

            index += 1;
        }

        // 2. process raw data
        // 2.1 read the input format for the instance (TODO must be changed to
        // use weka classes)
        reader = new BufferedReader(new InputStreamReader(getResources().openRawResource(R.raw.format_intern)));

        try {
            format = new Instances(reader);

            if (format.classIndex() == -1)
                format.setClassIndex(format.numAttributes() - 1);

            // 2.2 create a new instance
            newInstance = new DenseInstance(7);
            newInstance.setDataset(format);
            // set attributes
            newInstance.setValue(format.attribute(0), Feature.getStd(sx));
            newInstance.setValue(format.attribute(1), Feature.getStd(sy));
            newInstance.setValue(format.attribute(2), Feature.getStd(sz));
            newInstance.setValue(format.attribute(3), Feature.getMean(sx));
            newInstance.setValue(format.attribute(4), Feature.getMean(sy));
            newInstance.setValue(format.attribute(5), Feature.getMean(sz));
            // set unknown class
            newInstance.setMissing(format.attribute(6));

        } catch (IOException e) {
            e.printStackTrace();
        }

        break;
    default:
        if (D)
            Log.e(TAG, "+++ COMPUTING FEATURES: NO VALUE FOR THE SENSOR READING +++");
        break;
    }

    return newInstance;

}

From source file:core.classifier.MyFirstClassifier.java

License:Open Source License

/**
 * Method for building the classifier. Implements a one-against-one
 * wrapper for multi-class problems./* ww w  .ja  v a  2 s. co m*/
 *
 * @param insts the set of training instances
 * @throws Exception if the classifier can't be built successfully
 */
public void buildClassifier(Instances insts) throws Exception {

    if (!m_checksTurnedOff) {
        // can classifier handle the data?
        getCapabilities().testWithFail(insts);

        // remove instances with missing class
        insts = new Instances(insts);
        insts.deleteWithMissingClass();

        /* Removes all the instances with weight equal to 0.
         MUST be done since condition (8) of Keerthi's paper
         is made with the assertion Ci > 0 (See equation (3a). */
        Instances data = new Instances(insts, insts.numInstances());
        for (int i = 0; i < insts.numInstances(); i++) {
            if (insts.instance(i).weight() > 0)
                data.add(insts.instance(i));
        }
        if (data.numInstances() == 0) {
            throw new Exception("No training instances left after removing " + "instances with weight 0!");
        }
        insts = data;
    }

    if (!m_checksTurnedOff) {
        m_Missing = new ReplaceMissingValues();
        m_Missing.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Missing);
    } else {
        m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
        boolean onlyNumeric = true;
        if (!m_checksTurnedOff) {
            for (int i = 0; i < insts.numAttributes(); i++) {
                if (i != insts.classIndex()) {
                    if (!insts.attribute(i).isNumeric()) {
                        onlyNumeric = false;
                        break;
                    }
                }
            }
        }

        if (!onlyNumeric) {
            m_NominalToBinary = new NominalToBinary();
            m_NominalToBinary.setInputFormat(insts);
            insts = Filter.useFilter(insts, m_NominalToBinary);
        } else {
            m_NominalToBinary = null;
        }
    } else {
        m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
        m_Filter = new Standardize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else if (m_filterType == FILTER_NORMALIZE) {
        m_Filter = new Normalize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else {
        m_Filter = null;
    }

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();
    m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0);

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
        Instance inst = insts.instance(j);
        subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        for (int j = i + 1; j < insts.numClasses(); j++) {
            m_classifiers[i][j] = new BinarySMO();
            m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
            Instances data = new Instances(insts, insts.numInstances());
            for (int k = 0; k < subsets[i].numInstances(); k++) {
                data.add(subsets[i].instance(k));
            }
            for (int k = 0; k < subsets[j].numInstances(); k++) {
                data.add(subsets[j].instance(k));
            }
            data.compactify();
            data.randomize(rand);
            m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed);
        }
    }
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Evaluate the clusterer on a set of instances. Calculates clustering
 * statistics and stores cluster assigments for the instances in
 * m_clusterAssignments// w  w w .j a v  a  2 s  .com
 * 
 * @param test the set of instances to cluster
 * @param testFileName the name of the test file for incremental testing, 
 * if "" or null then not used
 * @param outputModel true if the clustering model is to be output as well
 * as the stats
 * 
 * @throws Exception if something goes wrong
 */
public void evaluateClusterer(Instances test, String testFileName, boolean outputModel) throws Exception {
    int i = 0;
    int cnum;
    double loglk = 0.0;
    int cc = m_Clusterer.numberOfClusters();
    m_numClusters = cc;
    double[] instanceStats = new double[cc];
    Instances testRaw = null;
    boolean hasClass = (test.classIndex() >= 0);
    int unclusteredInstances = 0;
    Vector<Double> clusterAssignments = new Vector<Double>();
    Filter filter = null;
    DataSource source = null;
    Instance inst;

    if (testFileName == null)
        testFileName = "";

    // load data
    if (testFileName.length() != 0)
        source = new DataSource(testFileName);
    else
        source = new DataSource(test);
    testRaw = source.getStructure(test.classIndex());

    // If class is set then do class based evaluation as well
    if (hasClass) {
        if (testRaw.classAttribute().isNumeric())
            throw new Exception("ClusterEvaluation: Class must be nominal!");

        filter = new Remove();
        ((Remove) filter).setAttributeIndices("" + (testRaw.classIndex() + 1));
        ((Remove) filter).setInvertSelection(false);
        filter.setInputFormat(testRaw);
    }

    i = 0;
    while (source.hasMoreElements(testRaw)) {
        // next instance
        inst = source.nextElement(testRaw);
        if (filter != null) {
            filter.input(inst);
            filter.batchFinished();
            inst = filter.output();
        }

        cnum = -1;
        try {
            if (m_Clusterer instanceof DensityBasedClusterer) {
                loglk += ((DensityBasedClusterer) m_Clusterer).logDensityForInstance(inst);
                cnum = m_Clusterer.clusterInstance(inst);
                clusterAssignments.add((double) cnum);
            } else {
                cnum = m_Clusterer.clusterInstance(inst);
                clusterAssignments.add((double) cnum);
            }
        } catch (Exception e) {
            clusterAssignments.add(-1.0);
            unclusteredInstances++;
        }

        if (cnum != -1) {
            instanceStats[cnum]++;
        }
    }

    double sum = Utils.sum(instanceStats);
    loglk /= sum;
    m_logL = loglk;
    m_clusterAssignments = new double[clusterAssignments.size()];
    for (i = 0; i < clusterAssignments.size(); i++) {
        m_clusterAssignments[i] = clusterAssignments.get(i);
    }
    int numInstFieldWidth = (int) ((Math.log(clusterAssignments.size()) / Math.log(10)) + 1);

    if (outputModel) {
        m_clusteringResults.append(m_Clusterer.toString());
    }
    m_clusteringResults.append("Clustered Instances\n\n");
    int clustFieldWidth = (int) ((Math.log(cc) / Math.log(10)) + 1);
    for (i = 0; i < cc; i++) {
        if (instanceStats[i] > 0)
            m_clusteringResults.append(Utils.doubleToString((double) i, clustFieldWidth, 0) + "      "
                    + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) + " ("
                    + Utils.doubleToString((instanceStats[i] / sum * 100.0), 3, 0) + "%)\n");
    }

    if (unclusteredInstances > 0)
        m_clusteringResults.append("\nUnclustered instances : " + unclusteredInstances);

    if (m_Clusterer instanceof DensityBasedClusterer)
        m_clusteringResults.append("\n\nLog likelihood: " + Utils.doubleToString(loglk, 1, 5) + "\n");

    if (hasClass) {
        evaluateClustersWithRespectToClass(test, testFileName);
    }
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Evaluates cluster assignments with respect to actual class labels.
 * Assumes that m_Clusterer has been trained and tested on 
 * inst (minus the class).//from   w ww . j av  a  2s. c  o  m
 * 
 * @param inst the instances (including class) to evaluate with respect to
 * @param fileName the name of the test file for incremental testing, 
 * if "" or null then not used
 * @throws Exception if something goes wrong
 */
private void evaluateClustersWithRespectToClass(Instances inst, String fileName) throws Exception {

    int numClasses = inst.classAttribute().numValues();
    int[][] counts = new int[m_numClusters][numClasses];
    int[] clusterTotals = new int[m_numClusters];
    double[] best = new double[m_numClusters + 1];
    double[] current = new double[m_numClusters + 1];
    DataSource source = null;
    Instances instances = null;
    Instance instance = null;
    int i;
    int numInstances;

    if (fileName == null)
        fileName = "";

    if (fileName.length() != 0) {
        source = new DataSource(fileName);
    } else
        source = new DataSource(inst);
    instances = source.getStructure(inst.classIndex());

    i = 0;
    while (source.hasMoreElements(instances)) {
        instance = source.nextElement(instances);
        if (m_clusterAssignments[i] >= 0) {
            counts[(int) m_clusterAssignments[i]][(int) instance.classValue()]++;
            clusterTotals[(int) m_clusterAssignments[i]]++;
        }
        i++;
    }
    numInstances = i;

    best[m_numClusters] = Double.MAX_VALUE;
    mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0);

    m_clusteringResults.append("\n\nClass attribute: " + inst.classAttribute().name() + "\n");
    m_clusteringResults.append("Classes to Clusters:\n");
    String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0));
    m_clusteringResults.append(matrixString).append("\n");

    int Cwidth = 1 + (int) (Math.log(m_numClusters) / Math.log(10));
    // add the minimum error assignment
    for (i = 0; i < m_numClusters; i++) {
        if (clusterTotals[i] > 0) {
            m_clusteringResults.append("Cluster " + Utils.doubleToString((double) i, Cwidth, 0));
            m_clusteringResults.append(" <-- ");

            if (best[i] < 0) {
                m_clusteringResults.append("No class\n");
            } else {
                m_clusteringResults.append(inst.classAttribute().value((int) best[i])).append("\n");
            }
        }
    }
    m_clusteringResults.append("\nIncorrectly clustered instances :\t" + best[m_numClusters] + "\t"
            + (Utils.doubleToString((best[m_numClusters] / numInstances * 100.0), 8, 4)) + " %\n");

    // copy the class assignments
    m_classToCluster = new int[m_numClusters];
    for (i = 0; i < m_numClusters; i++) {
        m_classToCluster[i] = (int) best[i];
    }
}

From source file:core.ClusterEvaluationEX.java

License:Open Source License

/**
 * Evaluates a clusterer with the options given in an array of
 * strings. It takes the string indicated by "-t" as training file, the
 * string indicated by "-T" as test file.
 * If the test file is missing, a stratified ten-fold
 * cross-validation is performed (distribution clusterers only).
 * Using "-x" you can change the number of
 * folds to be used, and using "-s" the random seed.
 * If the "-p" option is present it outputs the classification for
 * each test instance. If you provide the name of an object file using
 * "-l", a clusterer will be loaded from the given file. If you provide the
 * name of an object file using "-d", the clusterer built from the
 * training data will be saved to the given file.
 *
 * @param clusterer machine learning clusterer
 * @param options the array of string containing the options
 * @throws Exception if model could not be evaluated successfully
 * @return a string describing the results 
 *///from   w  w w .j a  v a 2s  .  c o m
public static String evaluateClusterer(Clusterer clusterer, String[] options) throws Exception {

    int seed = 1, folds = 10;
    boolean doXval = false;
    Instances train = null;
    Random random;
    String trainFileName, testFileName, seedString, foldsString;
    String objectInputFileName, objectOutputFileName, attributeRangeString;
    String graphFileName;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    StringBuffer text = new StringBuffer();
    int theClass = -1; // class based evaluation of clustering
    boolean updateable = (clusterer instanceof UpdateableClusterer);
    DataSource source = null;
    Instance inst;

    if (Utils.getFlag('h', options) || Utils.getFlag("help", options)) {

        // global info requested as well?
        boolean globalInfo = Utils.getFlag("synopsis", options) || Utils.getFlag("info", options);

        throw new Exception("Help requested." + makeOptionString(clusterer, globalInfo));
    }

    try {
        // Get basic options (options the same for all clusterers
        //printClusterAssignments = Utils.getFlag('p', options);
        objectInputFileName = Utils.getOption('l', options);
        objectOutputFileName = Utils.getOption('d', options);
        trainFileName = Utils.getOption('t', options);
        testFileName = Utils.getOption('T', options);
        graphFileName = Utils.getOption('g', options);

        // Check -p option
        try {
            attributeRangeString = Utils.getOption('p', options);
        } catch (Exception e) {
            throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. "
                    + "It now expects a parameter specifying a range of attributes "
                    + "to list with the predictions. Use '-p 0' for none.");
        }
        if (attributeRangeString.length() != 0) {
            printClusterAssignments = true;
            if (!attributeRangeString.equals("0"))
                attributesToOutput = new Range(attributeRangeString);
        }

        if (trainFileName.length() == 0) {
            if (objectInputFileName.length() == 0) {
                throw new Exception("No training file and no object " + "input file given.");
            }

            if (testFileName.length() == 0) {
                throw new Exception("No training file and no test file given.");
            }
        } else {
            if ((objectInputFileName.length() != 0) && (printClusterAssignments == false)) {
                throw new Exception("Can't use both train and model file " + "unless -p specified.");
            }
        }

        seedString = Utils.getOption('s', options);

        if (seedString.length() != 0) {
            seed = Integer.parseInt(seedString);
        }

        foldsString = Utils.getOption('x', options);

        if (foldsString.length() != 0) {
            folds = Integer.parseInt(foldsString);
            doXval = true;
        }
    } catch (Exception e) {
        throw new Exception('\n' + e.getMessage() + makeOptionString(clusterer, false));
    }

    try {
        if (trainFileName.length() != 0) {
            source = new DataSource(trainFileName);
            train = source.getStructure();

            String classString = Utils.getOption('c', options);
            if (classString.length() != 0) {
                if (classString.compareTo("last") == 0)
                    theClass = train.numAttributes();
                else if (classString.compareTo("first") == 0)
                    theClass = 1;
                else
                    theClass = Integer.parseInt(classString);

                if (theClass != -1) {
                    if (doXval || testFileName.length() != 0)
                        throw new Exception("Can only do class based evaluation on the " + "training data");

                    if (objectInputFileName.length() != 0)
                        throw new Exception("Can't load a clusterer and do class based " + "evaluation");

                    if (objectOutputFileName.length() != 0)
                        throw new Exception("Can't do class based evaluation and save clusterer");
                }
            } else {
                // if the dataset defines a class attribute, use it
                if (train.classIndex() != -1) {
                    theClass = train.classIndex() + 1;
                    System.err
                            .println("Note: using class attribute from dataset, i.e., attribute #" + theClass);
                }
            }

            if (theClass != -1) {
                if (theClass < 1 || theClass > train.numAttributes())
                    throw new Exception("Class is out of range!");

                if (!train.attribute(theClass - 1).isNominal())
                    throw new Exception("Class must be nominal!");

                train.setClassIndex(theClass - 1);
            }
        }
    } catch (Exception e) {
        throw new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
        savedOptions = new String[options.length];
        System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0)
        Utils.checkForRemainingOptions(options);

    // Set options for clusterer
    if (clusterer instanceof OptionHandler)
        ((OptionHandler) clusterer).setOptions(options);

    Utils.checkForRemainingOptions(options);

    Instances trainHeader = train;
    if (objectInputFileName.length() != 0) {
        // Load the clusterer from file
        //      clusterer = (Clusterer) SerializationHelper.read(objectInputFileName);
        java.io.ObjectInputStream ois = new java.io.ObjectInputStream(
                new java.io.BufferedInputStream(new java.io.FileInputStream(objectInputFileName)));
        clusterer = (Clusterer) ois.readObject();
        // try and get the training header
        try {
            trainHeader = (Instances) ois.readObject();
        } catch (Exception ex) {
            // don't moan if we cant
        }
    } else {
        // Build the clusterer if no object file provided
        if (theClass == -1) {
            if (updateable) {
                clusterer.buildClusterer(source.getStructure());
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    ((UpdateableClusterer) clusterer).updateClusterer(inst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                clusterer.buildClusterer(source.getDataSet());
            }
        } else {
            Remove removeClass = new Remove();
            removeClass.setAttributeIndices("" + theClass);
            removeClass.setInvertSelection(false);
            removeClass.setInputFormat(train);
            if (updateable) {
                Instances clusterTrain = Filter.useFilter(train, removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
                while (source.hasMoreElements(train)) {
                    inst = source.nextElement(train);
                    removeClass.input(inst);
                    removeClass.batchFinished();
                    Instance clusterTrainInst = removeClass.output();
                    ((UpdateableClusterer) clusterer).updateClusterer(clusterTrainInst);
                }
                ((UpdateableClusterer) clusterer).updateFinished();
            } else {
                Instances clusterTrain = Filter.useFilter(source.getDataSet(), removeClass);
                clusterer.buildClusterer(clusterTrain);
                trainHeader = clusterTrain;
            }
            ClusterEvaluationEX ce = new ClusterEvaluationEX();
            ce.setClusterer(clusterer);
            ce.evaluateClusterer(train, trainFileName);

            return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
        }
    }

    /* Output cluster predictions only (for the test data if specified,
       otherwise for the training data */
    if (printClusterAssignments) {
        return printClusterings(clusterer, trainFileName, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append(
            "\n\n=== Clustering stats for training data ===\n\n" + printClusterStats(clusterer, trainFileName));

    if (testFileName.length() != 0) {
        // check header compatibility
        DataSource test = new DataSource(testFileName);
        Instances testStructure = test.getStructure();
        if (!trainHeader.equalHeaders(testStructure)) {
            throw new Exception("Training and testing data are not compatible\n");
        }

        text.append("\n\n=== Clustering stats for testing data ===\n\n"
                + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DensityBasedClusterer) && (doXval == true) && (testFileName.length() == 0)
            && (objectInputFileName.length() == 0)) {
        // cross validate the log likelihood on the training data
        random = new Random(seed);
        random.setSeed(seed);
        train = source.getDataSet();
        train.randomize(random);
        text.append(crossValidateModel(clusterer.getClass().getName(), train, folds, savedOptions, random));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
        //SerializationHelper.write(objectOutputFileName, clusterer);
        saveClusterer(objectOutputFileName, clusterer, trainHeader);
    }

    // If classifier is drawable output string describing graph
    if ((clusterer instanceof Drawable) && (graphFileName.length() != 0)) {
        BufferedWriter writer = new BufferedWriter(new FileWriter(graphFileName));
        writer.write(((Drawable) clusterer).graph());
        writer.newLine();
        writer.flush();
        writer.close();
    }

    return text.toString();
}

From source file:core.TextDirectoryLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined by a
 * call to getStructure then method should do so before processing the rest of
 * the data set.//from   www . ja  v  a2  s .  c  om
 *
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException if there is no source or parsing fails
 */
@Override
public Instances getDataSet() throws IOException {
    if (getDirectory() == null) {
        throw new IOException("No directory/source has been specified");
    }

    String directoryPath = getDirectory().getAbsolutePath();
    ArrayList<String> classes = new ArrayList<String>();
    Enumeration<Object> enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements()) {
        Object oo = enm.nextElement();
        if (oo instanceof SerializedObject) {
            classes.add(((SerializedObject) oo).getObject().toString());
        } else {
            classes.add(oo.toString());
        }
    }

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = classes.get(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (String file : files) {
            try {
                fileCount++;
                if (getDebug()) {
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + file);
                }

                double[] newInst = null;
                if (m_OutputFilename) {
                    newInst = new double[3];
                } else {
                    newInst = new double[2];
                }
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + file);
                BufferedReader is;
                if (m_charSet == null || m_charSet.length() == 0) {
                    is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
                } else {
                    is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
                }
                StringBuffer txtStr = new StringBuffer();
                /*int c;
                while ((c = is.read()) != -1) {
                  txtStr.append((char) c);
                }*/

                FileReader fr = new FileReader(txt);
                BufferedReader br = new BufferedReader(fr);
                String line;
                while ((line = br.readLine()) != null) {
                    txtStr.append(line + System.getProperty("line.separator"));
                }

                newInst[0] = data.attribute(0).addStringValue(txtStr.toString());
                if (m_OutputFilename) {
                    newInst[1] = data.attribute(1).addStringValue(subdirPath + File.separator + file);
                }
                newInst[data.classIndex()] = k;
                data.add(new DenseInstance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + file);
            }
        }
    }

    return data;
}

From source file:core.TextDirectoryLoader.java

License:Open Source License

/**
 * Process input directories/files incrementally.
 *
 * @param structure ignored//  w  w w .j  ava  2 s . com
 * @return never returns without throwing an exception
 * @throws IOException if a problem occurs
 */
@Override
public Instance getNextInstance(Instances structure) throws IOException {
    // throw new
    // IOException("TextDirectoryLoader can't read data sets incrementally.");

    String directoryPath = getDirectory().getAbsolutePath();
    Attribute classAtt = structure.classAttribute();
    if (m_filesByClass == null) {
        m_filesByClass = new ArrayList<LinkedList<String>>();
        for (int i = 0; i < classAtt.numValues(); i++) {
            File classDir = new File(directoryPath + File.separator + classAtt.value(i));
            String[] files = classDir.list();
            LinkedList<String> classDocs = new LinkedList<String>();
            for (String cd : files) {
                File txt = new File(directoryPath + File.separator + classAtt.value(i) + File.separator + cd);
                if (txt.isFile()) {
                    classDocs.add(cd);
                }
            }
            m_filesByClass.add(classDocs);
        }
    }

    // cycle through the classes
    int count = 0;
    LinkedList<String> classContents = m_filesByClass.get(m_lastClassDir);
    boolean found = (classContents.size() > 0);
    while (classContents.size() == 0) {
        m_lastClassDir++;
        count++;
        if (m_lastClassDir == structure.classAttribute().numValues()) {
            m_lastClassDir = 0;
        }
        classContents = m_filesByClass.get(m_lastClassDir);
        if (classContents.size() > 0) {
            found = true; // we have an instance we can create
            break;
        }
        if (count == structure.classAttribute().numValues()) {
            break; // must be finished
        }
    }

    if (found) {
        String nextDoc = classContents.poll();
        File txt = new File(
                directoryPath + File.separator + classAtt.value(m_lastClassDir) + File.separator + nextDoc);

        BufferedReader is;
        if (m_charSet == null || m_charSet.length() == 0) {
            is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
        } else {
            is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
        }
        StringBuffer txtStr = new StringBuffer();
        int c;
        while ((c = is.read()) != -1) {
            txtStr.append((char) c);
        }

        double[] newInst = null;
        if (m_OutputFilename) {
            newInst = new double[3];
        } else {
            newInst = new double[2];
        }

        newInst[0] = 0;
        structure.attribute(0).setStringValue(txtStr.toString());

        if (m_OutputFilename) {
            newInst[1] = 0;
            structure.attribute(1).setStringValue(txt.getAbsolutePath());
        }
        newInst[structure.classIndex()] = m_lastClassDir;
        Instance inst = new DenseInstance(1.0, newInst);
        inst.setDataset(structure);
        is.close();

        m_lastClassDir++;
        if (m_lastClassDir == structure.classAttribute().numValues()) {
            m_lastClassDir = 0;
        }

        return inst;
    } else {
        return null; // done!
    }
}