Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:ExperimentDemo.java

License:Open Source License

/**
 * Expects the following parameters: // ww w .  j  a  v  a  2s .  c om
 * <ul>
 *   <li>-classifier "classifier incl. parameters"</li>
 *   <li>-exptype "classification|regression"</li>
 *   <li>-splittype "crossvalidation|randomsplit"</li>
 *   <li>-runs "# of runs"</li>
 *   <li>-folds "# of cross-validation folds"</li>
 *   <li>-percentage "percentage for randomsplit"</li>
 *   <li>-result "arff file for storing the results"</li>
 *   <li>-t "dataset" (can be supplied multiple times)</li>
 * </ul>
 * 
 * @param args   the commandline arguments
 * @throws Exception   if something goes wrong
 */
public static void main(String[] args) throws Exception {
    // parameters provided?
    if (args.length == 0) {
        System.out.println("\nUsage: weka.examples.experiment.ExperimentDemo\n"
                + "\t   -classifier <classifier incl. parameters>\n"
                + "\t   -exptype <classification|regression>\n"
                + "\t   -splittype <crossvalidation|randomsplit>\n" + "\t   -runs <# of runs>\n"
                + "\t   -folds <folds for CV>\n" + "\t   -percentage <percentage for randomsplit>\n"
                + "\t   -result <ARFF file for storing the results>\n"
                + "\t   -t dataset (can be supplied multiple times)\n");
        System.exit(1);
    }

    // 1. setup the experiment
    System.out.println("Setting up...");
    Experiment exp = new Experiment();
    exp.setPropertyArray(new Classifier[0]);
    exp.setUsePropertyIterator(true);

    String option;

    // classification or regression
    option = Utils.getOption("exptype", args);
    if (option.length() == 0)
        throw new IllegalArgumentException("No experiment type provided!");

    SplitEvaluator se = null;
    /*
     * Interface to objects able to generate a fixed set of results for a particular split of a dataset.
     * The set of results should contain fields related to any settings of the SplitEvaluator (not including the dataset name.
     * For example, one field for the classifier used to get the results, another for the classifier options, etc).
     * Possible implementations of SplitEvaluator: StdClassification results, StdRegression results.
     */
    Classifier sec = null;
    boolean classification = false;
    if (option.equals("classification")) {
        classification = true;
        se = new ClassifierSplitEvaluator();
        /*
         * A SplitEvaluator that produces results for a classification scheme on a nominal class attribute. 
         */
        sec = ((ClassifierSplitEvaluator) se).getClassifier();
    } else if (option.equals("regression")) {
        se = new RegressionSplitEvaluator();
        sec = ((RegressionSplitEvaluator) se).getClassifier();
    } else {
        throw new IllegalArgumentException("Unknown experiment type '" + option + "'!");
    }

    // crossvalidation or randomsplit
    option = Utils.getOption("splittype", args);
    if (option.length() == 0)
        throw new IllegalArgumentException("No split type provided!");

    if (option.equals("crossvalidation")) {
        CrossValidationResultProducer cvrp = new CrossValidationResultProducer();
        /*
         * Generates for each run, carries out an n-fold cross-validation, using the set SplitEvaluator to generate some results.
         * If the class attribute is nominal, the dataset is stratified. Results for each fold are generated, so you may wish to use
         * this in addition with an AveragingResultProducer to obtain averages for each run. 
         */
        option = Utils.getOption("folds", args);
        if (option.length() == 0)
            throw new IllegalArgumentException("No folds provided!");
        cvrp.setNumFolds(Integer.parseInt(option));
        cvrp.setSplitEvaluator(se);

        PropertyNode[] propertyPath = new PropertyNode[2];
        /*
         * Stores information on a property of an object: the class of the object with the property;
         * the property descriptor, and the current value.
         */
        try {
            propertyPath[0] = new PropertyNode(se,
                    new PropertyDescriptor("splitEvaluator", CrossValidationResultProducer.class),
                    CrossValidationResultProducer.class);
            propertyPath[1] = new PropertyNode(sec, new PropertyDescriptor("classifier", se.getClass()),
                    se.getClass());
        } catch (IntrospectionException e) {
            e.printStackTrace();
        }

        exp.setResultProducer(cvrp);
        exp.setPropertyPath(propertyPath);

    } else if (option.equals("randomsplit")) {
        RandomSplitResultProducer rsrp = new RandomSplitResultProducer();
        rsrp.setRandomizeData(true);
        option = Utils.getOption("percentage", args);
        if (option.length() == 0)
            throw new IllegalArgumentException("No percentage provided!");
        rsrp.setTrainPercent(Double.parseDouble(option));
        rsrp.setSplitEvaluator(se);

        PropertyNode[] propertyPath = new PropertyNode[2];
        try {
            propertyPath[0] = new PropertyNode(se,
                    new PropertyDescriptor("splitEvaluator", RandomSplitResultProducer.class),
                    RandomSplitResultProducer.class);
            propertyPath[1] = new PropertyNode(sec, new PropertyDescriptor("classifier", se.getClass()),
                    se.getClass());
        } catch (IntrospectionException e) {
            e.printStackTrace();
        }

        exp.setResultProducer(rsrp);
        exp.setPropertyPath(propertyPath);
    } else {
        throw new IllegalArgumentException("Unknown split type '" + option + "'!");
    }

    // runs
    option = Utils.getOption("runs", args);
    if (option.length() == 0)
        throw new IllegalArgumentException("No runs provided!");
    exp.setRunLower(1);
    exp.setRunUpper(Integer.parseInt(option));

    // classifier
    option = Utils.getOption("classifier", args);
    if (option.length() == 0)
        throw new IllegalArgumentException("No classifier provided!");
    String[] options = Utils.splitOptions(option);
    String classname = options[0];
    options[0] = "";
    Classifier c = (Classifier) Utils.forName(Classifier.class, classname, options);
    exp.setPropertyArray(new Classifier[] { c });

    // datasets
    boolean data = false;
    DefaultListModel model = new DefaultListModel();
    do {
        option = Utils.getOption("t", args);
        if (option.length() > 0) {
            File file = new File(option);
            if (!file.exists())
                throw new IllegalArgumentException("File '" + option + "' does not exist!");
            data = true;
            model.addElement(file);
        }
    } while (option.length() > 0);
    if (!data)
        throw new IllegalArgumentException("No data files provided!");
    exp.setDatasets(model);

    // result
    option = Utils.getOption("result", args);
    if (option.length() == 0)
        throw new IllegalArgumentException("No result file provided!");
    InstancesResultListener irl = new InstancesResultListener();
    irl.setOutputFile(new File(option));
    exp.setResultListener(irl);

    // 2. run experiment
    System.out.println("Initializing...");
    exp.initialize();
    System.out.println("Running...");
    exp.runExperiment();
    System.out.println("Finishing...");
    exp.postProcess();

    // 3. calculate statistics and output them
    System.out.println("Evaluating...");
    PairedTTester tester = new PairedCorrectedTTester();
    /*
     * Calculates T-Test statistics on data stored in a set of instances. 
     */
    Instances result = new Instances(new BufferedReader(new FileReader(irl.getOutputFile())));
    tester.setInstances(result);
    tester.setSortColumn(-1);
    tester.setRunColumn(result.attribute("Key_Run").index());
    if (classification)
        tester.setFoldColumn(result.attribute("Key_Fold").index());
    tester.setDatasetKeyColumns(new Range("" + (result.attribute("Key_Dataset").index() + 1)));
    tester.setResultsetKeyColumns(new Range("" + (result.attribute("Key_Scheme").index() + 1) + ","
            + (result.attribute("Key_Scheme_options").index() + 1) + ","
            + (result.attribute("Key_Scheme_version_ID").index() + 1)));
    tester.setResultMatrix(new ResultMatrixPlainText());
    tester.setDisplayedResultsets(null);
    tester.setSignificanceLevel(0.05);
    tester.setShowStdDevs(true);
    // fill result matrix (but discarding the output)
    if (classification)
        tester.multiResultsetFull(0, result.attribute("Percent_correct").index());
    else
        tester.multiResultsetFull(0, result.attribute("Correlation_coefficient").index());
    // output results for reach dataset
    System.out.println("\nResult:");
    ResultMatrix matrix = tester.getResultMatrix();
    for (int i = 0; i < matrix.getColCount(); i++) {
        System.out.println(matrix.getColName(i));
        System.out.println("    Perc. correct: " + matrix.getMean(i, 0));
        System.out.println("    StdDev: " + matrix.getStdDev(i, 0));
    }
}

From source file:PrincipalComponents.java

License:Open Source License

/**
 * Return a summary of the analysis/*from w w  w  .j av  a2 s  .  c  o  m*/
 *
 * @return a summary of the analysis.
 */
private String principalComponentsSummary() {
    StringBuffer result = new StringBuffer();
    double cumulative = 0.0;
    Instances output = null;
    int numVectors = 0;

    try {
        output = setOutputFormat();
        numVectors = (output.classIndex() < 0) ? output.numAttributes() : output.numAttributes() - 1;
    } catch (Exception ex) {
    }
    // tomorrow
    String corrCov = (m_center) ? "Covariance " : "Correlation ";
    result.append(corrCov + "matrix\n" + matrixToString(Matrices.getArray(m_correlation)) + "\n\n");
    result.append("eigenvalue\tproportion\tcumulative\n");
    for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
        cumulative += m_eigenvalues[m_sortedEigens[i]];
        result.append(Utils.doubleToString(m_eigenvalues[m_sortedEigens[i]], 9, 5) + "\t"
                + Utils.doubleToString((m_eigenvalues[m_sortedEigens[i]] / m_sumOfEigenValues), 9, 5) + "\t"
                + Utils.doubleToString((cumulative / m_sumOfEigenValues), 9, 5) + "\t"
                + output.attribute(m_numAttribs - i - 1).name() + "\n");
    }

    result.append("\nEigenvectors\n");
    for (int j = 1; j <= numVectors; j++) {
        result.append(" V" + j + '\t');
    }
    result.append("\n");
    for (int j = 0; j < m_numAttribs; j++) {

        for (int i = m_numAttribs - 1; i > (m_numAttribs - numVectors - 1); i--) {
            result.append(Utils.doubleToString(m_eigenvectors[j][m_sortedEigens[i]], 7, 4) + "\t");
        }
        result.append(m_trainInstances.attribute(j).name() + '\n');
    }

    if (m_transBackToOriginal) {
        result.append("\nPC space transformed back to original space.\n"
                + "(Note: can't evaluate attributes in the original " + "space)\n");
    }
    return result.toString();
}

From source file:TextDirectoryLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined
 * by a call to getStructure then method should do so before processing
 * the rest of the data set./*from   ww  w .jav a2 s  . c  o  m*/
 *
 * @return the structure of the data set as an empty set of Instances
 * @throws IOException if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (getDirectory() == null)
        throw new IOException("No directory/source has been specified");

    String directoryPath = getDirectory().getAbsolutePath();
    FastVector classes = new FastVector();
    Enumeration enm = getStructure().classAttribute().enumerateValues();
    while (enm.hasMoreElements())
        classes.addElement(enm.nextElement());

    Instances data = getStructure();
    int fileCount = 0;
    for (int k = 0; k < classes.size(); k++) {
        String subdirPath = (String) classes.elementAt(k);
        File subdir = new File(directoryPath + File.separator + subdirPath);
        String[] files = subdir.list();
        for (int j = 0; j < files.length; j++) {
            try {
                fileCount++;
                if (getDebug())
                    System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]);

                double[] newInst = null;
                if (m_OutputFilename)
                    newInst = new double[3];
                else
                    newInst = new double[2];
                File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]);
                BufferedReader is;
                if (m_charSet == null || m_charSet.length() == 0) {
                    is = new BufferedReader(new InputStreamReader(new FileInputStream(txt)));
                } else {
                    is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet));
                }

                StringBuffer txtStr = new StringBuffer();
                int c;
                while ((c = is.read()) != -1) {
                    txtStr.append((char) c);
                }

                newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
                if (m_OutputFilename)
                    newInst[1] = (double) data.attribute(1)
                            .addStringValue(subdirPath + File.separator + files[j]);
                newInst[data.classIndex()] = (double) k;
                data.add(new Instance(1.0, newInst));
                is.close();
            } catch (Exception e) {
                System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath
                        + File.separator + files[j]);
            }
        }
    }

    return data;
}

From source file:ArrayLoader.java

License:Open Source License

/**
 * Return the full data set. If the structure hasn't yet been determined
 * by a call to getStructure then method should do so before processing
 * the rest of the data set.// www .  j  a  va 2  s.c o  m
 *
 * @return the structure of the data set as an empty set of Instances
 * @exception IOException if there is no source or parsing fails
 */
public Instances getDataSet() throws IOException {
    if (m_data == null) {
        throw new IOException("No source has been specified");
    }

    if (m_structure == null) {
        getStructure();
    }

    m_cumulativeStructure = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
        m_cumulativeStructure.addElement(new Hashtable());
    }

    m_cumulativeInstances = new FastVector();
    FastVector current;

    for (int i = 0; i < m_data.length; i++) {
        current = getInstance(m_data[i]);

        m_cumulativeInstances.addElement(current);
    }

    FastVector atts = new FastVector(m_structure.numAttributes());
    for (int i = 0; i < m_structure.numAttributes(); i++) {
        String attname = m_structure.attribute(i).name();
        Hashtable tempHash = ((Hashtable) m_cumulativeStructure.elementAt(i));
        if (tempHash.size() == 0) {
            atts.addElement(new Attribute(attname));
        } else {
            if (m_StringAttributes.isInRange(i)) {
                atts.addElement(new Attribute(attname, (FastVector) null));
            } else {
                FastVector values = new FastVector(tempHash.size());
                // add dummy objects in order to make the FastVector's size == capacity
                for (int z = 0; z < tempHash.size(); z++) {
                    values.addElement("dummy");
                }
                Enumeration e = tempHash.keys();
                while (e.hasMoreElements()) {
                    Object ob = e.nextElement();
                    //     if (ob instanceof Double) {
                    int index = ((Integer) tempHash.get(ob)).intValue();
                    String s = ob.toString();
                    if (s.startsWith("'") || s.startsWith("\""))
                        s = s.substring(1, s.length() - 1);
                    values.setElementAt(new String(s), index);
                    //     }
                }
                atts.addElement(new Attribute(attname, values));
            }
        }
    }

    // make the instances
    String relationName;
    relationName = "ArrayData";
    Instances dataSet = new Instances(relationName, atts, m_cumulativeInstances.size());

    for (int i = 0; i < m_cumulativeInstances.size(); i++) {
        current = ((FastVector) m_cumulativeInstances.elementAt(i));
        double[] vals = new double[dataSet.numAttributes()];
        for (int j = 0; j < current.size(); j++) {
            Object cval = current.elementAt(j);
            if (cval instanceof String) {
                if (((String) cval).compareTo(m_MissingValue) == 0) {
                    vals[j] = Instance.missingValue();
                } else {
                    if (dataSet.attribute(j).isString()) {
                        vals[j] = dataSet.attribute(j).addStringValue((String) cval);
                    } else if (dataSet.attribute(j).isNominal()) {
                        // find correct index
                        Hashtable lookup = (Hashtable) m_cumulativeStructure.elementAt(j);
                        int index = ((Integer) lookup.get(cval)).intValue();
                        vals[j] = index;
                    } else {
                        throw new IllegalStateException("Wrong attribute type at position " + (i + 1) + "!!!");
                    }
                }
            } else if (dataSet.attribute(j).isNominal()) {
                // find correct index
                Hashtable lookup = (Hashtable) m_cumulativeStructure.elementAt(j);
                int index = ((Integer) lookup.get(cval)).intValue();
                vals[j] = index;
            } else if (dataSet.attribute(j).isString()) {
                vals[j] = dataSet.attribute(j).addStringValue("" + cval);
            } else {
                vals[j] = ((Double) cval).doubleValue();
            }
        }
        dataSet.add(new Instance(1.0, vals));
    }
    m_structure = new Instances(dataSet, 0);
    m_cumulativeStructure = null; // conserve memory

    return dataSet;
}

From source file:REPTree.java

License:Open Source License

/**
 * Builds classifier.//from   ww  w  . j  a  v  a2 s.  com
 * 
 * @param data the data to train with
 * @throws Exception if building fails
 */
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    Random random = new Random(m_Seed);

    m_zeroR = null;
    if (data.numAttributes() == 1) {
        m_zeroR = new ZeroR();
        m_zeroR.buildClassifier(data);
        return;
    }

    // Randomize and stratify
    data.randomize(random);
    if (data.classAttribute().isNominal()) {
        data.stratify(m_NumFolds);
    }

    // Split data into training and pruning set
    Instances train = null;
    Instances prune = null;
    if (!m_NoPruning) {
        train = data.trainCV(m_NumFolds, 0, random);
        prune = data.testCV(m_NumFolds, 0);
    } else {
        train = data;
    }

    // Create array of sorted indices and weights
    int[][][] sortedIndices = new int[1][train.numAttributes()][0];
    double[][][] weights = new double[1][train.numAttributes()][0];
    double[] vals = new double[train.numInstances()];
    for (int j = 0; j < train.numAttributes(); j++) {
        if (j != train.classIndex()) {
            weights[0][j] = new double[train.numInstances()];
            if (train.attribute(j).isNominal()) {

                // Handling nominal attributes. Putting indices of
                // instances with missing values at the end.
                sortedIndices[0][j] = new int[train.numInstances()];
                int count = 0;
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    if (!inst.isMissing(j)) {
                        sortedIndices[0][j][count] = i;
                        weights[0][j][count] = inst.weight();
                        count++;
                    }
                }
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    if (inst.isMissing(j)) {
                        sortedIndices[0][j][count] = i;
                        weights[0][j][count] = inst.weight();
                        count++;
                    }
                }
            } else {

                // Sorted indices are computed for numeric attributes
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    vals[i] = inst.value(j);
                }
                sortedIndices[0][j] = Utils.sort(vals);
                for (int i = 0; i < train.numInstances(); i++) {
                    weights[0][j][i] = train.instance(sortedIndices[0][j][i]).weight();
                }
            }
        }
    }

    // Compute initial class counts
    double[] classProbs = new double[train.numClasses()];
    double totalWeight = 0, totalSumSquared = 0;
    for (int i = 0; i < train.numInstances(); i++) {
        Instance inst = train.instance(i);
        if (data.classAttribute().isNominal()) {
            classProbs[(int) inst.classValue()] += inst.weight();
            totalWeight += inst.weight();
        } else {
            classProbs[0] += inst.classValue() * inst.weight();
            totalSumSquared += inst.classValue() * inst.classValue() * inst.weight();
            totalWeight += inst.weight();
        }
    }
    m_Tree = new Tree();
    double trainVariance = 0;
    if (data.classAttribute().isNumeric()) {
        trainVariance = m_Tree.singleVariance(classProbs[0], totalSumSquared, totalWeight) / totalWeight;
        classProbs[0] /= totalWeight;
    }

    // Build tree
    m_Tree.buildTree(sortedIndices, weights, train, totalWeight, classProbs, new Instances(train, 0), m_MinNum,
            m_MinVarianceProp * trainVariance, 0, m_MaxDepth);

    // Insert pruning data and perform reduced error pruning
    if (!m_NoPruning) {
        m_Tree.insertHoldOutSet(prune);
        m_Tree.reducedErrorPrune();
        m_Tree.backfitHoldOutSet();
    }
}

From source file:REPRandomTree.java

License:Open Source License

/**
 * Builds classifier./*  ww  w .  ja  va2 s.  c o m*/
 * 
 * @param data the data to train with
 * @throws Exception if building fails
 */
public void buildClassifier(Instances data) throws Exception {

    // can classifier handle the data?
    getCapabilities().testWithFail(data);

    // remove instances with missing class
    data = new Instances(data);
    data.deleteWithMissingClass();

    Random random = new Random(m_Seed);

    m_zeroR = null;
    if (data.numAttributes() == 1) {
        m_zeroR = new ZeroR();
        m_zeroR.buildClassifier(data);
        return;
    }

    // Randomize and stratify
    data.randomize(random);
    if (data.classAttribute().isNominal()) {
        data.stratify(m_NumFolds);
    }

    // Split data into training and pruning set
    Instances train = null;
    Instances prune = null;
    if (!m_NoPruning) {
        train = data.trainCV(m_NumFolds, 0, random);
        prune = data.testCV(m_NumFolds, 0);
    } else {
        train = data;
    }

    // Create array of sorted indices and weights
    int[][][] sortedIndices = new int[1][train.numAttributes()][0];
    double[][][] weights = new double[1][train.numAttributes()][0];
    double[] vals = new double[train.numInstances()];
    for (int j = 0; j < train.numAttributes(); j++) {
        if (j != train.classIndex()) {
            weights[0][j] = new double[train.numInstances()];
            if (train.attribute(j).isNominal()) {

                // Handling nominal attributes. Putting indices of
                // instances with missing values at the end.
                sortedIndices[0][j] = new int[train.numInstances()];
                int count = 0;
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    if (!inst.isMissing(j)) {
                        sortedIndices[0][j][count] = i;
                        weights[0][j][count] = inst.weight();
                        count++;
                    }
                }
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    if (inst.isMissing(j)) {
                        sortedIndices[0][j][count] = i;
                        weights[0][j][count] = inst.weight();
                        count++;
                    }
                }
            } else {

                // Sorted indices are computed for numeric attributes
                for (int i = 0; i < train.numInstances(); i++) {
                    Instance inst = train.instance(i);
                    vals[i] = inst.value(j);
                }
                sortedIndices[0][j] = Utils.sort(vals);
                for (int i = 0; i < train.numInstances(); i++) {
                    weights[0][j][i] = train.instance(sortedIndices[0][j][i]).weight();
                }
            }
        }
    }

    // Compute initial class counts
    double[] classProbs = new double[train.numClasses()];
    double totalWeight = 0, totalSumSquared = 0;
    for (int i = 0; i < train.numInstances(); i++) {
        Instance inst = train.instance(i);
        if (data.classAttribute().isNominal()) {
            classProbs[(int) inst.classValue()] += inst.weight();
            totalWeight += inst.weight();
        } else {
            classProbs[0] += inst.classValue() * inst.weight();
            totalSumSquared += inst.classValue() * inst.classValue() * inst.weight();
            totalWeight += inst.weight();
        }
    }
    m_Tree = new Tree();
    double trainVariance = 0;
    if (data.classAttribute().isNumeric()) {
        trainVariance = m_Tree.singleVariance(classProbs[0], totalSumSquared, totalWeight) / totalWeight;
        classProbs[0] /= totalWeight;
    }

    // Build tree
    m_Tree.buildTree(sortedIndices, weights, train, totalWeight, classProbs, new Instances(train, 0), m_MinNum,
            m_MinVarianceProp * trainVariance, 0, m_MaxDepth, m_FeatureFrac, random);

    // Insert pruning data and perform reduced error pruning
    if (!m_NoPruning) {
        m_Tree.insertHoldOutSet(prune);
        m_Tree.reducedErrorPrune();
        m_Tree.backfitHoldOutSet();
    }
}

From source file:LabeledItemSet.java

License:Open Source License

/**
 * Converts the header info of the given set of instances into a set
 * of item sets (singletons). The ordering of values in the header file
 * determines the lexicographic order. Each item set knows its class label.
 * @return a set of item sets, each containing a single item
 * @param instancesNoClass instances without the class attribute
 * @param classes the values of the class attribute sorted according to instances
 * @exception Exception if singletons can't be generated successfully
 *///from  w ww . j a v a  2 s  .c om
public static FastVector singletons(Instances instancesNoClass, Instances classes) throws Exception {

    FastVector cSet, setOfItemSets = new FastVector();
    LabeledItemSet current;

    //make singletons
    for (int i = 0; i < instancesNoClass.numAttributes(); i++) {
        if (instancesNoClass.attribute(i).isNumeric())
            throw new Exception("Can't handle numeric attributes!");
        for (int j = 0; j < instancesNoClass.attribute(i).numValues(); j++) {
            for (int k = 0; k < (classes.attribute(0)).numValues(); k++) {
                current = new LabeledItemSet(instancesNoClass.numInstances(), k);
                current.m_items = new int[instancesNoClass.numAttributes()];
                for (int l = 0; l < instancesNoClass.numAttributes(); l++)
                    current.m_items[l] = -1;
                current.m_items[i] = j;
                setOfItemSets.addElement(current);
            }
        }
    }
    return setOfItemSets;
}

From source file:TextDirectoryToArff.java

License:Open Source License

public Instances createDataset(String directoryPath) throws Exception {

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("contents", (FastVector) null));
    Instances data = new Instances("text_files_in_" + directoryPath, atts, 0);

    File dir = new File(directoryPath);
    String[] files = dir.list();//  w w  w .  j a  v  a 2s  . c o  m
    for (int i = 0; i < files.length; i++) {
        if (files[i].endsWith(".txt")) {
            try {
                double[] newInst = new double[2];
                newInst[0] = (double) data.attribute(0).addStringValue(files[i]);
                File txt = new File(directoryPath + File.separator + files[i]);
                InputStreamReader is;
                is = new InputStreamReader(new FileInputStream(txt));
                StringBuffer txtStr = new StringBuffer();
                int c;
                while ((c = is.read()) != -1) {
                    txtStr.append((char) c);
                }
                newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());
                data.add(new Instance(1.0, newInst));
            } catch (Exception e) {
                //System.err.println("failed to convert file: " + directoryPath + File.separator + files[i]);
            }
        }
    }
    return data;
}

From source file:SMO.java

License:Open Source License

/**
 * Method for building the classifier. Implements a one-against-one
 * wrapper for multi-class problems.//from w  ww.jav a 2 s  .c  o  m
 *
 * @param insts the set of training instances
 * @throws Exception if the classifier can't be built successfully
 */
public void buildClassifier(Instances insts) throws Exception {

    if (!m_checksTurnedOff) {
        // can classifier handle the data?
        getCapabilities().testWithFail(insts);

        // remove instances with missing class
        insts = new Instances(insts);
        insts.deleteWithMissingClass();

        /* Removes all the instances with weight equal to 0.
         MUST be done since condition (8) of Keerthi's paper 
         is made with the assertion Ci > 0 (See equation (3a). */
        Instances data = new Instances(insts, insts.numInstances());
        for (int i = 0; i < insts.numInstances(); i++) {
            if (insts.instance(i).weight() > 0)
                data.add(insts.instance(i));
        }
        if (data.numInstances() == 0) {
            throw new Exception("No training instances left after removing " + "instances with weight 0!");
        }
        insts = data;
    }

    if (!m_checksTurnedOff) {
        m_Missing = new ReplaceMissingValues();
        m_Missing.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Missing);
    } else {
        m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
        boolean onlyNumeric = true;
        if (!m_checksTurnedOff) {
            for (int i = 0; i < insts.numAttributes(); i++) {
                if (i != insts.classIndex()) {
                    if (!insts.attribute(i).isNumeric()) {
                        onlyNumeric = false;
                        break;
                    }
                }
            }
        }

        if (!onlyNumeric) {
            m_NominalToBinary = new NominalToBinary();
            m_NominalToBinary.setInputFormat(insts);
            insts = Filter.useFilter(insts, m_NominalToBinary);
        } else {
            m_NominalToBinary = null;
        }
    } else {
        m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
        m_Filter = new Standardize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else if (m_filterType == FILTER_NORMALIZE) {
        m_Filter = new Normalize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else {
        m_Filter = null;
    }

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();
    m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0);

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
        Instance inst = insts.instance(j);
        subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        for (int j = i + 1; j < insts.numClasses(); j++) {
            m_classifiers[i][j] = new BinarySMO();
            m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
            Instances data = new Instances(insts, insts.numInstances());
            for (int k = 0; k < subsets[i].numInstances(); k++) {
                data.add(subsets[i].instance(k));
            }
            for (int k = 0; k < subsets[j].numInstances(); k++) {
                data.add(subsets[j].instance(k));
            }
            data.compactify();
            data.randomize(rand);
            m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed);
        }
    }
}

From source file:BetterRemoveByName.java

License:Open Source License

/**
 * Determines the output format based on the input format and returns this. In
 * case the output format cannot be returned immediately, i.e.,
 * immediateOutputFormat() returns false, then this method will be called from
 * batchFinished()./*from  w  w  w.j  a  v a2  s  . c o  m*/
 *
 * @param inputFormat the input format to base the output format on
 * @return the output format
 * @throws Exception in case the determination goes wrong
 */
@Override
protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Vector<Integer> indices;
    int[] attributes;
    int i;

    // determine indices
    indices = new Vector<Integer>();
    for (i = 0; i < inputFormat.numAttributes(); i++) {
        // skip class
        // if (i == inputFormat.classIndex()) {
        //   continue;
        // }
        if (inputFormat.attribute(i).name().matches(m_Expression)) {
            indices.add(i);
        }
    }
    attributes = new int[indices.size()];
    for (i = 0; i < indices.size(); i++) {
        attributes[i] = indices.get(i);
    }

    m_Remove = new Remove();
    m_Remove.setAttributeIndicesArray(attributes);
    m_Remove.setInvertSelection(getInvertSelection());
    m_Remove.setInputFormat(inputFormat);

    return m_Remove.getOutputFormat();
}