Example usage for weka.core Instances numAttributes

List of usage examples for weka.core Instances numAttributes

Introduction

In this page you can find the example usage for weka.core Instances numAttributes.

Prototype


publicint numAttributes() 

Source Link

Document

Returns the number of attributes.

Usage

From source file:SMO.java

License:Open Source License

/**
 * Method for building the classifier. Implements a one-against-one
 * wrapper for multi-class problems./*from   ww  w.  j a  va  2  s  .c  o m*/
 *
 * @param insts the set of training instances
 * @throws Exception if the classifier can't be built successfully
 */
public void buildClassifier(Instances insts) throws Exception {

    if (!m_checksTurnedOff) {
        // can classifier handle the data?
        getCapabilities().testWithFail(insts);

        // remove instances with missing class
        insts = new Instances(insts);
        insts.deleteWithMissingClass();

        /* Removes all the instances with weight equal to 0.
         MUST be done since condition (8) of Keerthi's paper 
         is made with the assertion Ci > 0 (See equation (3a). */
        Instances data = new Instances(insts, insts.numInstances());
        for (int i = 0; i < insts.numInstances(); i++) {
            if (insts.instance(i).weight() > 0)
                data.add(insts.instance(i));
        }
        if (data.numInstances() == 0) {
            throw new Exception("No training instances left after removing " + "instances with weight 0!");
        }
        insts = data;
    }

    if (!m_checksTurnedOff) {
        m_Missing = new ReplaceMissingValues();
        m_Missing.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Missing);
    } else {
        m_Missing = null;
    }

    if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) {
        boolean onlyNumeric = true;
        if (!m_checksTurnedOff) {
            for (int i = 0; i < insts.numAttributes(); i++) {
                if (i != insts.classIndex()) {
                    if (!insts.attribute(i).isNumeric()) {
                        onlyNumeric = false;
                        break;
                    }
                }
            }
        }

        if (!onlyNumeric) {
            m_NominalToBinary = new NominalToBinary();
            m_NominalToBinary.setInputFormat(insts);
            insts = Filter.useFilter(insts, m_NominalToBinary);
        } else {
            m_NominalToBinary = null;
        }
    } else {
        m_NominalToBinary = null;
    }

    if (m_filterType == FILTER_STANDARDIZE) {
        m_Filter = new Standardize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else if (m_filterType == FILTER_NORMALIZE) {
        m_Filter = new Normalize();
        m_Filter.setInputFormat(insts);
        insts = Filter.useFilter(insts, m_Filter);
    } else {
        m_Filter = null;
    }

    m_classIndex = insts.classIndex();
    m_classAttribute = insts.classAttribute();
    m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0);

    // Generate subsets representing each class
    Instances[] subsets = new Instances[insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i] = new Instances(insts, insts.numInstances());
    }
    for (int j = 0; j < insts.numInstances(); j++) {
        Instance inst = insts.instance(j);
        subsets[(int) inst.classValue()].add(inst);
    }
    for (int i = 0; i < insts.numClasses(); i++) {
        subsets[i].compactify();
    }

    // Build the binary classifiers
    Random rand = new Random(m_randomSeed);
    m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()];
    for (int i = 0; i < insts.numClasses(); i++) {
        for (int j = i + 1; j < insts.numClasses(); j++) {
            m_classifiers[i][j] = new BinarySMO();
            m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel()));
            Instances data = new Instances(insts, insts.numInstances());
            for (int k = 0; k < subsets[i].numInstances(); k++) {
                data.add(subsets[i].instance(k));
            }
            for (int k = 0; k < subsets[j].numInstances(); k++) {
                data.add(subsets[j].instance(k));
            }
            data.compactify();
            data.randomize(rand);
            m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed);
        }
    }
}

From source file:SpectralClusterer.java

License:Open Source License

/**
 * Generates a clusterer by the mean of spectral clustering algorithm.
 * /*from w ww . j  a  v a  2  s .  com*/
 * @param data
 *            set of instances serving as training data
 * @exception Exception
 *                if the clusterer has not been generated successfully
 */
public void buildClusterer(Instances data) throws java.lang.Exception {
    int n = data.numInstances();
    int k = data.numAttributes();
    DoubleMatrix2D w;
    if (useSparseMatrix)
        w = DoubleFactory2D.sparse.make(n, n);
    else
        w = DoubleFactory2D.dense.make(n, n);
    double[][] v1 = new double[n][];
    for (int i = 0; i < n; i++)
        v1[i] = data.instance(i).toDoubleArray();
    v = DoubleFactory2D.dense.make(v1);
    double sigma_sq = sigma * sigma;
    // Sets up similarity matrix
    for (int i = 0; i < n; i++)
        for (int j = i; j < n; j++) {
            double dist = distnorm2(v.viewRow(i), v.viewRow(j));
            if ((r == -1) || (dist < r)) {
                double sim = Math.exp(-(dist * dist) / (2 * sigma_sq));
                w.set(i, j, sim);
                w.set(j, i, sim);
            }
        }

    // Partitions points
    int[][] p = partition(w, alpha_star);

    // Deploys results
    numOfClusters = p.length;
    cluster = new int[n];
    for (int i = 0; i < p.length; i++)
        for (int j = 0; j < p[i].length; j++)
            cluster[p[i][j]] = i;
}

From source file:BetterRemoveByName.java

License:Open Source License

/**
 * Determines the output format based on the input format and returns this. In
 * case the output format cannot be returned immediately, i.e.,
 * immediateOutputFormat() returns false, then this method will be called from
 * batchFinished()./*w w w  .java  2  s  .c  om*/
 *
 * @param inputFormat the input format to base the output format on
 * @return the output format
 * @throws Exception in case the determination goes wrong
 */
@Override
protected Instances determineOutputFormat(Instances inputFormat) throws Exception {
    Vector<Integer> indices;
    int[] attributes;
    int i;

    // determine indices
    indices = new Vector<Integer>();
    for (i = 0; i < inputFormat.numAttributes(); i++) {
        // skip class
        // if (i == inputFormat.classIndex()) {
        //   continue;
        // }
        if (inputFormat.attribute(i).name().matches(m_Expression)) {
            indices.add(i);
        }
    }
    attributes = new int[indices.size()];
    for (i = 0; i < indices.size(); i++) {
        attributes[i] = indices.get(i);
    }

    m_Remove = new Remove();
    m_Remove.setAttributeIndicesArray(attributes);
    m_Remove.setInvertSelection(getInvertSelection());
    m_Remove.setInputFormat(inputFormat);

    return m_Remove.getOutputFormat();
}

From source file:ID3Chi.java

License:Open Source License

/**
 * Method for building an ID3Chi tree.//ww  w.  j  av a 2  s  .  c o m
 *
 * @param data
 *            the training data
 * @exception Exception
 *                if decision tree can't be built successfully
 */
private void makeTree(Instances data) throws Exception {

    // Check if no instances have reached this node.
    /*
    if (data.numInstances() == 0) {
       m_Attribute = null;
       m_ClassValue = Instance.missingValue();
       m_Distribution = new double[data.numClasses()];
       return;
    }
    /**/
    if (data.numInstances() == 0) {
        SetNullDistribution(data);
    }

    // Compute attribute with maximum information gain.
    double[] infoGains = new double[data.numAttributes()];
    Enumeration attEnum = data.enumerateAttributes();
    double entropyOfAllData = computeEntropy(data);

    while (attEnum.hasMoreElements()) {
        Attribute att = (Attribute) attEnum.nextElement();
        infoGains[att.index()] = computeInfoGain(data, att, entropyOfAllData);
    }
    m_Attribute = data.attribute(Utils.maxIndex(infoGains));

    double chiSquare = computeChiSquare(data, m_Attribute);

    int degreesOfFreedom = m_Attribute.numValues() - 1;
    ChiSquaredDistribution chi = new ChiSquaredDistribution(degreesOfFreedom);
    double threshold = chi.inverseCumulativeProbability(m_confidenceLevel);

    // Make leaf if information gain is zero.
    // Otherwise create successors.
    if (Utils.eq(infoGains[m_Attribute.index()], 0)) {
        MakeALeaf(data);
    } else {
        // Discard unknown values for selected attribute
        //data.deleteWithMissing(m_Attribute);
        Instances[] subset = splitData(data, m_Attribute);

        if (CheckIfCanApplyChiSquare(subset) && (chiSquare <= threshold)) {
            MakeALeaf(data);
            return;
        }

        m_Successors = new ID3Chi[m_Attribute.numValues()];
        for (int j = 0; j < m_Attribute.numValues(); j++) {
            m_Successors[j] = new ID3Chi(this.m_confidenceLevel);
            m_Successors[j].m_Ratio = (double) subset[j].numInstances() / (double) data.numInstances();
            m_Successors[j].makeTree(subset[j]);
        }
    }
}

From source file:RunBestFirstSearch.java

License:Open Source License

public static void main(String[] arg) throws Exception {
    // Load data.
    ////from  ww w  .j a  v  a  2 s  .c  o m
    System.out.println("\nLoading sample file...");
    DataSource source = new DataSource(arg[0]);
    Instances data = source.getDataSet();

    if (data.classIndex() == -1)
        data.setClassIndex(data.numAttributes() - 1);

    int n = Integer.parseInt(arg[1]);

    System.out.println("Instance with " + n + " features!");
    System.out.println("\nRunning BFS algorithm with CFS cost function...");

    // Run feature selection algorithm BFS using CFS cost function.
    //
    runAttributeSelection(data, n);
}

From source file:MachinLearningInterface.java

private void jButton7ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton7ActionPerformed
    Instances data;//w  w  w  . j a  v a2  s .  c  o  m
    try {
        data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff")));
        Instances newData = null;
        Add filter;
        newData = new Instances(data);
        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setNominalLabels("rods,punctua,networks");
        filter.setAttributeName("target");
        filter.setInputFormat(newData);
        newData = Filter.useFilter(newData, filter);
        System.out.print(newData);
        Vector vec = new Vector();
        newData.setClassIndex(newData.numAttributes() - 1);
        if (!newData.equalHeaders(newData)) {
            throw new IllegalArgumentException("Train and test are not compatible!");
        }

        URL urlToModel = this.getClass().getResource("/" + "Final.model");
        InputStream stream = urlToModel.openStream();

        Classifier cls = (Classifier) weka.core.SerializationHelper.read(stream);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls.classifyInstance(newData.instance(i));
            double[] dist = cls.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));

        }
        int p = 0, n = 0, r = 0;

        //txtarea2.append(Utils.arrayToString(this.target));
        for (Object vec1 : vec) {
            if ("rods".equals(vec1.toString())) {
                r = r + 1;
            }
            if ("punctua".equals(vec1.toString())) {
                p = p + 1;
            }
            if ("networks".equals(vec1.toString())) {
                n = n + 1;
            }

            PrintWriter out = null;
            try {

                out = new PrintWriter(this.name3 + "_morphology.txt");
                out.println(vec);
                out.close();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            //System.out.println(vec.get(i));
        }
        System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n);
        IJ.showMessage(
                "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: "
                        + p + ", rods: " + r + ", networks: " + n);

    } catch (IOException ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    }

    IJ.showMessage("analysing complete ");
}

From source file:MachinLearningInterface.java

private void jButton10ActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_jButton10ActionPerformed
    Instances data;/*w w w .  j  av  a2s  .c  o  m*/
    try {
        data = new Instances(new BufferedReader(new FileReader(this.name3 + ".arff")));
        Instances newData = null;
        Add filter;
        newData = new Instances(data);
        filter = new Add();
        filter.setAttributeIndex("last");
        filter.setNominalLabels(this.liststring);
        filter.setAttributeName("target");
        filter.setInputFormat(newData);
        newData = Filter.useFilter(newData, filter);
        System.out.print(newData);
        Vector vec = new Vector();
        newData.setClassIndex(newData.numAttributes() - 1);

        if (!newData.equalHeaders(newData)) {
            throw new IllegalArgumentException("Train and test are not compatible!");
        }

        Classifier cls = (Classifier) weka.core.SerializationHelper.read(this.model);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls.classifyInstance(newData.instance(i));
            double[] dist = cls.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));
            //txtarea2.append(Utils.arrayToString(dist));

        }
        URL urlToModel = this.getClass().getResource("/" + "Final.model");
        InputStream stream = urlToModel.openStream();

        Classifier cls2 = (Classifier) weka.core.SerializationHelper.read(stream);
        System.out.println("PROVANT MODEL.classifyInstance");
        for (int i = 0; i < newData.numInstances(); i++) {
            double pred = cls2.classifyInstance(newData.instance(i));
            double[] dist = cls2.distributionForInstance(newData.instance(i));
            System.out.print((i + 1) + " - ");
            System.out.print(newData.classAttribute().value((int) pred) + " - ");
            //txtarea2.setText(Utils.arrayToString(dist));

            System.out.println(Utils.arrayToString(dist));

            vec.add(newData.classAttribute().value((int) pred));

        }
        int p = 0, n = 0, r = 0;

        //txtarea2.append(Utils.arrayToString(this.target));
        for (Object vec1 : vec) {
            if ("rods".equals(vec1.toString())) {
                r = r + 1;
            }
            if ("punctua".equals(vec1.toString())) {
                p = p + 1;
            }
            if ("networks".equals(vec1.toString())) {
                n = n + 1;
            }

            PrintWriter out = null;
            try {

                out = new PrintWriter(this.name3 + "_morphology.txt");
                out.println(vec);
                out.close();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            //System.out.println(vec.get(i));
        }
        System.out.println("VECTOR-> punctua: " + p + ", rods: " + r + ", networks: " + n);
        IJ.showMessage(
                "Your file:" + this.name3 + "arff" + "\nhas been analysed, and it is composed by-> punctua: "
                        + p + ", rods: " + r + ", networks: " + n);
        //txtarea2.setText("Your file:" + this.name3 + ".arff"
        //+ "\nhas been analysed, and it is composed by-> punctua: " + p + ", rods: " + r + ", networks: " + n
        //+ "\n"
        //+ "\nAnalyse complete");
        //txtarea.setText("Analyse complete");

    } catch (IOException ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    } catch (Exception ex) {
        Logger.getLogger(MachinLearningInterface.class.getName()).log(Level.SEVERE, null, ex);
    }

    IJ.showMessage("analysing complete "); // TODO add your handling code here:         // TODO add your handling code here:
}

From source file:MPCKMeans.java

License:Open Source License

/** Sets training instances */
public void setInstances(Instances instances) {
    m_Instances = instances;/*from  ww w .  ja  va  2s. c o  m*/

    // create the checksum coefficients
    m_checksumCoeffs = new double[instances.numAttributes()];
    for (int i = 0; i < m_checksumCoeffs.length; i++) {
        m_checksumCoeffs[i] = m_RandomNumberGenerator.nextDouble();
    }

    // hash the instance checksums
    m_checksumHash = new HashMap(instances.numInstances());
    int classIdx = instances.classIndex();
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        double[] values = instance.toDoubleArray();
        double checksum = 0;

        for (int j = 0; j < values.length; j++) {
            if (j != classIdx) {
                checksum += m_checksumCoeffs[j] * values[j];
            }
        }

        // take care of chaining
        Object list = m_checksumHash.get(new Double((float) checksum));
        ArrayList idxList = null;
        if (list == null) {
            idxList = new ArrayList();
            m_checksumHash.put(new Double((float) checksum), idxList);
        } else { // chaining
            idxList = (ArrayList) list;
        }
        idxList.add(new Integer(i));
    }
}

From source file:MPCKMeans.java

License:Open Source License

public static void runFromCommandLine(String[] args) {
    MPCKMeans mpckmeans = new MPCKMeans();
    Instances data = null, clusterData = null;
    ArrayList labeledPairs = null;

    try {//www  . j  a va  2s  .com
        String optionString = Utils.getOption('D', args);
        if (optionString.length() != 0) {
            FileReader reader = new FileReader(optionString);
            data = new Instances(reader);
            System.out.println("Reading dataset: " + data.relationName());
        }

        int classIndex = data.numAttributes() - 1;
        optionString = Utils.getOption('K', args);
        if (optionString.length() != 0) {
            classIndex = Integer.parseInt(optionString);
            if (classIndex >= 0) {
                data.setClassIndex(classIndex); // starts with 0
                // Remove the class labels before clustering
                clusterData = new Instances(data);
                mpckmeans.setNumClusters(clusterData.numClasses());
                clusterData.deleteClassAttribute();
                System.out.println("Setting classIndex: " + classIndex);
            } else {
                clusterData = new Instances(data);
            }
        } else {
            data.setClassIndex(classIndex); // starts with 0
            // Remove the class labels before clustering
            clusterData = new Instances(data);
            mpckmeans.setNumClusters(clusterData.numClasses());
            clusterData.deleteClassAttribute();
            System.out.println("Setting classIndex: " + classIndex);
        }

        optionString = Utils.getOption('C', args);
        if (optionString.length() != 0) {
            labeledPairs = mpckmeans.readConstraints(optionString);
            System.out.println("Reading constraints from: " + optionString);
        } else {
            labeledPairs = new ArrayList(0);
        }

        mpckmeans.setTotalTrainWithLabels(data);
        mpckmeans.setOptions(args);
        System.out.println();
        mpckmeans.buildClusterer(labeledPairs, clusterData, data, mpckmeans.getNumClusters(),
                data.numInstances());
        mpckmeans.printClusterAssignments();

        if (mpckmeans.m_TotalTrainWithLabels.classIndex() > -1) {
            double nCorrect = 0;
            for (int i = 0; i < mpckmeans.m_TotalTrainWithLabels.numInstances(); i++) {
                for (int j = i + 1; j < mpckmeans.m_TotalTrainWithLabels.numInstances(); j++) {
                    int cluster_i = mpckmeans.m_ClusterAssignments[i];
                    int cluster_j = mpckmeans.m_ClusterAssignments[j];
                    double class_i = (mpckmeans.m_TotalTrainWithLabels.instance(i)).classValue();
                    double class_j = (mpckmeans.m_TotalTrainWithLabels.instance(j)).classValue();
                    //         System.out.println(cluster_i + "," + cluster_j + ":" + class_i + "," + class_j);
                    if (cluster_i == cluster_j && class_i == class_j
                            || cluster_i != cluster_j && class_i != class_j) {
                        nCorrect++;
                        //        System.out.println("nCorrect:" + nCorrect);
                    }
                }
            }
            int numInstances = mpckmeans.m_TotalTrainWithLabels.numInstances();
            double RandIndex = 100 * nCorrect / (numInstances * (numInstances - 1) / 2);
            System.err.println("Acc\t" + RandIndex);
        }

        //      if (mpckmeans.getTotalTrainWithLabels().classIndex() >= 0) {
        //    SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels,
        //                             mpckmeans.m_TotalTrainWithLabels.numClasses(),
        //                             mpckmeans.m_TotalTrainWithLabels.numClasses());
        //    eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances);
        //    eval.mutualInformation();
        //    eval.pairwiseFMeasure();
        //      }
    } catch (Exception e) {
        System.out.println("Option not specified");
        e.printStackTrace();
    }
}

From source file:MPCKMeans.java

License:Open Source License

public static void testCase() {
    try {/*from  w w  w .  j a  va 2  s .com*/
        String dataset = new String("lowd");
        //String dataset = new String("highd");
        if (dataset.equals("lowd")) {
            //////// Low-D data

            //   String datafile = "/u/ml/data/bio/arffFromPhylo/ecoli_K12-100.arff";
            //   String datafile = "/u/sugato/weka/data/digits-0.1-389.arff";
            String datafile = "/u/sugato/weka/data/iris.arff";
            int numPairs = 200, num = 0;

            // set up the data
            FileReader reader = new FileReader(datafile);
            Instances data = new Instances(reader);

            // Make the last attribute be the class 
            int classIndex = data.numAttributes() - 1;
            data.setClassIndex(classIndex); // starts with 0
            System.out.println("ClassIndex is: " + classIndex);

            // Remove the class labels before clustering
            Instances clusterData = new Instances(data);
            clusterData.deleteClassAttribute();

            // create the pairs
            ArrayList labeledPair = InstancePair.getPairs(data, numPairs);

            System.out.println("Finished initializing constraint matrix");

            MPCKMeans mpckmeans = new MPCKMeans();
            mpckmeans.setUseMultipleMetrics(false);
            System.out.println("\nClustering the data using MPCKmeans...\n");

            WeightedEuclidean metric = new WeightedEuclidean();
            WEuclideanLearner metricLearner = new WEuclideanLearner();

            //     LearnableMetric metric = new WeightedDotP();
            //     MPCKMeansMetricLearner metricLearner = new DotPGDLearner();

            //     KL metric = new KL();
            //     KLGDLearner metricLearner = new KLGDLearner();
            //   ((KL)metric).setUseIDivergence(true);

            //   BarHillelMetric metric = new BarHillelMetric();
            //   BarHillelMetricMatlab metric = new BarHillelMetricMatlab();
            //     XingMetric metric = new XingMetric();
            //   WeightedMahalanobis metric = new WeightedMahalanobis(); 

            mpckmeans.setMetric(metric);
            mpckmeans.setMetricLearner(metricLearner);
            mpckmeans.setVerbose(false);
            mpckmeans.setRegularize(false);
            mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING));
            mpckmeans.setSeedable(true);
            mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances());
            mpckmeans.getIndexClusters();
            mpckmeans.printIndexClusters();

            SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels,
                    mpckmeans.m_TotalTrainWithLabels.numClasses(),
                    mpckmeans.m_TotalTrainWithLabels.numClasses());
            eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances);
            System.out.println("MI=" + eval.mutualInformation());
            System.out.print("FM=" + eval.pairwiseFMeasure());
            System.out.print("\tP=" + eval.pairwisePrecision());
            System.out.print("\tR=" + eval.pairwiseRecall());
        } else if (dataset.equals("highd")) {
            //////// Newsgroup data
            String datafile = "/u/ml/users/sugato/groupcode/weka335/data/arffFromCCS/sanitized/different-1000_sanitized.arff";
            //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/small-newsgroup_fromCCS.arff";
            //String datafile = "/u/ml/users/sugato/groupcode/weka335/data/20newsgroups/same-100_fromCCS.arff";

            // set up the data
            FileReader reader = new FileReader(datafile);
            Instances data = new Instances(reader);

            // Make the last attribute be the class 
            int classIndex = data.numAttributes() - 1;
            data.setClassIndex(classIndex); // starts with 0
            System.out.println("ClassIndex is: " + classIndex);

            // Remove the class labels before clustering
            Instances clusterData = new Instances(data);
            clusterData.deleteClassAttribute();

            // create the pairs
            int numPairs = 0, num = 0;
            ArrayList labeledPair = new ArrayList(numPairs);
            Random rand = new Random(42);
            System.out.println("Initializing constraint matrix:");
            while (num < numPairs) {
                int i = (int) (data.numInstances() * rand.nextFloat());
                int j = (int) (data.numInstances() * rand.nextFloat());
                int first = (i < j) ? i : j;
                int second = (i >= j) ? i : j;
                int linkType = (data.instance(first).classValue() == data.instance(second).classValue())
                        ? InstancePair.MUST_LINK
                        : InstancePair.CANNOT_LINK;
                InstancePair pair = new InstancePair(first, second, linkType);
                if (first != second && !labeledPair.contains(pair)) {
                    labeledPair.add(pair);
                    //System.out.println(num + "th entry is: " + pair);
                    num++;
                }
            }
            System.out.println("Finished initializing constraint matrix");

            MPCKMeans mpckmeans = new MPCKMeans();
            mpckmeans.setUseMultipleMetrics(false);
            System.out.println("\nClustering the highd data using MPCKmeans...\n");

            LearnableMetric metric = new WeightedDotP();
            MPCKMeansMetricLearner metricLearner = new DotPGDLearner();

            //     KL metric = new KL();
            //     KLGDLearner metricLearner = new KLGDLearner();

            mpckmeans.setMetric(metric);
            mpckmeans.setMetricLearner(metricLearner);
            mpckmeans.setVerbose(false);
            mpckmeans.setRegularize(true);
            mpckmeans.setTrainable(new SelectedTag(TRAINING_INTERNAL, TAGS_TRAINING));
            mpckmeans.setSeedable(true);
            mpckmeans.buildClusterer(labeledPair, clusterData, data, data.numClasses(), data.numInstances());
            mpckmeans.getIndexClusters();

            SemiSupClustererEvaluation eval = new SemiSupClustererEvaluation(mpckmeans.m_TotalTrainWithLabels,
                    mpckmeans.m_TotalTrainWithLabels.numClasses(),
                    mpckmeans.m_TotalTrainWithLabels.numClasses());

            mpckmeans.getMetric().resetMetric(); // Vital: to reset m_attrWeights to 1 for proper normalization
            eval.evaluateModel(mpckmeans, mpckmeans.m_TotalTrainWithLabels, mpckmeans.m_Instances);
            System.out.println("MI=" + eval.mutualInformation());
            System.out.print("FM=" + eval.pairwiseFMeasure());
            System.out.print("\tP=" + eval.pairwisePrecision());
            System.out.print("\tR=" + eval.pairwiseRecall());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}