Example usage for weka.clusterers HierarchicalClusterer HierarchicalClusterer

Introduction

In this page you can find the example usage for weka.clusterers HierarchicalClusterer HierarchicalClusterer.

Prototype

HierarchicalClusterer

Source Link

Usage

From source file:Clustering.WekaHierarchicalClustererWrapper.java

@Override
public String cluster(HashMap<String, List> data) {

    try {// www .  j a  va 2  s  .c  om
        File arff = m_ArffExporter.getArff(data);
        if (arff == null)
            return null;

        FileInputStream is = new FileInputStream(arff.getAbsolutePath());
        Instances instances = ConverterUtils.DataSource.read(is);
        is.close();

        HierarchicalClusterer cl = new HierarchicalClusterer();

        String[] options = new String[6];
        options[0] = "-N"; // number of clusters should be "1"
        options[1] = "1";
        options[2] = "-L"; // linking type
        options[3] = m_LinkType;
        options[4] = "-A";
        options[5] = m_DistanceFunction;

        cl.setOptions(options);

        cl.buildClusterer(instances);

        String newickString = cl.graph();

        if (!arff.delete())
            arff.deleteOnExit();

        return newickString;

    } catch (Exception ex) {
        //System.out.println( "[EXCEPTION] " + ex.toString() );
        m_LastErrorMessage = ex.getMessage();
        return null;
    }
}

From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java

License:Apache License

/**
 * This is an auxiliary function that prepares the clustering data set. The
 * events must be translated to instances of the data set that can be used for
 * clustering.//from www .  ja  v a 2s  .  co m
 * 
 * @param isolated
 *          The list of the events containing an isolated appliance.
 * @return The instances of the data
 * @throws Exception
 */
private Instances createInstances(ArrayList<Event> isolated) throws Exception {
    // Initializing auxiliary variables namely the attributes of the data set
    Attribute id = new Attribute("id");
    Attribute pDiffRise = new Attribute("pDiffRise");
    Attribute qDiffRise = new Attribute("qDiffRise");
    Attribute pDiffReduce = new Attribute("pDiffReduce");
    Attribute qDiffReduce = new Attribute("qDiffReduce");

    ArrayList<Attribute> attr = new ArrayList<Attribute>();
    attr.add(id);
    attr.add(pDiffRise);
    attr.add(qDiffRise);
    attr.add(pDiffReduce);
    attr.add(qDiffReduce);

    Instances instances = new Instances("Isolated", attr, 0);

    // Each event is translated to an instance with the above attributes
    for (Event event : isolated) {

        Instance inst = new DenseInstance(5);
        inst.setValue(id, event.getId());
        inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff());
        inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff());
        inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff());
        inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff());

        instances.add(inst);

    }

    int n = Constants.MAX_CLUSTERS_NUMBER;
    Instances newInst = null;

    System.out.println("Instances: " + instances.toSummaryString());
    System.out.println("Max Clusters: " + n);

    // Create the addcluster filter of Weka and the set up the hierarchical
    // clusterer.
    AddCluster addcluster = new AddCluster();

    if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) {

        HierarchicalClusterer clusterer = new HierarchicalClusterer();

        String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" };

        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setNumClusters(n);
        clusterer.setOptions(opt);
        clusterer.setPrintNewick(true);
        clusterer.setDebug(true);

        // clusterer.getOptions();

        addcluster.setClusterer(clusterer);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    } else {

        SimpleKMeans kmeans = new SimpleKMeans();

        kmeans.setSeed(10);

        // This is the important parameter to set
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(n);
        kmeans.buildClusterer(instances);

        addcluster.setClusterer(kmeans);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    }

    return newInst;

}

From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java

License:Apache License

/**
 * This is an auxiliary function that prepares the clustering data set. The
 * events must be translated to instances of the data set that can be used for
 * clustering.// ww  w .  j  a va  2 s.c om
 * 
 * @param isolated
 *          The list of the events containing an isolated appliance.
 * @return The instances of the data
 * @throws Exception
 */
private Instances createInstances(ArrayList<Event> isolated) throws Exception {
    // Initializing auxiliary variables namely the attributes of the data set
    Attribute id = new Attribute("id");
    Attribute pDiffRise = new Attribute("pDiffRise");
    Attribute qDiffRise = new Attribute("qDiffRise");
    Attribute pDiffReduce = new Attribute("pDiffReduce");
    Attribute qDiffReduce = new Attribute("qDiffReduce");
    Attribute duration = new Attribute("duration");

    ArrayList<Attribute> attr = new ArrayList<Attribute>();
    attr.add(id);
    attr.add(pDiffRise);
    attr.add(qDiffRise);
    attr.add(pDiffReduce);
    attr.add(qDiffReduce);
    attr.add(duration);

    Instances instances = new Instances("Isolated", attr, 0);

    // Each event is translated to an instance with the above attributes
    for (Event event : isolated) {

        Instance inst = new DenseInstance(6);
        inst.setValue(id, event.getId());
        inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff());
        inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff());
        inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff());
        inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff());
        inst.setValue(duration, event.getEndMinute() - event.getStartMinute());
        instances.add(inst);

    }

    int n = Constants.MAX_CLUSTERS_NUMBER;
    Instances newInst = null;

    log.info("Instances: " + instances.toSummaryString());
    log.info("Max Clusters: " + n);

    // Create the addcluster filter of Weka and the set up the hierarchical
    // clusterer.
    AddCluster addcluster = new AddCluster();

    if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) {

        HierarchicalClusterer clusterer = new HierarchicalClusterer();

        String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" };

        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setNumClusters(n);
        clusterer.setOptions(opt);
        clusterer.setPrintNewick(true);
        clusterer.setDebug(true);

        // clusterer.getOptions();

        addcluster.setClusterer(clusterer);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    } else {

        SimpleKMeans kmeans = new SimpleKMeans();

        kmeans.setSeed(10);

        // This is the important parameter to set
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(n);
        kmeans.buildClusterer(instances);

        addcluster.setClusterer(kmeans);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    }

    return newInst;

}

From source file:eu.cassandra.server.mongo.csn.MongoCluster.java

License:Apache License

public DBObject clusterHierarchical(String message, String graph_id, String run_id, String clusterBasedOn,
        int numberOfClusters, String name, String clusterbasedon) {
    try {/*from  w ww. jav a2s  .  co  m*/
        Instances instances = getInstances(clusterBasedOn, graph_id);
        if (instances.numInstances() < 2) {
            return new JSONtoReturn().createJSONError(message, new Exception("Number of CSN Nodes is < 2"));
        }

        HierarchicalClusterer h = new HierarchicalClusterer();
        h.setOptions(new String[] { "-L", "AVERAGE" });
        h.setDistanceFunction(new EuclideanDistance());
        if (numberOfClusters > 0)
            h.setNumClusters(numberOfClusters);
        h.buildClusterer(instances);

        HashMap<Integer, Vector<String>> clusters = new HashMap<Integer, Vector<String>>();
        double[] arr;
        for (int i = 0; i < instances.numInstances(); i++) {
            String nodeId = nodeIDs.get(i);
            arr = h.distributionForInstance(instances.instance(i));
            for (int j = 0; j < arr.length; j++) {
                if (arr[j] == 1.0) {
                    if (!clusters.containsKey(j)) {
                        Vector<String> nodes = new Vector<String>();
                        nodes.add(nodeId);
                        clusters.put(j, nodes);
                    } else {
                        Vector<String> nodes = clusters.get(j);
                        nodes.add(nodeId);
                        clusters.put(j, nodes);
                    }
                }
            }
        }
        return saveClusters(graph_id, run_id, "hierarchical", clusters, null, name, clusterbasedon);
    } catch (Exception e) {
        e.printStackTrace();
        return new JSONtoReturn().createJSONError(message, e);
    }
}

From source file:gr.iit.demokritos.cru.cps.ai.KeyphraseClustering.java

License:Open Source License

public ArrayList<String> getClusters() throws Exception {
    System.out.println("Clustering......");
    // int[] clusters_size = new int[clusters];
    HierarchicalClusterer cl = new HierarchicalClusterer();
    // EM em=new EM();
    // XMeans xm = new XMeans();       no nominal attributes
    // DBSCAN db= new DBSCAN();        not our distance function
    // CascadeSimpleKMeans c = new CascadeSimpleKMeans(); not our distance function  

    cl.setNumClusters(this.clusters);
    if (language.equals("en")) {
        // cl.setDistanceFunction(wd);
        //xm.setDistanceF(wd);
        cl.setDistanceFunction(wd);/*w w  w . j av a2 s .  c om*/
    } else if (language.equals("de")) {
        cl.setDistanceFunction(wdde);
        //c.setDistanceFunction(wdde);
        //xm.setDistanceF(wdde);
    } else if (language.equals("el")) {
        cl.setDistanceFunction(wdel);
        //c.setDistanceFunction(wdel);
        // xm.setDistanceF(wdel);
    }
    cl.buildClusterer(data);
    //xm.buildClusterer(data);
    //c.setMaxIterations(5);
    ArrayList<String> clustersList = new ArrayList<String>();
    for (int i = 0; i < cl.numberOfClusters(); i++) {
        clustersList.add("");
    }
    //cl.buildClusterer(data);
    //em.buildClusterer(data);
    // xm.buildClusterer(data);

    for (int j = 0; j < data.numInstances(); j++) {
        //double[] prob = c.distributionForInstance(data.instance(j));
        //double[] prob = cl.distributionForInstance(data.instance(j)); 
        String clusterLine = data.instance(j).stringValue(0);

        int clust = cl.clusterInstance(data.instance(j));
        clustersList.set(clust, clustersList.get(clust).concat(clusterLine + ";"));
        //take the probabilities prob[i] that it is in the coresponding cluster i
        /*for (int i = 0; i < prob.length; i++) {
         //keep the cluster that has prob>0.9, as this is the cluster that the word is in
         if (prob[i] > 0.9) {
         //keep for every cluster its terms
         clustersList.set(i, clustersList.get(i).concat(clusterLine + ";"));
         //keep the size of cluster i
         // clusters_size[i] = clusters_size[i] + 1;
         }
         }*/
    }
    return clustersList;
}

From source file:guineu.modules.dataanalysis.clustering.hierarchical.HierarClusterer.java

License:Open Source License

public String getHierarchicalCluster(Instances dataset) {
    Clusterer clusterer = new HierarchicalClusterer();
    String[] options = new String[5];
    LinkType link = parameters.getParameter(HierarClustererParameters.linkType).getValue();
    DistanceType distanceType = parameters.getParameter(HierarClustererParameters.distanceType).getValue();
    options[0] = "-L";
    options[1] = link.name();//  w  ww  .  j  ava2  s.c  o  m
    options[2] = "-A";
    switch (distanceType) {
    case EUCLIDIAN:
        options[3] = "weka.core.EuclideanDistance";
        break;
    case CHEBYSHEV:
        options[3] = "weka.core.ChebyshevDistance";
        break;
    case MANHATTAN:
        options[3] = "weka.core.ManhattanDistance";
        break;
    case MINKOWSKI:
        options[3] = "weka.core.MinkowskiDistance";
        break;
    }

    options[4] = "-P";
    try {
        ((HierarchicalClusterer) clusterer).setOptions(options);
        clusterer.buildClusterer(dataset);
        return ((HierarchicalClusterer) clusterer).graph();
    } catch (Exception ex) {
        Logger.getLogger(HierarClusterer.class.getName()).log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:intensityclustering.IntensityClustering.java

/**
 * Returns a new weka clusterer used for nucleus staining intensity
 * clustering. The kind of clusterer is determined by the user.
 *
 * @return A new weka clusterer./*from   w  w w.j  a  v  a 2  s  .c  om*/
 */
private Clusterer getClusterer() {
    String clustername = getParam_AutomaticClustererString();
    Clusterer clusterer = null;
    if (clustername.equalsIgnoreCase("K-Means")) {
        clusterer = new SimpleKMeans();
    } else if (clustername.equalsIgnoreCase("Hierarchical")) {
        clusterer = new HierarchicalClusterer();
    } else if (clustername.equalsIgnoreCase("EM")) {
        clusterer = new EM();
    } else {
        clusterer = new FarthestFirst();
    }
    return clusterer;
}

From source file:jmetal.problems.SurvivalAnalysis.java

License:Open Source License

/** 
 * Evaluates a solution /*from w w  w.jav  a2 s  . com*/
 * @param solution The solution to evaluate
 */
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    double ArithmeticHarmonicCutScore = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        // filtering complete

        /*
        // debug: write the filtered dataset
                
         ArffSaver saver = new ArffSaver();
         saver.setInstances(dataClusterer);
         saver.setFile(new File("filteered-data.arff"));
         saver.writeBatch();
        // end debug
                
        */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", this.HC_LinkType }); // complete linkage clustering
        //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
        //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]

        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman)

        clusterer.buildClusterer(dataClusterer);

        double[][] distanceMatrix = clusterer.getDistanceMatrix();
        // save the cluster assignments

        if (this.re == null) { // we are not calling R functions. Therefore parallelization possible

            int[] clusterAssignment = new int[dataClusterer.numInstances()];
            int classOneCnt = 0;
            int classTwoCnt = 0;
            for (int i = 0; i < dataClusterer.numInstances(); ++i) {
                clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                if (clusterAssignment[i] == 0) {
                    ++classOneCnt;
                } else if (clusterAssignment[i] == 1) {
                    ++classTwoCnt;
                }
                //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
            }

            //System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);

            // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
            double[] time1 = new double[classOneCnt];
            double[] censor1 = new double[classOneCnt];
            double[] time2 = new double[classTwoCnt];
            double[] censor2 = new double[classTwoCnt];

            //data = source.getDataSet();
            for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) {
                //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                if (clusterAssignment[i] == 0) {
                    time1[cnt1] = data.get(i).value(attTime);
                    censor1[cnt1++] = data.get(i).value(attCensor);
                    //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
                } else if (clusterAssignment[i] == 1) {
                    time2[cnt2] = data.get(i).value(attTime);
                    //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                    censor2[cnt2++] = data.get(i).value(attCensor);
                    ;
                }
                //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
            }

            //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
            //System.out.println("Class instances seperated");

            // calculate log rank test and p values

            LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2);
            double[] scores = testclass1.logRank();
            testStatistic = scores[0];
            pValue = scores[2];

            ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment);
            //debug:
            //System.out.println("Calculation by myLibrary: testStatistic: " + scores[0] + " pValue: " + scores[2]);
            //end debug
            //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
            //testStatistic = testclass1.testStatistic;
            //pValue = testclass1.pValue;true
        } else { // We are calling R for Log Rank test, Parallelization not possible

            String strT = "time <- c(";
            String strC = "censor <- c(";
            String strG = "group <- c(";

            for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
                strT = strT + (int) data.get(i).value(attTime) + ",";
                strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
                strC = strC + (int) data.get(i).value(attCensor) + ",";
            }

            int tmpi = dataClusterer.numInstances() - 1;
            strT = strT + (int) data.get(tmpi).value(attTime) + ")";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
            strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

            this.re.eval(strT);
            this.re.eval(strC);
            this.re.eval(strG);

            //debug
            //System.out.println(strT);
            //System.out.println(strC);
            //System.out.println(strG);
            //end debug

            /** If you are calling surv_test from coin library */
            /*v
            re.eval("library(coin)");
            re.eval("grp <- factor (group)");
            re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                    
            x=re.eval("statistic(result)");
            testStatistic = x.asDouble();
            //x=re.eval("pvalue(result)");
            //pValue = x.asDouble();
            //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
             */

            /** If you are calling survdiff from survival library (much faster) */
            re.eval("library(survival)");
            re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)");
            x = re.eval("res2$chisq");
            testStatistic = x.asDouble();
            //System.out.println(x);
            x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)");
            //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
            pValue = x.asDouble();
            //debug:
            //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue);
            //end debug

        }

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

    /**********
     *  Current Implementation considers two objectives
     *  1. pvalue to be minimized / statistical score to be maximized
     *  2. Number of Features to be maximized/minimized
     */

    // Currently this section implements the OneZeroMax problem - need to modify it
    for (int i = 0; i < variable.getNumberOfBits(); i++)
        if (variable.bits_.get(i))
            counterSelectedFeatures++;

    // OneZeroMax is a maximization problem: multiply by -1 to minimize
    /*
    if (Double.isNaN(testStatistic)){
       solution.setObjective(0,Double.MAX_VALUE);
    }
    else{
       solution.setObjective(0, testStatistic);
    }
    */

    if (this.pValueFlag) {
        solution.setObjective(0, pValue); // pValue to be minimized
    } else {
        solution.setObjective(0, -1.0 * testStatistic); // statistic score to be maximized
    }
    if (this.featureMax) {
        solution.setObjective(1, -1.0 * counterSelectedFeatures); // feature maximized
    } else {
        solution.setObjective(1, counterSelectedFeatures); // feature minimized
    }
    if (this.numberOfObjectives_ == 3) {
        solution.setObjective(2, -1.0 * ArithmeticHarmonicCutScore); // feature maximized
    }
}

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraph.java

License:Open Source License

/** 
 * Evaluates a solution //from  w  w  w .  j  a  va 2 s.com
 * @param solution The solution to evaluate
 */
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    double ArithmeticHarmonicCutScore = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        // filtering complete

        // List the selected features/attributes
        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes/features: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();

        /*
        // debug: write the filtered dataset
                
         ArffSaver saver = new ArffSaver();
         saver.setInstances(dataClusterer);
         saver.setFile(new File("filteered-data.arff"));
         saver.writeBatch();
        // end debug
                
        */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", this.HC_LinkType });
        //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
        //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]

        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman)

        clusterer.buildClusterer(dataClusterer);

        double[][] distanceMatrix = clusterer.getDistanceMatrix();

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution(" + this.testDataFileName + "): "
                    + eval.clusterResultsToString());
        }

        // First analyze using my library function

        // save the cluster assignments

        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);

        // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
        double[] time1 = new double[classOneCnt];
        double[] censor1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor2 = new double[classTwoCnt];

        //data = source.getDataSet();
        for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) {
            //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                time1[cnt1] = data.get(i).value(attTime);
                censor1[cnt1++] = data.get(i).value(attCensor);
                //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
            } else if (clusterAssignment[i] == 1) {
                time2[cnt2] = data.get(i).value(attTime);
                //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                censor2[cnt2++] = data.get(i).value(attCensor);
                ;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
        //System.out.println("Class instances seperated");

        // calculate log rank test and p values

        LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2);
        double[] scores = testclass1.logRank();
        testStatistic = scores[0];
        pValue = scores[2];

        ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment);
        //debug:
        System.out.println("Calculation by myLibrary:\n testStatistic: " + scores[0] + " pValue: " + scores[2]
                + " Arithmetic Harmonic Cut Score: " + ArithmeticHarmonicCutScore);
        //end debug
        //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
        //testStatistic = testclass1.testStatistic;
        //pValue = testclass1.pValue;true

        // Now analyze calling R for Log Rank test, Parallelization not possible

        String strT = "time <- c(";
        String strC = "censor <- c(";
        String strG = "group <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";
        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        //debug
        //System.out.println(strT);
        //System.out.println(strC);
        //System.out.println(strG);
        //end debug

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
         */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)");
        x = re.eval("res2$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        //debug:
        //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue);
        //end debug

        System.out.println("Calculation by R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time, censor)~ strata(group), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time, censor)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("\nCluster Assignments:");
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraphOld.java

License:Open Source License

/** 
 * Evaluates a solution - actually generate the survival graph 
 * @param solution The solution to evaluate
 *//*  w ww.  j a  v  a 2 s.c  o m*/
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    System.out.println("\nSolution ID " + this.SolutionID);

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();
        // filtering complete

        // Debug: write the filtered dataset
        /*
        ArffSaver saver = new ArffSaver();
        saver.setInstances(dataClusterer);
        saver.setFile(new File("filteered-data.arff"));
        saver.writeBatch();
         */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", "COMPLETE" }); // complete linkage clustering
        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        //clusterer.setDistanceFunction(new ChebyshevDistance());
        clusterer.setDistanceIsBranchLength(false);

        clusterer.buildClusterer(dataClusterer);

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution: " + eval.clusterResultsToString());
        }

        // Print the cluster assignments:

        // save the cluster assignments
        //if (printClusterAssignment==true){
        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);
        //}

        /*
                
                         
                 // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
                 double[] time1 = new double[classOneCnt];   
                 double[] censor1 = new double[classOneCnt];
                 double[] time2 = new double[classTwoCnt];
                 double[] censor2 = new double[classTwoCnt];
                
                
                 //data = source.getDataSet();
                 for (int i=0, cnt1=0, cnt2=0; i<dataClusterer.numInstances(); ++i){
                    clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                    if (clusterAssignment[i]==0){
                       time1[cnt1] = data.get(i).value(attTime);
                       censor1[cnt1++] = 1;
                       //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
                    }
                    else if (clusterAssignment[i]==1){
                       time2[cnt2] = data.get(i).value(attTime);
                       //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                       censor2[cnt2++] = 1;
                    }
                    //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
                 }
                
                
                
                 //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
                 //System.out.println("Class instances seperated");
                
                 // calculate log rank test and p values
                         
                 //LogRankTest testclass1 = new LogRankTest(time1, censor1, time2, censor2);
                 //testStatistic = testclass1.testStatistic;
                 //pValue = testclass1.pValue;
                
                
                 WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
                 testStatistic = testclass1.testStatistic;
                 pValue = testclass1.pValue;true
        */

        String strT = "time1 <- c(";
        String strC = "censor1 <- c(";
        String strG = "group1 <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        // for MyLogRankTest

        double[] time1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor1 = new double[classOneCnt];
        double[] censor2 = new double[classTwoCnt];

        int i1 = 0, i2 = 0;

        for (int i = 0; i < dataClusterer.numInstances(); ++i) {

            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

            if (clusterer.clusterInstance(dataClusterer.get(i)) == 0) {
                time1[i1] = data.get(i).value(attTime);
                censor1[i1] = data.get(i).value(attCensor);
                ++i1;
            } else {
                time2[i2] = data.get(i).value(attTime);
                censor2[i2] = data.get(i).value(attCensor);
                ++i2;
            }

        }

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
        */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res21 <- survdiff(Surv(time1,censor1)~group1,rho=0)");
        x = re.eval("res21$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res21$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        System.out.println("Results from R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time1, censor1)~ strata(group1), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time1, censor1)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("Results from my code: ");
        LogRankTest lrt = new LogRankTest(time1, time2, censor1, censor2);
        double[] results = lrt.logRank();
        System.out.println("Statistics: " + results[0] + " variance: " + results[1] + " pValue: " + results[2]);

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

    /**********
     *  Current Implementation considers two objectives
     *  1. pvalue to be minimized / statistical score to be maximized
     *  2. Number of Features to be maximized/minimized
     */

}