Example usage for weka.classifiers.bayes NaiveBayesUpdateable buildClassifier

Introduction

In this page you can find the example usage for weka.classifiers.bayes NaiveBayesUpdateable buildClassifier.

Prototype

@Override
public void buildClassifier(Instances instances) throws Exception

Source Link

Document

Generates the classifier.

Usage

From source file:gr.demokritos.iit.cpgislanddetection.analysis.VectorSequenceDetector.java

License:Apache License

public VectorSequenceDetector(List<BaseSequence> sequences, List<String> labels)
        throws FileNotFoundException, IOException, Exception {

    //gia ola ta seq
    //gia kathe seq pare to vector me vash ton analyzer 
    //vale kai to label
    //kai update classify

    // load data/*from w  w  w.  jav  a 2  s  .  co m*/
    ArffLoader loader = new ArffLoader();
    loader.setFile(new File("/Desktop/filesForWeka/2o_peirama/dataForWeka.arff"));
    Instances structure = loader.getStructure();
    // setting class attribute
    structure.setClassIndex(structure.numAttributes() - 1);

    // train NaiveBayes
    NaiveBayesUpdateable nb = new NaiveBayesUpdateable();
    nb.buildClassifier(structure);
    Instance current;
    while ((current = loader.getNextInstance(structure)) != null)
        nb.updateClassifier(current);
}

From source file:gr.demokritos.iit.cpgislanddetection.CpGIslandDetection.java

License:Apache License

/**
 * @param args the command line arguments
 *//*www  . j  a va 2  s.  c  o  m*/
public static void main(String[] args) throws IOException, ParseException, Exception {

    // String sFileNameArgs = args[0];

    // String[] fileNames = null;
    // Read  file
    //IGenomicSequenceFileReader reader = new SequenceListFileReader();

    //        String seq ="GCTCTTGACTTTCAGACTTCCTGAAAACAACGTTCTGGTAAGGACAAGGGTT";
    //
    //        CpGIslandIdentification iClass = new CpGIslandIdentification();
    //        boolean b = iClass.identify(seq);
    //        System.out.println("This sequence is a CpG island: " + b);
    //        SequenceListFileReader s = new SequenceListFileReader();
    //        ArrayList<BaseSequence> alRes = new ArrayList<>();
    //        
    //        alRes = s.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt");

    //        for(int i=0; i<alRes.size(); i++)
    //        System.out.println("alRes = " + i + alRes.get(i));
    //        VectorAnalyzer vA = new VectorAnalyzer();
    //        List<Vector<Integer>> listVector = new ArrayList<>();
    //Vector<Vector<Integer>> list = 
    //        listVector = vA.analyze(alRes);
    //        for(int i=0; i<listVector.size();i++)
    //        System.out.println(i + " " +listVector.get(i));
    //IGenomicSequenceFileReader reader = new FASTAFileReader();

    // If no input file has been given
    /*        if (args.length == 0) {
    // Use default
    fileNames[0] = "C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt";
    fileNames[1] = "C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt";
    fileNames[2] = "C:\\Users\\Xenia\\Desktop\\files\\newsamples.txt";
            
            } else // else use the provided one
            {
    fileNames = sFileNameArgs.split(";");
            }
    */

    //-----------------VECTOR ANALYSIS STARTS HERE--------------------------------------

    //read sequences from txt files
    SequenceListFileReader reader = new SequenceListFileReader();
    ArrayList<BaseSequence> lSeqs1 = new ArrayList<>();
    ArrayList<BaseSequence> lSeqs2 = new ArrayList<>();

    lSeqs1 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt");
    lSeqs2 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt");

    //create vectors for every sequence
    List<Vector<Integer>> listVectorForPositiveSamples = new ArrayList<>();
    List<Vector<Integer>> listVectorForNegativeSamples = new ArrayList<>();
    VectorAnalyzer v = new VectorAnalyzer();
    listVectorForPositiveSamples = v.analyze(lSeqs1);
    listVectorForNegativeSamples = v.analyze(lSeqs2);

    //create ARFF files for positive and negative samples
    FileCreatorARFF fc = new FileCreatorARFF();
    Instances positiveInstances = fc.createARFF(listVectorForPositiveSamples, "yes");
    Instances negativeInstances = fc.createARFF(listVectorForNegativeSamples, "no");
    //System.out.println(positiveInstances);

    //build and train classifier
    // setting class attribute
    positiveInstances.setClassIndex(positiveInstances.numAttributes() - 1);
    negativeInstances.setClassIndex(negativeInstances.numAttributes() - 1);
    // train NaiveBayes
    NaiveBayesUpdateable nb = new NaiveBayesUpdateable();
    nb.buildClassifier(positiveInstances);
    nb.buildClassifier(negativeInstances);
    Instance current;
    for (int i = 0; i < positiveInstances.numInstances(); i++) {
        current = positiveInstances.instance(i);
        nb.updateClassifier(current);
    }

    // Test the model
    Evaluation eTest = new Evaluation(positiveInstances);
    Instances isTestingSet = fc.createARFF(listVectorForNegativeSamples, "?");
    isTestingSet.setClassIndex(isTestingSet.numAttributes() - 1);
    eTest.evaluateModel(nb, isTestingSet);

    //------------------VECTOR ANALYSIS ENDS HERE---------------------------------------

    //----------------------------HMM CLASSIFIER STARTS HERE----------------------------------
    // Init classifier
    /*       ISequenceClassifier<List<ObservationDiscrete<HmmSequence.Packet>>> classifier
        = new HmmClassifier();
    */
    // WARNING: Remember to change when you have normal data!!!
    // Obfuscation in negative training file?
    //       final boolean bObfuscateNeg = true;
    //        FASTAObfuscatorReader r = new FASTAObfuscatorReader();
    //for each file do the same work: train
    //        for (int i = 0; i < 3; i++) {
    // Read the sequences

    // If obfuscation is on and we are dealing with the negative
    // training file
    /*            if ((i == 2) && (bObfuscateNeg)) {
        //FASTAObfuscatorReader r = new FASTAObfuscatorReader();
        lSeqs = r.getSequencesFromFile(fileNames[i]);
        fileNames[1] = "Not" + fileNames[1]; // Update to indicate different class
    }
    else
        // else read normally
        lSeqs = reader.getSequencesFromFile(fileNames[i]);
            
    System.out.println("lSeqs size="+lSeqs.size());
    */
    // Create HMM sequences
    /*            ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst
            = new HmmAnalyzer();
    List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs);
            
    // Train classifier with the observations
    classifier.train(lHmmSeqs, new File(fileNames[i]).getName());
            }
            
            //Classify the test file        
            //First: Read the sequences
            lSeqs = r.getSequencesFromFile(fileNames[2]);
            //System.out.println("file name= "+fileNames[2]);
            //Then: Create HMM sequences
            ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst
        = new HmmAnalyzer();
            List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs);
    */

    //-------------------------------HMM CLASSIFIER ENDS HERE-----------------------------------------

    /*
            
    //----------------------------HMM EVALUATION STARTS-----------------------------------------------
    //System.out.println("size of lHmmSeqs="+ lHmmSeqs.size());
    String str = null;
    String[] savedResults = new String[lHmmSeqs.size()];
            
    //create a 2x2 array to store successes and failures for each class
    int[][] matrix = new int[2][2];
    int successForCpG = 0, failForCpG = 0, successForNotCpG = 0, failForNotCpG = 0;
            
    // Init identifier
    // CpGIslandIdentification identifier = new CpGIslandIdentification();
    CpGIslandIdentification identifier = new CpGIslandIdentificationByList("CpG_hg18.fa");
            
    for (int i = 0; i < lHmmSeqs.size(); i++) {
    // DEBUG
    System.err.print(".");
    if (i % 10 == 0)
        System.err.println();
    ////////
    str = classifier.classify(lHmmSeqs.get(i));
            //  System.out.println(  "i="+i);    
            
    System.out.println("Determined class:" + str);
    //            savedResults[i] = str;
            
    //kalw sunarthsh pou exetazei an to sequence ikanopoiei ta CpG criterias
    if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[0])) {
            
        //Success for CpG class
        successForCpG++;
        System.out.println("successForCpG"  + successForCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[1])) {
        //fail for CpG class
        failForCpG++;
        System.out.println("failForCpG" + failForCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[1])) {
            
        //System.out.println(i);
        //Success for Not CpG class
        successForNotCpG++;
        System.out.println("successForNotCpG" + successForNotCpG);
    } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[0])) {
              
        //fail for Not CpG class
        failForNotCpG++;
        System.out.println("failForNotCpG" + failForNotCpG);
    }
              
    }
            
    //Evaluation: calculation of classification rate and accuracy
    double totalAccuracy = (successForNotCpG + successForCpG)/(successForCpG + failForCpG + failForNotCpG + successForNotCpG);
            
    //missclassification rate for CpG class
    double rate1 = ( failForCpG + successForCpG ) != 0 ?
        failForCpG / ( failForCpG + successForCpG ) :
        0.0;
            
    //missclassification rate for Not CpG class
    double rate2 = ( failForNotCpG + successForNotCpG ) != 0 ? 
        failForNotCpG / ( failForNotCpG + successForNotCpG ) :
        0.0;
            
    System.out.println(totalAccuracy +" "+ rate1 + " "+ rate2);
            
    NGramGraphClassifier nGramGraphClassifier = new NGramGraphClassifier();
    List<List<DocumentNGramGraph>> representation;
    NGramGraphAnalyzer myAnalyst = new NGramGraphAnalyzer();
    representation = myAnalyst.analyze(lSeqs);
    for(int i=0; i<representation.size();i++)
    nGramGraphClassifier.classify(representation.get(i));
            
            
    */
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on enumerated attribute.
 *
 * @exception Exception if something goes wrong
 *//* w  w w . ja v  a  2  s. c om*/
private void handleEnumeratedAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;
    Instance instance;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
        trainingSets[i] = new Instances(trainInstances, 0);
    }
    /*    m_distribution = new Distribution(m_complexityIndex,
     trainInstances.numClasses()); */
    int subset;
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset > -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                try {
                    Instance temp = (Instance) instance.copy();
                    if (weights.length == m_complexityIndex) {
                        temp.setWeight(temp.weight() * weights[j]);
                    } else {
                        temp.setWeight(temp.weight() / m_complexityIndex);
                    }
                    trainingSets[j].add(temp);
                } catch (Exception ex) {
                    ex.printStackTrace();
                    System.err.println("*** " + m_complexityIndex);
                    System.err.println(weights.length);
                    System.exit(1);
                }
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    /*
    // Only Instances with known values are relevant.
    Enumeration enu = trainInstances.enumerateInstances();
    while (enu.hasMoreElements()) {
      instance = (Instance) enu.nextElement();
      if (!instance.isMissing(m_attIndex)) {
    //   m_distribution.add((int)instance.value(m_attIndex),instance);
    trainingSets[(int)instances.value(m_attIndex)].add(instance);
      } else {
    // add these to the error count
    m_errors += instance.weight();
      }
      } */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() >= 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            // if fewer than min obj then just count them as errors
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if there are at least five instances in at least two of the subsets
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates split on numeric attribute.//from   w  ww.j  ava 2s.  c o m
 *
 * @exception Exception if something goes wrong
 */
private void handleNumericAttribute(Instances trainInstances) throws Exception {

    m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights);
    m_c45S.buildClassifier(trainInstances);
    if (m_c45S.numSubsets() == 0) {
        return;
    }
    m_errors = 0;

    Instances[] trainingSets = new Instances[m_complexityIndex];
    trainingSets[0] = new Instances(trainInstances, 0);
    trainingSets[1] = new Instances(trainInstances, 0);
    int subset = -1;

    // populate the subsets
    for (int i = 0; i < trainInstances.numInstances(); i++) {
        Instance instance = trainInstances.instance(i);
        subset = m_c45S.whichSubset(instance);
        if (subset != -1) {
            trainingSets[subset].add((Instance) instance.copy());
        } else {
            double[] weights = m_c45S.weights(instance);
            for (int j = 0; j < m_complexityIndex; j++) {
                Instance temp = (Instance) instance.copy();
                if (weights.length == m_complexityIndex) {
                    temp.setWeight(temp.weight() * weights[j]);
                } else {
                    temp.setWeight(temp.weight() / m_complexityIndex);
                }
                trainingSets[j].add(temp);
            }
        }
    }

    /*    // compute weights (weights of instances per subset
    m_weights = new double [m_complexityIndex];
    for (int i = 0; i < m_complexityIndex; i++) {
      m_weights[i] = trainingSets[i].sumOfWeights();
    }
    Utils.normalize(m_weights); */

    Random r = new Random(1);
    int minNumCount = 0;
    for (int i = 0; i < m_complexityIndex; i++) {
        if (trainingSets[i].numInstances() > 5) {
            minNumCount++;
            // Discretize the sets
            Discretize disc = new Discretize();
            disc.setInputFormat(trainingSets[i]);
            trainingSets[i] = Filter.useFilter(trainingSets[i], disc);

            trainingSets[i].randomize(r);
            trainingSets[i].stratify(5);
            NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable();
            fullModel.buildClassifier(trainingSets[i]);

            // add the errors for this branch of the split
            m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r);
        } else {
            for (int j = 0; j < trainingSets[i].numInstances(); j++) {
                m_errors += trainingSets[i].instance(j).weight();
            }
        }
    }

    // Check if minimum number of Instances in at least two
    // subsets.
    if (minNumCount > 1) {
        m_numSubsets = m_complexityIndex;
    }
}