List of usage examples for weka.classifiers.bayes NaiveBayesUpdateable buildClassifier
@Override public void buildClassifier(Instances instances) throws Exception
From source file:gr.demokritos.iit.cpgislanddetection.analysis.VectorSequenceDetector.java
License:Apache License
public VectorSequenceDetector(List<BaseSequence> sequences, List<String> labels) throws FileNotFoundException, IOException, Exception { //gia ola ta seq //gia kathe seq pare to vector me vash ton analyzer //vale kai to label //kai update classify // load data/*from w w w. jav a 2 s . co m*/ ArffLoader loader = new ArffLoader(); loader.setFile(new File("/Desktop/filesForWeka/2o_peirama/dataForWeka.arff")); Instances structure = loader.getStructure(); // setting class attribute structure.setClassIndex(structure.numAttributes() - 1); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(structure); Instance current; while ((current = loader.getNextInstance(structure)) != null) nb.updateClassifier(current); }
From source file:gr.demokritos.iit.cpgislanddetection.CpGIslandDetection.java
License:Apache License
/** * @param args the command line arguments *//*www . j a va 2 s. c o m*/ public static void main(String[] args) throws IOException, ParseException, Exception { // String sFileNameArgs = args[0]; // String[] fileNames = null; // Read file //IGenomicSequenceFileReader reader = new SequenceListFileReader(); // String seq ="GCTCTTGACTTTCAGACTTCCTGAAAACAACGTTCTGGTAAGGACAAGGGTT"; // // CpGIslandIdentification iClass = new CpGIslandIdentification(); // boolean b = iClass.identify(seq); // System.out.println("This sequence is a CpG island: " + b); // SequenceListFileReader s = new SequenceListFileReader(); // ArrayList<BaseSequence> alRes = new ArrayList<>(); // // alRes = s.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); // for(int i=0; i<alRes.size(); i++) // System.out.println("alRes = " + i + alRes.get(i)); // VectorAnalyzer vA = new VectorAnalyzer(); // List<Vector<Integer>> listVector = new ArrayList<>(); //Vector<Vector<Integer>> list = // listVector = vA.analyze(alRes); // for(int i=0; i<listVector.size();i++) // System.out.println(i + " " +listVector.get(i)); //IGenomicSequenceFileReader reader = new FASTAFileReader(); // If no input file has been given /* if (args.length == 0) { // Use default fileNames[0] = "C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"; fileNames[1] = "C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"; fileNames[2] = "C:\\Users\\Xenia\\Desktop\\files\\newsamples.txt"; } else // else use the provided one { fileNames = sFileNameArgs.split(";"); } */ //-----------------VECTOR ANALYSIS STARTS HERE-------------------------------------- //read sequences from txt files SequenceListFileReader reader = new SequenceListFileReader(); ArrayList<BaseSequence> lSeqs1 = new ArrayList<>(); ArrayList<BaseSequence> lSeqs2 = new ArrayList<>(); lSeqs1 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); lSeqs2 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"); //create vectors for every sequence List<Vector<Integer>> listVectorForPositiveSamples = new ArrayList<>(); List<Vector<Integer>> listVectorForNegativeSamples = new ArrayList<>(); VectorAnalyzer v = new VectorAnalyzer(); listVectorForPositiveSamples = v.analyze(lSeqs1); listVectorForNegativeSamples = v.analyze(lSeqs2); //create ARFF files for positive and negative samples FileCreatorARFF fc = new FileCreatorARFF(); Instances positiveInstances = fc.createARFF(listVectorForPositiveSamples, "yes"); Instances negativeInstances = fc.createARFF(listVectorForNegativeSamples, "no"); //System.out.println(positiveInstances); //build and train classifier // setting class attribute positiveInstances.setClassIndex(positiveInstances.numAttributes() - 1); negativeInstances.setClassIndex(negativeInstances.numAttributes() - 1); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(positiveInstances); nb.buildClassifier(negativeInstances); Instance current; for (int i = 0; i < positiveInstances.numInstances(); i++) { current = positiveInstances.instance(i); nb.updateClassifier(current); } // Test the model Evaluation eTest = new Evaluation(positiveInstances); Instances isTestingSet = fc.createARFF(listVectorForNegativeSamples, "?"); isTestingSet.setClassIndex(isTestingSet.numAttributes() - 1); eTest.evaluateModel(nb, isTestingSet); //------------------VECTOR ANALYSIS ENDS HERE--------------------------------------- //----------------------------HMM CLASSIFIER STARTS HERE---------------------------------- // Init classifier /* ISequenceClassifier<List<ObservationDiscrete<HmmSequence.Packet>>> classifier = new HmmClassifier(); */ // WARNING: Remember to change when you have normal data!!! // Obfuscation in negative training file? // final boolean bObfuscateNeg = true; // FASTAObfuscatorReader r = new FASTAObfuscatorReader(); //for each file do the same work: train // for (int i = 0; i < 3; i++) { // Read the sequences // If obfuscation is on and we are dealing with the negative // training file /* if ((i == 2) && (bObfuscateNeg)) { //FASTAObfuscatorReader r = new FASTAObfuscatorReader(); lSeqs = r.getSequencesFromFile(fileNames[i]); fileNames[1] = "Not" + fileNames[1]; // Update to indicate different class } else // else read normally lSeqs = reader.getSequencesFromFile(fileNames[i]); System.out.println("lSeqs size="+lSeqs.size()); */ // Create HMM sequences /* ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); // Train classifier with the observations classifier.train(lHmmSeqs, new File(fileNames[i]).getName()); } //Classify the test file //First: Read the sequences lSeqs = r.getSequencesFromFile(fileNames[2]); //System.out.println("file name= "+fileNames[2]); //Then: Create HMM sequences ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); */ //-------------------------------HMM CLASSIFIER ENDS HERE----------------------------------------- /* //----------------------------HMM EVALUATION STARTS----------------------------------------------- //System.out.println("size of lHmmSeqs="+ lHmmSeqs.size()); String str = null; String[] savedResults = new String[lHmmSeqs.size()]; //create a 2x2 array to store successes and failures for each class int[][] matrix = new int[2][2]; int successForCpG = 0, failForCpG = 0, successForNotCpG = 0, failForNotCpG = 0; // Init identifier // CpGIslandIdentification identifier = new CpGIslandIdentification(); CpGIslandIdentification identifier = new CpGIslandIdentificationByList("CpG_hg18.fa"); for (int i = 0; i < lHmmSeqs.size(); i++) { // DEBUG System.err.print("."); if (i % 10 == 0) System.err.println(); //////// str = classifier.classify(lHmmSeqs.get(i)); // System.out.println( "i="+i); System.out.println("Determined class:" + str); // savedResults[i] = str; //kalw sunarthsh pou exetazei an to sequence ikanopoiei ta CpG criterias if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[0])) { //Success for CpG class successForCpG++; System.out.println("successForCpG" + successForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[1])) { //fail for CpG class failForCpG++; System.out.println("failForCpG" + failForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[1])) { //System.out.println(i); //Success for Not CpG class successForNotCpG++; System.out.println("successForNotCpG" + successForNotCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[0])) { //fail for Not CpG class failForNotCpG++; System.out.println("failForNotCpG" + failForNotCpG); } } //Evaluation: calculation of classification rate and accuracy double totalAccuracy = (successForNotCpG + successForCpG)/(successForCpG + failForCpG + failForNotCpG + successForNotCpG); //missclassification rate for CpG class double rate1 = ( failForCpG + successForCpG ) != 0 ? failForCpG / ( failForCpG + successForCpG ) : 0.0; //missclassification rate for Not CpG class double rate2 = ( failForNotCpG + successForNotCpG ) != 0 ? failForNotCpG / ( failForNotCpG + successForNotCpG ) : 0.0; System.out.println(totalAccuracy +" "+ rate1 + " "+ rate2); NGramGraphClassifier nGramGraphClassifier = new NGramGraphClassifier(); List<List<DocumentNGramGraph>> representation; NGramGraphAnalyzer myAnalyst = new NGramGraphAnalyzer(); representation = myAnalyst.analyze(lSeqs); for(int i=0; i<representation.size();i++) nGramGraphClassifier.classify(representation.get(i)); */ }
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on enumerated attribute. * * @exception Exception if something goes wrong *//* w w w . ja v a 2 s. c om*/ private void handleEnumeratedAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instance instance; Instances[] trainingSets = new Instances[m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { trainingSets[i] = new Instances(trainInstances, 0); } /* m_distribution = new Distribution(m_complexityIndex, trainInstances.numClasses()); */ int subset; for (int i = 0; i < trainInstances.numInstances(); i++) { instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset > -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { try { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } catch (Exception ex) { ex.printStackTrace(); System.err.println("*** " + m_complexityIndex); System.err.println(weights.length); System.exit(1); } } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ /* // Only Instances with known values are relevant. Enumeration enu = trainInstances.enumerateInstances(); while (enu.hasMoreElements()) { instance = (Instance) enu.nextElement(); if (!instance.isMissing(m_attIndex)) { // m_distribution.add((int)instance.value(m_attIndex),instance); trainingSets[(int)instances.value(m_attIndex)].add(instance); } else { // add these to the error count m_errors += instance.weight(); } } */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() >= 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { // if fewer than min obj then just count them as errors for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if there are at least five instances in at least two of the subsets // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }
From source file:j48.NBTreeSplit.java
License:Open Source License
/** * Creates split on numeric attribute.//from w ww.j ava 2s. c o m * * @exception Exception if something goes wrong */ private void handleNumericAttribute(Instances trainInstances) throws Exception { m_c45S = new C45Split(m_attIndex, 2, m_sumOfWeights); m_c45S.buildClassifier(trainInstances); if (m_c45S.numSubsets() == 0) { return; } m_errors = 0; Instances[] trainingSets = new Instances[m_complexityIndex]; trainingSets[0] = new Instances(trainInstances, 0); trainingSets[1] = new Instances(trainInstances, 0); int subset = -1; // populate the subsets for (int i = 0; i < trainInstances.numInstances(); i++) { Instance instance = trainInstances.instance(i); subset = m_c45S.whichSubset(instance); if (subset != -1) { trainingSets[subset].add((Instance) instance.copy()); } else { double[] weights = m_c45S.weights(instance); for (int j = 0; j < m_complexityIndex; j++) { Instance temp = (Instance) instance.copy(); if (weights.length == m_complexityIndex) { temp.setWeight(temp.weight() * weights[j]); } else { temp.setWeight(temp.weight() / m_complexityIndex); } trainingSets[j].add(temp); } } } /* // compute weights (weights of instances per subset m_weights = new double [m_complexityIndex]; for (int i = 0; i < m_complexityIndex; i++) { m_weights[i] = trainingSets[i].sumOfWeights(); } Utils.normalize(m_weights); */ Random r = new Random(1); int minNumCount = 0; for (int i = 0; i < m_complexityIndex; i++) { if (trainingSets[i].numInstances() > 5) { minNumCount++; // Discretize the sets Discretize disc = new Discretize(); disc.setInputFormat(trainingSets[i]); trainingSets[i] = Filter.useFilter(trainingSets[i], disc); trainingSets[i].randomize(r); trainingSets[i].stratify(5); NaiveBayesUpdateable fullModel = new NaiveBayesUpdateable(); fullModel.buildClassifier(trainingSets[i]); // add the errors for this branch of the split m_errors += NBTreeNoSplit.crossValidate(fullModel, trainingSets[i], r); } else { for (int j = 0; j < trainingSets[i].numInstances(); j++) { m_errors += trainingSets[i].instance(j).weight(); } } } // Check if minimum number of Instances in at least two // subsets. if (minNumCount > 1) { m_numSubsets = m_complexityIndex; } }