Java tutorial
/* * Copyright (C) 2014 roelen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package nl.bioinf.roelen.thema11.classifier_tools; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import nl.bioinf.roelen.thema11.file_readers.FastaReader; import nl.bioinf.roelen.thema11.file_readers.GenBankReader; import nl.bioinf.roelen.thema11.file_readers.GenbankResult; import nl.bioinf.roelen.thema11.file_writers.ArffWriter; import nl.bioinf.roelen.thema11.sequence_tools.sequence_modders.SequenceReplacer; import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.FastaSequence; import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.Gene; import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.IntronExonBoundaryTesterResult; import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestGenes; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.ConverterUtils; /** * * @author roelen */ public class ClassifierUser { /** * use the classifier to test the sequences in a genbank or fasta file for boundaries * @param fileLocation the location of the genbank of fasta file * @param classifier the classifier to use * @return */ public static ArrayList<ClassifiedNucleotide> getPossibleBoundaries(String fileLocation, Classifier classifier) { ArrayList<Gene> genesFromFile = new ArrayList<>(); ArrayList<ClassifiedNucleotide> classifiedNucleotides = new ArrayList<>(); //read from fasta if (fileLocation.toUpperCase().endsWith(".FASTA") || fileLocation.toUpperCase().endsWith(".FA") || fileLocation.toUpperCase().endsWith(".FAN")) { genesFromFile.addAll(readFasta(fileLocation)); } //read from genbank else if (fileLocation.toUpperCase().endsWith(".GENBANK") || fileLocation.toUpperCase().endsWith(".GB")) { GenBankReader gbr = new GenBankReader(); gbr.readFile(fileLocation); GenbankResult gbresult = gbr.getResult(); genesFromFile = gbresult.getGenes(); } //get the test data HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults; geneTestResults = TestGenes.testForIntronExonBoundaries(genesFromFile, 1); ArrayList<InstanceToClassify> instanceNucs = new ArrayList<>(); try { //write our results to a temporary file File tempArrf = File.createTempFile("realSet", ".arff"); ArffWriter.write(tempArrf.getAbsolutePath(), geneTestResults, null); //get data ConverterUtils.DataSource source = new ConverterUtils.DataSource(tempArrf.getAbsolutePath()); //SET DATA AND OPTIONS Instances data = source.getDataSet(); for (int i = 0; i < data.numInstances(); i++) { Instance in = data.instance(i); //get the name of the gene or sequence tested String nameOfInstance = in.stringValue(in.numAttributes() - 3); //get the tested position int testedPosition = (int) in.value(in.numAttributes() - 2); //set the class as missing, because we want to find it in.setMissing((in.numAttributes() - 1)); Instance instanceNoExtras = new Instance(in); //delete the name and position, they are irrelevant for classifying instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2); InstanceToClassify ic = new InstanceToClassify(instanceNoExtras, testedPosition, nameOfInstance); instanceNucs.add(ic); } for (InstanceToClassify ic : instanceNucs) { Instance in = ic.getInstance(); in.setDataset(data); data.setClassIndex(data.numAttributes() - 1); //classify our instance classifier.classifyInstance(in); //save the likelyhood something is part of something double likelyhoodBoundary = classifier.distributionForInstance(in)[0]; double likelyhoodNotBoundary = classifier.distributionForInstance(in)[1]; //create a classified nucleotide and give it the added data ClassifiedNucleotide cn = new ClassifiedNucleotide(likelyhoodBoundary, likelyhoodNotBoundary, ic.getName(), ic.getPosition()); classifiedNucleotides.add(cn); } } catch (IOException ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception ex) { Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex); } return classifiedNucleotides; } /** * method to read fasta * @param fileLocation the location to read fasta files from * @return an ArrayList of Gene objects */ public static ArrayList<Gene> readFasta(String fileLocation) { ArrayList<Gene> genesFromFile = new ArrayList<>(); //use the fastareader object FastaReader fr = new FastaReader(); fr.readFile(fileLocation); HashMap<Integer, FastaSequence> sequencesFromFasta = fr.getResult(); //check all entries for (Map.Entry<Integer, FastaSequence> entry : sequencesFromFasta.entrySet()) { String geneID = entry.getValue().getHeader().concat("_".concat(String.valueOf(entry.getKey()))); int stopPos = entry.getValue().getSequence().length() - 1; String definition = entry.getValue().getHeader(); String organism = entry.getValue().getHeader(); //create a gene and add it Gene g = new Gene(geneID, 0, stopPos, organism, definition, false); g.setSequenceStraight(entry.getValue().getSequence()); genesFromFile.add(g); //take the previously entered gene, and add the reverse complement version Gene g2 = new Gene(geneID.concat("_R"), 0, stopPos, organism, definition, true); g2.setSequenceStraight(SequenceReplacer.reverseComplementSequence(entry.getValue().getSequence())); genesFromFile.add(g2); } return genesFromFile; } }