Java tutorial
/* * Copyright (C) 2014 roelen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package nl.bioinf.roelen.thema11.query_handlers; import java.util.ArrayList; import java.util.HashMap; import nl.bioinf.roelen.thema11.classifier_tools.BoundaryClassifier; import nl.bioinf.roelen.thema11.classifier_tools.ClassifiedNucleotide; import nl.bioinf.roelen.thema11.classifier_tools.ClassifierTester; import nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser; import nl.bioinf.roelen.thema11.classifier_tools.ConsistencyChecker; import nl.bioinf.roelen.thema11.file_readers.GenBankReader; import nl.bioinf.roelen.thema11.file_readers.GenbankResult; import nl.bioinf.roelen.thema11.file_writers.ArffWriter; import nl.bioinf.roelen.thema11.file_writers.XmlWriter; import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.Gene; import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.MRNA; import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.IntronExonBoundaryTesterResult; import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestGenes; import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestMRNA; import weka.classifiers.Classifier; import weka.classifiers.Evaluation; /** * query handler which is sent requests from CLI or GUI * @author roelen */ public class QueryHandler { HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> AllMrnaTestResults; HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> AllGeneTestResultsNoFalseNegatives; ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> mrnaTestSets; ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> geneTestSets; int jump; String locationOfTrainingData = ""; Classifier classifier; public enum TestOrTraining { TRAINING, TEST } public QueryHandler() { AllMrnaTestResults = new HashMap<>(); AllGeneTestResultsNoFalseNegatives = new HashMap(); mrnaTestSets = new ArrayList<>(); geneTestSets = new ArrayList<>(); jump = 500; } public void setJump(int givenJump) { jump = givenJump; } /** * read a genbank file to get a positive or negative dataset * @param files genbank files to read * @param testOrTraining if the set is a training or test set */ public void readGenbanks(ArrayList<String> files, TestOrTraining testOrTraining) { HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> groupedMrnaTestResults = new HashMap<>(); HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> groupedGeneTestResultsNoFalseNegatives = new HashMap<>(); for (String fileName : files) { GenBankReader gbr = new GenBankReader(); gbr.readFile(fileName); GenbankResult gbresult = gbr.getResult(); ArrayList<Gene> genes = gbresult.getGenes(); ArrayList<MRNA> mrnas = gbresult.getMrnas(); //Get the mRNA results HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> mrnaTestResults; mrnaTestResults = TestMRNA.testForIntronExonBoundaries(mrnas, genes); //Get the gene results HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults; geneTestResults = TestGenes.testForIntronExonBoundaries(genes, jump); //check for consistency HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResultsNoFalseNegatives; geneTestResultsNoFalseNegatives = ConsistencyChecker.removePostivesFromNegativeSet(geneTestResults, mrnas); //add the reads to the final file groupedMrnaTestResults.putAll(mrnaTestResults); groupedGeneTestResultsNoFalseNegatives.putAll(geneTestResultsNoFalseNegatives); } //see it this was training or test data switch (testOrTraining) { case TRAINING: //add the reads to the final file AllMrnaTestResults.putAll(groupedMrnaTestResults); AllGeneTestResultsNoFalseNegatives.putAll(groupedGeneTestResultsNoFalseNegatives); break; case TEST: //add them to the lists of test sets mrnaTestSets.add(groupedMrnaTestResults); geneTestSets.add(groupedGeneTestResultsNoFalseNegatives); break; } } /** * write to an ARRF file * @param fileLocation the complete path to write the ARRF file to * @param testOrTraining * @param which */ public void toArrf(String fileLocation, TestOrTraining testOrTraining, int which) { switch (testOrTraining) { case TRAINING: //if no positive or negative dataset were defined, no ARRF file can be written if (AllMrnaTestResults.isEmpty() || AllGeneTestResultsNoFalseNegatives.isEmpty()) { System.out.println("cannot write an empty ARRF file!"); } else { ArffWriter.write(fileLocation, AllMrnaTestResults, AllGeneTestResultsNoFalseNegatives); } case TEST: //if there are not test sets, we can't write any ARRF files if (geneTestSets.isEmpty() || mrnaTestSets.isEmpty()) { System.out.println("not test sets have been defined"); } else { try { ArffWriter.write(fileLocation, geneTestSets.get(which), mrnaTestSets.get(which)); } catch (IndexOutOfBoundsException ex) { System.out.println("test set doesn't exist" + ex.getMessage()); } } } } /** * build a classifier from an ARRF file * @param aarfLocation the location of the ARRF file to write a classifier from * @param method which method to use for building the classifier */ public void buildClassifier(String aarfLocation, String method) { locationOfTrainingData = aarfLocation; classifier = BoundaryClassifier.build(aarfLocation, method); } /** * test classifier on the training data by using 10-folds * @param fileLocation lccation of the training set Arrf file * @param classifier the classifier to use * @return String representation of how well it performed */ public String testTenFold(String fileLocation, Classifier classifier) { Evaluation eval = ClassifierTester.testTenFold(fileLocation, classifier); return eval.toSummaryString(); } /** * test classifier on the training data by a test set * @param fileLocation lccation of the training set Arrf file * @param classifier the classifier to use * @return String representation of how well it performed */ public String testTestSet(String fileLocation, Classifier classifier) { Evaluation eval = ClassifierTester.testTenFold(fileLocation, classifier); return eval.toSummaryString(); } /** * use a created classifier to classify real data * @param fileLocationInput the location of the file containing the sequences to classify * @param fileLocationOutput the location of the file where to output the results * @param minimunLikelyhoodPositivePercentage the minimum likelyhood the boundary is an actual boundary * @param maximumLikelyhoodNegativePercentage the maximu likelyhood the boundary is not an actual boundary * @param classifier the classifier to classify the real data * @return a String representation of the nucleotides and the possible boundaries */ public String useClassifier(String fileLocationInput, String fileLocationOutput, double minimunLikelyhoodPositivePercentage, double maximumLikelyhoodNegativePercentage, Classifier classifier) { double maximumLikelyhoodNegative = 0; double minimunLikelyhoodPositive = 0; //have to check we don't devide by zero if (minimunLikelyhoodPositivePercentage > 0) { minimunLikelyhoodPositive = minimunLikelyhoodPositivePercentage / 100; } if (maximumLikelyhoodNegativePercentage > 0) { maximumLikelyhoodNegative = maximumLikelyhoodNegativePercentage / 100; } String result = ""; //classify the nucleotides ArrayList<ClassifiedNucleotide> classifiedNucleotides = ClassifierUser .getPossibleBoundaries(fileLocationInput, classifier); ArrayList<ClassifiedNucleotide> bestClassifiedNucleotides = new ArrayList<>(); //get only the nucleotides that have have the minumum and maximum likelyhoods for (ClassifiedNucleotide nuc : classifiedNucleotides) { if (nuc.getLikelyhoodBoundary() >= minimunLikelyhoodPositive && nuc.getLikelyhoodNotBoundary() <= maximumLikelyhoodNegative) { bestClassifiedNucleotides.add(nuc); } } //write the results to an XML XmlWriter.writeClassifiedNucleotides(fileLocationOutput, bestClassifiedNucleotides); //show the user how many hits were found StringBuilder strBui = new StringBuilder(); strBui.append("Found "); strBui.append(bestClassifiedNucleotides.size()); strBui.append(" possible boundaries"); result = strBui.toString(); return result; } /** * get the built classifier * @return the built classifier, returns null if there is no classifier */ public Classifier getClassifier() { return classifier; } /** * returns the arff that was used to create the classifier * @return */ public String getTrainingArffLocation() { return locationOfTrainingData; } /** * return the mRNA test sets currently in memory * @return the mRNA test sets currently in memory */ public ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> getMrnaTestSets() { return mrnaTestSets; } /** * return the gene test sets currently in memory * @return the gene test sets currently in memory */ public ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> getGeneTestSets() { return geneTestSets; } /** * remove a test set * @param index the index of the test set */ public void removeTestSet(int index) { mrnaTestSets.remove(index); geneTestSets.remove(index); } /** * utility method to convert a FASTA file to an arff file * @param fastaLocation the location of the fasta to convert * @param arffLocation the location of the arff to write */ public static void fastaToArff(String fastaLocation, String arffLocation) { ArrayList<Gene> genes = ClassifierUser.readFasta(fastaLocation); //get the test data HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults; geneTestResults = TestGenes.testForIntronExonBoundaries(genes, 1); ArffWriter.write(arffLocation, geneTestResults, null); } }