nl.bioinf.roelen.thema11.query_handlers.QueryHandler.java Source code

Java tutorial

Introduction

Here is the source code for nl.bioinf.roelen.thema11.query_handlers.QueryHandler.java

Source

/*
 * Copyright (C) 2014 roelen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package nl.bioinf.roelen.thema11.query_handlers;

import java.util.ArrayList;
import java.util.HashMap;
import nl.bioinf.roelen.thema11.classifier_tools.BoundaryClassifier;
import nl.bioinf.roelen.thema11.classifier_tools.ClassifiedNucleotide;
import nl.bioinf.roelen.thema11.classifier_tools.ClassifierTester;
import nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser;
import nl.bioinf.roelen.thema11.classifier_tools.ConsistencyChecker;
import nl.bioinf.roelen.thema11.file_readers.GenBankReader;
import nl.bioinf.roelen.thema11.file_readers.GenbankResult;
import nl.bioinf.roelen.thema11.file_writers.ArffWriter;
import nl.bioinf.roelen.thema11.file_writers.XmlWriter;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.Gene;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.MRNA;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.IntronExonBoundaryTesterResult;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestGenes;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestMRNA;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;

/**
 * query handler which is sent requests from CLI or GUI
 * @author roelen
 */
public class QueryHandler {
    HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> AllMrnaTestResults;
    HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> AllGeneTestResultsNoFalseNegatives;

    ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> mrnaTestSets;
    ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> geneTestSets;

    int jump;
    String locationOfTrainingData = "";
    Classifier classifier;

    public enum TestOrTraining {
        TRAINING, TEST
    }

    public QueryHandler() {
        AllMrnaTestResults = new HashMap<>();
        AllGeneTestResultsNoFalseNegatives = new HashMap();
        mrnaTestSets = new ArrayList<>();
        geneTestSets = new ArrayList<>();
        jump = 500;
    }

    public void setJump(int givenJump) {
        jump = givenJump;
    }

    /**
     * read a genbank file to get a positive or negative dataset
     * @param files genbank files to read
     * @param testOrTraining if the set is a training or test set
     */
    public void readGenbanks(ArrayList<String> files, TestOrTraining testOrTraining) {
        HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> groupedMrnaTestResults = new HashMap<>();
        HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> groupedGeneTestResultsNoFalseNegatives = new HashMap<>();
        for (String fileName : files) {
            GenBankReader gbr = new GenBankReader();
            gbr.readFile(fileName);
            GenbankResult gbresult = gbr.getResult();

            ArrayList<Gene> genes = gbresult.getGenes();
            ArrayList<MRNA> mrnas = gbresult.getMrnas();

            //Get the mRNA results
            HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> mrnaTestResults;
            mrnaTestResults = TestMRNA.testForIntronExonBoundaries(mrnas, genes);

            //Get the gene results
            HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults;
            geneTestResults = TestGenes.testForIntronExonBoundaries(genes, jump);

            //check for consistency
            HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResultsNoFalseNegatives;
            geneTestResultsNoFalseNegatives = ConsistencyChecker.removePostivesFromNegativeSet(geneTestResults,
                    mrnas);

            //add the reads to the final file
            groupedMrnaTestResults.putAll(mrnaTestResults);
            groupedGeneTestResultsNoFalseNegatives.putAll(geneTestResultsNoFalseNegatives);
        }
        //see it this was training or test data
        switch (testOrTraining) {
        case TRAINING:
            //add the reads to the final file
            AllMrnaTestResults.putAll(groupedMrnaTestResults);
            AllGeneTestResultsNoFalseNegatives.putAll(groupedGeneTestResultsNoFalseNegatives);
            break;
        case TEST:
            //add them to the lists of test sets
            mrnaTestSets.add(groupedMrnaTestResults);
            geneTestSets.add(groupedGeneTestResultsNoFalseNegatives);
            break;
        }
    }

    /**
     * write to an ARRF file
     * @param fileLocation the complete path to write the ARRF file to
     * @param testOrTraining
     * @param which
     */
    public void toArrf(String fileLocation, TestOrTraining testOrTraining, int which) {
        switch (testOrTraining) {
        case TRAINING:
            //if no positive or negative dataset were defined, no ARRF file can be written
            if (AllMrnaTestResults.isEmpty() || AllGeneTestResultsNoFalseNegatives.isEmpty()) {
                System.out.println("cannot write an empty ARRF file!");
            } else {
                ArffWriter.write(fileLocation, AllMrnaTestResults, AllGeneTestResultsNoFalseNegatives);
            }
        case TEST:
            //if there are not test sets, we can't write any ARRF files
            if (geneTestSets.isEmpty() || mrnaTestSets.isEmpty()) {
                System.out.println("not test sets have been defined");
            } else {
                try {
                    ArffWriter.write(fileLocation, geneTestSets.get(which), mrnaTestSets.get(which));
                } catch (IndexOutOfBoundsException ex) {
                    System.out.println("test set doesn't exist" + ex.getMessage());
                }
            }
        }

    }

    /**
     * build a classifier from an ARRF file
     * @param aarfLocation the location of the ARRF file to write a classifier from
     * @param method which method to use for building the classifier
     */
    public void buildClassifier(String aarfLocation, String method) {
        locationOfTrainingData = aarfLocation;
        classifier = BoundaryClassifier.build(aarfLocation, method);
    }

    /**
     * test classifier on the training data by using 10-folds
     * @param fileLocation lccation of the training set Arrf file
     * @param classifier the classifier to use
     * @return String representation of how well it performed
     */
    public String testTenFold(String fileLocation, Classifier classifier) {
        Evaluation eval = ClassifierTester.testTenFold(fileLocation, classifier);
        return eval.toSummaryString();
    }

    /**
     * test classifier on the training data by a test set
     * @param fileLocation lccation of the training set Arrf file
     * @param classifier the classifier to use
     * @return String representation of how well it performed
     */
    public String testTestSet(String fileLocation, Classifier classifier) {
        Evaluation eval = ClassifierTester.testTenFold(fileLocation, classifier);
        return eval.toSummaryString();
    }

    /**
     * use a created classifier to classify real data
     * @param fileLocationInput the location of the file containing the sequences to classify
     * @param fileLocationOutput the location of the file where to output the results
     * @param minimunLikelyhoodPositivePercentage the minimum likelyhood the boundary is an actual boundary
     * @param maximumLikelyhoodNegativePercentage the maximu likelyhood the boundary is not an actual boundary
     * @param classifier the classifier to classify the real data
     * @return a String representation of the nucleotides and the possible boundaries
     */
    public String useClassifier(String fileLocationInput, String fileLocationOutput,
            double minimunLikelyhoodPositivePercentage, double maximumLikelyhoodNegativePercentage,
            Classifier classifier) {
        double maximumLikelyhoodNegative = 0;
        double minimunLikelyhoodPositive = 0;
        //have to check we don't devide by zero
        if (minimunLikelyhoodPositivePercentage > 0) {
            minimunLikelyhoodPositive = minimunLikelyhoodPositivePercentage / 100;
        }
        if (maximumLikelyhoodNegativePercentage > 0) {
            maximumLikelyhoodNegative = maximumLikelyhoodNegativePercentage / 100;
        }
        String result = "";
        //classify the nucleotides
        ArrayList<ClassifiedNucleotide> classifiedNucleotides = ClassifierUser
                .getPossibleBoundaries(fileLocationInput, classifier);
        ArrayList<ClassifiedNucleotide> bestClassifiedNucleotides = new ArrayList<>();
        //get only the nucleotides that have have the minumum and maximum likelyhoods
        for (ClassifiedNucleotide nuc : classifiedNucleotides) {
            if (nuc.getLikelyhoodBoundary() >= minimunLikelyhoodPositive
                    && nuc.getLikelyhoodNotBoundary() <= maximumLikelyhoodNegative) {
                bestClassifiedNucleotides.add(nuc);
            }
        }
        //write the results to an XML
        XmlWriter.writeClassifiedNucleotides(fileLocationOutput, bestClassifiedNucleotides);
        //show the user how many hits were found
        StringBuilder strBui = new StringBuilder();
        strBui.append("Found ");
        strBui.append(bestClassifiedNucleotides.size());
        strBui.append(" possible boundaries");
        result = strBui.toString();
        return result;
    }

    /**
     * get the built classifier
     * @return the built classifier, returns null if there is no classifier
     */
    public Classifier getClassifier() {
        return classifier;
    }

    /**
     * returns the arff that was used to create the classifier
     * @return 
     */
    public String getTrainingArffLocation() {
        return locationOfTrainingData;
    }

    /**
     * return the mRNA test sets currently in memory
     * @return the mRNA test sets currently in memory
     */
    public ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> getMrnaTestSets() {
        return mrnaTestSets;
    }

    /**
     * return the gene test sets currently in memory
     * @return the gene test sets currently in memory
     */
    public ArrayList<HashMap<String, ArrayList<IntronExonBoundaryTesterResult>>> getGeneTestSets() {
        return geneTestSets;
    }

    /**
     * remove a test set
     * @param index the index of the test set
     */
    public void removeTestSet(int index) {
        mrnaTestSets.remove(index);
        geneTestSets.remove(index);
    }

    /**
     * utility method to convert a FASTA file to an arff file
     * @param fastaLocation the location of the fasta to convert
     * @param arffLocation the location of the arff to write
     */
    public static void fastaToArff(String fastaLocation, String arffLocation) {
        ArrayList<Gene> genes = ClassifierUser.readFasta(fastaLocation);
        //get the test data
        HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults;
        geneTestResults = TestGenes.testForIntronExonBoundaries(genes, 1);
        ArffWriter.write(arffLocation, geneTestResults, null);
    }
}