nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser.java Source code

Introduction

Here is the source code for nl.bioinf.roelen.thema11.classifier_tools.ClassifierUser.java
Source

/*
 * Copyright (C) 2014 roelen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package nl.bioinf.roelen.thema11.classifier_tools;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import nl.bioinf.roelen.thema11.file_readers.FastaReader;
import nl.bioinf.roelen.thema11.file_readers.GenBankReader;
import nl.bioinf.roelen.thema11.file_readers.GenbankResult;
import nl.bioinf.roelen.thema11.file_writers.ArffWriter;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_modders.SequenceReplacer;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.FastaSequence;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_objects.Gene;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.IntronExonBoundaryTesterResult;
import nl.bioinf.roelen.thema11.sequence_tools.sequence_testers.TestGenes;
import weka.classifiers.Classifier;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils;

/**
 *
 * @author roelen
 */
public class ClassifierUser {
    /**
     * use the classifier to test the sequences in a genbank or fasta file for boundaries
     * @param fileLocation the location of the genbank of fasta file
     * @param classifier the classifier to use
     * @return 
     */
    public static ArrayList<ClassifiedNucleotide> getPossibleBoundaries(String fileLocation,
            Classifier classifier) {
        ArrayList<Gene> genesFromFile = new ArrayList<>();
        ArrayList<ClassifiedNucleotide> classifiedNucleotides = new ArrayList<>();
        //read from fasta
        if (fileLocation.toUpperCase().endsWith(".FASTA") || fileLocation.toUpperCase().endsWith(".FA")
                || fileLocation.toUpperCase().endsWith(".FAN")) {
            genesFromFile.addAll(readFasta(fileLocation));
        }
        //read from genbank
        else if (fileLocation.toUpperCase().endsWith(".GENBANK") || fileLocation.toUpperCase().endsWith(".GB")) {
            GenBankReader gbr = new GenBankReader();
            gbr.readFile(fileLocation);
            GenbankResult gbresult = gbr.getResult();
            genesFromFile = gbresult.getGenes();
        }
        //get the test data
        HashMap<String, ArrayList<IntronExonBoundaryTesterResult>> geneTestResults;
        geneTestResults = TestGenes.testForIntronExonBoundaries(genesFromFile, 1);
        ArrayList<InstanceToClassify> instanceNucs = new ArrayList<>();
        try {
            //write our results to a temporary file
            File tempArrf = File.createTempFile("realSet", ".arff");
            ArffWriter.write(tempArrf.getAbsolutePath(), geneTestResults, null);
            //get data
            ConverterUtils.DataSource source = new ConverterUtils.DataSource(tempArrf.getAbsolutePath());
            //SET DATA AND OPTIONS
            Instances data = source.getDataSet();
            for (int i = 0; i < data.numInstances(); i++) {
                Instance in = data.instance(i);
                //get the name of the gene or sequence tested
                String nameOfInstance = in.stringValue(in.numAttributes() - 3);
                //get the tested position
                int testedPosition = (int) in.value(in.numAttributes() - 2);
                //set the class as missing, because we want to find it
                in.setMissing((in.numAttributes() - 1));

                Instance instanceNoExtras = new Instance(in);

                //delete the name and position, they are irrelevant for classifying
                instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2);
                instanceNoExtras.deleteAttributeAt(instanceNoExtras.numAttributes() - 2);
                InstanceToClassify ic = new InstanceToClassify(instanceNoExtras, testedPosition, nameOfInstance);
                instanceNucs.add(ic);
            }
            for (InstanceToClassify ic : instanceNucs) {
                Instance in = ic.getInstance();
                in.setDataset(data);
                data.setClassIndex(data.numAttributes() - 1);
                //classify our instance
                classifier.classifyInstance(in);
                //save the likelyhood something is part of something
                double likelyhoodBoundary = classifier.distributionForInstance(in)[0];
                double likelyhoodNotBoundary = classifier.distributionForInstance(in)[1];

                //create a classified nucleotide and give it the added data
                ClassifiedNucleotide cn = new ClassifiedNucleotide(likelyhoodBoundary, likelyhoodNotBoundary,
                        ic.getName(), ic.getPosition());
                classifiedNucleotides.add(cn);
            }

        } catch (IOException ex) {
            Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex);
        } catch (Exception ex) {
            Logger.getLogger(ClassifierUser.class.getName()).log(Level.SEVERE, null, ex);
        }
        return classifiedNucleotides;
    }

    /**
    * method to read fasta
    * @param fileLocation the location to read fasta files from
    * @return an ArrayList of Gene objects
    */
    public static ArrayList<Gene> readFasta(String fileLocation) {
        ArrayList<Gene> genesFromFile = new ArrayList<>();
        //use the fastareader object
        FastaReader fr = new FastaReader();
        fr.readFile(fileLocation);
        HashMap<Integer, FastaSequence> sequencesFromFasta = fr.getResult();
        //check all entries
        for (Map.Entry<Integer, FastaSequence> entry : sequencesFromFasta.entrySet()) {
            String geneID = entry.getValue().getHeader().concat("_".concat(String.valueOf(entry.getKey())));
            int stopPos = entry.getValue().getSequence().length() - 1;
            String definition = entry.getValue().getHeader();
            String organism = entry.getValue().getHeader();
            //create a gene and add it
            Gene g = new Gene(geneID, 0, stopPos, organism, definition, false);
            g.setSequenceStraight(entry.getValue().getSequence());
            genesFromFile.add(g);
            //take the previously entered gene, and add the reverse complement version
            Gene g2 = new Gene(geneID.concat("_R"), 0, stopPos, organism, definition, true);
            g2.setSequenceStraight(SequenceReplacer.reverseComplementSequence(entry.getValue().getSequence()));
            genesFromFile.add(g2);
        }
        return genesFromFile;
    }
}