com.act.utils.parser.GenbankInterpreter.java Source code

Java tutorial

Introduction

Here is the source code for com.act.utils.parser.GenbankInterpreter.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.utils.parser;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.biojava.nbio.core.sequence.DNASequence;
import org.biojava.nbio.core.sequence.ProteinSequence;
import org.biojava.nbio.core.sequence.compound.AmbiguityDNACompoundSet;
import org.biojava.nbio.core.sequence.features.AbstractFeature;
import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
import org.biojava.nbio.core.sequence.features.FeatureInterface;
import org.biojava.nbio.core.sequence.features.Qualifier;
import org.biojava.nbio.core.sequence.io.DNASequenceCreator;
import org.biojava.nbio.core.sequence.io.GenbankReader;
import org.biojava.nbio.core.sequence.io.GenbankReaderHelper;
import org.biojava.nbio.core.sequence.io.GenericGenbankHeaderParser;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

public class GenbankInterpreter {
    private static final Logger LOGGER = LogManager.getFormatterLogger(GenbankInterpreter.class);
    private static final String OPTION_GENBANK_PATH = "p";
    private static final String OPTION_SEQ_TYPE = "s";
    private static final String PROTEIN = "Protein";
    private static final String DNA = "DNA";

    public static final String HELP_MESSAGE = StringUtils.join(new String[] {
            "This class parses Genbank Protein sequence files. It can be used on the command line with ",
            "a file path as a parameter." }, "");

    public static final List<Option.Builder> OPTION_BUILDERS = new ArrayList<Option.Builder>() {
        {
            add(Option.builder(OPTION_GENBANK_PATH).argName("genbank file")
                    .desc("genbank dna or protein sequence file containing sequence and annotations").hasArg()
                    .longOpt("genbank").required());
            add(Option.builder(OPTION_SEQ_TYPE).argName("sequence type")
                    .desc("declares whether the sequence type is DNA or Protein").hasArg().longOpt("sequence")
                    .required());
            add(Option.builder("h").argName("help").desc("Example of usage: -p filepath.gb -s DNA")
                    .longOpt("help"));
        }
    };

    public static final HelpFormatter HELP_FORMATTER = new HelpFormatter();

    static {
        HELP_FORMATTER.setWidth(100);
    }

    private File protFile;
    private String seq_type;
    private ArrayList<AbstractSequence> sequences = new ArrayList<>();

    /**
     * Parses every sequence object from the Genbank File
     * @throws Exception
     */
    public void init() throws Exception {
        if (seq_type.equals(PROTEIN)) {
            Map<String, ProteinSequence> sequences;

            if (protFile.getName().endsWith(".gz")) {
                try (InputStream is = new GZIPInputStream(new FileInputStream(protFile))) {
                    sequences = GenbankReaderHelper.readGenbankProteinSequence(is);
                }
            } else {
                sequences = GenbankReaderHelper.readGenbankProteinSequence(protFile);
            }

            for (AbstractSequence sequence : sequences.values()) {
                this.sequences.add(sequence);
            }

        } else if (seq_type.equals(DNA)) {
            Map<String, DNASequence> sequences;

            if (protFile.getName().endsWith(".gz")) {
                try (InputStream is = new GZIPInputStream(new FileInputStream(protFile))) {
                    /* the AmbiguityDNACompoundSet is necessary due to the presence of ambiguous nucleotide (non-ATCG) compounds
                    in the parsed DNA sequences */
                    GenbankReader genbankReader = new GenbankReader(is, new GenericGenbankHeaderParser<>(),
                            new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet()));

                    sequences = genbankReader.process();
                }
            } else {
                /* the AmbiguityDNACompoundSet is necessary due to the presence of ambiguous nucleotide (non-ATCG) compounds
                  in the parsed DNA sequences */
                GenbankReader genbankReader = new GenbankReader(protFile, new GenericGenbankHeaderParser<>(),
                        new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet()));

                sequences = genbankReader.process();
            }

            for (AbstractSequence sequence : sequences.values()) {
                this.sequences.add(sequence);
            }

        } else {
            String msg = "No proper sequence type given; must be either DNA or Protein";
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
    }

    /**
     * Checks if sequence object has been initialized, throws RuntimeException if not
     */
    private void checkInit() {
        if (sequences == null) {
            String msg = "Class hasn't been appropriately initialized, no sequence object";
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        }
    }

    public GenbankInterpreter(File GenbankFile, String type) {
        protFile = GenbankFile;
        seq_type = type;
    }

    /**
     * Prints the genetic sequences extracted from the sequence objects
     */
    public void printSequences() {
        checkInit();

        for (AbstractSequence sequence : this.sequences) {
            System.out.println("Sequence:");
            System.out.println(sequence.getSequenceAsString());
            System.out.println("\n");
        }
    }

    /**
     * Extracts the genetic sequence as a string from the sequence objects
     * @return A list of genetic sequences as strings
     */
    public ArrayList<String> getSequenceStrings() {
        checkInit();

        ArrayList<String> sequences = new ArrayList<>();

        for (AbstractSequence sequence : this.sequences) {
            sequences.add(sequence.getSequenceAsString());
        }

        return sequences;
    }

    /**
     * Prints all the Features and corresponding Qualifiers for each sequence object
     */
    public void printFeaturesAndQualifiers() {
        checkInit();

        for (AbstractSequence sequence : sequences) {
            List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequence.getFeatures();

            for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) {
                System.out.println("Type: " + feature.getType() + "; Source: " + feature.getSource() + "\n");

                Map<String, List<Qualifier>> qualifiers = feature.getQualifiers();

                for (List<Qualifier> qual_list : qualifiers.values()) {
                    for (Qualifier qual : qual_list) {
                        if (qual.getName().equals("dbxref")) {
                            System.out.println("/" + qual.getName() + "=\"" + ((DBReferenceInfo) qual).getDatabase()
                                    + ":" + ((DBReferenceInfo) qual).getId() + "\" |");
                        } else {
                            System.out.println("/" + qual.getName() + "=\"" + qual.getValue() + "\" |");
                        }
                    }
                }

                System.out.println("=======================\n");
            }
        }
    }

    /**
     * Extracts feature types from the sequence object
     * @return list of all feature types in the Genbank file
     */
    public ArrayList<ArrayList<String>> getFeatures() {
        checkInit();

        ArrayList<ArrayList<String>> all_feature_types = new ArrayList<>();

        for (AbstractSequence sequence : sequences) {
            ArrayList<String> feature_types = new ArrayList<String>();

            List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequence.getFeatures();

            for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) {
                feature_types.add(feature.getType());
            }

            all_feature_types.add(feature_types);
        }

        return all_feature_types;
    }

    /**
     * Extracts qualifiers for a particular feature in the sequence object
     * @param sequence_index the index of the sequence object of interest in the sequences list
     * @param feature_type i.e. "source", "gene", "CDS", etc
     * @param feature_source i.e. "1..678"
     * @return Map of the corresponding qualifiers with the key being the Qualifier name (i.e. organism, mol_type, etc)
     * and the value being the list of Qualifiers that have the same name as the key
     */
    public Map<String, List<Qualifier>> getQualifiers(int sequence_index, String feature_type,
            String feature_source) {
        checkInit();

        List<FeatureInterface<AbstractSequence<Compound>, Compound>> features = sequences.get(sequence_index)
                .getFeatures();

        for (FeatureInterface<AbstractSequence<Compound>, Compound> feature : features) {
            if (feature.getType().equals(feature_type) && feature.getSource().equals(feature_source)) {
                return feature.getQualifiers();
            }
        }

        return null;
    }

    /**
     * Adds a Qualifier to a particular Feature i.e. /organism="Escherichia Coli"
     * @param feature the feature object you'd like to add the qualifier to
     * @param qual_name e.g. "organism"
     * @param qual_value e.g. "Escherichia Coli"
     */
    public void addQualifier(AbstractFeature<AbstractSequence<Compound>, Compound> feature, String qual_name,
            String qual_value) {
        feature.addQualifier(qual_name, new Qualifier(qual_name, qual_value));
    }

    /**
     * Constructs a Feature with a particular type (i.e. gene) and source (i.e. 1..678)
     * @param type e.g. "gene", "source", "CDS"
     * @param source e,g. "1..678"
     * @return the constructed Feature object
     */
    public AbstractFeature<AbstractSequence<Compound>, Compound> constructFeature(String type, String source) {
        return new AbstractFeature<AbstractSequence<Compound>, Compound>(type, source) {
        };
    }

    /**
     * prints the description string for each sequence
     */
    public void printDescription() {
        for (AbstractSequence sequence : sequences) {
            System.out.println(sequence.getDescription());
        }
    }

    /**
     * prints the Accession ID for each sequence
     */
    public void printAccessionID() {
        for (AbstractSequence sequence : sequences) {
            System.out.println(sequence.getAccession().getID());
        }
    }

    public void printHeader() {
        for (AbstractSequence sequence : sequences) {
            System.out.println(sequence.getOriginalHeader());
        }
    }

    public List<AbstractSequence> getSequences() {
        return this.sequences;
    }

    /**
     * Once the Feature has been constructed and all the qualifiers have been added, this method adds the feature to
     * a specific sequence
     * @param bioStart the start index of the source of the feature
     * @param bioEnd the end index of the source of the feature
     * @param feature the feature object to be added
     * @param sequence_index the index of the sequence of interest in the sequences list
     */
    public void addFeature(int bioStart, int bioEnd, AbstractFeature<AbstractSequence<Compound>, Compound> feature,
            int sequence_index) {
        checkInit();
        sequences.get(sequence_index).addFeature(bioStart, bioEnd, feature);
    }

    public static void main(String[] args) throws Exception {
        Options opts = new Options();
        for (Option.Builder b : OPTION_BUILDERS) {
            opts.addOption(b.build());
        }

        CommandLine cl = null;
        try {
            CommandLineParser parser = new DefaultParser();
            cl = parser.parse(opts, args);
        } catch (ParseException e) {
            LOGGER.error("Argument parsing failed: %s", e.getMessage());
            HELP_FORMATTER.printHelp(GenbankInterpreter.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        if (cl.hasOption("help")) {
            HELP_FORMATTER.printHelp(GenbankInterpreter.class.getCanonicalName(), HELP_MESSAGE, opts, null, true);
            System.exit(1);
        }

        File genbankFile = new File(cl.getOptionValue(OPTION_GENBANK_PATH));
        String seq_type = cl.getOptionValue(OPTION_SEQ_TYPE);

        if (!genbankFile.exists()) {
            String msg = "Genbank file path is null";
            LOGGER.error(msg);
            throw new RuntimeException(msg);
        } else {
            GenbankInterpreter reader = new GenbankInterpreter(genbankFile, seq_type);
            reader.init();
            reader.printSequences();
        }
    }
}