com.act.utils.parser.GenbankInterpreterTest.java Source code

Java tutorial

Introduction

Here is the source code for com.act.utils.parser.GenbankInterpreterTest.java

Source

/*************************************************************************
*                                                                        *
*  This file is part of the 20n/act project.                             *
*  20n/act enables DNA prediction for synthetic biology/bioengineering.  *
*  Copyright (C) 2017 20n Labs, Inc.                                     *
*                                                                        *
*  Please direct all queries to act@20n.com.                             *
*                                                                        *
*  This program is free software: you can redistribute it and/or modify  *
*  it under the terms of the GNU General Public License as published by  *
*  the Free Software Foundation, either version 3 of the License, or     *
*  (at your option) any later version.                                   *
*                                                                        *
*  This program is distributed in the hope that it will be useful,       *
*  but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*  GNU General Public License for more details.                          *
*                                                                        *
*  You should have received a copy of the GNU General Public License     *
*  along with this program.  If not, see <http://www.gnu.org/licenses/>. *
*                                                                        *
*************************************************************************/

package com.act.utils.parser;

import org.apache.commons.lang3.tuple.Pair;
import org.biojava.nbio.core.sequence.features.AbstractFeature;
import org.biojava.nbio.core.sequence.features.DBReferenceInfo;
import org.biojava.nbio.core.sequence.features.Qualifier;
import org.biojava.nbio.core.sequence.template.AbstractSequence;
import org.biojava.nbio.core.sequence.template.Compound;
import org.junit.Before;
import org.junit.Test;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class GenbankInterpreterTest {
    protected GenbankInterpreter giProtein;
    protected GenbankInterpreter giDna;

    /*
      http://www.ncbi.nlm.nih.gov/protein/CUB13083
      ACCESSION CUB13083
     */
    final String protein_seq = "MMTNLQKEFFKRLKIPAKEITFNDLDEILLKMGLTLPYENLDIMAGTIKDISKNNLVEKILIQKRGGLCYELNSLLYYFLMDCG"
            + "FQVYKVAGTVYDLYDNKWKPDDGHVIIVLTHNNKDYVIDAGFASHLPLHPVPFNGEVISSQTGEYRIRKRTTRKGTHILEMRKGANGES"
            + "TNFLQSEPSHEWKVGYAFTLDPIDEKKVNNIQKVIVEHKESPFNKGAITCKLTDYGHVSLTNKNYTETFKGTKNKRPIESKDYAHILRE"
            + "SFGITQVKYVGKTLERG";

    /*
      http://www.ncbi.nlm.nih.gov/nuccore/AB000097
      ACCESSION AB000097
     */
    final String dna_seq = "AAAAAACCTAGCTAAACGAAGAAAATCATTCCAATACACATGGCTTCCGAAAGGCAAGCGTTAATGCTTATTCTCTTAACAAC"
            + "ATTCTTCTTCACCATAAAGCCTTCACAGGCCAGTACTACTGGTGGCATAACAATCTACTGGGGCCAAAACATTGACGACGGCACCTTG"
            + "ACCTCCACATGCGACACTGGAAACTTCGAGATTGTCAACCTAGCTTTCCTCAATGCGTTTGGTTGCGGCATAACTCCATCATGGAACT"
            + "TCGCTGGCCACTGTGGGGACTGGAACCCTTGTTCCATACTAGAACCCCAAATACAATACTGCCAGCAGAAAGGTGTCAAAGTCTTCCT"
            + "TTCCCTCGGTGGTGCTAAAGGAACCTACTCCCTCTGCTCACCCGAGGACGCAAAAGAAGTTGCCAATTACCTTTATCAAAACTTCCTC"
            + "AGTGGCAAACCCGGTCCACTTGGAAGTGTAACATTGGAAGGCATCGATTTCGACATTGAACTTGGTTCCAACCTCTATTGGGGCGACC"
            + "TTGCCAAGGAACTAGATGCTCTCAGGCACCAAAACGACCACTACTTCTACTTGTCCGCAGCCCCACAATGTTTTATGCCTGATTACCA"
            + "CCTCGACAATGCCATCAAAACTGGTCTTTTCGATCATGTAAACGTTCAGTTCTACAATAACCCTCCATGCCAATACTCACCTGGCAAT"
            + "ACTCAATTGCTTTTTAATTCATGGGATGATTGGACTTCAAATGTTCTTCCCAATAACTCTGTTTTCTTTGGACTACCAGCATCTCCCG"
            + "ACGCTGCTCCAAGTGGTGGTTATATACCACCACAGGTGCTCATTTCTGAGGTGCTTCCCTATGTAAAGCAAGCTTCCAACTATGGAGG"
            + "AGTTATGCTGTGGGACAGGTACCATGATGTTTTAAATTATCACAGCGATCAGATAAAGGATTATGTTCCAAAATATGCAATGCGGTTT"
            + "GTGACCGCAGTTTCCGACGCTATTTATGAGAGTGTCTCTGCACGTACGCACCGCATCTTACAGAAGAAACCATATTAGAAATATGGGG"
            + "AGCCGTACGTGCAAACTATTTATCAGCTATCTATGCATGTGTCCGTCTCTGTAACGTTTGTATGGAAAAATGGAAATAAGTAACAAAT"
            + "TGTTATTAGTTGTTACCTTTGTGGCATCTACTCCAGCTTTGATTTCCTAGCTAGTTGTTATGTAATGTAACCAATATAATCGAAGCAT"
            + "GTTGAGAATAAAATACTCCCTACTT";

    @Before
    public void setUp() throws Exception {
        giProtein = new GenbankInterpreter(
                new File(this.getClass().getResource("genbank_test_protein.gb").getFile()), "Protein");
        giProtein.init();

        giDna = new GenbankInterpreter(new File(this.getClass().getResource("genbank_test_dna.gb").getFile()),
                "DNA");
        giDna.init();
    }

    @Test
    public void testReadSequence() {
        assertEquals("test whether parser extracts sequence accurately", protein_seq,
                giProtein.getSequenceStrings().get(0));
        assertEquals("test whether parser extracts sequence accurately", dna_seq,
                giDna.getSequenceStrings().get(0));
    }

    @Test
    public void testReadFeatures() {
        List<String> protein_feature_types = new ArrayList<>(
                Arrays.asList("source", "Protein", "Region", "CDS", "restriction_site"));
        for (String feature_type : protein_feature_types) {
            assertTrue("test whether parser extracts feature types accurately",
                    giProtein.getFeatures().get(0).contains(feature_type));
        }

        List<String> dna_feature_types = new ArrayList<>(
                Arrays.asList("source", "5'UTR", "CDS", "sig_peptide", "mat_peptide", "regulatory"));
        for (String feature_type : dna_feature_types) {
            assertTrue("test whether parser extracts feature types accurately",
                    giDna.getFeatures().get(0).contains(feature_type));
        }
    }

    @Test
    public void testReadQualifiers() {
        Map<Pair<String, String>, Map<String, String>> proteinFeatureMap = constructProteinFeatureMap();
        Map<Pair<String, String>, Map<String, String>> dnaFeatureMap = constructDNAFeatureMap();

        validateFeatureMap(proteinFeatureMap, giProtein);
        validateFeatureMap(dnaFeatureMap, giDna);
    }

    @Test
    public void testWriteFeatureAndQualifier() {
        AbstractFeature<AbstractSequence<Compound>, Compound> protein_feature = giProtein
                .constructFeature("test_type", "test_source");

        giProtein.addQualifier(protein_feature, "test_name", "test_value");
        giProtein.addFeature(1, 687, protein_feature, 0);

        assertTrue("tests whether the feature was correctly written to the sequence object",
                giProtein.getFeatures().get(0).contains("test_type"));
        assertTrue("tests whether the qualifier map identifier was correctly written to the sequence object",
                giProtein.getQualifiers(0, "test_type", "test_source").keySet().contains("test_name"));
        assertEquals("tests whether the qualifier name was correctly written to the sequence object", "test_name",
                giProtein.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getName());
        assertEquals("tests whether the qualifier value was correctly written to the sequence object", "test_value",
                giProtein.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getValue());

        AbstractFeature<AbstractSequence<Compound>, Compound> dna_feature = giDna.constructFeature("test_type",
                "test_source");

        giDna.addQualifier(dna_feature, "test_name", "test_value");
        giDna.addFeature(1, 687, dna_feature, 0);

        assertTrue("tests whether the feature was correctly written to the sequence object",
                giDna.getFeatures().get(0).contains("test_type"));
        assertTrue("tests whether the qualifier map identifier was correctly written to the sequence object",
                giDna.getQualifiers(0, "test_type", "test_source").keySet().contains("test_name"));
        assertEquals("tests whether the qualifier name was correctly written to the sequence object", "test_name",
                giDna.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getName());
        assertEquals("tests whether the qualifier value was correctly written to the sequence object", "test_value",
                giDna.getQualifiers(0, "test_type", "test_source").get("test_name").get(0).getValue());
    }

    private Map<Pair<String, String>, Map<String, String>> constructProteinFeatureMap() {
        Map<String, String> qualifier_name_to_value_1 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_2 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_3 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_4 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_5 = new HashMap<>();

        Pair<String, String> feature_type_and_source_1 = Pair.of("source", "1..279");
        Pair<String, String> feature_type_and_source_2 = Pair.of("Protein", "1..279");
        Pair<String, String> feature_type_and_source_3 = Pair.of("Region", "25..266");
        Pair<String, String> feature_type_and_source_4 = Pair.of("CDS", "1..279");
        Pair<String, String> feature_type_and_source_5 = Pair.of("restriction_site", "1..279");

        Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers = new HashMap<>();
        qualifier_name_to_value_1.put("organism", "Bacillus cereus");
        qualifier_name_to_value_1.put("isolate", "JRS1");
        qualifier_name_to_value_1.put("dbxref", "taxon:1396");

        qualifier_name_to_value_2.put("product", "Arylamine N-acetyltransferase");
        qualifier_name_to_value_2.put("EC_number", "2.3.1.5");

        qualifier_name_to_value_3.put("region_name", "Acetyltransf_2");
        qualifier_name_to_value_3.put("note", "N-acetyltransferase; cl00949");
        qualifier_name_to_value_3.put("dbxref", "CDD:260716");

        qualifier_name_to_value_4.put("gene", "nat_1");
        qualifier_name_to_value_4.put("locus_tag", "BN2127_JRS1_04775");
        qualifier_name_to_value_4.put("coded_by", "complement(CYHI01000402.1:9425..10264)");
        qualifier_name_to_value_4.put("inference", "ab initio prediction:Prodigal:2.60");
        qualifier_name_to_value_4.put("note", "*protein_id: CGR:BN2127_JRS1_04775");

        qualifier_name_to_value_5.put("gene", "test_gene");
        qualifier_name_to_value_5.put("note", "test_case");

        feature_to_qualifiers.put(feature_type_and_source_1, qualifier_name_to_value_1);
        feature_to_qualifiers.put(feature_type_and_source_2, qualifier_name_to_value_2);
        feature_to_qualifiers.put(feature_type_and_source_3, qualifier_name_to_value_3);
        feature_to_qualifiers.put(feature_type_and_source_4, qualifier_name_to_value_4);
        feature_to_qualifiers.put(feature_type_and_source_5, qualifier_name_to_value_5);

        return feature_to_qualifiers;
    }

    private Map<Pair<String, String>, Map<String, String>> constructDNAFeatureMap() {
        Map<String, String> qualifier_name_to_value_1 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_2 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_3 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_4 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_5 = new HashMap<>();
        Map<String, String> qualifier_name_to_value_6 = new HashMap<>();

        Pair<String, String> feature_type_and_source_1 = Pair.of("source", "1..1252");
        Pair<String, String> feature_type_and_source_2 = Pair.of("5'UTR", "<1..39");
        Pair<String, String> feature_type_and_source_3 = Pair.of("CDS", "40..1041");
        Pair<String, String> feature_type_and_source_4 = Pair.of("sig_peptide", "40..114");
        Pair<String, String> feature_type_and_source_5 = Pair.of("mat_peptide", "115..1038");
        Pair<String, String> feature_type_and_source_6 = Pair.of("regulatory", "1234..1239");

        Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers = new HashMap<>();
        qualifier_name_to_value_1.put("organism", "Glycine max");
        qualifier_name_to_value_1.put("mol_type", "mRNA");
        qualifier_name_to_value_1.put("cultivar", "Bonminori");
        qualifier_name_to_value_1.put("dbxref", "taxon:3847");

        qualifier_name_to_value_3.put("EC_number", "3.2.1.14");
        qualifier_name_to_value_3.put("product", "class III acidic endochitinase");
        qualifier_name_to_value_3.put("protein_id", "BAA25015.1");
        qualifier_name_to_value_3.put("dbxref", "GI:2934696");
        qualifier_name_to_value_3.put("translation",
                "MASERQALMLILLTTFFFTIKPSQASTTGGITIYWGQNIDDGTL"
                        + "TSTCDTGNFEIVNLAFLNAFGCGITPSWNFAGHCGDWNPCSILEPQIQYCQQKGVKVF"
                        + "LSLGGAKGTYSLCSPEDAKEVANYLYQNFLSGKPGPLGSVTLEGIDFDIELGSNLYWG"
                        + "DLAKELDALRHQNDHYFYLSAAPQCFMPDYHLDNAIKTGLFDHVNVQFYNNPPCQYSP"
                        + "GNTQLLFNSWDDWTSNVLPNNSVFFGLPASPDAAPSGGYIPPQVLISEVLPYVKQASN"
                        + "YGGVMLWDRYHDVLNYHSDQIKDYVPKYAMRFVTAVSDAIYESVSARTHRILQKKPY");

        qualifier_name_to_value_5.put("product", "unnamed");

        qualifier_name_to_value_6.put("regulatory_class", "polyA_signal_sequence");

        feature_to_qualifiers.put(feature_type_and_source_1, qualifier_name_to_value_1);
        feature_to_qualifiers.put(feature_type_and_source_2, qualifier_name_to_value_2);
        feature_to_qualifiers.put(feature_type_and_source_3, qualifier_name_to_value_3);
        feature_to_qualifiers.put(feature_type_and_source_4, qualifier_name_to_value_4);
        feature_to_qualifiers.put(feature_type_and_source_5, qualifier_name_to_value_5);
        feature_to_qualifiers.put(feature_type_and_source_6, qualifier_name_to_value_6);

        return feature_to_qualifiers;
    }

    private void validateFeatureMap(Map<Pair<String, String>, Map<String, String>> feature_to_qualifiers,
            GenbankInterpreter gi) {
        for (Pair<String, String> feature_type_and_source : feature_to_qualifiers.keySet()) {
            for (List<Qualifier> qual_list : gi
                    .getQualifiers(0, feature_type_and_source.getLeft(), feature_type_and_source.getRight())
                    .values()) {
                for (Qualifier qual : qual_list) {
                    Map<String, String> qual_map = feature_to_qualifiers.get(feature_type_and_source);
                    assertTrue("testing whether the qualifier name extracted is accurate",
                            qual_map.containsKey(qual.getName()));
                    if (qual.getName().equals("dbxref")) {
                        assertEquals("testing whether the extracted value of the db_xref qualifier is accurate",
                                qual_map.get(qual.getName()),
                                ((DBReferenceInfo) qual).getDatabase() + ":" + ((DBReferenceInfo) qual).getId());
                    } else {
                        assertEquals("testing whether the extracted value of the qualifier is accurate",
                                qual_map.get(qual.getName()), qual.getValue());
                    }
                }
            }
        }
    }
}