eu.eubrazilcc.lvl.core.xml.GbSeqXmlBinder.java Source code

Java tutorial

Introduction

Here is the source code for eu.eubrazilcc.lvl.core.xml.GbSeqXmlBinder.java

Source

/*
 * Copyright 2014 EUBrazilCC (EU?Brazil Cloud Connect)
 * 
 * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by 
 * the European Commission - subsequent versions of the EUPL (the "Licence");
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 *   http://ec.europa.eu/idabc/eupl
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and 
 * limitations under the Licence.
 * 
 * This product combines work with different licenses. See the "NOTICE" text
 * file for details on the various modules and licenses.
 * The "NOTICE" text file is part of the distribution. Any derivative works
 * that you distribute must include a readable copy of the "NOTICE" text file.
 */

package eu.eubrazilcc.lvl.core.xml;

import static com.google.common.base.Optional.fromNullable;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Predicates.notNull;
import static com.google.common.collect.FluentIterable.from;
import static com.google.common.collect.Lists.newArrayList;
import static eu.eubrazilcc.lvl.core.DataSource.GENBANK;
import static eu.eubrazilcc.lvl.core.geocoding.GeocodingHelper.geocode;
import static eu.eubrazilcc.lvl.core.util.LocaleUtils.getLocale;
import static java.lang.Integer.parseInt;
import static java.util.regex.Pattern.compile;
import static org.apache.commons.lang.StringUtils.isNotBlank;
import static org.apache.commons.lang.StringUtils.isNumeric;
import static org.apache.commons.lang.StringUtils.trimToNull;
import static org.slf4j.LoggerFactory.getLogger;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Nullable;
import javax.xml.bind.JAXBElement;

import org.slf4j.Logger;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableMultimap;

import eu.eubrazilcc.lvl.core.Reference;
import eu.eubrazilcc.lvl.core.Sequence;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBFeature;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBQualifier;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBReference;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBSeq;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBSeqReferences;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBSeqid;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBSet;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.ObjectFactory;

/**
 * NCBI GenBank Sequence XML binding helper.
 * @author Erik Torres <ertorser@upv.es>
 */
public final class GbSeqXmlBinder extends XmlBinder {

    private final static Logger LOGGER = getLogger(GbSeqXmlBinder.class);

    public static final String SUBMITTER_BLOCK_TITLE = "Direct Submission";

    private static final Class<?>[] SUPPORTED_CLASSES = { GBSet.class, GBSeq.class };

    public static final ObjectFactory GBSEQ_XML_FACTORY = new ObjectFactory();

    public static final GbSeqXmlBinder GBSEQ_XMLB = new GbSeqXmlBinder();

    private GbSeqXmlBinder() {
        super(SUPPORTED_CLASSES);
    }

    @Override
    @SuppressWarnings("unchecked")
    protected <T> JAXBElement<T> createType(final T obj) {
        Object element = null;
        Class<? extends Object> clazz = obj.getClass();
        if (clazz.equals(GBSet.class)) {
            element = GBSEQ_XML_FACTORY.createGBSet();
        } else if (clazz.equals(GBSeq.class)) {
            element = GBSEQ_XML_FACTORY.createGBSeq();
        } else {
            throw new IllegalArgumentException("Unsupported type: " + clazz.getCanonicalName());
        }
        return (JAXBElement<T>) element;
    }

    /**
     * Gets the GenInfo identifier (gi) from a sequence.
     * @param gbSeq - sequence to be analyzed
     * @return if found, the GenInfo identifier (gi), otherwise {@code null}.
     */
    public static final @Nullable Integer getGenInfoIdentifier(final GBSeq gbSeq) {
        checkArgument(gbSeq != null && gbSeq.getGBSeqOtherSeqids() != null
                && gbSeq.getGBSeqOtherSeqids().getGBSeqid() != null, "Uninitialized or invalid sequence");
        String gi = null;
        final List<GBSeqid> ids = gbSeq.getGBSeqOtherSeqids().getGBSeqid();
        for (int i = 0; i < ids.size() && gi == null; i++) {
            final Pattern pattern = compile("(gi\\|\\d+)");
            final Matcher matcher = pattern.matcher(ids.get(i).getvalue());
            if (matcher.find()) {
                gi = trimToNull(matcher.group().substring(3));
            }
        }
        return isNumeric(gi) ? parseInt(gi) : null;
    }

    /**
     * Gets the country feature from a sequence.
     * @param sequence - sequence to be analyzed
     * @return the value of the country feature or {@code null}.
     */
    public static final @Nullable String countryFeature(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        String country = null;
        if (sequence.getGBSeqFeatureTable() != null && sequence.getGBSeqFeatureTable().getGBFeature() != null) {
            final List<GBFeature> features = sequence.getGBSeqFeatureTable().getGBFeature();
            for (int i = 0; i < features.size() && country == null; i++) {
                final GBFeature feature = features.get(i);
                if (feature.getGBFeatureQuals() != null && feature.getGBFeatureQuals().getGBQualifier() != null) {
                    final List<GBQualifier> qualifiers = feature.getGBFeatureQuals().getGBQualifier();
                    for (int j = 0; j < qualifiers.size() && country == null; j++) {
                        final GBQualifier qualifier = qualifiers.get(j);
                        if ("country".equals(qualifier.getGBQualifierName())) {
                            country = qualifier.getGBQualifierValue();
                        }
                    }
                }
            }
        }
        return trimToNull(country);
    }

    /**
     * Infers the possible countries of the species from which the DNA sequence was obtained and 
     * returns a map of Java {@link Locale} where the key of the map is the GenBank field that was
     * used to infer the country. The country is inferred from the annotations of the GenBank file 
     * format, using the fields in the following order:
     * <ol>
     * <li>If a country entry exists in the features of the file, then this is returned to 
     * the caller and no other check is performed;</li>
     * <li>Definition field;</li>
     * <li>Title field; or</li>
     * <li>Check PubMed title and abstract fields.</li>
     * </ol>
     * @param sequence - sequence to be analyzed
     * @return a map of Java {@link Locale} inferred from the input sequence, where the key of the map
     *         is the GenBank field used to infer the country.
     */
    public static final ImmutableMultimap<String, Locale> inferCountry(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        final ImmutableMultimap.Builder<String, Locale> builder = new ImmutableMultimap.Builder<String, Locale>();
        // infer from features
        final String countryFeature = countryFeature(sequence);
        Locale locale = isNotBlank(countryFeature) ? countryFeatureToLocale(countryFeature) : null;
        if (locale != null) {
            builder.put("features", locale);
        } else {
            // infer from definition
            // TODO

            // infer from title
            // TODO

            // infer from PubMed title and abstract fields
            // TODO
        }
        return builder.build();
    }

    /**
     * Converts country feature to Java {@link Locale}. Java {@link Locale} allows latter to export the 
     * country to several different formats, including a two-letter code compatible with ISO 3166-1 
     * alpha-2 standard.
     * @param countryFeature - value of country feature field
     * @return a Java {@link Locale} inferred from the input sequence.
     */
    public static final Locale countryFeatureToLocale(final String countryFeature) {
        checkArgument(isNotBlank(countryFeature), "Uninitialized or invalid country feature");
        return getLocale(countryFeature.replace(":.*", ""));
    }

    /**
     * Parses publication references from a GenBank entry. The submitter block as well as non-PubMed publications
     * are excluded from the list of references returned by this method. A message will be written to the log system
     * when a reference is excluded for any of the reasons mentioned before.
     * @param sequence - GenBank sequence entry
     * @return a list of {@link Reference} that contains the references to published work included in the 
     *         input GenBank sequence that are included PubMed (has a valid PMID).
     */
    public static final List<Reference> getPubMedReferences(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        final List<Reference> references = newArrayList();
        final GBSeqReferences gbRefs = sequence.getGBSeqReferences();
        if (gbRefs != null && gbRefs.getGBReference() != null) {
            final List<GBReference> list = gbRefs.getGBReference();
            for (final GBReference gbRef : list) {
                if (gbRef != null) {
                    if (gbRef.getGBReferencePubmed() != null) {
                        references.add(Reference.builder().title(gbRef.getGBReferenceTitle())
                                .pubmedId(gbRef.getGBReferencePubmed()).build());
                    } else if (SUBMITTER_BLOCK_TITLE.equals(gbRef.getGBReferenceTitle())) {
                        LOGGER.trace("Ignoring submitter block in GenBank sequence: " + sequenceId(sequence));
                    } else {
                        LOGGER.info("Ignoring non-PubMed publication in GenBank sequence: " + sequenceId(sequence));
                    }
                }
            }
        }
        return references;
    }

    /**
     * Parses publication references from a GenBank entry. The submitter block as well as non-PubMed publications
     * are excluded from the list of references returned by this method. A message will be written to the log system
     * when a reference is excluded for any of the reasons mentioned before.
     * @param sequence - GenBank sequence entry
     * @return a list of {@link Reference} that contains the references to published work included in the 
     *         input GenBank sequence that are included PubMed (has a valid PMID).
     */
    public static final Set<String> getPubMedIds(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        return from(getPubMedReferences(sequence)).transform(new Function<Reference, String>() {
            @Override
            public String apply(final Reference reference) {
                return reference != null && isNotBlank(reference.getPubmedId()) ? reference.getPubmedId().trim()
                        : null;
            }
        }).filter(notNull()).toSet();
    }

    private static final List<GBFeature> getFeatures(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        List<GBFeature> features = null;
        if (sequence.getGBSeqFeatureTable() != null && sequence.getGBSeqFeatureTable().getGBFeature() != null) {
            features = sequence.getGBSeqFeatureTable().getGBFeature();
        }
        return fromNullable(features).or(new ArrayList<GBFeature>());
    }

    /**
     * Parses gene name (and possible synonyms) found in a GenBank entry.
     * @param sequence - GenBank sequence entry
     * @return a deduplicated list of gene names.
     */
    public static final Set<String> getGeneNames(final GBSeq sequence) {
        checkArgument(sequence != null, "Uninitialized or invalid sequence");
        return from(getFeatures(sequence)).transform(new Function<GBFeature, String>() {
            @Override
            public String apply(final GBFeature feature) {
                String name = null;
                if (feature != null && "gene".equals(feature.getGBFeatureKey())
                        && feature.getGBFeatureQuals() != null
                        && feature.getGBFeatureQuals().getGBQualifier() != null) {
                    final List<GBQualifier> quals = feature.getGBFeatureQuals().getGBQualifier();
                    for (int i = 0; i < quals.size() && name == null; i++) {
                        final GBQualifier qualifier = quals.get(i);
                        if (qualifier != null && "gene".equals(qualifier.getGBQualifierName())
                                && isNotBlank(qualifier.getGBQualifierValue())) {
                            name = qualifier.getGBQualifierValue().trim();
                        }
                    }
                }
                return name;
            }
        }).filter(notNull()).toSet();
    }

    /**
     * Parses DNA sequence from a GenBank entry.
     * @param gbSeq - GenBank sequence entry
     * @return a {@link Sequence} built from the input GenBank sequence.
     */
    public static final <T extends Sequence> T parseSequence(final GBSeq gbSeq, final Sequence.Builder<T> builder) {
        checkArgument(gbSeq != null, "Uninitialized or invalid sequence");
        final String countryFeature = countryFeature(gbSeq);
        return builder.dataSource(GENBANK).definition(gbSeq.getGBSeqDefinition())
                .accession(gbSeq.getGBSeqPrimaryAccession()).version(gbSeq.getGBSeqAccessionVersion())
                .gi(getGenInfoIdentifier(gbSeq)).organism(gbSeq.getGBSeqOrganism())
                .length(parseInt(gbSeq.getGBSeqLength())).gene(getGeneNames(gbSeq)).countryFeature(countryFeature)
                .location(isNotBlank(countryFeature) ? geocode(countryFeature).orNull() : null)
                .locale(isNotBlank(countryFeature) ? countryFeatureToLocale(countryFeature) : null)
                .pmids(getPubMedIds(gbSeq)).sequence(gbSeq).build();
    }

    public static String sequenceId(final GBSeq gbSeq) {
        return "AC:" + gbSeq.getGBSeqPrimaryAccession() + ", GI:" + getGenInfoIdentifier(gbSeq);
    }

}