eu.eubrazilcc.lvl.core.entrez.GbFlatFileHelper.java Source code

Java tutorial

Introduction

Here is the source code for eu.eubrazilcc.lvl.core.entrez.GbFlatFileHelper.java

Source

/*
 * Copyright 2014 EUBrazilCC (EU?Brazil Cloud Connect)
 * 
 * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by 
 * the European Commission - subsequent versions of the EUPL (the "Licence");
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 *   http://ec.europa.eu/idabc/eupl
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and 
 * limitations under the Licence.
 * 
 * This product combines work with different licenses. See the "NOTICE" text
 * file for details on the various modules and licenses.
 * The "NOTICE" text file is part of the distribution. Any derivative works
 * that you distribute must include a readable copy of the "NOTICE" text file.
 */

package eu.eubrazilcc.lvl.core.entrez;

import static com.google.common.base.Preconditions.checkArgument;
import static eu.eubrazilcc.lvl.core.entrez.EntrezHelper.countryFeature;
import static eu.eubrazilcc.lvl.core.util.LocaleUtils.getLocale;

import java.io.File;
import java.util.Locale;

import com.google.common.collect.ImmutableMultimap;

/**
 * Utility class to deal with sequences stored in GenBank flat file format.
 * @author Erik Torres <ertorser@upv.es>
 */
public final class GbFlatFileHelper {

    /**
     * Infers the possible countries of the species from which the DNA sequence was obtained and 
     * returns a map of Java {@link Locale} where the key of the map is the GenBank field that was
     * used to infer the country. The country is inferred from the annotations of the GenBank file 
     * format, using the fields in the following order:
     * <ol>
     * <li>If a /country entry exists in the FEATURES of the file, then this is returned to 
     * the caller and no other check is performed;</li>
     * <li>DEFINITION;</li>
     * <li>TITLE; or</li>
     * <li>Check PUBMED title and abstract fields.</li>
     * </ol>
     * Java {@link Locale} allows latter to export the country to several different formats, including a 
     * two-letter code compatible with ISO 3166-1 alpha-2 standard.
     * @param file - sequence file.
     * @return a Java {@link Locale} inferred from the sequence file.
     * @throws Exception if an error occurs.
     */
    public static final ImmutableMultimap<GenBankField, Locale> inferCountry(final File file) throws Exception {
        checkArgument(file != null && file.canRead(), "Uninitialized or invalid file");
        final ImmutableMultimap.Builder<GenBankField, Locale> builder = new ImmutableMultimap.Builder<GenBankField, Locale>();
        // infer from features
        final Locale locale = getLocale(countryFeature(file));
        if (locale != null) {
            builder.put(GenBankField.COUNTRY_FEATURE, locale);
        } else {
            // infer from definition
            // TODO

            // infer from title
            // TODO

            // infer from PubMed title and abstract fields
            // TODO
        }
        return builder.build();
    }

    /**
     * Represents a GenBank field, for example, the field that was used to infer the country of a sequence.
     * @author Erik Torres <ertorser@upv.es>
     */
    public static enum GenBankField {
        COUNTRY_FEATURE, DEFINITION, TITLE, PUBMED_TITLE, PUBMED_ABSTRACT
    }

}