eu.eubrazilcc.lvl.service.io.ImportSequencesTask.java Source code

Java tutorial

Introduction

Here is the source code for eu.eubrazilcc.lvl.service.io.ImportSequencesTask.java

Source

/*
 * Copyright 2014 EUBrazilCC (EU?Brazil Cloud Connect)
 * 
 * Licensed under the EUPL, Version 1.1 or - as soon they will be approved by 
 * the European Commission - subsequent versions of the EUPL (the "Licence");
 * You may not use this work except in compliance with the Licence.
 * You may obtain a copy of the Licence at:
 * 
 *   http://ec.europa.eu/idabc/eupl
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the Licence is distributed on an "AS IS" basis,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Licence for the specific language governing permissions and 
 * limitations under the Licence.
 * 
 * This product combines work with different licenses. See the "NOTICE" text
 * file for details on the various modules and licenses.
 * The "NOTICE" text file is part of the distribution. Any derivative works
 * that you distribute must include a readable copy of the "NOTICE" text file.
 */

package eu.eubrazilcc.lvl.service.io;

import static com.google.common.base.Joiner.on;
import static com.google.common.base.MoreObjects.toStringHelper;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Predicates.notNull;
import static com.google.common.collect.FluentIterable.from;
import static com.google.common.collect.ImmutableList.of;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.util.concurrent.Futures.successfulAsList;
import static com.google.common.util.concurrent.ListenableFutureTask.create;
import static eu.eubrazilcc.lvl.core.DataSource.GENBANK;
import static eu.eubrazilcc.lvl.core.concurrent.TaskRunner.TASK_RUNNER;
import static eu.eubrazilcc.lvl.core.concurrent.TaskStorage.TASK_STORAGE;
import static eu.eubrazilcc.lvl.core.entrez.EntrezHelper.MAX_RECORDS_FETCHED;
import static eu.eubrazilcc.lvl.core.entrez.EntrezHelper.MAX_RECORDS_LISTED;
import static eu.eubrazilcc.lvl.core.entrez.EntrezHelper.NUCLEOTIDE_DB;
import static eu.eubrazilcc.lvl.core.entrez.EntrezHelper.Format.GB_SEQ_XML;
import static eu.eubrazilcc.lvl.core.xml.ESearchXmlBinder.getCount;
import static eu.eubrazilcc.lvl.core.xml.ESearchXmlBinder.getIds;
import static eu.eubrazilcc.lvl.core.xml.GbSeqXmlBinder.GBSEQ_XMLB;
import static eu.eubrazilcc.lvl.core.xml.GbSeqXmlBinder.getPubMedIds;
import static eu.eubrazilcc.lvl.core.xml.GbSeqXmlBinder.parseSequence;
import static eu.eubrazilcc.lvl.storage.NotificationManager.NOTIFICATION_MANAGER;
import static eu.eubrazilcc.lvl.storage.security.PermissionHelper.DATA_CURATOR_ROLE;
import static java.nio.file.Files.createTempDirectory;
import static java.util.Collections.synchronizedList;
import static java.util.concurrent.TimeUnit.MINUTES;
import static org.apache.commons.io.FileUtils.deleteQuietly;
import static org.apache.commons.lang.StringUtils.isNotBlank;
import static org.slf4j.LoggerFactory.getLogger;

import java.io.File;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

import javax.annotation.Nullable;

import org.slf4j.Logger;

import com.google.common.base.Function;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ListenableFuture;

import eu.eubrazilcc.lvl.core.Leishmania;
import eu.eubrazilcc.lvl.core.Notification;
import eu.eubrazilcc.lvl.core.Sandfly;
import eu.eubrazilcc.lvl.core.Sequence;
import eu.eubrazilcc.lvl.core.concurrent.CancellableTask;
import eu.eubrazilcc.lvl.core.entrez.EntrezHelper;
import eu.eubrazilcc.lvl.core.entrez.EntrezHelper.Format;
import eu.eubrazilcc.lvl.core.xml.ncbi.esearch.ESearchResult;
import eu.eubrazilcc.lvl.core.xml.ncbi.gb.GBSeq;
import eu.eubrazilcc.lvl.service.io.filter.NewReferenceFilter;
import eu.eubrazilcc.lvl.service.io.filter.RecordFilter;
import eu.eubrazilcc.lvl.storage.dao.SequenceDAO;

/**
 * Discovers and imports new sequences in the LVL collection. Sequences are discovered from public databases, such as GenBank. Sequence
 * references are also imported (when available). To this end, this class creates an instance of {@link ImportPublicationsTask}, which
 * uses a specialized form of the {@code insert} method that avoids failing when a duplicate record is found in the database, supporting
 * concurrent use from different threads without expensive synchronization.
 * @author Erik Torres <ertorser@upv.es>
 */
public class ImportSequencesTask<T extends Sequence> extends CancellableTask<Integer> {

    private static final Logger LOGGER = getLogger(ImportSequencesTask.class);

    public static final ImmutableList<String> DATABASES = of(GENBANK);

    public static final long TIMEOUT_MINUTES = 60l;

    private ImmutableList<RecordFilter> filters = of();

    private final AtomicInteger pending = new AtomicInteger(0);
    private final AtomicInteger fetched = new AtomicInteger(0);

    private final List<String> pmids = synchronizedList(new ArrayList<String>());

    private UUID importPublicationsTaskId = null;

    private final String query;
    private final Sequence.Builder<T> builder;
    private final SequenceDAO<T> dao;

    public ImportSequencesTask(final String query, final Sequence.Builder<T> builder, final SequenceDAO<T> dao) {
        this.query = query;
        this.builder = builder;
        this.dao = dao;
        this.task = create(importSequencesTask());
    }

    public ImmutableList<RecordFilter> getFilters() {
        return filters;
    }

    public void setFilters(final Iterable<RecordFilter> filters) {
        final ImmutableList.Builder<RecordFilter> builder = new ImmutableList.Builder<RecordFilter>();
        this.filters = (filters != null ? builder.addAll(filters).build() : builder.build());
    }

    public @Nullable UUID getImportPublicationsTaskId() {
        return importPublicationsTaskId;
    }

    /**
     * Imports sequences from external databases into the application's database.
     */
    private Callable<Integer> importSequencesTask() {
        return new Callable<Integer>() {
            @Override
            public Integer call() throws Exception {
                LOGGER.info("Importing new sequences from: " + DATABASES);
                int count = 0;
                final File tmpDir = createTmpDir();
                try (final EntrezHelper entrez = EntrezHelper.create()) {
                    final List<ListenableFuture<Integer>> subTasks = newArrayList();
                    for (final String db : DATABASES) {
                        if (GENBANK.equals(db)) {
                            subTasks.addAll(importGenBankSubTasks(entrez, tmpDir));
                        } else {
                            throw new IllegalArgumentException("Unsupported database: " + db);
                        }
                    }
                    final ListenableFuture<List<Integer>> globalTask = successfulAsList(subTasks);
                    final List<Integer> results = globalTask.get(TIMEOUT_MINUTES, MINUTES);
                    for (final Integer result : results) {
                        if (result != null) {
                            count += result;
                        } else {
                            setHasErrors(true);
                            setStatus("Error while importing sequences: not all sequences were imported");
                        }
                    }
                } catch (InterruptedException ie) {
                    // ignore and propagate
                    LOGGER.warn("Sequence import was interrupted, exiting");
                    throw ie;
                } catch (Exception e) {
                    setHasErrors(true);
                    setStatus("Uncaught error while importing sequences: not all sequences were imported");
                    LOGGER.error("Uncaught error while importing sequences", e);
                } finally {
                    deleteQuietly(tmpDir);
                }
                final String msg = count + " new sequences were imported from: " + on(", ").join(DATABASES);
                if (!hasErrors()) {
                    setStatus(msg);
                    LOGGER.info(msg);
                } else {
                    LOGGER.warn(msg + " - errors reported");
                }
                NOTIFICATION_MANAGER
                        .broadcast(Notification.builder().scope(DATA_CURATOR_ROLE).message(msg).build());
                // schedule publication import
                final ImportPublicationsTask importPublicationsTask = ImportPublicationsTask.builder()
                        .filter(NewReferenceFilter.builder().build()).pmids(pmids).build();
                importPublicationsTaskId = importPublicationsTask.getUuid();
                TASK_RUNNER.execute(importPublicationsTask);
                TASK_STORAGE.add(importPublicationsTask);
                // unregister this task before returning the result to the execution service
                TASK_STORAGE.remove(getUuid());
                return new Integer(count);
            }
        };
    }

    private List<ListenableFuture<Integer>> importGenBankSubTasks(final EntrezHelper entrez, final File tmpDir) {
        final List<ListenableFuture<Integer>> subTasks = newArrayList();
        int esearchCount = -1, retstart = 0, count = 0, retries = 0;
        final int retmax = MAX_RECORDS_LISTED;
        do {
            setStatus("Searching GenBank for sequence identifiers");
            try {
                final ESearchResult result = entrez.esearch(NUCLEOTIDE_DB, query, retstart, retmax);
                if (esearchCount < 0) {
                    esearchCount = getCount(result);
                }
                final List<String> ids = getIds(result);
                count = ids.size();
                subTasks.add(TASK_RUNNER.submit(importGenBankSubTask(ids, entrez, tmpDir, GB_SEQ_XML, "xml")));
                LOGGER.trace("Listing Ids (start=" + retstart + ", max=" + retmax + ") produced " + count
                        + " new records. Query: " + query);
                retstart += count;
            } catch (Exception e) {
                if (++retries > 3) {
                    throw new IllegalStateException("Failed to import GenBank sequences", e);
                }
            }
        } while (count > 0 && retstart < esearchCount);
        return subTasks;
    }

    private Callable<Integer> importGenBankSubTask(final List<String> ids, final EntrezHelper entrez,
            final File tmpDir, final Format format, final String extension) {
        return new Callable<Integer>() {
            private int efetchCount = 0;

            @Override
            public Integer call() throws Exception {
                setStatus("Finding missing sequences between GenBank and the local collection");
                // filter out the sequence that are already stored in the database, creating a new set
                // with the identifiers that are missing from the database. Using a set ensures that 
                // duplicate identifiers are also removed from the original list
                final List<String> ids2 = from(ids).transform(new Function<String, String>() {
                    @Override
                    public String apply(final String id) {
                        String result = id;
                        for (int i = 0; i < filters.size() && result != null; i++) {
                            final RecordFilter filter = filters.get(i);
                            if (filter.canBeApplied(GENBANK)) {
                                result = filters.get(i).filterById(id);
                            }
                        }
                        return result;
                    }
                }).filter(notNull()).toSet().asList();
                if (ids2.size() > 0) {
                    setStatus("Fetching sequences from GenBank");
                    // update progress
                    int pendingCount = pending.addAndGet(ids2.size());
                    setProgress(100.0d * fetched.get() / pendingCount);
                    // fetch sequence files
                    final Path tmpDir2 = createTempDirectory(tmpDir.toPath(), "fetch_seq_task_");
                    entrez.efetch(ids2, 0, MAX_RECORDS_FETCHED, tmpDir2.toFile(), format);
                    // import sequence files to the database
                    for (final String id : ids2) {
                        setStatus("Importing GenBank sequences into local collection");
                        final Path source = tmpDir2.resolve(id + "." + extension);
                        try {
                            // insert sequence in the database
                            final GBSeq gbSeq = GBSEQ_XMLB.typeFromFile(source.toFile());
                            final T sequence = parseSequence(gbSeq, builder);
                            dao.insert(sequence);
                            efetchCount++;
                            LOGGER.info("New GBSeqXML file stored: " + source.toString());
                            // update progress
                            int fetchedCount = fetched.incrementAndGet();
                            setProgress(100.0d * fetchedCount / pending.get());
                            // extract references from the sequence
                            pmids.addAll(getPubMedIds(gbSeq));
                        } catch (Exception e) {
                            LOGGER.warn("Failed to import sequence from file: " + source.getFileName(), e);
                        }
                    }
                }
                checkState(ids2.size() == efetchCount, "No all sequences were imported");
                return efetchCount;
            }
        };
    }

    private static File createTmpDir() {
        File tmpDir = null;
        try {
            tmpDir = createTempDirectory("tmp_seq_").toFile();
        } catch (Exception e) {
            throw new IllegalStateException("Failed to create temporary directory", e);
        }
        return tmpDir;
    }

    @Override
    public String toString() {
        return toStringHelper(this).addValue(super.toString()).toString();
    }

    /* Fluent API */

    public static Builder<Leishmania> leishmaniaBuilder() {
        return new Builder<Leishmania>();
    }

    public static Builder<Sandfly> sandflyBuilder() {
        return new Builder<Sandfly>();
    }

    public static class Builder<T extends Sequence> {

        private String query;
        private Sequence.Builder<T> builder;
        private SequenceDAO<T> dao;
        private ImmutableList<RecordFilter> filters;

        public Builder<T> query(final String query) {
            this.query = query;
            return this;
        }

        public Builder<T> dao(final SequenceDAO<T> dao) {
            this.dao = dao;
            return this;
        }

        public Builder<T> builder(final Sequence.Builder<T> builder) {
            this.builder = builder;
            return this;
        }

        public Builder<T> filter(final RecordFilter filter) {
            this.filters = of(filter);
            return this;
        }

        public Builder<T> filters(final Iterable<RecordFilter> filters) {
            final ImmutableList.Builder<RecordFilter> builder = new ImmutableList.Builder<RecordFilter>();
            this.filters = (filters != null ? builder.addAll(filters).build() : builder.build());
            return this;
        }

        public ImportSequencesTask<T> build() {
            checkArgument(isNotBlank(query), "Uninitialized or invalid database query");
            checkArgument(builder != null, "Uninitialized or invalid sequence builder");
            checkArgument(dao != null, "Uninitialized or invalid sequence DAO");
            final ImportSequencesTask<T> instance = new ImportSequencesTask<T>(query, builder, dao);
            instance.setFilters(filters);
            return instance;
        }

    }

}