ubic.gemma.loader.genome.goldenpath.GoldenPathBioSequenceLoader.java Source code

Java tutorial

Introduction

Here is the source code for ubic.gemma.loader.genome.goldenpath.GoldenPathBioSequenceLoader.java

Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2006 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.genome.goldenpath;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;

import org.springframework.security.core.context.SecurityContextHolder;
import org.springframework.security.core.context.SecurityContext;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ubic.gemma.externalDb.GoldenPathDumper;
import ubic.gemma.model.common.description.ExternalDatabase;
import ubic.gemma.model.common.description.ExternalDatabaseService;
import ubic.gemma.model.genome.Taxon;
import ubic.gemma.model.genome.biosequence.BioSequence;
import ubic.gemma.model.genome.biosequence.BioSequenceService;

/**
 * Load a dump of a Goldenpath table. The input is expected to have just two columns: sequence identifier (accession)
 * and sequence length. Note that this uses create, not findOrCreate, so it should only be used to 'prime' the system.
 * 
 * @author pavlidis
 * @version $Id: GoldenPathBioSequenceLoader.java,v 1.13 2012/05/27 02:58:32 paul Exp $
 */
public class GoldenPathBioSequenceLoader {

    private static final int QUEUE_SIZE = 30000;
    private static final int BATCH_SIZE = 2000;
    static Log log = LogFactory.getLog(GoldenPathBioSequenceLoader.class.getName());
    ExternalDatabaseService externalDatabaseService;
    BioSequenceService bioSequenceService;
    ExternalDatabase genbank;

    int limit = -1;

    GoldenPathBioSequenceParser parser = new GoldenPathBioSequenceParser();
    private Taxon taxon;
    boolean producerDone = false;
    private boolean consumerDone = false;

    public GoldenPathBioSequenceLoader(Taxon taxon) {
        this.taxon = taxon;
    }

    public void setExternalDatabaseService(ExternalDatabaseService externalDatabaseService) {
        this.externalDatabaseService = externalDatabaseService;
        genbank = externalDatabaseService.find("Genbank");
    }

    public void setBioSequenceService(BioSequenceService bioSequenceService) {
        this.bioSequenceService = bioSequenceService;
    }

    /**
     * @param file
     * @return
     * @throws IOException
     */
    public void load(File file) throws IOException {
        if (file == null) {
            throw new IllegalArgumentException("File cannot be null");
        }
        if (!file.exists() || !file.canRead()) {
            throw new IOException("Could not read from file " + file.getPath());
        }
        FileInputStream stream = new FileInputStream(file);
        load(stream);
        stream.close();
    }

    /**
     * @param file
     * @return
     * @throws IOException
     */
    public void load(String filename) throws IOException {
        if (StringUtils.isBlank(filename)) {
            throw new IllegalArgumentException("No filename provided");
        }
        log.info("Parsing " + filename);
        File infile = new File(filename);
        load(infile);
    }

    /**
     * @param inputStream
     * @return
     * @throws IOException
     */
    public void load(final InputStream inputStream) {

        final BlockingQueue<BioSequence> queue = new ArrayBlockingQueue<BioSequence>(QUEUE_SIZE);
        final SecurityContext context = SecurityContextHolder.getContext();

        Thread loadThread = new Thread(new Runnable() {
            @Override
            public void run() {
                log.info("Starting loading");
                SecurityContextHolder.setContext(context);
                load(queue);
            }
        });

        loadThread.start();

        Thread parseThread = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    parser.parse(inputStream, queue);
                } catch (IOException e) {
                    e.printStackTrace();
                }
                log.info("Done parsing");
                producerDone = true;
            }
        });

        parseThread.start();

        while (!producerDone || !consumerDone) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }

    }

    /**
     * Load from a database source.
     * 
     * @param dumper
     */
    public void load(final GoldenPathDumper dumper) {
        final BlockingQueue<BioSequence> queue = new ArrayBlockingQueue<BioSequence>(QUEUE_SIZE);

        final SecurityContext context = SecurityContextHolder.getContext();
        assert context != null;
        Thread parseThread = new Thread(new Runnable() {
            @Override
            public void run() {
                dumper.dumpTranscriptBioSequences(limit, queue);
                log.info("Done dumping");
                producerDone = true;
            }
        }, "Parser");

        parseThread.start();

        Thread loadThread = new Thread(new Runnable() {
            @Override
            public void run() {
                SecurityContextHolder.setContext(context);
                log.info("Starting loading");
                load(queue);
            }
        }, "Loader");

        loadThread.start();

        while (!producerDone || !consumerDone) {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * @param bioSequences
     */
    void load(BlockingQueue<BioSequence> queue) {
        log.debug("Entering 'load' ");

        StopWatch timer = new StopWatch();
        timer.start();

        int count = 0;
        int cpt = 0;
        double secspt = 0.0;

        Collection<BioSequence> bioSequencesToPersist = new ArrayList<BioSequence>();
        try {
            while (!(producerDone && queue.isEmpty())) {
                BioSequence sequence = queue.poll();

                if (sequence == null) {
                    continue;
                }

                sequence.getSequenceDatabaseEntry().setExternalDatabase(genbank);
                sequence.setTaxon(taxon);
                bioSequencesToPersist.add(sequence);
                if (++count % BATCH_SIZE == 0) {
                    bioSequenceService.create(bioSequencesToPersist);
                    bioSequencesToPersist.clear();
                }

                // just some timing information.
                if (count % 1000 == 0) {
                    cpt++;
                    timer.stop();
                    double secsperthousand = timer.getTime() / 1000.0;
                    secspt += secsperthousand;
                    double meanspt = secspt / cpt;

                    String progString = "Processed and loaded " + count + " sequences, last one was "
                            + sequence.getName() + " (" + secsperthousand + "s for last 1000, mean per 1000 ="
                            + String.format("%.1f", meanspt) + "s)";
                    log.info(progString);
                    timer.reset();
                    timer.start();
                }

            }
        } catch (Exception e) {
            consumerDone = true;
            throw new RuntimeException(e);
        }

        // finish up.
        bioSequenceService.create(bioSequencesToPersist);

        log.info("Loaded total of " + count + " sequences");
        consumerDone = true;

    }

    public void setLimit(int limit) {
        this.limit = limit;
    }
}