gov.nih.nci.caintegrator.application.download.carray23.CaArrayFileDownloader.java Source code

Java tutorial

Introduction

Here is the source code for gov.nih.nci.caintegrator.application.download.carray23.CaArrayFileDownloader.java

Source

/*L
 *  Copyright SAIC
 *
 *  Distributed under the OSI-approved BSD 3-Clause License.
 *  See http://ncip.github.com/stats-application-commons/LICENSE.txt for details.
 */

package gov.nih.nci.caintegrator.application.download.carray23;

import gov.nih.nci.caarray.domain.file.FileType;
import gov.nih.nci.caarray.external.v1_0.CaArrayEntityReference;
import gov.nih.nci.caarray.external.v1_0.data.FileCategory;
import gov.nih.nci.caarray.external.v1_0.experiment.Experiment;
import gov.nih.nci.caarray.external.v1_0.query.BiomaterialSearchCriteria;
import gov.nih.nci.caarray.external.v1_0.query.ExperimentSearchCriteria;
import gov.nih.nci.caarray.external.v1_0.query.FileSearchCriteria;
import gov.nih.nci.caarray.external.v1_0.sample.Biomaterial;
import gov.nih.nci.caarray.external.v1_0.sample.BiomaterialType;
import gov.nih.nci.caarray.services.ServerConnectionException;
import gov.nih.nci.caarray.services.external.v1_0.InvalidInputException;
import gov.nih.nci.caarray.services.external.v1_0.InvalidReferenceException;
import gov.nih.nci.caarray.services.external.v1_0.UnsupportedCategoryException;
import gov.nih.nci.caarray.services.external.v1_0.data.DataApiUtils;
import gov.nih.nci.caarray.services.external.v1_0.data.DataTransferException;
import gov.nih.nci.caarray.services.external.v1_0.search.SearchApiUtils;
import gov.nih.nci.caarray.services.external.v1_0.search.SearchService;
import gov.nih.nci.caintegrator.application.download.DownloadZipHelper;
import gov.nih.nci.caintegrator.application.zip.ZipItem;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.rmi.RemoteException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;

import com.sun.org.apache.xerces.internal.util.URI.MalformedURIException;

/**
 * Imports caArray files from an experiment into a zip file for caIntegrator usage.
 * Based on CaArray2Importer created by GenePattern Team at Broad.
 * 
 * @author sahnih
 * 
 */
public class CaArrayFileDownloader {
    // Maximum size for a single zip file (2GB) 
    // Only applies if breakIntoMultipleFileIfLarge is true
    private static final long MAX_ZIP_FILE_SIZE = FileUtils.ONE_GB * 2;

    public static Logger logger = Logger.getLogger(CaArrayFileDownloader.class);

    private String inputDirectory;
    private String outputZipDirectory;
    private String directoryInZip;

    public CaArrayFileDownloader(String inputDirectory, String outputZipDirectory, String directoryInZip) {
        super();
        this.inputDirectory = inputDirectory;
        this.outputZipDirectory = outputZipDirectory;
        this.directoryInZip = directoryInZip;
    }

    /**
     * Writes ZipItems to a Zip file
     * @param zipItems
     * @param zipFileName
     * @throws IOException
     */
    public List<String> writeZipFile(Set<ZipItem> zipItems, String zipFileName) throws IOException {
        return DownloadZipHelper.zipFile(new ArrayList<ZipItem>(zipItems), zipFileName, outputZipDirectory, true);
    }

    /**
     * Downloads files to temporary files.
     * 
     * @param fileService
     * @param files
     * @return
     * @throws IOException
     * @throws DataTransferException 
     * @throws InvalidReferenceException 
     */
    public Set<ZipItem> downloadFiles(SearchService searchService, DataApiUtils dataServiceHelper,
            List<gov.nih.nci.caarray.external.v1_0.data.File> files, String zipFileName)
            throws IOException, InvalidReferenceException, DataTransferException {
        Set<ZipItem> zipItems = new HashSet<ZipItem>();
        boolean compressFile = false;
        if (files != null) {
            logger.debug("downloading " + files.size() + " files");
            for (gov.nih.nci.caarray.external.v1_0.data.File file : files) {

                BufferedOutputStream bos = null;
                try {
                    // if the file does not already exists than write it
                    if (!DownloadZipHelper.checkIfFileExists(file.getMetadata().getName(),
                            file.getMetadata().getUncompressedSize(), inputDirectory)) {
                        File tempFile = DownloadZipHelper.createFile(file.getMetadata().getName(), inputDirectory);
                        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
                        long startTime = System.currentTimeMillis();
                        dataServiceHelper.copyFileContentsToOutputStream(file.getReference(), compressFile,
                                outStream);
                        long totalTime = System.currentTimeMillis() - startTime;
                        byte[] byteArray = outStream.toByteArray();
                        logger.debug("downloaded file: " + file.getMetadata().getName());
                        bos = new BufferedOutputStream(new FileOutputStream(tempFile));
                        logger.debug("writing file: " + tempFile.getName());
                        bos.write(byteArray);
                        logger.debug("wrote file: " + tempFile.getName());
                        if (byteArray != null) {
                            System.out.println("Retrieved " + byteArray.length + " bytes in " + totalTime + " ms.");
                        } else {
                            System.out.println("Error: Retrieved null byte array.");
                        }
                    }
                } catch (IOException e) {
                    reportError("Error writing temporary file: " + file.getMetadata().getName(), e);
                    throw e;
                } finally {
                    if (null != bos)
                        bos.close();
                }
                ZipItem zipItem = DownloadZipHelper.fileToZip(file.getMetadata().getName(), zipFileName,
                        directoryInZip, inputDirectory);
                zipItems.add(zipItem);
            }
        }

        return zipItems;
    }

    /**
     * Tries to find an experiment with a publicIdentifier equal to the
     * experimentName. If it can't find one, then it tries to find an experiment
     * with a title equal to the experimentName. If it doesn't find one, then it
     * throws an exception.
     * 
     * @param service
     * @return experiment with publicIdentifier or title equal to the
     *         experimentName.
     * @throws UnsupportedCategoryException 
     * @throws InvalidReferenceException 
     */
    public CaArrayEntityReference findExperiment(SearchService searchService, String experimentPublicIdentifier)
            throws InvalidReferenceException, UnsupportedCategoryException {
        logger.debug("searching by public identifier for: [" + experimentPublicIdentifier + "]");
        // Search for experiment with the given public identifier.
        ExperimentSearchCriteria experimentSearchCriteria = new ExperimentSearchCriteria();
        experimentSearchCriteria.setPublicIdentifier(experimentPublicIdentifier);
        // Look up experiment.
        List<Experiment> experimentList = (searchService.searchForExperiments(experimentSearchCriteria, null))
                .getResults();
        if (experimentList == null || experimentList.size() <= 0) {
            experimentSearchCriteria = new ExperimentSearchCriteria();
            experimentSearchCriteria.setTitle(experimentPublicIdentifier);
            // Search for experiment with the given public identifier as the Title.
            experimentList = (searchService.searchForExperiments(experimentSearchCriteria, null)).getResults();
            logger.debug("searching by name for: [" + experimentPublicIdentifier + "]");
            if (experimentList == null || experimentList.size() <= 0) {
                String message = "Experiment not found: " + experimentPublicIdentifier;
                reportError(message, null);
                throw new IllegalArgumentException(message);
            }
        }

        if (experimentList.size() > 1) {
            StringBuffer buf = new StringBuffer();
            buf.append("Found multiple files matching name: ");
            buf.append(experimentPublicIdentifier);
            buf.append("\n");
            for (Experiment experiment : experimentList) {
                buf.append("\t: ");
                buf.append(experiment.getTitle());
                buf.append("\n");
            }
            buf.append("downloading experiment: ");
            buf.append(experimentList.get(0).getTitle());
            String message = new String(buf);
            reportError(message, null);
        }
        // Assuming that only one experiment was found, pick the first result.
        // This will always be true for a search by public identifier, but may not be true for a search by title.
        Experiment experiment = experimentList.iterator().next();
        return experiment.getReference();
    }

    /**
     * Application is called as: java CaArrayFileDownloader -u <url to carray>
     * -x <experiment>
     * 
     * with optional arguments:
     * 
     * -n <username> -p <password> -t <type> -e <extension>
     * 
     * If -u is specified, then -p must also be specified
     * 
     * Possible values for -t are "raw" or "derived"
     * 
     * -e is the extension for the files. The default is "CEL"
     * 
     * @param args
     * @throws ServerConnectionException
     * @throws ServerConnectionException
     */
    public static void main(String[] args) {
        CaArrayFileDownloader importer = null;
        try {
            importer = new CaArrayFileDownloader(args[0], args[1], args[3]);

        } catch (Exception e) {
            reportError(e.getMessage(), null);
            e.printStackTrace(System.err);
            System.exit(1);
        }

        // try {
        // importer.downloadExperiment();
        // } catch (ServerConnectionException e) {
        // System.exit(2);
        // } catch (IllegalArgumentException e) {
        // System.exit(3);
        // } catch (IOException e) {
        // System.exit(4);
        // } catch (LoginException e) {
        // System.exit(5);
        // }
        // System.exit(0);
    }

    protected static void reportError(String message, Throwable e) {
        if (null == message)
            message = "";
        logger.error(message, e);
        String e_message = "";
        if (null != e)
            e_message = e.getMessage();
        System.err.println(message + ": " + e_message);
    }

    /**
     * Search for samples based on name.
     * @throws InvalidInputException 
     */
    private Set<CaArrayEntityReference> searchForSamples(SearchApiUtils searchServiceHelper,
            CaArrayEntityReference experimentRef, List<String> specimenList)
            throws RemoteException, InvalidInputException {
        BiomaterialSearchCriteria criteria = new BiomaterialSearchCriteria();
        criteria.setExperiment(experimentRef);
        criteria.getTypes().add(BiomaterialType.SAMPLE);
        List<Biomaterial> samples = null;
        try {
            samples = (searchServiceHelper.biomaterialsByCriteria(criteria)).list();
        } catch (RuntimeException e) {
            reportError(e.getMessage(), e);
            e.printStackTrace();
            throw e;
        }
        Set<CaArrayEntityReference> sampleRefs = new HashSet<CaArrayEntityReference>();
        for (Biomaterial sample : samples) {
            for (String specimenName : specimenList) {
                if (sample != null && sample.getName() != null && sample.getName().contains(specimenName.trim())) {
                    System.out.println(sample.getName());
                    sampleRefs.add(sample.getReference());
                }
            }

        }
        return sampleRefs;
    }

    public List<gov.nih.nci.caarray.external.v1_0.data.File> selectFilesFromSamples(
            SearchApiUtils searchServiceHelper, CaArrayEntityReference experimentRef, List<String> specimenList,
            FileType type) throws RemoteException, InvalidInputException {
        Set<CaArrayEntityReference> sampleRefs = searchForSamples(searchServiceHelper, experimentRef, specimenList);
        if (sampleRefs == null || sampleRefs.size() <= 0) {
            String message = "Could not find the requested samples.";
            reportError(message, null);
            return null;
        }
        System.out.println("searched for " + sampleRefs.size() + " samples");
        List<gov.nih.nci.caarray.external.v1_0.data.File> files = null;
        //        if (type == FileType.AFFYMETRIX_CEL) {
        if (type.getName().equals("AFFYMETRIX_CEL")) {
            files = selectCelFilesFromSamples(searchServiceHelper, experimentRef, sampleRefs);
            //      } else if (type == FileType.AFFYMETRIX_CHP) {
        } else if (type.getName().equals("AFFYMETRIX_CHP")) {
            files = selectChpFilesFromSamples(searchServiceHelper, experimentRef, sampleRefs);
        }
        System.out.println("Got back " + files.size() + " files");
        if (files == null) {
            String message = "Could not find any raw files associated with the given samples.";
            reportError(message, null);
            return null;
        } else {
            return files;
        }
    }

    /**
     * Select all raw data files associated with the given samples.
     * @throws InvalidInputException 
     */
    private List<gov.nih.nci.caarray.external.v1_0.data.File> selectCelFilesFromSamples(
            SearchApiUtils searchServiceHelper, CaArrayEntityReference experimentRef,
            Set<CaArrayEntityReference> sampleRefs) throws RemoteException, InvalidInputException {
        FileSearchCriteria fileSearchCriteria = new FileSearchCriteria();
        fileSearchCriteria.setExperiment(experimentRef);
        fileSearchCriteria.setExperimentGraphNodes(sampleRefs);
        fileSearchCriteria.getCategories().add(FileCategory.RAW_DATA);
        fileSearchCriteria.setExtension("CEL");

        List<gov.nih.nci.caarray.external.v1_0.data.File> files = null;
        try {
            files = searchServiceHelper.filesByCriteria(fileSearchCriteria).list();
        } catch (RuntimeException e) {
            reportError(e.getMessage(), e);
            e.printStackTrace();
            throw e;
        }

        if (files != null && files.size() <= 0) {
            return null;
        }

        return files;
    }

    /**
     * Select all chp data files associated with the given samples.
     * @throws InvalidInputException 
     */
    private List<gov.nih.nci.caarray.external.v1_0.data.File> selectChpFilesFromSamples(
            SearchApiUtils searchServiceHelper, CaArrayEntityReference experimentRef,
            Set<CaArrayEntityReference> sampleRefs) throws RemoteException, InvalidInputException {
        FileSearchCriteria fileSearchCriteria = new FileSearchCriteria();
        fileSearchCriteria.setExperiment(experimentRef);
        fileSearchCriteria.setExperimentGraphNodes(sampleRefs);
        fileSearchCriteria.getCategories().add(FileCategory.DERIVED_DATA);
        fileSearchCriteria.setExtension("CHP");

        List<gov.nih.nci.caarray.external.v1_0.data.File> files = null;
        try {
            files = searchServiceHelper.filesByCriteria(fileSearchCriteria).list();
        } catch (RuntimeException e) {
            reportError(e.getMessage(), e);
            e.printStackTrace();
            throw e;
        }
        if (files != null && files.size() <= 0) {
            return null;
        }

        return files;
    }

    /**
     * Download a zip of the given files.
     */

    /*   public  List<String> downloadZipOfFiles(DataApiUtils dataServiceHelper, List<gov.nih.nci.caarray.external.v1_0.data.File> files, String zipFileName ) throws RemoteException,
        MalformedURIException, IOException, Exception {
    FileDownloadRequest downloadRequest = new FileDownloadRequest();
    List<String> listOfZipFiles = new ArrayList<String>();
    List<CaArrayEntityReference> fileRefs = new ArrayList<CaArrayEntityReference>();   
    List<gov.nih.nci.caarray.external.v1_0.data.File> removeFiles = new ArrayList<gov.nih.nci.caarray.external.v1_0.data.File>();   
    long bytesInCurrentZipFile = 0;        
    long numberOfFiles = 0;
    boolean compressEachIndividualFile = false;
    File zipFile = new File(outputZipDirectory+File.separator+zipFileName);
    long sequenceNumber = 1;
    while (!files.isEmpty()){
       for (gov.nih.nci.caarray.external.v1_0.data.File file : files) {
           System.out.print(file.getMetadata().getName() + "  ");
        if(numberOfFiles < 100 &&  bytesInCurrentZipFile < MAX_ZIP_FILE_SIZE) {
           fileRefs.add(file.getReference());
           removeFiles.add(file);
           numberOfFiles++;
           bytesInCurrentZipFile = bytesInCurrentZipFile + file.getMetadata().getUncompressedSize();
           }
        else{
           break;
        }
            
       }
       downloadRequest.setFiles(fileRefs);
       long startTime = System.currentTimeMillis();
       dataServiceHelper.downloadFileContentsZipToFile(downloadRequest, compressEachIndividualFile, zipFile);
       long totalTime = System.currentTimeMillis() - startTime;
       if (zipFile != null  && zipFile.exists()  && zipFile.length() > 0) {
           System.out.println("Retrieved " + zipFile.length() + " bytes in " + totalTime + " ms.");
           listOfZipFiles.add(zipFile.getName());
       } else {
           System.err.println("Error: Retrieved null byte array.");
       }
       //clear counts
       bytesInCurrentZipFile = 0;        
       numberOfFiles = 0;
       files.removeAll(removeFiles);
       if(files.size()> 0){
          sequenceNumber++;
          int dotIndex = zipFileName.lastIndexOf('.');
          String beforeDot = zipFileName.substring(0, dotIndex);
          zipFileName =beforeDot+"_part-"+sequenceNumber+".zip";
          zipFile = new File(outputZipDirectory+File.separator+zipFileName);
          fileRefs = new ArrayList<CaArrayEntityReference>();   
       }
    }
        
    return listOfZipFiles;
       }
    */

}