org.icgc.dcc.portal.manifest.ManifestService.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.portal.manifest.ManifestService.java

Source

/*
 * Copyright (c) 2015 The Ontario Institute for Cancer Research. All rights reserved.
 *
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.portal.manifest;

import static com.google.common.collect.Ordering.explicit;
import static com.google.common.collect.Ordering.natural;
import static com.sun.jersey.core.header.ContentDisposition.type;
import static java.util.Comparator.comparing;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.joining;
import static org.dcc.portal.pql.meta.Type.FILE;
import static org.icgc.dcc.common.core.json.Jackson.DEFAULT;
import static org.icgc.dcc.common.core.util.Joiners.COMMA;
import static org.icgc.dcc.common.core.util.Joiners.DOT;
import static org.icgc.dcc.common.core.util.function.Predicates.distinctByKey;
import static org.icgc.dcc.portal.model.EntitySetDefinition.SortOrder.DESCENDING;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Comparator;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.UUID;
import java.util.stream.Stream;

import org.elasticsearch.action.search.SearchResponse;
import org.icgc.dcc.portal.config.PortalProperties;
import org.icgc.dcc.portal.manifest.model.Manifest;
import org.icgc.dcc.portal.manifest.model.ManifestField;
import org.icgc.dcc.portal.manifest.model.ManifestFile;
import org.icgc.dcc.portal.manifest.model.ManifestFormat;
import org.icgc.dcc.portal.manifest.writer.EGAManifestWriter;
import org.icgc.dcc.portal.manifest.writer.GDCManifestWriter;
import org.icgc.dcc.portal.manifest.writer.GNOSManifestWriter;
import org.icgc.dcc.portal.manifest.writer.GenericManifestWriter;
import org.icgc.dcc.portal.manifest.writer.ICGCManifestWriter;
import org.icgc.dcc.portal.manifest.writer.PDCManifestWriter;
import org.icgc.dcc.portal.model.BaseEntitySet.Type;
import org.icgc.dcc.portal.model.EntitySetDefinition;
import org.icgc.dcc.portal.model.Query;
import org.icgc.dcc.portal.model.Repository;
import org.icgc.dcc.portal.pql.convert.Jql2PqlConverter;
import org.icgc.dcc.portal.repository.FileRepository;
import org.icgc.dcc.portal.repository.ManifestRepository;
import org.icgc.dcc.portal.repository.RepositoryRepository;
import org.icgc.dcc.portal.service.EntitySetService;
import org.icgc.dcc.portal.service.NotFoundException;
import org.icgc.dcc.portal.util.MultiPartOutputStream;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import com.google.common.base.Stopwatch;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.sun.jersey.multipart.file.DefaultMediaTypePredictor;

import lombok.Cleanup;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.val;
import lombok.extern.slf4j.Slf4j;

@Slf4j
@Service
@RequiredArgsConstructor(onConstructor = @__({ @Autowired }))
public class ManifestService {

    /**
     * Constants.
     */
    private static final int BUFFER_SIZE = 1024 * 100;
    private static final String FILE_NAME_PREFIX = "manifest";
    private static final String MANIFEST_SET_NAME = "Saved manifest";

    /**
     * Configuration.
     */
    @NonNull
    private final PortalProperties properties;

    /**
     * Dependencies.
     */
    @NonNull
    private final RepositoryRepository repositories;
    @NonNull
    private final ManifestRepository manifestRepository;
    @NonNull
    private final FileRepository fileRepository;
    @NonNull
    private final EntitySetService entitySetService;

    public String getFileName(@NonNull Manifest manifest) {
        val repoCode = manifest.getRepos().size() == 1 ? manifest.getRepos().get(0) : null;

        val timestamp = manifest.getTimestamp();
        if (manifest.getFormat() == ManifestFormat.TARBALL) {
            // Archive
            return FILE_NAME_PREFIX + "." + timestamp + ".tar.gz";
        } else if (repoCode != null) {
            // Single repo
            val repo = repositories.findOne(repoCode);
            return formatFileName(repo, timestamp);
        } else {
            // Concatenated manifest
            return FILE_NAME_PREFIX + ".concatenated." + timestamp + ".txt";
        }
    }

    @NonNull
    public Manifest getManifest(@NonNull UUID manifestId) {
        val manifest = manifestRepository.find(manifestId);
        if (manifest == null) {
            throw new NotFoundException(manifestId.toString(), "manifest");
        }

        return manifest;
    }

    @SneakyThrows
    public void saveManifest(@NonNull Manifest manifest) {
        val dataVersion = properties.getRelease().getDataVersion();

        val entitySetDefinition = new EntitySetDefinition(DEFAULT.writeValueAsString(manifest.getFilters()), "id",
                DESCENDING, MANIFEST_SET_NAME, COMMA.join(manifest.getRepos()), Type.FILE, 200000, false);

        val entitySet = entitySetService.createFileEntitySet(entitySetDefinition);

        manifest.setId(entitySet.getId());
        manifest.setVersion(dataVersion);

        manifestRepository.save(manifest, dataVersion);
    }

    public void generateManifests(@NonNull ManifestContext context) throws IOException {
        // Get requested file copies
        val watch = Stopwatch.createStarted();

        log.info("Finding manifest files...");
        val searchResult = findFiles(context.getQuery());
        log.info("Read manifest files in {}", watch);

        try {
            switch (context.getManifest().getFormat()) {
            case TARBALL:
                generateManifestArchive(searchResult, context);
                break;
            case FILES:
                generateManifestFiles(searchResult, context);
                break;
            case JSON:
                generateManifestJSON(searchResult, context);
                break;
            }
        } catch (Exception e) {
            log.error("Error generating manifests: ", e);
            throw e;
        }

        log.info("Finsished creating manifest in {}", watch);
    }

    public void generateManifestArchive(SearchResponse searchResult, ManifestContext context) throws IOException {
        @Cleanup
        val archive = new ManifestArchive(context.getOutput());
        val timestamp = context.getManifest().getTimestamp();

        // Write a manifest for each repository in turn
        log.info("Writing manifest archive...");
        eachRepository(context, searchResult, (repo, bundles) -> {
            ByteArrayOutputStream fileContents = new ByteArrayOutputStream(BUFFER_SIZE);
            writeManifest(repo, timestamp, bundles, fileContents);

            String fileName = formatFileName(repo, timestamp);
            archive.addManifest(fileName, fileContents);
        });
    }

    private void generateManifestFiles(SearchResponse searchResult, ManifestContext context) throws IOException {
        val timestamp = context.getManifest().getTimestamp();
        if (context.getManifest().isMultipart()) {
            val boundary = "boundary_" + timestamp;
            val output = new MultiPartOutputStream(boundary, context.getOutput());

            eachRepository(context, searchResult, (repo, bundles) -> {
                String fileName = formatFileName(repo, timestamp);
                String fileType = DefaultMediaTypePredictor.getInstance().getMediaTypeFromFileName(fileName)
                        .toString();
                output.startPart(fileType, new String[] {
                        "ContentDisposition: " + type("attachment").fileName(fileName).build().toString() });
                writeManifest(repo, timestamp, bundles, output);
            });
        } else {
            val output = context.getOutput();
            eachRepository(context, searchResult, (repo, bundles) -> {
                writeManifest(repo, timestamp, bundles, output);
            });
        }
    }

    private void generateManifestJSON(SearchResponse searchResult, ManifestContext context) throws IOException {
        val output = context.getOutput();
        val manifest = context.getManifest();
        val generator = DEFAULT.getFactory().createGenerator(output);
        val fields = manifest.getFields();
        val files = fields.contains(ManifestField.ID) || fields.contains(ManifestField.MD5SUM)
                || fields.contains(ManifestField.SIZE);

        // This is too big to fit in a {@link Manifest} so we stream instead
        generator.writeStartObject();
        if (manifest.getId() != null) {
            generator.writeStringField("id", manifest.getId().toString());
        }
        generator.writeObjectField("repos", manifest.getRepos());
        generator.writeNumberField("timestamp", manifest.getTimestamp());
        generator.writeNumberField("version", manifest.getVersion());
        generator.writeObjectField("filters", manifest.getFilters());
        generator.writeObjectField("fields", manifest.getFields());
        generator.writeObjectField("format", manifest.getFormat());
        generator.writeBooleanField("unique", manifest.isUnique());
        generator.writeBooleanField("multipart", manifest.isMultipart());
        generator.writeFieldName("entries");

        generator.writeStartArray();
        eachRepository(context, searchResult, (repo, bundles) -> {
            generator.writeStartObject();
            generator.writeStringField("repo", repo.getCode());

            // Files
            if (files) {
                generator.writeArrayFieldStart("files");
                for (ManifestFile file : bundles.values()) {
                    generator.writeStartObject();
                    if (fields.contains(ManifestField.ID)) {
                        generator.writeStringField(ManifestField.ID.getKey(), file.getId());
                    }
                    if (fields.contains(ManifestField.MD5SUM)) {
                        generator.writeStringField(ManifestField.MD5SUM.getKey(), file.getMd5sum());
                    }
                    if (fields.contains(ManifestField.REPOFILEID)) {
                        generator.writeStringField(ManifestField.REPOFILEID.getKey(), file.getRepoFileId());
                    }
                    if (fields.contains(ManifestField.SIZE)) {
                        generator.writeNumberField(ManifestField.SIZE.getKey(), file.getSize());
                    }
                    generator.writeEndObject();
                }
                generator.writeEndArray();
            }

            // Contents
            if (fields.contains(ManifestField.CONTENT)) {
                ByteArrayOutputStream fileContents = new ByteArrayOutputStream(BUFFER_SIZE);
                writeManifest(repo, manifest.getTimestamp(), bundles, fileContents);

                generator.writeBinaryField(ManifestField.CONTENT.getKey(), fileContents.toByteArray());
            }
            generator.writeEndObject();
        });
        generator.writeEndArray();
        generator.writeEndObject();

        generator.flush();
    }

    private void eachRepository(ManifestContext context, SearchResponse searchResult, BundlesCallback callback)
            throws IOException {
        // Map and filter
        Stream<ManifestFile> files = new ManifestMapper(repositories.findAll()).map(searchResult)
                .filter(file -> context.isActive(file.getRepoCode()));

        if (context.getManifest().isUnique()) {
            // Remove duplicates by file id by choosing the one with the higest priority
            files = files
                    .sorted(fileIdOrder().thenComparing(priorityFileCopyOrder(context.getManifest().getRepos())))
                    .filter(distinctByKey(file -> file.getId()));
        }

        // Group files from each repo together
        val repoCodeFiles = files.collect(groupingBy(file -> file.getRepoCode()));

        // Iterate in order of priority
        for (val repoCode : prioritizeRepoCodes(context.getManifest().getRepos(), repoCodeFiles.keySet())) {
            val repo = repositories.findOne(repoCode);
            val repoFiles = repoCodeFiles.get(repoCode);

            // Index
            val bundles = Multimaps.index(repoFiles, file -> formatFileURL(repo, file));

            // Hand off
            callback.handle(repo, bundles);
        }
    }

    private SearchResponse findFiles(Query query) {
        val converter = Jql2PqlConverter.getInstance();
        val pql = converter.convert(query, FILE);
        log.debug("Received JQL: '{}'; converted to PQL: '{}'.", query.getFilters(), pql);

        return fileRepository.findFileInfoPQL(pql);
    }

    private static void writeManifest(Repository repo, long timestamp,
            ListMultimap<String, ManifestFile> downloadUrlGroups, OutputStream out) {
        if (repo.isGNOS()) {
            GNOSManifestWriter.write(out, downloadUrlGroups, timestamp);
        } else if (repo.isS3()) {
            ICGCManifestWriter.write(out, downloadUrlGroups);
        } else if (repo.isGDC()) {
            GDCManifestWriter.write(out, downloadUrlGroups);
        } else if (repo.isPDC()) {
            PDCManifestWriter.write(out, downloadUrlGroups);
        } else if (repo.isEGA()) {
            EGAManifestWriter.write(out, downloadUrlGroups);
        } else {
            // e.g TCGA
            GenericManifestWriter.write(out, downloadUrlGroups);
        }
    }

    private static Comparator<ManifestFile> fileIdOrder() {
        return comparing(file -> file.getId());
    }

    private static SortedSet<String> prioritizeRepoCodes(List<String> priorities, Set<String> repoCodes) {
        val order = priorityRepoOrder(priorities);
        return ImmutableSortedSet.orderedBy(order).addAll(repoCodes).build();
    }

    private static Comparator<ManifestFile> priorityFileCopyOrder(List<String> priorities) {
        val order = priorityRepoOrder(priorities);
        return comparing(file -> file.getRepoCode(), order);
    }

    private static Ordering<String> priorityRepoOrder(List<String> priorities) {
        val all = priorities.isEmpty();
        return all ? natural() : explicit(priorities);
    }

    private static String formatFileName(Repository repo, long timestamp) {
        val ext = repo.isGNOS() ? "xml" : repo.isEGA() || repo.isPDC() ? "sh" : "tsv";
        return DOT.join(FILE_NAME_PREFIX, repo.getCode(), timestamp, ext);
    }

    private static String formatFileURL(String... parts) {
        return Stream.of(parts).map(part -> part.replaceAll("^/+|/+$", "")).collect(joining("/"));
    }

    private static String formatFileURL(@NonNull Repository repo, @NonNull ManifestFile file) {
        if (repo.isGNOS()) {
            return formatFileURL(file.getRepoBaseUrl(), file.getRepoDataPath(), file.getDataBundleId());
        } else if (repo.isGDC()) {
            return file.getRepoFileId();
        } else if (repo.isPDC()) {
            return formatFileURL(file.getRepoBaseUrl(), file.getRepoDataPath());
        } else {
            return formatFileURL(file.getRepoBaseUrl(), file.getRepoDataPath(), file.getName());
        }
    }

    /**
     * Callback for processing a grouped set of files by url (a.k.a a bundle).
     */
    private interface BundlesCallback {

        void handle(Repository repo, ListMultimap<String, ManifestFile> bundles) throws IOException;

    }

}