org.icgc.dcc.portal.repository.FileRepository.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.portal.repository.FileRepository.java

Source

/*
 * Copyright (c) 2015 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.portal.repository;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Lists.transform;
import static com.google.common.collect.Sets.newHashSet;
import static com.google.common.math.LongMath.divide;
import static java.lang.String.format;
import static java.math.RoundingMode.CEILING;
import static java.util.stream.Collectors.toMap;
import static java.util.stream.IntStream.range;
import static org.apache.commons.collections.CollectionUtils.isEmpty;
import static org.dcc.portal.pql.ast.function.FunctionBuilders.limit;
import static org.dcc.portal.pql.ast.function.FunctionBuilders.select;
import static org.dcc.portal.pql.ast.function.FunctionBuilders.sortBuilder;
import static org.dcc.portal.pql.meta.FileTypeModel.AVAILABLE_FACETS;
import static org.dcc.portal.pql.meta.Type.FILE;
import static org.dcc.portal.pql.query.PqlParser.parse;
import static org.elasticsearch.action.search.SearchType.COUNT;
import static org.elasticsearch.action.search.SearchType.QUERY_THEN_FETCH;
import static org.elasticsearch.action.search.SearchType.SCAN;
import static org.elasticsearch.index.query.FilterBuilders.boolFilter;
import static org.elasticsearch.index.query.FilterBuilders.matchAllFilter;
import static org.elasticsearch.index.query.FilterBuilders.missingFilter;
import static org.elasticsearch.index.query.FilterBuilders.nestedFilter;
import static org.elasticsearch.index.query.FilterBuilders.termFilter;
import static org.elasticsearch.index.query.FilterBuilders.termsFilter;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.index.query.QueryBuilders.multiMatchQuery;
import static org.elasticsearch.search.aggregations.AggregationBuilders.avg;
import static org.elasticsearch.search.aggregations.AggregationBuilders.filter;
import static org.elasticsearch.search.aggregations.AggregationBuilders.missing;
import static org.elasticsearch.search.aggregations.AggregationBuilders.nested;
import static org.elasticsearch.search.aggregations.AggregationBuilders.reverseNested;
import static org.elasticsearch.search.aggregations.AggregationBuilders.sum;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
import static org.icgc.dcc.common.core.util.stream.Collectors.toImmutableList;
import static org.icgc.dcc.portal.model.IndexModel.FIELDS_MAPPING;
import static org.icgc.dcc.portal.model.IndexModel.IS;
import static org.icgc.dcc.portal.model.IndexModel.MAX_FACET_TERM_COUNT;
import static org.icgc.dcc.portal.model.IndexModel.MISSING;
import static org.icgc.dcc.portal.model.SearchFieldMapper.searchFieldMapper;
import static org.icgc.dcc.portal.model.TermFacet.repoTermFacet;
import static org.icgc.dcc.portal.pql.convert.FiltersConverter.ENTITY_SET_ID;
import static org.icgc.dcc.portal.pql.convert.FiltersConverter.ENTITY_SET_PREFIX;
import static org.icgc.dcc.portal.repository.TermsLookupRepository.createTermsLookupFilter;
import static org.icgc.dcc.portal.repository.TermsLookupRepository.TermLookupType.DONOR_IDS;
import static org.icgc.dcc.portal.repository.TermsLookupRepository.TermLookupType.FILE_IDS;
import static org.icgc.dcc.portal.util.ElasticsearchResponseUtils.checkResponseState;
import static org.icgc.dcc.portal.util.JsonUtils.merge;
import static org.icgc.dcc.portal.util.SearchResponses.getHitIds;
import static org.icgc.dcc.portal.util.SearchResponses.getTotalHitCount;

import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.function.BiFunction;
import java.util.function.Consumer;

import org.dcc.portal.pql.ast.StatementNode;
import org.dcc.portal.pql.ast.function.SelectNode;
import org.dcc.portal.pql.ast.function.SortNode;
import org.dcc.portal.pql.meta.IndexModel;
import org.dcc.portal.pql.meta.FileTypeModel.EsFields;
import org.dcc.portal.pql.meta.FileTypeModel.Fields;
import org.dcc.portal.pql.meta.TypeModel;
import org.dcc.portal.pql.query.QueryEngine;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.search.MultiSearchResponse.Item;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.BoolFilterBuilder;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.index.query.FilteredQueryBuilder;
import org.elasticsearch.index.query.NestedFilterBuilder;
import org.elasticsearch.search.aggregations.AbstractAggregationBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.Aggregations;
import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregation;
import org.elasticsearch.search.aggregations.bucket.filter.Filter;
import org.elasticsearch.search.aggregations.bucket.filter.FilterAggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.missing.Missing;
import org.elasticsearch.search.aggregations.bucket.missing.MissingBuilder;
import org.elasticsearch.search.aggregations.bucket.nested.Nested;
import org.elasticsearch.search.aggregations.bucket.nested.NestedBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
import org.elasticsearch.search.aggregations.metrics.avg.Avg;
import org.elasticsearch.search.aggregations.metrics.sum.Sum;
import org.icgc.dcc.portal.model.IndexModel.Kind;
import org.icgc.dcc.portal.model.IndexModel.Type;
import org.icgc.dcc.portal.model.Query;
import org.icgc.dcc.portal.model.SearchFieldMapper;
import org.icgc.dcc.portal.model.TermFacet;
import org.icgc.dcc.portal.model.TermFacet.Term;
import org.icgc.dcc.portal.model.param.FiltersParam;
import org.icgc.dcc.portal.pql.convert.Jql2PqlConverter;
import org.icgc.dcc.portal.repository.TermsLookupRepository.TermLookupType;
import org.icgc.dcc.portal.service.BadRequestException;
import org.icgc.dcc.portal.service.IndexService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;

import lombok.NonNull;
import lombok.SneakyThrows;
import lombok.Value;
import lombok.val;
import lombok.extern.slf4j.Slf4j;

@Slf4j
@Component
public class FileRepository {

    /**
     * Constants
     */
    private static final Set<String> FILE_DONOR_FIELDS = newHashSet("specimen_id", "sample_id",
            "submitted_specimen_id", "submitted_sample_id", "id", "submitted_donor_id", "tcga_participant_barcode",
            "tcga_sample_barcode", "tcga_aliquot_barcode");
    private static final SearchFieldMapper FILE_DONOR_TEXT_FIELDS = searchFieldMapper()
            .partialMatchFields(FILE_DONOR_FIELDS).lowercaseMatchFields(FILE_DONOR_FIELDS).build();
    private static final SelectNode FILE_INFO_FIELDS = select(ImmutableList.of(Fields.FILE_UUID, Fields.FILE_ID,
            Fields.STUDY, Fields.DATA_BUNDLE_ID, Fields.FILE_COPIES, Fields.DONORS));
    private static final SortNode FILE_INFO_SORT = sortBuilder().sortAsc(Fields.REPO_TYPE).build();

    private static final Kind KIND = Kind.FILE;
    private static final TypeModel TYPE_MODEL = IndexModel.getFileTypeModel();
    private static final String PREFIX = TYPE_MODEL.prefix();
    private static final String FILE_INDEX_TYPE = FILE.getId();
    private static final String FILE_DONOR_TEXT_INDEX_TYPE = Type.FILE_DONOR_TEXT.getId();

    private static final String DONOR_ID_RAW_FIELD_NAME = toRawFieldName(Fields.DONOR_ID);
    private static final Jql2PqlConverter PQL_CONVERTER = Jql2PqlConverter.getInstance();
    private static final Map<String, String> JQL_FIELD_NAME_MAPPING = FIELDS_MAPPING.get(KIND);

    private static final TimeValue KEEP_ALIVE = new TimeValue(10000);

    /**
     * Dependencies.
     */
    private final Client client;
    private final String repoIndexName;
    private final QueryEngine queryEngine;
    private final IndexService indexService;

    @Autowired
    public FileRepository(Client client,
            @NonNull @org.springframework.beans.factory.annotation.Value("#{repoIndexName}") String repoIndexName,
            IndexService indexService) {
        this.client = client;
        this.repoIndexName = repoIndexName;
        this.queryEngine = new QueryEngine(client, repoIndexName);
        this.indexService = indexService;
    }

    public Map<String, String> findRepos() {
        // TODO: Return a list of {@link Repository}s instead?
        val repoName = toRawFieldName(Fields.REPO_NAME);
        val repoCode = toRawFieldName(Fields.REPO_CODE);
        val repoNameSubAgg = terms(repoName).field(repoName);
        val repoCodeSubAgg = nestedAgg(repoCode, EsFields.FILE_COPIES,
                terms(repoCode).size(100).field(repoCode).subAggregation(repoNameSubAgg));
        val response = searchFileCentric("findRepos",
                request -> request.setSearchType(COUNT).addAggregation(repoCodeSubAgg));
        val terms = (Terms) getSubAggResultFromNested(response.getAggregations(), repoCode).get(repoCode);

        return terms.getBuckets().stream().collect(toMap(bucket -> bucket.getKey(), bucket -> {
            final List<Bucket> repoNameBuckets = termsBuckets(bucket.getAggregations(), repoName);

            return isEmpty(repoNameBuckets) ? "" : repoNameBuckets.get(0).getKey();
        }));
    }

    /**
     * @param fields - A list of field names that form the search query.
     * @param queryString - User input - could be any value out of one of the fields.
     * @return
     */
    public SearchResponse findRepoDonor(Iterable<String> fields, String queryString) {
        val maxNumberOfDocs = 5;
        String[] fieldNames = Iterables.toArray(FILE_DONOR_TEXT_FIELDS.map(fields), String.class);
        val result = searchFileDonorText("findRepoDonor", request -> request.setSearchType(QUERY_THEN_FETCH)
                .setFrom(0).setSize(maxNumberOfDocs).setQuery(multiMatchQuery(queryString, fieldNames)));

        log.debug("findRepoDonor - ES search result is: '{}'.", result);
        return result;
    }

    public GetResponse findOne(@NonNull String id) {
        val search = client.prepareGet(repoIndexName, FILE_INDEX_TYPE, id);
        val response = search.execute().actionGet();
        // This check is important as it validates if there is any document at all in the GET response.
        checkResponseState(id, response, KIND);

        return response;
    }

    public SearchResponse findAll(@NonNull Query query) {
        val queryFilter = query.getFilters();
        val filters = repoFilters(queryFilter);

        val response = searchFileCentric("findAll()", request -> {
            request.setSearchType(QUERY_THEN_FETCH).setFrom(query.getFrom()).setSize(query.getSize())
                    .addSort(JQL_FIELD_NAME_MAPPING.get(query.getSort()), query.getOrder()).setPostFilter(filters);

            aggs(queryFilter).stream().forEach(agg -> request.addAggregation(agg));
        });

        log.debug("findAll() - ES response is: '{}'.", response);
        return response;
    }

    public SearchResponse findAll(Query query, final String[] fields) {
        val filters = repoFilters(query.getFilters());

        return searchFileCentric(request -> request.setPostFilter(filters).setQuery(matchAllQuery()), fields);
    }

    // FIXME: Support terms lookup on files as part of the filter builder so we don't need an extra method.
    public SearchResponse findAll(String setId, final String[] fields) {
        val query = fileSetIdQuery(setId);

        return searchFileCentric(request -> request.setQuery(query), fields);
    }

    public Set<String> findAllDonorIds(@NonNull Query query, final int setLimit) {
        val pqlAst = parse(PQL_CONVERTER.convert(query, FILE));
        val size = query.getSize();
        int pageNumber = 0;

        SearchResponse response = findDonorIdsPQL(pqlAst, pageNumber, size);

        val result = Sets.<String>newHashSet();
        val pageCount = divide(getTotalHitCount(response), size, CEILING);

        // Number of files > max limit, so we must page files in order to ensure we get all donors.
        while (pageNumber <= pageCount) {
            for (val hit : response.getHits()) {
                val donorIdField = hit.field(DONOR_ID_RAW_FIELD_NAME);

                if (null == donorIdField) {
                    // Skips when donorId doesn't appear in the fields.
                    log.warn("The Donors array in this document (id: {}) is empty, which is not valid.",
                            hit.getId());
                    continue;
                }

                val donorIds = donorIdField.getValues();
                result.addAll(transform(donorIds, id -> id.toString()));

                if (result.size() >= setLimit) {
                    return result;
                }
            }

            response = findDonorIdsPQL(pqlAst, ++pageNumber, size);
        }

        return result;
    }

    public List<String> findAllFileIds(Query query) {
        val queryFilter = query.getFilters();
        val filters = repoFilters(queryFilter);

        val response = searchFileCentric("Files Ids from Query", (request) -> request
                .setSearchType(QUERY_THEN_FETCH).setFrom(query.getFrom()).setSize(query.getSize())
                .addSort(JQL_FIELD_NAME_MAPPING.get(query.getSort()), query.getOrder()).setPostFilter(filters));

        return getHitIds(response);
    }

    public SearchResponse findFileInfo(String setId) {
        val pqlAst = parse("select(*)");
        pqlAst.setSelect(FILE_INFO_FIELDS);

        val response = searchFileCentricPQL("Donor Info From Set Id", pqlAst,
                request -> request.setFrom(0).setSize(20000).setQuery(fileSetIdQuery(setId)));

        log.debug("ES response is: {}", response);
        return response;
    }

    public SearchResponse findFileInfoPQL(@NonNull final String pql) {
        val pqlAst = parse(pql);
        pqlAst.setSelect(FILE_INFO_FIELDS);
        pqlAst.setSort(FILE_INFO_SORT);

        log.debug("PQL for download is: '{}'.", pqlAst.toString());

        // Get the total count first.
        val count = getTotalHitCount(findFileInfoPQL(pqlAst, COUNT, 0));
        log.debug("A total of {} files will be returned from this query.", count);

        return findFileInfoPQL(pqlAst, QUERY_THEN_FETCH, Ints.saturatedCast(count));
    }

    /**
     * Get total file size, total donor count and total number of files based on query
     */
    public Map<String, Long> getSummary(Query query) {
        val donorSubAggs = donorIdAgg(SummaryAggregationKeys.DONOR)
                .subAggregation(
                        terms(SummaryAggregationKeys.PROJECT).size(1000).field(toRawFieldName(Fields.PROJECT_CODE)))
                .subAggregation(terms(SummaryAggregationKeys.PRIMARY_SITE).size(1000)
                        .field(toRawFieldName(Fields.PRIMARY_SITE)));

        val fileSizeSubAgg = averageFileSizePerFileCopyAgg(SummaryAggregationKeys.FILE);
        val filters = repoFilters(query.getFilters());

        val response = searchFileCentric("Summary aggregation", request -> request.setSearchType(COUNT)
                .setQuery(filteredQuery(filters)).addAggregation(fileSizeSubAgg).addAggregation(donorSubAggs));

        log.debug("getSummary aggregation result is: '{}'.", response);

        val aggResult = response.getAggregations();
        val totalFileSize = sumFileCopySize(termsBuckets(aggResult, SummaryAggregationKeys.FILE),
                SummaryAggregationKeys.FILE);
        val donorAggResult = getSubAggResultFromNested(aggResult, SummaryAggregationKeys.DONOR);

        return ImmutableMap.<String, Long>of("fileCount", getTotalHitCount(response), "totalFileSize",
                (long) totalFileSize, "donorCount", (long) bucketSize(donorAggResult, SummaryAggregationKeys.DONOR),
                "projectCount", (long) bucketSize(donorAggResult, SummaryAggregationKeys.PROJECT),
                "primarySiteCount", (long) bucketSize(donorAggResult, SummaryAggregationKeys.PRIMARY_SITE));
    }

    /**
     * Returns the unique donor count across repositories Note we are counting the bucket size of a term aggregation. It
     * appears that using cardinality aggregation yields imprecise result.
     */
    public long getDonorCount(Query query) {
        val aggKey = "donorCount";
        val filters = repoFilters(query.getFilters());

        val response = searchFileCentric("Donor Count aggregation", request -> request.setSearchType(COUNT)
                .setQuery(filteredQuery(filters)).addAggregation(donorIdAgg(aggKey)));

        log.debug("getDonorCount aggregation result is: '{}'.", response);

        return bucketSize(getSubAggResultFromNested(response.getAggregations(), aggKey), aggKey);
    }

    public Map<String, Map<String, Map<String, Object>>> getRepoStats(String repoName) {
        val aggsFilter = nestedFilter(EsFields.FILE_COPIES, termFilter(toRawFieldName(Fields.REPO_CODE), repoName));
        val response = findFileStats(aggsFilter, repoName);

        return convertStats(response.getAggregations(), repoName);
    }

    public Map<String, Map<String, Map<String, Object>>> getStudyStats(String study) {
        val aggsFilter = termFilter(toRawFieldName(Fields.STUDY), study);
        val response = findFileStats(aggsFilter, study);

        return convertStats(response.getAggregations(), study);
    }

    public Map<String, TermFacet> getAggregationFacets(Query query, Aggregations aggs) {
        val result = Maps.<String, TermFacet>newHashMap();

        for (val agg : aggs) {
            val name = agg.getName();
            val aggregations = ((Filter) agg).getAggregations();

            if (name.equals(CustomAggregationKeys.REPO_SIZE)) {
                val nestedAgg = getSubAggResultFromNested(aggregations, name);
                val buckets = ((Terms) nestedAgg.get(name)).getBuckets();

                result.put(CustomAggregationKeys.REPO_SIZE, convertRepoSizeAggregation(buckets));
                result.put(CustomAggregationKeys.REPO_DONOR_COUNT, searchGroupByRepoNameDonorId(buckets, query));
            } else if (name.equals(CustomAggregationKeys.REPO_NAME)) {
                val nestedAgg = getSubAggResultFromNested(aggregations, name);

                result.put(name, convertNormalAggregation(nestedAgg, name));
            } else if (name.equals(Fields.FILE_FORMAT)) {
                result.put(name, convertFileFormatAggregation(aggregations, name));
            } else {
                result.put(name, convertNormalAggregation(aggregations, name));
            }
        }

        log.debug("Result of convertAggregationsToFacets is: '{}'.", result);
        return result;
    }

    @SneakyThrows
    public Map<String, String> getIndexMetaData() {
        // TODO: Should all of this really be returned? What is acutally required and by whom?
        return indexService.getIndexMetaData(client, repoIndexName);
    }

    public SearchResponse prepareSearchScroll(@NonNull String scrollId) {
        return client.prepareSearchScroll(scrollId).setScroll(KEEP_ALIVE).execute().actionGet();
    }

    private static String toRawFieldName(@NonNull String alias) {
        return TYPE_MODEL.getField(alias);
    }

    private SearchResponse findFileStats(FilterBuilder filter, String aggName) {
        val fileSizeAgg = averageFileSizePerFileCopyAgg(StatsAggregationKeys.SIZE);

        val fileFormatAgg = nestedAgg(StatsAggregationKeys.FORMAT, EsFields.FILE_COPIES,
                terms(StatsAggregationKeys.FORMAT).field(toRawFieldName(Fields.FILE_FORMAT)));

        val dataTypeAgg = terms(aggName).field(toRawFieldName(Fields.DATA_TYPE))
                .subAggregation(donorIdAgg(StatsAggregationKeys.DONOR)).subAggregation(fileSizeAgg)
                .subAggregation(fileFormatAgg);

        // Primary Site => Project Code => Donor ID
        val primarySiteAgg = primarySiteAgg(Fields.PRIMARY_SITE, 100).subAggregation(
                primarySiteAgg(Fields.PROJECT_CODE, 100).subAggregation(primarySiteAgg(Fields.DONOR_ID, 30000)));

        val statsAgg = filter(aggName).filter(filter).subAggregation(dataTypeAgg).subAggregation(
                nestedAgg(StatsAggregationKeys.DONOR_PRIMARY_SITE, EsFields.DONORS, primarySiteAgg));

        val response = searchFileCentric("findFileStats",
                request -> request.setSearchType(COUNT).addAggregation(statsAgg));

        log.debug("findFileStats - ES response is: {}", response);

        return response;
    }

    private SearchResponse findFileInfoPQL(StatementNode pqlAst, SearchType searchType, int size) {
        val response = searchFileCentricPQL("findFileInfoPQL", pqlAst,
                request -> request.setSearchType(searchType).setSize(size));

        log.debug("findFileInfoPQL - ES response is: {}", response);
        return response;
    }

    private SearchResponse findDonorIdsPQL(@NonNull StatementNode pqlAst, int pageNumber, int size) {
        pqlAst.setLimit(limit(pageNumber * size, size));

        val response = searchFileCentricPQL("findDonorIdsPQL", pqlAst, request -> {
        });

        log.debug("findDonorIdsPQL - ES response is: '{}'.", response);
        return response;
    }

    // Special aggregation to get unique donor count for each repository
    private TermFacet searchGroupByRepoNameDonorId(List<Bucket> buckets, Query query) {
        if (isEmpty(buckets)) {
            return repoTermFacet(0L, 0, ImmutableList.of());
        }

        val repoNames = transform(buckets, bucket -> bucket.getKey());

        val donorAggKey = CustomAggregationKeys.REPO_DONOR_COUNT;
        val repoFilterTemplate = "{file: {repoName: {is: [\"%s\"]}}}";
        val userFilter = query.getFilters();

        val multiSearch = client.prepareMultiSearch();

        for (val repoName : repoNames) {
            val repoFilter = new FiltersParam(format(repoFilterTemplate, repoName));
            val mergedFilter = merge(userFilter, repoFilter.get());
            val filters = repoFilters(mergedFilter);

            val oneSearch = client.prepareSearch(repoIndexName).setSearchType(COUNT)
                    .setQuery(filteredQuery(filters)).addAggregation(donorIdAgg(donorAggKey));

            multiSearch.add(oneSearch);
        }

        val response = multiSearch.execute().actionGet();
        Item[] responseItems = response.getResponses();
        val responseItemCount = responseItems.length;

        val donorResult = range(0, responseItemCount).boxed().map(i -> {
            final Aggregations aggResult = responseItems[i].getResponse().getAggregations();
            final int donorCount = bucketSize(getSubAggResultFromNested(aggResult, donorAggKey), donorAggKey);

            return new Term(repoNames.get(i), Long.valueOf(donorCount));
        }).collect(toImmutableList());

        // Total does not have any meaning in this context because a donor can cross multiple repositories.
        val total = -1L;
        return repoTermFacet(total, 0, donorResult);
    }

    private SearchResponse searchFileCentric(String logMessage, Consumer<SearchRequestBuilder> customizer) {
        return searchFiles(FILE_INDEX_TYPE, logMessage, customizer);
    }

    private SearchResponse searchFileCentricPQL(String logMessage, StatementNode pqlAst,
            Consumer<SearchRequestBuilder> customizer) {
        val request = queryEngine.execute(pqlAst, FILE).getRequestBuilder();
        customizer.accept(request);

        log.debug(logMessage + "; ES query is: '{}'", request);
        return request.execute().actionGet();
    }

    private SearchResponse searchFileCentric(Consumer<SearchRequestBuilder> queryCustomizer, String[] fields) {
        val size = 5000;

        return searchFileCentric("Preparing data table export", request -> {
            request.setSearchType(SCAN).setSize(size).setScroll(KEEP_ALIVE).addFields(fields);

            queryCustomizer.accept(request);
        });
    }

    private SearchResponse searchFileDonorText(String logMessage, Consumer<SearchRequestBuilder> customizer) {
        return searchFiles(FILE_DONOR_TEXT_INDEX_TYPE, logMessage, customizer);
    }

    private SearchResponse searchFiles(String indexType, String logMessage,
            Consumer<SearchRequestBuilder> customizer) {
        val request = client.prepareSearch(repoIndexName).setTypes(indexType);
        customizer.accept(request);

        log.debug(logMessage + "; ES query is: '{}'", request);
        return request.execute().actionGet();
    }

    private static boolean isNestedField(String fieldAlias) {
        return TYPE_MODEL.isAliasDefined(fieldAlias) && TYPE_MODEL.isNested(fieldAlias);
    }

    /**
     * Converters.
     */

    // Special aggregation to get file size for each repository
    private static TermFacet convertRepoSizeAggregation(List<Bucket> buckets) {
        val termsBuilder = ImmutableList.<Term>builder();
        long total = 0;

        for (val bucket : buckets) {
            val childCount = sumValue(bucket.getAggregations(), CustomAggregationKeys.FILE_SIZE);

            termsBuilder.add(new Term(bucket.getKey(), (long) childCount));
            total += childCount;
        }

        val result = repoTermFacet(total, 0, termsBuilder.build());

        log.debug("Result of convertRepoSizeAggregation is: {}", result);
        return result;
    }

    private static TermFacet convertNormalAggregation(Aggregations aggregations, String name) {
        return convertNormalAggregation(aggregations, name, (bucket, notUsed) -> bucket.getDocCount());
    }

    private static TermFacet convertFileFormatAggregation(Aggregations aggregations, String name) {
        return convertNormalAggregation(aggregations, name,
                (bucket, aggKey) -> ((SingleBucketAggregation) bucket.getAggregations().get(aggKey)).getDocCount());
    }

    private static TermFacet convertNormalAggregation(Aggregations aggregations, String name,
            BiFunction<Bucket, String, Long> docCountGetter) {
        val termsAgg = isNestedField(name) ? getSubAggResultFromNested(aggregations, name) : aggregations;
        val aggResult = (Terms) termsAgg.get(name);
        val termsBuilder = ImmutableList.<Term>builder();
        long total = 0;

        for (val bucket : aggResult.getBuckets()) {
            val bucketKey = bucket.getKey();
            val count = docCountGetter.apply(bucket, name);
            log.debug("convertNormalAggregation bucketKey: {}, count: {}", bucketKey, count);

            total += count;
            termsBuilder.add(new Term(bucketKey, count));
        }

        val missingAgg = (Missing) termsAgg.get(MISSING);
        val missingCount = missingAgg.getDocCount();
        log.debug("convertNormalAggregation Missing count is: {}", missingCount);

        // No need to return a term with a value of 0.
        if (missingCount > 0) {
            termsBuilder.add(new Term(MISSING, missingCount));
        }

        return repoTermFacet(total, missingCount, termsBuilder.build());
    }

    private static Map<String, Map<String, Map<String, Object>>> convertStats(Aggregations aggs, String aggName) {
        val stats = (Filter) aggs.get(aggName);
        val statsAggregations = stats.getAggregations();

        val result = Maps.<String, Map<String, Map<String, Object>>>newHashMap();

        // donorPrimarySite
        val donorPrimarySite = Maps.<String, Map<String, Object>>newHashMap();
        val primarySiteAggKey = StatsAggregationKeys.DONOR_PRIMARY_SITE;
        val donorFacets = (Terms) getSubAggResultFromNested(statsAggregations, primarySiteAggKey)
                .get(primarySiteAggKey);

        for (val bucket : donorFacets.getBuckets()) {
            val projectFacets = (Terms) bucket.getAggregations().get(primarySiteAggKey);
            val newEntries = projectFacets.getBuckets().stream().collect(toMap(project -> project.getKey(),
                    project -> bucketSize(project.getAggregations(), primarySiteAggKey)));

            val name = bucket.getKey();
            val map = donorPrimarySite.getOrDefault(name, Maps.<String, Object>newHashMap());
            map.putAll(newEntries);

            donorPrimarySite.putIfAbsent(name, map);
        }

        result.put(primarySiteAggKey, donorPrimarySite);

        // statistics
        val statistics = Maps.<String, Map<String, Object>>newHashMap();
        val datatypes = (Terms) statsAggregations.get(aggName);

        for (val bucket : datatypes.getBuckets()) {
            val bucketAggregations = bucket.getAggregations();
            val donorCount = bucketSize(getSubAggResultFromNested(bucketAggregations, StatsAggregationKeys.DONOR),
                    StatsAggregationKeys.DONOR);

            val fileSizeResult = (Terms) bucketAggregations.get(StatsAggregationKeys.SIZE);
            val totalFileSize = sumFileCopySize(fileSizeResult.getBuckets(), StatsAggregationKeys.SIZE);

            val dataFormat = (Terms) getSubAggResultFromNested(bucketAggregations, StatsAggregationKeys.FORMAT)
                    .get(StatsAggregationKeys.FORMAT);
            val formats = transform(dataFormat.getBuckets(), b -> b.getKey());

            // TODO: We should use StatsAggregationKeys for these keys too, though it requires changes in the client side.
            val map = ImmutableMap.<String, Object>of("fileCount", bucket.getDocCount(), "donorCount", donorCount,
                    "fileSize", totalFileSize, "dataFormat", formats);

            statistics.put(bucket.getKey(), map);
        }

        result.put("stats", statistics);

        log.debug("Result {}", result);
        return result;
    }

    private static FilteredQueryBuilder filteredQuery(FilterBuilder filters) {
        return new FilteredQueryBuilder(matchAllQuery(), filters);
    }

    private static FilteredQueryBuilder fileSetIdQuery(String setId) {
        val lookupFilter = createTermsLookupFilter(toRawFieldName(Fields.ID), FILE_IDS, UUID.fromString(setId));
        return filteredQuery(lookupFilter);
    }

    /**
     * FIXME: This is a temporary solution. We really should use the PQL infrastructure to build. <br>
     * Negation is not supported <br>
     * _missing is not supported for data_types.datatype and data_type.dataformat <br>
     */
    private static FilterBuilder repoFilters(final ObjectNode filters) {
        val fields = filters.path(PREFIX).fields();

        if (!fields.hasNext()) {
            // If there is no filter defined under "file", return a match-all filter.
            return matchAllFilter();
        }

        val result = boolFilter();

        // Used for creating the terms lookup filter when ENTITY_SET_ID and donorId are in the JQL.
        FilterBuilder entitySetIdFilter = null;
        BoolFilterBuilder donorIdFilter = null;

        while (fields.hasNext()) {
            val facetField = fields.next();
            val fieldAlias = facetField.getKey();

            checkArgument(JQL_FIELD_NAME_MAPPING.containsKey(fieldAlias),
                    "'%s' is not a valid field in this query.", fieldAlias);

            val facetValue = facetField.getValue().path(IS);
            if (facetValue.isMissingNode()) {
                throw new BadRequestException(format("Expected '%s' in filter for %s.", "is", facetField.getKey()));
            }
            val filterValues = transform(newArrayList(facetValue), item -> item.textValue());

            if (fieldAlias.equals(ENTITY_SET_ID)) {
                // The assumption here is there should be only one "entitySetId" filter in JQL.
                entitySetIdFilter = entitySetIdFilter(filterValues);
            } else if (fieldAlias.equals(Fields.ID)) {
                // Prepare for processing two types of file ids
                val uuids = Lists.<String>newArrayList();
                val ids = Lists.<String>newArrayList();

                // Partition and parse
                for (val id : filterValues) {
                    if (id.startsWith(ENTITY_SET_PREFIX)) {
                        val uuid = id.substring(ENTITY_SET_PREFIX.length());
                        uuids.add(uuid);
                    } else {
                        ids.add(id);
                    }
                }

                // Normal ids
                BoolFilterBuilder idFilter = null;
                if (!ids.isEmpty()) {
                    idFilter = missingInclusiveTermsFilter(fieldAlias, ids);
                }

                // Entity set ids
                BoolFilterBuilder uuidFilter = null;
                if (!uuids.isEmpty()) {
                    uuidFilter = boolFilter();
                    for (val uuid : uuids) {
                        uuidFilter.should(createTermsLookupFilter(toRawFieldName(fieldAlias),
                                TermLookupType.FILE_IDS, UUID.fromString(uuid)));
                    }
                }

                // Combine
                if (idFilter != null && uuidFilter != null) {
                    result.should(idFilter).should(uuidFilter);
                } else if (idFilter != null) {
                    result.must(idFilter);
                } else if (uuidFilter != null) {
                    result.must(uuidFilter);
                }
            } else {
                val filter = missingInclusiveTermsFilter(fieldAlias, filterValues);

                if (fieldAlias.equals(Fields.DONOR_ID)) {
                    // The assumption here is there should be only one "donorId" filter in JQL.
                    donorIdFilter = filter;
                } else {
                    result.must(filter);
                }
            }
        }

        // Creates the terms lookup filter when both ENTITY_SET_ID and donorId are in the JQL.
        if (null != donorIdFilter && null != entitySetIdFilter) {
            result.must(boolFilter().should(donorIdFilter).should(entitySetIdFilter));
        } else if (null != donorIdFilter) {
            result.must(donorIdFilter);
        } else if (null != entitySetIdFilter) {
            result.must(entitySetIdFilter);
        }

        return result;
    }

    private static FilterBuilder entitySetIdFilter(List<String> uuids) {
        if (isEmpty(uuids)) {
            return null;
        }

        if (1 == uuids.size()) {
            return nestedEntitySetIdFilter(uuids.get(0));
        }

        val result = boolFilter();

        for (val uuid : uuids) {
            result.should(nestedEntitySetIdFilter(uuid));
        }

        return result;
    }

    private static FilterBuilder selfRemovingFilter(final ObjectNode filters, String facetAlias) {
        if (!filters.fieldNames().hasNext()) {
            return matchAllFilter();
        }

        val facetFilters = filters.deepCopy();

        if (facetFilters.has(PREFIX)) {
            // Remove the facet itself from the "file" filter.
            facetFilters.with(PREFIX).remove(facetAlias);
        }

        return repoFilters(facetFilters);
    }

    private static NestedFilterBuilder nestedEntitySetIdFilter(String uuid) {
        return nestedFilter(EsFields.DONORS,
                createTermsLookupFilter(DONOR_ID_RAW_FIELD_NAME, DONOR_IDS, UUID.fromString(uuid)));
    }

    private static BoolFilterBuilder missingInclusiveTermsFilter(String fieldAlias, List<String> filterValues) {
        val rawFieldName = toRawFieldName(fieldAlias);
        val result = boolFilter();
        val terms = termsFilter(rawFieldName, filterValues);

        // Special processing for "no data" terms
        if (filterValues.remove(MISSING)) {
            val missing = missingFilter(rawFieldName).existence(true).nullValue(true);
            result.should(missing).should(terms);
        } else {
            result.must(terms);
        }

        return isNestedField(fieldAlias)
                ? boolFilter().must(nestedFilter(TYPE_MODEL.getNestedPath(fieldAlias), result))
                : result;
    }

    private static NestedBuilder nestedAgg(String aggName, String path, AbstractAggregationBuilder... subAggs) {
        val result = nested(aggName).path(path);

        for (val subAgg : subAggs) {
            result.subAggregation(subAgg);
        }

        return result;
    }

    private static NestedBuilder donorIdAgg(String aggKey) {
        return nestedAgg(aggKey, EsFields.DONORS, terms(aggKey).size(100000).field(DONOR_ID_RAW_FIELD_NAME));
    }

    private static List<AggregationBuilder<?>> aggs(final ObjectNode filters) {
        val regularUiFacets = transform(AVAILABLE_FACETS, facet -> {
            final String rawFieldName = toRawFieldName(facet);
            final FilterAggregationBuilder filterAgg = filter(facet).filter(selfRemovingFilter(filters, facet));

            if (facet.equals(Fields.FILE_FORMAT)) {
                return addReverseNestedTermsAgg(filterAgg, facet, rawFieldName, TYPE_MODEL.getNestedPath(facet));
            } else if (isNestedField(facet)) {
                return filterAgg
                        .subAggregation(nestedTermsAgg(facet, rawFieldName, TYPE_MODEL.getNestedPath(facet)));
            } else {
                return addSubTermsAgg(filterAgg, facet, rawFieldName);
            }
        });
        val result = ImmutableList.<AggregationBuilder<?>>builder().addAll(regularUiFacets);

        /*
         * Facets that aren't visible in the UI, mostly used by the Manifest Download modal dialog. These use special
         * filters, which do not exclude self.
         */
        val repoFilters = repoFilters(filters.deepCopy());
        val repoNameFieldName = toRawFieldName(Fields.REPO_NAME);

        // repositoryNamesFiltered - file count
        val repoNameAggKey = CustomAggregationKeys.REPO_NAME;
        val filterAgg = filter(repoNameAggKey).filter(repoFilters);
        val repoNameSubAgg = filterAgg
                .subAggregation(nestedTermsAgg(repoNameAggKey, repoNameFieldName, EsFields.FILE_COPIES));
        result.add(repoNameSubAgg);

        // repositorySize
        val repoSizeAggKey = CustomAggregationKeys.REPO_SIZE;
        val repoSizeTermsSubAgg = terms(repoSizeAggKey).size(MAX_FACET_TERM_COUNT).field(repoNameFieldName)
                .subAggregation(sum(CustomAggregationKeys.FILE_SIZE).field(toRawFieldName(Fields.FILE_SIZE)));
        val repoSizeSubAgg = filter(repoSizeAggKey).filter(repoFilters)
                .subAggregation(nestedAgg(repoSizeAggKey, EsFields.FILE_COPIES, repoSizeTermsSubAgg));

        result.add(repoSizeSubAgg);

        return result.build();
    }

    private static NestedBuilder nestedTermsAgg(String aggregationKey, String fieldName, String path) {
        val agg = TermsMissingAggPair.from(aggregationKey, fieldName);

        return nestedAgg(aggregationKey, path, agg.terms, agg.missing);
    }

    private static FilterAggregationBuilder addReverseNestedTermsAgg(FilterAggregationBuilder builder,
            String aggregationKey, String fieldName, String path) {
        val agg = TermsMissingAggPair.from(aggregationKey, fieldName);
        val reverseNestedAgg = agg.terms.subAggregation(reverseNested(aggregationKey));
        val nestedAgg = nestedAgg(aggregationKey, path, reverseNestedAgg, agg.missing);

        return builder.subAggregation(nestedAgg);
    }

    private static FilterAggregationBuilder addSubTermsAgg(FilterAggregationBuilder builder, String aggregationKey,
            String fieldName) {
        val agg = TermsMissingAggPair.from(aggregationKey, fieldName);

        return builder.subAggregation(agg.terms).subAggregation(agg.missing);
    }

    private static TermsBuilder primarySiteAgg(@NonNull String fieldAlias, int size) {
        return terms(StatsAggregationKeys.DONOR_PRIMARY_SITE).field(toRawFieldName(fieldAlias)).size(size);
    }

    private static TermsBuilder averageFileSizePerFileCopyAgg(@NonNull String aggName) {
        return terms(aggName).size(100000).field(toRawFieldName(Fields.FILE_ID)).subAggregation(
                nestedAgg(aggName, EsFields.FILE_COPIES, avg(aggName).field(toRawFieldName(Fields.FILE_SIZE))));
    }

    /**
     * Aggregation utilites
     */

    private static Aggregations getSubAggResultFromNested(Aggregations nestedAggs, String aggKey) {
        return ((Nested) nestedAggs.get(aggKey)).getAggregations();
    }

    private static List<Bucket> termsBuckets(Aggregations aggResult, String aggKey) {
        return ((Terms) aggResult.get(aggKey)).getBuckets();
    }

    private static int bucketSize(Aggregations aggResult, String aggKey) {
        return termsBuckets(aggResult, aggKey).size();
    }

    private static double sumValue(Aggregations aggResult, String name) {
        return ((Sum) aggResult.get(name)).getValue();
    }

    private static double averageValue(Aggregations aggResult, String name) {
        return ((Avg) aggResult.get(name)).getValue();
    }

    private static double sumFileCopySize(List<Bucket> buckets, String aggregationKey) {
        return buckets.stream().mapToDouble(bucket -> averageValue(
                getSubAggResultFromNested(bucket.getAggregations(), aggregationKey), aggregationKey)).sum();
    }

    private static class CustomAggregationKeys {

        public static final String FILE_SIZE = "fileSize";
        public static final String REPO_SIZE = "repositorySizes";
        public static final String REPO_NAME = "repositoryNamesFiltered";
        public static final String REPO_DONOR_COUNT = "repositoryDonors";

    }

    private static class SummaryAggregationKeys {

        public static final String FILE = "file";
        public static final String DONOR = "donor";
        public static final String PROJECT = "project";
        public static final String PRIMARY_SITE = "primarySite";

    }

    private static class StatsAggregationKeys {

        public static final String DONOR = "donor";
        public static final String SIZE = "size";
        public static final String FORMAT = "format";
        public static final String DONOR_PRIMARY_SITE = "donorPrimarySite";

    }

    @Value
    private static class TermsMissingAggPair {

        TermsBuilder terms;
        MissingBuilder missing;

        public static TermsMissingAggPair from(String aggregationKey, String fieldName) {
            val termsAgg = terms(aggregationKey).field(fieldName).size(MAX_FACET_TERM_COUNT);
            val missingAgg = missing(MISSING).field(fieldName);

            return new TermsMissingAggPair(termsAgg, missingAgg);
        }

    }

}