com.epam.catgenome.manager.vcf.VcfManager.java Source code

Java tutorial

Introduction

Here is the source code for com.epam.catgenome.manager.vcf.VcfManager.java

Source

/*
 * MIT License
 *
 * Copyright (c) 2016 EPAM Systems
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.epam.catgenome.manager.vcf;

import static com.epam.catgenome.component.MessageHelper.getMessage;
import static com.epam.catgenome.constant.MessagesConstants.ERROR_REGISTER_FILE;
import static com.epam.catgenome.constant.MessagesConstants.ERROR_VCF_ID_INVALID;
import static com.epam.catgenome.constant.MessagesConstants.ERROR_VCF_INDEX;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import com.epam.catgenome.dao.index.FeatureIndexDao;
import com.epam.catgenome.util.InfoFieldParser;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.codehaus.jettison.json.JSONException;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.Assert;

import com.epam.catgenome.constant.MessagesConstants;
import com.epam.catgenome.controller.vo.ga4gh.CallSet;
import com.epam.catgenome.controller.vo.ga4gh.CallSetSearch;
import com.epam.catgenome.controller.vo.registration.FeatureIndexedFileRegistrationRequest;
import com.epam.catgenome.controller.vo.registration.IndexedFileRegistrationRequest;
import com.epam.catgenome.entity.BaseEntity;
import com.epam.catgenome.entity.BiologicalDataItem;
import com.epam.catgenome.entity.BiologicalDataItemFormat;
import com.epam.catgenome.entity.BiologicalDataItemResourceType;
import com.epam.catgenome.entity.gene.GeneFile;
import com.epam.catgenome.entity.index.VcfIndexEntry;
import com.epam.catgenome.entity.reference.Chromosome;
import com.epam.catgenome.entity.reference.Reference;
import com.epam.catgenome.entity.track.Track;
import com.epam.catgenome.entity.track.TrackType;
import com.epam.catgenome.entity.vcf.InfoItem;
import com.epam.catgenome.entity.vcf.Variation;
import com.epam.catgenome.entity.vcf.VariationQuery;
import com.epam.catgenome.entity.vcf.VcfFile;
import com.epam.catgenome.entity.vcf.VcfFilterInfo;
import com.epam.catgenome.entity.vcf.VcfSample;
import com.epam.catgenome.exception.ExternalDbUnavailableException;
import com.epam.catgenome.exception.FeatureFileReadingException;
import com.epam.catgenome.exception.FeatureIndexException;
import com.epam.catgenome.exception.GeneReadingException;
import com.epam.catgenome.exception.RegistrationException;
import com.epam.catgenome.exception.VcfReadingException;
import com.epam.catgenome.manager.BiologicalDataItemManager;
import com.epam.catgenome.manager.DownloadFileManager;
import com.epam.catgenome.manager.FeatureIndexManager;
import com.epam.catgenome.manager.FileManager;
import com.epam.catgenome.manager.TrackHelper;
import com.epam.catgenome.manager.externaldb.HttpDataManager;
import com.epam.catgenome.manager.reference.ReferenceGenomeManager;
import com.epam.catgenome.manager.vcf.reader.AbstractVcfReader;
import com.epam.catgenome.manager.vcf.reader.VcfFileReader;
import com.epam.catgenome.manager.vcf.reader.VcfGa4ghReader;
import com.epam.catgenome.manager.vcf.reader.VcfReader;
import com.epam.catgenome.util.AuthUtils;
import com.epam.catgenome.util.IOHelper;
import com.epam.catgenome.util.Utils;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.FeatureReader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.VariantContext;

/**
 * {@code VcfManager} represents a service class designed to encapsulate all business
 * logic operations required to manage {@code VcfFile} and corresponded tracks, e.g. to process
 * variants uploads, position-based and/or zoom queries etc.
 */
@Service
public class VcfManager {

    @Autowired
    private FileManager fileManager;

    @Autowired
    private VcfFileManager vcfFileManager;

    @Autowired
    private ReferenceGenomeManager referenceGenomeManager;

    @Autowired
    private BiologicalDataItemManager biologicalDataItemManager;

    @Autowired
    private TrackHelper trackHelper;

    @Autowired
    private HttpDataManager httpDataManager;

    @Autowired
    private DownloadFileManager downloadFileManager;

    @Autowired
    private FeatureIndexManager featureIndexManager;

    public static final double HTSJDK_WRONG_QUALITY = -10.0;

    @Value("#{catgenome['vcf.filter.whitelist']}")
    private String[] whiteArray;
    private List<String> whiteList;

    @Value("${vcf.extended.info.patterns}")
    private String extendedInfoTemplates;

    private InfoFieldParser infoFieldParser;

    private static final Logger LOGGER = LoggerFactory.getLogger(VcfManager.class);

    /**
     * Registers a VCF file in the system to make it available to browse. Creates Tribble/Tabix index if absent
     * and a feature index to allow fast search for variations
     *
     * @param request a request for file registration, containing path to file, reference ID and optional parameters:
     *                path to index file and vcf file name to save in the system
     * @return a {@code VcfFile} that was registered
     */
    public VcfFile registerVcfFile(FeatureIndexedFileRegistrationRequest request) {
        final String requestPath = request.getPath();
        Assert.isTrue(StringUtils.isNotBlank(requestPath), getMessage(MessagesConstants.ERROR_NULL_PARAM, "path"));
        Assert.notNull(request.getReferenceId(), getMessage(MessagesConstants.ERROR_NULL_PARAM, "referenceId"));

        VcfFile vcfFile;
        Reference reference = referenceGenomeManager.loadReferenceGenome(request.getReferenceId());
        Map<String, Chromosome> chromosomeMap = reference.getChromosomes().stream()
                .collect(Collectors.toMap(BaseEntity::getName, chromosome -> chromosome));
        if (request.getType() == null) {
            request.setType(BiologicalDataItemResourceType.FILE);
        }
        switch (request.getType()) {
        case GA4GH:
            vcfFile = getVcfFileFromGA4GH(request, requestPath);
            break;
        case FILE:
            vcfFile = createVcfFromFile(request, chromosomeMap, reference, request.isDoIndex());
            break;
        case DOWNLOAD:
            vcfFile = downloadVcfFile(request, requestPath, chromosomeMap, reference, request.isDoIndex());
            break;
        case URL:
            vcfFile = createVcfFromUrl(request, chromosomeMap, reference);
            break;
        default:
            throw new IllegalArgumentException(getMessage(MessagesConstants.ERROR_INVALID_PARAM));
        }
        return vcfFile;
    }

    /**
     * Delete vcf file metadata from database and feature file directory.
     *
     * @param vcfFileId id of file to remove
     * @return deleted file
     * @throws IOException if error occurred during deleting feature file directory
     */
    @Transactional(propagation = Propagation.REQUIRED)
    public VcfFile unregisterVcfFile(final Long vcfFileId) throws IOException {
        Assert.notNull(vcfFileId, MessagesConstants.ERROR_INVALID_PARAM);
        Assert.isTrue(vcfFileId > 0, MessagesConstants.ERROR_INVALID_PARAM);
        VcfFile vcfFile = vcfFileManager.loadVcfFile(vcfFileId);
        Assert.notNull(vcfFile, MessagesConstants.ERROR_NO_SUCH_FILE);
        vcfFileManager.deleteVcfFile(vcfFile);
        if (vcfFile.getType() == BiologicalDataItemResourceType.GA4GH) {
            return vcfFile;
        }
        fileManager.deleteFeatureFileDirectory(vcfFile);
        return vcfFile;
    }

    /**
     * Loads variations for a specified track, for a specified sample
     *
     * @param track    a {@code Track} to load variations for
     * @param sampleId specifies sample to load variations for
     * @param loadInfo specifies if extended info should be loaded
     * @param collapse flag determines if variations should be collapsed on small scale
     * @return a {@code Track} with variations
     */
    public Track<Variation> loadVariations(final Track<Variation> track, final Long sampleId, boolean loadInfo,
            final boolean collapse) throws VcfReadingException {
        Chromosome chromosome = trackHelper.validateTrack(track);

        final VcfFile vcfFile = vcfFileManager.loadVcfFile(track.getId());
        Assert.notNull(vcfFile, getMessage(ERROR_VCF_ID_INVALID, track.getId()));
        final Integer sampleIndex = getSampleIndex(sampleId, vcfFile);
        Assert.notNull(vcfFile.getIndex(), getMessage(ERROR_VCF_INDEX, track.getId()));
        if (track.getType() == null) {
            track.setType(TrackType.VCF);
        }

        AbstractVcfReader.createVcfReader(vcfFile.getType(), httpDataManager, fileManager, referenceGenomeManager)
                .readVariations(vcfFile, track, chromosome, sampleIndex, loadInfo, collapse);

        return track;
    }

    /**
     * Loads variations for a specified track, for a specified sample
     *
     * @param track    a {@code Track} to load variations for
     * @param sampleIndex specifies sample to load variations for
     * @param fileUrl URL of VCF file resource
     * @param indexUrl URL of VCF index resource
     * @param loadInfo specifies if extended info should be loaded
     * @param collapse flag determines if variations should be collapsed on small scale
     * @return a {@code Track} with variations
     */
    public Track<Variation> loadVariations(final Track<Variation> track, String fileUrl, String indexUrl,
            final Integer sampleIndex, final boolean loadInfo, final boolean collapse) throws VcfReadingException {
        Chromosome chromosome = trackHelper.validateUrlTrack(track, fileUrl, indexUrl);

        VcfFile notRegisteredFile = makeTemporaryVcfFileFromUrl(fileUrl, indexUrl, chromosome);

        if (track.getType() == null) {
            track.setType(TrackType.VCF);
        }

        AbstractVcfReader.createVcfReader(BiologicalDataItemResourceType.URL, httpDataManager, fileManager,
                referenceGenomeManager).readVariations(notRegisteredFile, track, chromosome,
                        sampleIndex != null ? sampleIndex : 0, loadInfo, collapse);
        return track;
    }

    /**
     * Loads a single variation with extended info
     *
     * @param query {@code VariationQuery}, defining variation to load
     * @return desired {@code Variation} from VCF file
     */
    public Variation loadVariation(final VariationQuery query) throws FeatureFileReadingException {
        // converts query to a simple track corresponded to a single nucleotide position, where a particular
        // variation should be presented

        final Track<Variation> track = new Track<>();
        track.setScaleFactor(1D);
        track.setId(query.getId());
        track.setEndIndex(query.getPosition());
        track.setStartIndex(query.getPosition());
        track.setChromosome(referenceGenomeManager.loadChromosome(query.getChromosomeId()));

        // tries to load variation
        loadVariations(track, query.getSampleId(), true, false);
        Assert.notEmpty(track.getBlocks(),
                getMessage(MessagesConstants.ERROR_NO_SUCH_VARIATION, query.getPosition()));
        Variation variation = track.getBlocks().get(0);
        extendInfoFields(variation);
        VcfFile vcfFile = vcfFileManager.loadVcfFile(query.getId());
        Reference reference = referenceGenomeManager.loadReferenceGenome(vcfFile.getReferenceId());
        if (reference.getGeneFile() != null) {
            Set<String> geneIds = featureIndexManager.fetchGeneIds(variation.getStartIndex(),
                    variation.getEndIndex(), Collections.singletonList(reference.getGeneFile()),
                    track.getChromosome());
            variation.setGeneNames(geneIds);
        }

        return variation;
    }

    /**
     * Loads a single variation with extended info
     *
     * @param query {@code VariationQuery}, defining variation to load
     * @param fileUrl URL of VCF file resource
     * @param indexUrl URL of VCF index resource
     * @return desired {@code Variation} from VCF file
     */
    public Variation loadVariation(final VariationQuery query, String fileUrl, String indexUrl)
            throws FeatureFileReadingException {
        // converts query to a simple track corresponded to a single nucleotide position, where a particular
        // variation should be presented

        final Track<Variation> track = new Track<>();
        track.setScaleFactor(1D);
        track.setEndIndex(query.getPosition());
        track.setStartIndex(query.getPosition());
        track.setChromosome(new Chromosome(query.getChromosomeId()));

        // tries to load variation
        loadVariations(track, fileUrl, indexUrl,
                query.getSampleId() != null ? query.getSampleId().intValue() : null, true, false);
        Assert.notEmpty(track.getBlocks(),
                getMessage(MessagesConstants.ERROR_NO_SUCH_VARIATION, query.getPosition()));
        Variation variation = track.getBlocks().get(0);
        extendInfoFields(variation);
        Reference reference = referenceGenomeManager.loadReferenceGenome(track.getChromosome().getReferenceId());
        if (reference.getGeneFile() != null) {
            Set<String> geneIds = featureIndexManager.fetchGeneIds(variation.getStartIndex(),
                    variation.getEndIndex(), Collections.singletonList(reference.getGeneFile()),
                    track.getChromosome());
            variation.setGeneNames(geneIds);
        }

        return variation;
    }

    /**
     * Returns next/previous variation of the specified chromosome in specified VCF file
     *
     * @param fromPosition {@code int} the position from which look for next/previous variation
     * @param vcfFileId    {@code int} ID of the VCF file
     * @param chromosomeId {@code int} ID of the chromosome
     * @param sampleId     {@code Integer} ID of the desired sample to search, can be null
     * @param forward      {@code boolean} flag that determines direction to look for feature
     * @return {@code Gene} next or previous feature
     */
    public Variation getNextOrPreviousVariation(final int fromPosition, final Long vcfFileId, final Long sampleId,
            final long chromosomeId, final boolean forward, String fileUrl, String indexUrl)
            throws VcfReadingException {
        final Chromosome chromosome = referenceGenomeManager.loadChromosome(chromosomeId);
        Assert.isTrue(vcfFileId != null || (StringUtils.isNotBlank(fileUrl) && StringUtils.isNotBlank(indexUrl)),
                getMessage(MessagesConstants.ERROR_NULL_PARAM));
        Assert.notNull(chromosome, getMessage(MessagesConstants.ERROR_CHROMOSOME_ID_NOT_FOUND));

        int end = forward ? chromosome.getSize() : 0;
        if ((forward && fromPosition + 1 >= end) || (!forward && fromPosition - 1 <= end)) { // no next features
            return null;
        }

        VcfFile vcfFile;
        if (vcfFileId != null) {
            vcfFile = vcfFileManager.loadVcfFile(vcfFileId);
            Assert.notNull(vcfFile, getMessage(ERROR_VCF_ID_INVALID, vcfFileId));
            Assert.notNull(vcfFile.getIndex(), getMessage(ERROR_VCF_INDEX, vcfFileId));
        } else {
            vcfFile = makeTemporaryVcfFileFromUrl(fileUrl, indexUrl, chromosome);
        }

        VcfReader vcfReader = AbstractVcfReader.createVcfReader(vcfFile.getType(), httpDataManager, fileManager,
                referenceGenomeManager);
        Integer sampleIndex = getSampleIndex(sampleId, vcfFile);
        return vcfReader.getNextOrPreviousVariation(fromPosition, vcfFile, sampleIndex, chromosome, forward);
    }

    /**
     * Loads VCF FILTER and INFO data for a {@code Collection} of VCF files
     * @param vcfFileIds {@code Collection} specifies VCF files of interest
     * @return  VCF FILTER and INFO data
     * @throws IOException if an error with file system occurred
     */
    public VcfFilterInfo getFiltersInfo(Collection<Long> vcfFileIds) throws IOException {
        VcfFilterInfo filterInfo = new VcfFilterInfo();
        Map<String, InfoItem> infoItems = new HashMap<>();
        Set<String> availableFilters = new HashSet<>();

        for (Long fileId : vcfFileIds) {
            VcfFile vcfFile = vcfFileManager.loadVcfFile(fileId);
            Assert.notNull(vcfFile, getMessage(ERROR_VCF_ID_INVALID, fileId));

            try (FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(vcfFile.getPath(),
                    new VCFCodec(), false)) {
                VCFHeader header = (VCFHeader) reader.getHeader();
                Collection<VCFInfoHeaderLine> headerLines = header.getInfoHeaderLines();
                infoItems.putAll(headerLines.stream().filter(l -> !isExtendedInfoLine(l.getDescription())) // Exclude ANN from fields,
                        .map(InfoItem::new) // we don't need it in the index
                        .collect(Collectors.toMap(InfoItem::getName, i -> i)));
                availableFilters.addAll(header.getFilterLines().stream().map(VCFSimpleHeaderLine::getID)
                        .collect(Collectors.toList()));
            }

        }

        List<String> filtersWhiteList = getFilterWhiteList();
        if (!filtersWhiteList.isEmpty()) {
            infoItems = scourFilterList(infoItems, filtersWhiteList);
        }

        infoItems.put(FeatureIndexDao.FeatureIndexFields.IS_EXON.getFieldName(),
                new InfoItem(FeatureIndexDao.FeatureIndexFields.IS_EXON.getFieldName(), VCFHeaderLineType.Flag,
                        "Defines if a variation is " + "located in exon region"));
        filterInfo.setInfoItemMap(infoItems);
        filterInfo.setAvailableFilters(availableFilters);

        return filterInfo;
    }

    /**
     * Creates a feature index for {@link VcfFile}. If an index already exists, it will be deleted and created
     * from scratch
     * @param vcfFileId an ID of VCF file to reindex.
     * @throws FeatureIndexException if an error occurred while writing index
     */
    public VcfFile reindexVcfFile(long vcfFileId) throws FeatureIndexException {
        VcfFile vcfFile = vcfFileManager.loadVcfFile(vcfFileId);
        Reference reference = referenceGenomeManager.loadReferenceGenome(vcfFile.getReferenceId());
        Map<String, Chromosome> chromosomeMap = reference.getChromosomes().stream()
                .collect(Collectors.toMap(BaseEntity::getName, chromosome -> chromosome));
        List<GeneFile> geneFiles = reference.getGeneFile() != null
                ? Collections.singletonList(reference.getGeneFile())
                : Collections.emptyList();

        try {
            fileManager.deleteFileFeatureIndex(vcfFile);

            try (FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(vcfFile.getPath(),
                    new VCFCodec(), false)) {
                VcfFilterInfo info = getFiltersInfo(reader);
                featureIndexManager.makeIndexForVcfReader(vcfFile, reader, geneFiles, chromosomeMap, info);
            }
        } catch (IOException e) {
            throw new FeatureIndexException(vcfFile, e);
        }

        return vcfFile;
    }

    @NotNull
    private VcfFile makeTemporaryVcfFileFromUrl(String fileUrl, String indexUrl, Chromosome chromosome) {
        VcfFile notRegisteredFile = new VcfFile();
        notRegisteredFile.setPath(fileUrl);
        notRegisteredFile.setCompressed(false);
        notRegisteredFile.setType(BiologicalDataItemResourceType.URL);
        notRegisteredFile.setReferenceId(chromosome.getReferenceId());

        BiologicalDataItem index = new BiologicalDataItem();
        index.setPath(indexUrl);
        notRegisteredFile.setIndex(index);
        return notRegisteredFile;
    }

    private VcfFilterInfo getFiltersInfo(FeatureReader<VariantContext> reader) throws IOException {
        VcfFilterInfo filterInfo = new VcfFilterInfo();

        VCFHeader header = (VCFHeader) reader.getHeader();
        Collection<VCFInfoHeaderLine> headerLines = header.getInfoHeaderLines();
        Map<String, InfoItem> infoItems = headerLines.stream().filter(l -> !isExtendedInfoLine(l.getDescription())) // Exclude ANN from fields,
                .map(InfoItem::new) // we don't need it in the index
                .collect(Collectors.toMap(InfoItem::getName, i -> i));
        filterInfo.setAvailableFilters(
                header.getFilterLines().stream().map(VCFSimpleHeaderLine::getID).collect(Collectors.toSet()));

        List<String> filtersWhiteList = getFilterWhiteList();
        if (!filtersWhiteList.isEmpty()) {
            infoItems = scourFilterList(infoItems, filtersWhiteList);
        }

        filterInfo.setInfoItemMap(infoItems);

        return filterInfo;
    }

    /**
     * Returns a white list of VCF Info fields, that are available for filtering
     *
     * @return a {@code List} of field names
     */
    private List<String> getFilterWhiteList() {
        if (whiteList == null) {
            if (whiteArray == null) {
                return Collections.emptyList();
            } else {
                whiteList = Arrays.asList(whiteArray);
            }
        }
        return whiteList;
    }

    private VcfFile createVcfFromFile(final IndexedFileRegistrationRequest request,
            final Map<String, Chromosome> chromosomeMap, Reference reference, boolean doIndex) {
        VcfFile vcfFile = null;
        try (FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(request.getPath(),
                request.getIndexPath(), new VCFCodec(), request.getIndexPath() != null)) {
            vcfFile = createVcfFile(request, reader);
            fileManager.makeVcfDir(vcfFile.getId(), AuthUtils.getCurrentUserId());
            if (StringUtils.isBlank(request.getIndexPath())) {
                fileManager.makeVcfIndex(vcfFile, AuthUtils.getCurrentUserId());
            }

            // In order to fix bugs with zipped VCF
            Map<String, Pair<Integer, Integer>> metaMap = readMetaMap(chromosomeMap, vcfFile, reader, reference,
                    doIndex);
            fileManager.makeIndexMetadata(vcfFile, metaMap);
            biologicalDataItemManager.createBiologicalDataItem(vcfFile.getIndex());
            vcfFileManager.createVcfFile(vcfFile);
        } catch (IOException | GeneReadingException e) {
            throw new RegistrationException(getMessage(ERROR_REGISTER_FILE, request.getName()), e);
        } finally {
            if (vcfFile != null && vcfFile.getId() != null && vcfFileManager.loadVcfFile(vcfFile.getId()) == null) {
                biologicalDataItemManager.deleteBiologicalDataItem(vcfFile.getBioDataItemId());
                try {
                    fileManager.deleteFeatureFileDirectory(vcfFile);
                } catch (IOException e) {
                    LOGGER.error("Unable to delete directory for " + vcfFile.getName(), e);
                }
            }
        }
        return vcfFile;
    }

    private VcfFile createVcfFromUrl(final IndexedFileRegistrationRequest request,
            final Map<String, Chromosome> chromosomeMap, Reference reference) {
        final VcfFile vcfFile;
        try (FeatureReader<VariantContext> reader = AbstractFeatureReader.getFeatureReader(request.getPath(),
                request.getIndexPath(), new VCFCodec(), true)) {
            vcfFile = createVcfFile(request, reader);
            boolean hasVariations = false;
            for (Map.Entry<String, Chromosome> chrEntry : chromosomeMap.entrySet()) {
                CloseableIterator<VariantContext> iterator = Utils.query(reader, chrEntry.getKey(), 1,
                        chrEntry.getValue().getSize());
                if (iterator.hasNext()) {
                    hasVariations = true;
                    break;
                }
            }

            Assert.isTrue(hasVariations, getMessage(MessagesConstants.ERROR_FILE_CORRUPTED_OR_EMPTY,
                    request.getPath(), reference.getName()));
        } catch (IOException e) {
            throw new RegistrationException(getMessage(ERROR_REGISTER_FILE, request.getName()), e);
        }
        biologicalDataItemManager.createBiologicalDataItem(vcfFile.getIndex());
        vcfFileManager.createVcfFile(vcfFile);
        return vcfFile;
    }

    @NotNull
    private Map<String, Pair<Integer, Integer>> readMetaMap(Map<String, Chromosome> chromosomeMap, VcfFile vcfFile,
            FeatureReader<VariantContext> reader, Reference reference, boolean doIndex)
            throws IOException, GeneReadingException {
        Map<String, Pair<Integer, Integer>> metaMap = new HashMap<>();
        CloseableIterator<VariantContext> iterator = reader.iterator();
        int startPosition = 1;
        int endPosition = 1;
        String currentKey = null;
        VariantContext variantContext = null;
        VariantContext lastFeature = null;

        VcfFilterInfo info = getFiltersInfo(reader);
        VcfFileReader vcfFileReader = new VcfFileReader(fileManager, referenceGenomeManager);
        VCFHeader vcfHeader = (VCFHeader) reader.getHeader();
        List<VcfIndexEntry> allEntries = new ArrayList<>();
        List<GeneFile> geneFiles = reference.getGeneFile() != null
                ? Collections.singletonList(reference.getGeneFile())
                : Collections.emptyList();

        while (iterator.hasNext()) {
            variantContext = iterator.next();

            if (!variantContext.getContig().equals(currentKey)) {
                if (checkMetaMapKey(chromosomeMap, currentKey)) {
                    metaMap.put(currentKey, new ImmutablePair<>(startPosition, endPosition));

                    writeEntriesForChromosome(allEntries, geneFiles,
                            Utils.getFromChromosomeMap(chromosomeMap, currentKey), vcfFile, vcfHeader,
                            vcfFileReader, doIndex);
                }

                startPosition = variantContext.getStart();
                currentKey = variantContext.getContig();
            }

            checkSorted(vcfFile, variantContext, lastFeature);

            indexVariation(allEntries, variantContext, chromosomeMap, info, vcfHeader, vcfFileReader, doIndex);

            lastFeature = variantContext;
            // Put the last one in metaMap
            endPosition = variantContext.getStart();

            if (checkMetaMapKey(chromosomeMap, currentKey)) {
                metaMap.put(currentKey, new ImmutablePair<>(startPosition, endPosition));
            }
        }

        // Put the last one
        if (variantContext != null && checkMetaMapKey(chromosomeMap, currentKey)) {
            writeEntriesForChromosome(allEntries, geneFiles, Utils.getFromChromosomeMap(chromosomeMap, currentKey),
                    vcfFile, vcfHeader, vcfFileReader, doIndex);
        }

        return metaMap;
    }

    private void indexVariation(List<VcfIndexEntry> allEntries, VariantContext variantContext,
            Map<String, Chromosome> chromosomeMap, VcfFilterInfo info, VCFHeader vcfHeader,
            VcfFileReader vcfFileReader, boolean doIndex) {
        if (doIndex) {
            featureIndexManager.addVariationToIndex(allEntries, variantContext, chromosomeMap, info, vcfHeader,
                    vcfFileReader);
        }
    }

    private void writeEntriesForChromosome(List<VcfIndexEntry> allEntries, List<GeneFile> geneFiles,
            Chromosome currentChromosome, VcfFile vcfFile, VCFHeader vcfHeader, VcfFileReader vcfFileReader,
            boolean doIndex) throws GeneReadingException, IOException {
        if (doIndex) {
            List<VcfIndexEntry> processedEntries = featureIndexManager.postProcessIndexEntries(allEntries,
                    geneFiles, currentChromosome, vcfHeader, vcfFileReader);
            featureIndexManager.writeLuceneIndexForFile(vcfFile, processedEntries);
            LOGGER.info(
                    getMessage(MessagesConstants.INFO_FEATURE_INDEX_CHROMOSOME_WROTE, currentChromosome.getName()));
            allEntries.clear();
        }
    }

    private boolean checkMetaMapKey(Map<String, Chromosome> chromosomeMap, String currentKey) {
        return currentKey != null && Utils.chromosomeMapContains(chromosomeMap, currentKey);
    }

    private VcfFile createVcfGA4GH(final IndexedFileRegistrationRequest request) {

        VcfFile vcfFile = new VcfFile();
        vcfFile.setId(vcfFileManager.createVcfFileId());
        vcfFile.setCompressed(true);
        vcfFile.setPath(request.getPath());
        vcfFile.setName(request.getName() != null ? request.getName() : request.getPath());
        vcfFile.setPrettyName(request.getPrettyName());
        vcfFile.setType(BiologicalDataItemResourceType.GA4GH); // For now we're working only with files
        vcfFile.setCreatedDate(new Date());
        vcfFile.setCreatedBy(AuthUtils.getCurrentUserId());
        vcfFile.setReferenceId(request.getReferenceId());
        VcfGa4ghReader reader = new VcfGa4ghReader(httpDataManager, referenceGenomeManager);
        CallSetSearch callSetSearch;
        try {
            callSetSearch = reader.callSetSearch(vcfFile.getPath());
        } catch (JSONException | InterruptedException | ExternalDbUnavailableException | IOException e) {
            throw new RegistrationException(vcfFile.getName(), e);
        }
        Map<String, Integer> sampleMap = getSampleNameToOffset(callSetSearch.getCallSets());
        if (sampleMap != null && !sampleMap.isEmpty()) {
            List<VcfSample> samples = sampleMap.entrySet().stream()
                    .map(e -> new VcfSample(e.getKey(), e.getValue())).collect(Collectors.toList());
            vcfFile.setSamples(samples);
        }
        if (StringUtils.isNotBlank(request.getIndexPath())) {
            BiologicalDataItem indexItem = new BiologicalDataItem();
            indexItem.setCreatedDate(new Date());
            indexItem.setPath(request.getIndexPath());
            indexItem.setFormat(BiologicalDataItemFormat.VCF_INDEX);
            indexItem.setType(BiologicalDataItemResourceType.GA4GH);
            indexItem.setName("");
            indexItem.setCreatedBy(AuthUtils.getCurrentUserId());

            vcfFile.setIndex(indexItem);
        }
        long vcfId = vcfFile.getId();
        biologicalDataItemManager.createBiologicalDataItem(vcfFile);
        vcfFile.setBioDataItemId(vcfFile.getId());
        vcfFile.setId(vcfId);
        LOGGER.info(getMessage(MessagesConstants.INFO_GENE_REGISTER, vcfFile.getId(), vcfFile.getPath()));
        return vcfFile;
    }

    private Map<String, Integer> getSampleNameToOffset(final List<CallSet> callSets) {
        HashMap<String, Integer> map = new HashMap<>();
        for (CallSet callSet : callSets) {
            String sample = callSet.getId();
            int index = sample.indexOf('-');
            if (index == -1) {
                Assert.isTrue(false, "SampleId error");
            }
            int sampleId = Integer.parseInt(sample.substring(index + 1, sample.length()));
            map.put(callSet.getSampleId(), sampleId);
        }
        return map;
    }

    private VcfFile createVcfFile(final IndexedFileRegistrationRequest request,
            final FeatureReader<VariantContext> reader) {
        VcfFile vcfFile;
        vcfFile = new VcfFile();

        VCFHeader header = (VCFHeader) reader.getHeader();
        Map<String, Integer> sampleMap = header.getSampleNameToOffset();

        if (sampleMap != null && !sampleMap.isEmpty()) {
            List<VcfSample> samples = sampleMap.entrySet().stream()
                    .map(e -> new VcfSample(e.getKey(), e.getValue())).collect(Collectors.toList());
            vcfFile.setSamples(samples);
        }

        BiologicalDataItemResourceType resourceType = BiologicalDataItemResourceType
                .translateRequestType(request.getType());
        String fileName = FilenameUtils.getName(request.getPath());

        vcfFile.setCompressed(resourceType == BiologicalDataItemResourceType.FILE && IOHelper.isGZIPFile(fileName));
        vcfFile.setName(request.getName() != null ? request.getName() : fileName);
        vcfFile.setPrettyName(request.getPrettyName());
        vcfFile.setId(vcfFileManager.createVcfFileId());
        vcfFile.setPath(request.getPath());
        vcfFile.setType(resourceType);
        vcfFile.setCreatedDate(new Date());
        vcfFile.setCreatedBy(AuthUtils.getCurrentUserId());
        vcfFile.setReferenceId(request.getReferenceId());

        if (StringUtils.isNotBlank(request.getIndexPath())) {
            BiologicalDataItem indexItem = new BiologicalDataItem();
            indexItem.setCreatedDate(new Date());
            indexItem.setPath(request.getIndexPath());
            indexItem.setFormat(BiologicalDataItemFormat.VCF_INDEX);
            indexItem.setType(BiologicalDataItemResourceType.translateRequestType(request.getIndexType()));
            indexItem.setName(vcfFile.getName() + "_index");
            indexItem.setCreatedBy(AuthUtils.getCurrentUserId());

            vcfFile.setIndex(indexItem);
        }
        long vcfId = vcfFile.getId();
        biologicalDataItemManager.createBiologicalDataItem(vcfFile);
        vcfFile.setBioDataItemId(vcfFile.getId());
        vcfFile.setId(vcfId);
        LOGGER.info(getMessage(MessagesConstants.INFO_GENE_REGISTER, vcfFile.getId(), vcfFile.getPath()));
        return vcfFile;
    }

    @NotNull
    private VcfFile getVcfFileFromGA4GH(IndexedFileRegistrationRequest request, String requestPath) {
        VcfFile vcfFile;
        vcfFile = createVcfGA4GH(request);
        BiologicalDataItem indexItem = new BiologicalDataItem();
        indexItem.setCreatedDate(new Date());
        indexItem.setPath(requestPath);
        indexItem.setFormat(BiologicalDataItemFormat.VCF_INDEX);
        indexItem.setType(BiologicalDataItemResourceType.GA4GH);
        indexItem.setName("");
        indexItem.setCreatedBy(AuthUtils.getCurrentUserId());
        vcfFile.setIndex(indexItem);
        biologicalDataItemManager.createBiologicalDataItem(vcfFile.getIndex());
        vcfFileManager.createVcfFile(vcfFile);
        return vcfFile;
    }

    private VcfFile downloadVcfFile(IndexedFileRegistrationRequest request, String requestPath,
            Map<String, Chromosome> chromosomeMap, Reference reference, boolean doIndex) {

        final File newFile;
        try {
            newFile = downloadFileManager.downloadFromURL(request.getPath());
        } catch (IOException e) {
            throw new RegistrationException(getMessage(ERROR_REGISTER_FILE, request.getName()), e);
        }
        request.setIndexPath(null);
        request.setName(request.getName() != null ? request.getName() : FilenameUtils.getBaseName(requestPath));
        request.setPath(newFile.getPath());
        return createVcfFromFile(request, chromosomeMap, reference, doIndex);
    }

    private void checkSorted(VcfFile vcfFile, VariantContext variantContext, VariantContext lastFeature) {
        if (lastFeature != null && variantContext.getStart() < lastFeature.getStart()
                && lastFeature.getContig().equals(variantContext.getContig())) {
            throw new TribbleException.MalformedFeatureFile("Input file is not sorted by start position. \n"
                    + "We saw a record with a start of " + variantContext.getContig() + ":"
                    + variantContext.getStart() + " after a record with a start of " + lastFeature.getContig() + ":"
                    + lastFeature.getStart(), vcfFile.getName());
        }
    }

    private Integer getSampleIndex(Long sampleId, VcfFile vcfFile) {
        if (vcfFile.getSamples() != null && !vcfFile.getSamples().isEmpty()) {
            Map<Long, VcfSample> sampleMap = new LinkedHashMap<>();
            vcfFile.getSamples().forEach(s -> sampleMap.put(s.getId(), s));

            if (sampleId != null) {
                return sampleMap.get(sampleId).getIndex();
            } else {
                return sampleMap.get(sampleMap.keySet().iterator().next()).getIndex();
            }
        }
        return null;
    }

    private Map<String, InfoItem> scourFilterList(Map<String, InfoItem> map, List<String> whiteList) {
        return whiteList.stream().filter(map::containsKey).map(map::get)
                .collect(Collectors.toMap(InfoItem::getName, i -> i));
    }

    private boolean isExtendedInfoLine(String description) {
        InfoFieldParser parser = getExtendedInfoParser();
        return parser.isExtendedInfoField(description);
    }

    protected void setExtendedInfoTemplates(String extendedInfoTemplates) {
        this.extendedInfoTemplates = extendedInfoTemplates;
    }

    private void extendInfoFields(Variation variation) {
        Map<String, Variation.InfoField> infoFieldMap = variation.getInfo();
        InfoFieldParser parser = getExtendedInfoParser();
        for (Map.Entry<String, Variation.InfoField> infoEntry : infoFieldMap.entrySet()) {
            Variation.InfoField infoField = infoEntry.getValue();
            if (parser.isExtendedInfoField(infoField.getDescription())) {
                extendInfoField(infoField, parser);
            }
        }
    }

    private void extendInfoField(Variation.InfoField infoField, InfoFieldParser parser) {
        List<String> values;
        if (infoField.getValue() instanceof List) {
            values = (List<String>) infoField.getValue();
        } else if (infoField.getValue() instanceof String) {
            values = Collections.singletonList((String) infoField.getValue());
        } else {
            return;
        }
        List<String> header = parser.extractHeaderFromLine(infoField.getDescription());
        List<List<String>> lines = new ArrayList<>();
        for (String line : values) {
            List<String> data = parser.extractDataFromLine(line);
            if (data.size() == header.size()) {
                lines.add(data);
            } else {
                LOGGER.error("Extended info field value doesn't match the format " + "defined in the file header.");
                return;
            }
        }
        infoField.setType(Variation.InfoFieldTypes.TABLE);
        infoField.setHeader(header);
        infoField.setValue(lines);
    }

    public InfoFieldParser getExtendedInfoParser() {
        if (infoFieldParser != null) {
            return infoFieldParser;
        }
        if (extendedInfoTemplates == null || extendedInfoTemplates.isEmpty()) {
            infoFieldParser = new InfoFieldParser("");
        } else {
            infoFieldParser = new InfoFieldParser(extendedInfoTemplates);
        }
        return infoFieldParser;
    }
}