gov.nih.nci.ncicb.tcga.dcc.qclive.common.action.MafFileProcessor.java Source code

Java tutorial

Introduction

Here is the source code for gov.nih.nci.ncicb.tcga.dcc.qclive.common.action.MafFileProcessor.java

Source

/*
 * Software License, Version 1.0 Copyright 2010 SRA International, Inc.
 * Copyright Notice.  The software subject to this notice and license includes both human
 * readable source code form and machine readable, binary, object code form (the "caBIG
 * Software").
 *
 * Please refer to the complete License text for full details at the root of the project.
 */

package gov.nih.nci.ncicb.tcga.dcc.qclive.common.action;

import gov.nih.nci.ncicb.tcga.dcc.common.bean.Archive;
import gov.nih.nci.ncicb.tcga.dcc.common.bean.FileInfo;
import gov.nih.nci.ncicb.tcga.dcc.common.dao.DiseaseContextHolder;
import gov.nih.nci.ncicb.tcga.dcc.common.dao.FileInfoQueries;
import gov.nih.nci.ncicb.tcga.dcc.common.exception.UUIDException;
import gov.nih.nci.ncicb.tcga.dcc.qclive.bean.BCRID;
import gov.nih.nci.ncicb.tcga.dcc.qclive.bean.MafInfo;
import gov.nih.nci.ncicb.tcga.dcc.qclive.common.QcContext;
import gov.nih.nci.ncicb.tcga.dcc.qclive.common.action.validation.util.MessageFormat;
import gov.nih.nci.ncicb.tcga.dcc.qclive.common.action.validation.util.MessagePropertyType;
import gov.nih.nci.ncicb.tcga.dcc.qclive.common.util.BiospecimenHelper;
import gov.nih.nci.ncicb.tcga.dcc.qclive.dao.BCRDataService;
import gov.nih.nci.ncicb.tcga.dcc.qclive.dao.MafInfoQueries;
import org.apache.commons.lang.StringUtils;
import org.springframework.dao.DataAccessException;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Processes all maf files in an archive.  Note that this does not perform validation -- see MafFileValidator for that.
 * <p/>
 * NOTE: saves info only to disease-specific schema.
 *
 * @author Jessica Chen
 *         Last updated by: $Author: sfeirr $
 * @version $Rev: 3419 $
 */
public class MafFileProcessor extends AbstractMafFileHandler<File> {

    private MafInfoQueries mafInfoQueries;
    private FileInfoQueries fileInfoQueries;
    private BCRDataService bcrDataService;

    private int batchSize = BATCH_SIZE;

    @Override
    protected File doWork(final File mafFile, final QcContext context) throws ProcessorException {
        // make sure the disease is set
        if (context.getArchive() != null) {
            DiseaseContextHolder.setDisease(context.getArchive().getTumorType());
        }

        FileReader fileReader = null;
        BufferedReader bufferedReader = null;

        try {
            // open file
            fileReader = new FileReader(mafFile);
            bufferedReader = new BufferedReader(fileReader);

            int lineNum = 0;

            // find first non-blank line not starting with #, this is the header
            String headerLine = bufferedReader.readLine();
            lineNum++;
            while (StringUtils.isEmpty(headerLine.trim())
                    || StringUtils.startsWith(headerLine, COMMENT_LINE_TOKEN)) {
                headerLine = bufferedReader.readLine();
                lineNum++;
            }

            final List<String> headers = Arrays.asList(headerLine.split("\\t"));

            context.setFile(mafFile);
            final Map<String, Integer> fieldOrder = mapFieldOrder(headers);
            // need to find out the file id for this maf file
            final Long mafFileId = fileInfoQueries.getFileId(mafFile.getName(), context.getArchive().getId());
            if (mafFileId == null || mafFileId == -1) {
                context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
                throw new ProcessorException(new StringBuilder().append("File '").append(mafFile.getName())
                        .append("' was not found in the database").toString());
            }
            if (isAddMafInfo(mafFileId)) {
                HashMap<String, BCRID> biospecimens = new HashMap<String, BCRID>();
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    lineNum++;
                    if (!StringUtils.isBlank(line.trim()) && !StringUtils.startsWith(line, COMMENT_LINE_TOKEN)) {
                        final String[] row = line.split("\\t");

                        try {
                            processRow(row, fieldOrder, mafFileId, biospecimens, context, mafFile, lineNum);
                            //  If exceeds batch size store it in the database
                            if (biospecimens.size() >= getBatchSize()) {
                                try {
                                    insertBiospecimenToFileRelationships(biospecimens, context, mafFile);
                                } catch (UUIDException ue) {
                                    throw new ProcessorException(ue.getMessage(), ue);
                                }
                                biospecimens.clear();
                            }
                        } catch (DataAccessException e) {
                            // catch DB errors per line
                            context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
                            context.addError(MessageFormat.format(MessagePropertyType.MAF_FILE_PROCESSING_ERROR,
                                    mafFile.getName(),
                                    new StringBuilder().append("Mutation information from file at line '")
                                            .append(lineNum).append("' was not successfully added. Root cause: ")
                                            .append(e.getMessage()).toString()));
                        }
                    }
                }
                // process remaining biospecimens
                if (biospecimens.size() > 0) {
                    try {
                        insertBiospecimenToFileRelationships(biospecimens, context, mafFile);
                    } catch (UUIDException ue) {
                        context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
                        throw new ProcessorException(ue.getMessage(), ue);
                    } catch (DataAccessException e) {
                        context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
                        throw new ProcessorException(e.getMessage(), e);
                    }
                    biospecimens.clear();
                }
            }
        } catch (IOException e) {
            context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
            throw new ProcessorException(
                    new StringBuilder().append("Error reading maf file ").append(mafFile.getName()).toString());
        } finally {
            if (bufferedReader != null) {
                try {
                    bufferedReader.close();
                } catch (IOException e) {
                    // ignore
                }
            }

            if (fileReader != null) {
                try {
                    fileReader.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        }

        return mafFile;
    }

    /**
     * Method determines whether the fileid has already been loaded in the MAF_INFO table. Checks
     * to see if the archive corresponding to the fileid is available and latest. If yes, returns false. Otherwise,
     * deletes the existing set of maf_info data corresponding to the fileid and returns true. This ensures that there
     * is only on set of maf_info data for a given fileid.
     * @param mafFileId
     * @return boolean true if ok to add the {@link MafInfo} corresponding to the maf file id. false otherwise.
     */
    protected boolean isAddMafInfo(final Long mafFileId) {
        boolean bRet = true;
        if (mafInfoQueries.fileIdExistsInMafInfo(mafFileId)) {
            final FileInfo fi = fileInfoQueries.getFileForFileId(mafFileId);
            if (fi != null) {
                final Archive archive = fileInfoQueries.getLatestArchiveContainingFile(fi);
                if (archive != null) {
                    bRet = false;
                }
            }
            if (bRet) {
                mafInfoQueries.deleteMafInfoForFileId(mafFileId);
            }
        }
        return bRet;
    }

    protected void setOtherMafInfoFields(final MafInfo mafInfo, final String[] row,
            final Map<String, Integer> fieldOrder, final long mafFileId) throws ProcessorException {
        // does nothing, all fields are in processRow.  But subclasses should override!
    }

    protected String getEndPositionName() {
        return FIELD_END_POSITION;
    }

    protected String getStartPositionName() {
        return FIELD_START_POSITION;
    }

    private void processRow(final String[] row, final Map<String, Integer> fieldOrder, final long mafFileId,
            final HashMap<String, BCRID> bcrIds, final QcContext context, final File mafFile, final int lineNum)
            throws ProcessorException {

        final MafInfo mafInfo = getMafInfo();
        mafInfo.setLineNumber(lineNum);
        mafInfo.setCenterID(context.getArchive().getTheCenter().getCenterId());
        mafInfo.setFileID(mafFileId);
        mafInfo.setChromosome(row[fieldOrder.get(FIELD_CHROMOSOME)]);
        mafInfo.setDbsnpRS(row[fieldOrder.get(FIELD_DBSNP_RS)]);
        mafInfo.setDbSNPValStatus(row[fieldOrder.get(FIELD_DBSNP_VAL_STATUS)]);
        mafInfo.setEndPosition(Integer.valueOf(row[fieldOrder.get(getEndPositionName())]));
        mafInfo.setEntrezGeneID(Integer.valueOf(row[fieldOrder.get(FIELD_ENTREZ_GENE_ID)]));
        mafInfo.setHugoSymbol(row[fieldOrder.get(FIELD_HUGO_SYMBOL)]);
        mafInfo.setMatchNormalSampleBarcode(row[fieldOrder.get(FIELD_MATCHED_NORM_SAMPLE_BARCODE)]);
        mafInfo.setMatchNormSeqAllele1(row[fieldOrder.get(FIELD_MATCH_NORM_SEQ_ALLELE1)]);
        mafInfo.setMatchNormSeqAllele2(row[fieldOrder.get(FIELD_MATCH_NORM_SEQ_ALLELE2)]);
        mafInfo.setMatchNormValidationAllele1(row[fieldOrder.get(FIELD_MATCH_NORM_VALIDATION_ALLELE1)]);
        mafInfo.setMatchNormValidationAllele2(row[fieldOrder.get(FIELD_MATCH_NORM_VALIDATION_ALLELE2)]);
        mafInfo.setMutationStatus(row[fieldOrder.get(FIELD_MUTATION_STATUS)]);
        mafInfo.setNcbiBuild(row[fieldOrder.get(FIELD_NCBI_BUILD)]);
        mafInfo.setReferenceAllele(row[fieldOrder.get(FIELD_REFERENCE_ALLELE)]);
        mafInfo.setStartPosition(Integer.valueOf(row[fieldOrder.get(getStartPositionName())]));
        mafInfo.setStrand(row[fieldOrder.get(FIELD_STRAND)]);
        mafInfo.setTumorSampleBarcode(row[fieldOrder.get(FIELD_TUMOR_SAMPLE_BARCODE)]);
        mafInfo.setTumorSeqAllele1(row[fieldOrder.get(FIELD_TUMOR_SEQ_ALLELE1)]);
        mafInfo.setTumorSeqAllele2(row[fieldOrder.get(FIELD_TUMOR_SEQ_ALLELE2)]);
        mafInfo.setTumorValidationAllele1(row[fieldOrder.get(FIELD_TUMOR_VALIDATION_ALLELE1)]);
        mafInfo.setTumorValidationAllele2(row[fieldOrder.get(FIELD_TUMOR_VALIDATION_ALLELE2)]);
        mafInfo.setValidationStatus(row[fieldOrder.get(FIELD_VALIDATION_STATUS)]);
        mafInfo.setVariantClassification(row[fieldOrder.get(FIELD_VARIANT_CLASSIFICATION)]);
        mafInfo.setVariantType(row[fieldOrder.get(FIELD_VARIANT_TYPE)]);
        mafInfo.setVerificationStatus(row[fieldOrder.get(FIELD_VERIFICATION_STATUS)]);

        // set uuids values in maf info bean
        if (fieldOrder.get(FIELD_TUMOR_SAMPLE_UUID) != null) {
            mafInfo.setTumorSampleUUID(row[fieldOrder.get(FIELD_TUMOR_SAMPLE_UUID)]);
        }
        if (fieldOrder.get(FIELD_MATCHED_NORM_SAMPLE_UUID) != null) {
            mafInfo.setMatchNormalSampleUUID(row[fieldOrder.get(FIELD_MATCHED_NORM_SAMPLE_UUID)]);
        }

        setOtherMafInfoFields(mafInfo, row, fieldOrder, mafFileId);
        final long mafId = mafInfoQueries.addMaf(mafInfo);
        mafInfo.setId(mafId);

        insertBiospecimenInMap(bcrIds, mafInfo.getTumorSampleBarcode(), mafInfo.getTumorSampleUUID(), context,
                mafFile);
        insertBiospecimenInMap(bcrIds, mafInfo.getMatchNormalSampleBarcode(), mafInfo.getMatchNormalSampleUUID(),
                context, mafFile);
    }

    // depending on the context isCenterConvertedToUUID will either put UUID or barcode in map
    private void insertBiospecimenInMap(final HashMap<String, BCRID> bcrIds, final String barcodeString,
            final String uuid, final QcContext context, final File mafFile) {

        try {
            final BCRID bcrId;

            final String key;
            if (context.isCenterConvertedToUUID()) {
                key = uuid;
            } else {
                key = barcodeString;
            }

            if (bcrIds.get(key) == null) {
                bcrId = bcrDataService.parseAliquotBarcode(barcodeString);
                bcrId.setUUID(uuid);
                bcrIds.put(key, bcrId);
            }

        } catch (ParseException e) {
            context.getArchive().setDeployStatus(Archive.STATUS_IN_REVIEW);
            context.addError(MessageFormat.format(MessagePropertyType.MAF_FILE_PROCESSING_ERROR, mafFile.getName(),
                    "Barcode format in " + mafFile.getName() + " is invalid: '"));
        }

    }

    private void insertBiospecimenToFileRelationships(final HashMap<String, BCRID> barcodesOrUuids,
            final QcContext context, final File mafFile) throws UUIDException {

        final List<Integer> biospecimenIds;
        final List<Long> shippedBiospecimenIds;

        // if center is converted to UUID use the uuids to look up the biospecimen ID
        if (context.isCenterConvertedToUUID()) {
            final List<String> uuids = new ArrayList<String>();
            for (final BCRID bcrId : barcodesOrUuids.values()) {
                uuids.add(bcrId.getUUID());
            }

            shippedBiospecimenIds = bcrDataService.getShippedBiospecimenIds(uuids);
            biospecimenIds = new ArrayList<Integer>();
            for (final Long shippedBiospecimenId : shippedBiospecimenIds) {
                biospecimenIds.add(shippedBiospecimenId.intValue());
            }

        } else {
            biospecimenIds = BiospecimenHelper.getBiospecimenIds(new ArrayList<BCRID>(barcodesOrUuids.values()),
                    bcrDataService);
            shippedBiospecimenIds = new ArrayList<Long>();
            for (final Integer id : biospecimenIds) {
                shippedBiospecimenIds.add(id.longValue());
            }

        }

        // add biospecimen to file relationship into common and disease database
        BiospecimenHelper.insertBiospecimenFileRelationship(biospecimenIds,
                context.getArchive().getFilenameToIdToMap().get(mafFile.getName()), bcrDataService,
                context.getArchive().getTheTumor());

        // now shipped biospecimen to file...
        bcrDataService.addShippedBiospecimensFileRelationship(shippedBiospecimenIds,
                context.getArchive().getFilenameToIdToMap().get(mafFile.getName()));
    }

    protected MafInfo getMafInfo() {
        return new MafInfo();
    }

    public String getName() {
        return "maf file processor";
    }

    public void setMafInfoQueries(final MafInfoQueries mafInfoQueries) {
        this.mafInfoQueries = mafInfoQueries;
    }

    public void setFileInfoQueries(final FileInfoQueries fileInfoQueries) {
        this.fileInfoQueries = fileInfoQueries;
    }

    public BCRDataService getBcrDataService() {
        return bcrDataService;
    }

    public void setBcrDataService(BCRDataService bcrDataService) {
        this.bcrDataService = bcrDataService;
    }

    public int getBatchSize() {
        return batchSize;
    }

    public void setBatchSize(final int batchSize) {
        this.batchSize = batchSize;
    }
}