Java tutorial
/* * Copyright 2015 OpenCB * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.opencb.cellbase.app.transform; import com.google.common.base.Stopwatch; import htsjdk.tribble.readers.TabixReader; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.opencb.biodata.models.variation.PopulationFrequency; import org.opencb.biodata.models.variation.TranscriptVariation; import org.opencb.biodata.models.variation.Variation; import org.opencb.biodata.models.variation.Xref; import org.opencb.cellbase.app.transform.utils.FileUtils; import org.opencb.cellbase.app.transform.utils.VariationUtils; import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.sql.SQLException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class VariationParser extends CellBaseParser { private static final String VARIATION_FILENAME = "variation.txt"; private static final String PREPROCESSED_VARIATION_FILENAME = "variation.sorted.txt"; private static final String VARIATION_FEATURE_FILENAME = "variation_feature.txt"; private static final String TRANSCRIPT_VARIATION_FILENAME = "transcript_variation.txt"; private static final String VARIATION_SYNONYM_FILENAME = "variation_synonym.txt"; private static final String VARIATION_FREQUENCIES_FILENAME = "eva_population_freqs.sorted.txt.gz"; private static final String PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME = "transcript_variation.includingVariationId.txt"; private static final String PREPROCESSED_VARIATION_FEATURE_FILENAME = "variation_feature.sorted.txt"; private static final String PREPROCESSED_VARIATION_SYNONYM_FILENAME = "variation_synonym.sorted.txt"; private static final int VARIATION_FEATURE_FILE_ID = 0; private static final int TRANSCRIPT_VARIATION_FILE_ID = 1; private static final int VARIATION_SYNONYM_FILE_ID = 2; private static final String THOUSAND_GENOMES_PHASE_1_STUDY = "1000GENOMES_phase_1"; private static final String THOUSAND_GENOMES_PHASE_3_STUDY = "1000GENOMES_phase_3"; private static final String ESP_6500_STUDY = "ESP_6500"; private static final String THOUSAND_GENOMES_ALL_POPULATION = "ALL"; private static final String THOUSAND_GENOMES_AMERICAN_POPULATION = "AMR"; private static final String THOUSAND_GENOMES_ASIAN_POPULATION = "ASN"; private static final String THOUSAND_GENOMES_AFRICAN_POPULATION = "AFR"; private static final String THOUSAND_GENOMES_EUROPEAN_POPULATION = "EUR"; private static final String THOUSAND_GENOMES_EASTASIAN_POPULATION = "EAS"; private static final String THOUSAND_GENOMES_SOUTHASIAN_POPULATION = "SAS"; private static final String ESP_EUROPEAN_AMERICAN_POPULATION = "European_American"; private static final String ESP_AFRICAN_AMERICAN_POPULATION = "African_American"; private Path variationDirectoryPath; private BufferedReader variationSynonymsFileReader; private BufferedReader variationTranscriptsFileReader; private BufferedReader variationFeaturesFileReader; private static final int VARIATION_ID_COLUMN_INDEX_IN_VARIATION_FILE = 0; private static final int VARIATION_ID_COLUMN_INDEX_IN_VARIATION_SYNONYM_FILE = 1; private static final int VARIATION_ID_COLUMN_INDEX_IN_VARIATION_FEATURE_FILE = 5; private static final int VARIATION_FEATURE_ID_COLUMN_INDEX_IN_TRANSCRIPT_VARIATION_FILE = 1; private int variationIdColumnIndexInTranscriptVariationFile; private int[] lastVariationIdInVariationRelatedFiles; private boolean[] endOfFileOfVariationRelatedFiles; private String[][] lastLineInVariationRelatedFile; private int[] variationIdColumnIndexInVariationRelatedFile; private BufferedReader[] variationRelatedFileReader; private Pattern populationFrequnciesPattern; private static final String POPULATION_ID_GROUP = "popId"; private static final String REFERENCE_FREQUENCY_GROUP = "ref"; private static final String ALTERNATE_FREQUENCY_GROUP = "alt"; private TabixReader frequenciesTabixReader; private final Set<String> thousandGenomesPhase1MissedPopulations; private final Set<String> thousandGenomesPhase3MissedPopulations; private CellBaseFileSerializer fileSerializer; private Map<String, String> outputFileNames; public VariationParser(Path variationDirectoryPath, CellBaseFileSerializer serializer) { super(serializer); fileSerializer = serializer; this.variationDirectoryPath = variationDirectoryPath; populationFrequnciesPattern = Pattern.compile("(?<" + POPULATION_ID_GROUP + ">\\w+):(?<" + REFERENCE_FREQUENCY_GROUP + ">\\d+.\\d+),(?<" + ALTERNATE_FREQUENCY_GROUP + ">\\d+.\\d+)"); thousandGenomesPhase1MissedPopulations = new HashSet<>(); thousandGenomesPhase3MissedPopulations = new HashSet<>(); outputFileNames = new HashMap<>(); } @Override public void parse() throws IOException, InterruptedException, SQLException, ClassNotFoundException { if (!Files.exists(variationDirectoryPath) || !Files.isDirectory(variationDirectoryPath) || !Files.isReadable(variationDirectoryPath)) { throw new IOException( "Variation directory whether does not exist, is not a directory or cannot be read"); } if (!existsZippedOrUnzippedFile(VARIATION_FILENAME) || isEmpty(variationDirectoryPath.resolve(VARIATION_FILENAME).toString())) { throw new IOException("variation.txt.gz whether does not exist, is not a directory or cannot be read"); } Variation variation; // To speed up calculation a SQLite database is created with the IDs and file offsets, // file must be uncompressed for doing this. gunzipVariationInputFiles(); // add idVariation to transcript_variation file preprocessInputFiles(); // Open variation file, this file never gets uncompressed. It's read from gzip file BufferedReader bufferedReaderVariation = getBufferedReader(PREPROCESSED_VARIATION_FILENAME); // create buffered readers for all other input files createVariationFilesBufferedReaders(); Map<String, String> seqRegionMap = VariationUtils.parseSeqRegionToMap(variationDirectoryPath); Map<String, String> sourceMap = VariationUtils.parseSourceToMap(variationDirectoryPath); initializeVariationRelatedArrays(); Stopwatch globalStartwatch = Stopwatch.createStarted(); Stopwatch batchWatch = Stopwatch.createStarted(); logger.info("Parsing variation file " + variationDirectoryPath.resolve(PREPROCESSED_VARIATION_FILENAME) + " ..."); long countprocess = 0; String line; while ((line = bufferedReaderVariation.readLine()) != null) { String[] variationFields = line.split("\t"); int variationId = Integer.parseInt(variationFields[0]); List<String[]> resultVariationFeature = getVariationRelatedFields(VARIATION_FEATURE_FILE_ID, variationId); if (resultVariationFeature != null && resultVariationFeature.size() > 0) { String[] variationFeatureFields = resultVariationFeature.get(0); List<TranscriptVariation> transcriptVariation = getTranscriptVariations(variationId, variationFeatureFields[0]); List<Xref> xrefs = getXrefs(sourceMap, variationId); try { // Preparing the variation alleles String[] allelesArray = getAllelesArray(variationFeatureFields); // For code sanity save chromosome, start, end and id String chromosome = seqRegionMap.get(variationFeatureFields[1]); if (!chromosome.contains("PATCH") && !chromosome.contains("HSCHR") && !chromosome.contains("contig")) { int start = (variationFeatureFields != null) ? Integer.valueOf(variationFeatureFields[2]) : 0; int end = (variationFeatureFields != null) ? Integer.valueOf(variationFeatureFields[3]) : 0; String id = (variationFields[2] != null && !variationFields[2].equals("\\N")) ? variationFields[2] : ""; String reference = (allelesArray[0] != null && !allelesArray[0].equals("\\N")) ? allelesArray[0] : ""; String alternate = (allelesArray[1] != null && !allelesArray[1].equals("\\N")) ? allelesArray[1] : ""; // Preparing frequencies //List<PopulationFrequency> populationFrequencies = getPopulationFrequencies(variationId, allelesArray); List<PopulationFrequency> populationFrequencies = getPopulationFrequencies(chromosome, start, end, id, reference, alternate); // TODO: check that variationFeatureFields is always different to null and intergenic-variant is never used //List<String> consequenceTypes = (variationFeatureFields != null) ? Arrays.asList(variationFeatureFields[12].split(",")) : Arrays.asList("intergenic_variant"); List<String> consequenceTypes = Arrays.asList(variationFeatureFields[12].split(",")); String displayConsequenceType = getDisplayConsequenceType(consequenceTypes); // we have all the necessary to construct the 'variation' object variation = buildVariation(variationFields, variationFeatureFields, chromosome, start, end, id, reference, alternate, transcriptVariation, xrefs, populationFrequencies, allelesArray, consequenceTypes, displayConsequenceType); fileSerializer.serialize(variation, getOutputFileName(chromosome)); } if (++countprocess % 100000 == 0 && countprocess != 0) { logger.info("Processed variations: " + countprocess); logger.debug("Elapsed time processing batch: " + batchWatch); batchWatch.reset(); batchWatch.start(); } } catch (Exception e) { e.printStackTrace(); logger.error("Error parsing variation: " + e.getMessage()); logger.error("Last line processed: " + line); break; } } // TODO: just for testing, remove //if (countprocess % 100000 == 0) { // break; //} } logger.info("Variation parsing finished"); logger.info("Variants processed: " + countprocess); logger.debug("Elapsed time parsing: " + globalStartwatch); gzipVariationFiles(variationDirectoryPath); try { bufferedReaderVariation.close(); } catch (Exception e) { e.printStackTrace(); } } private void preprocessInputFiles() throws IOException, InterruptedException { preprocessVariationFile(); sortInputFile(VARIATION_FEATURE_FILENAME, PREPROCESSED_VARIATION_FEATURE_FILENAME, VARIATION_ID_COLUMN_INDEX_IN_VARIATION_FEATURE_FILE); sortInputFile(VARIATION_SYNONYM_FILENAME, PREPROCESSED_VARIATION_SYNONYM_FILENAME, VARIATION_ID_COLUMN_INDEX_IN_VARIATION_SYNONYM_FILE); preprocessTranscriptVariationFile(); } private void preprocessVariationFile() throws IOException, InterruptedException { if (!existsZippedOrUnzippedFile(PREPROCESSED_VARIATION_FILENAME)) { sortInputFile(VARIATION_FILENAME, PREPROCESSED_VARIATION_FILENAME, VARIATION_ID_COLUMN_INDEX_IN_VARIATION_FILE); } } private void sortInputFile(String unsortedFileName, String sortedFileName, int columnToSortByIndex) throws IOException, InterruptedException { if (!existsZippedOrUnzippedFile(sortedFileName)) { Path sortedFile = variationDirectoryPath.resolve(sortedFileName); Path unsortedFile = variationDirectoryPath.resolve(unsortedFileName); sortFileByNumericColumn(unsortedFile, sortedFile, columnToSortByIndex); } } private void sortFileByNumericColumn(Path inputFile, Path outputFile, int columnIndex) throws InterruptedException, IOException { this.logger.info("Sorting file " + inputFile + " into " + outputFile + " ..."); // increment column index by 1, beacause Java indexes are 0-based and 'sort' command uses 1-based indexes columnIndex++; ProcessBuilder pb = new ProcessBuilder("sort", "-t", "\t", "-k", Integer.toString(columnIndex), "-n", "--stable", inputFile.toAbsolutePath().toString(), "-T", variationDirectoryPath.toString(), "-o", outputFile.toAbsolutePath().toString()); // System.getProperty("java.io.tmpdir") this.logger.debug("Executing '" + StringUtils.join(pb.command(), " ") + "' ..."); Stopwatch stopwatch = Stopwatch.createStarted(); Process process = pb.start(); process.waitFor(); int returnedValue = process.exitValue(); if (returnedValue != 0) { String errorMessage = IOUtils.toString(process.getErrorStream()); logger.error("Error sorting " + inputFile); logger.error(errorMessage); throw new RuntimeException("Error sorting " + inputFile); } this.logger.info("Sorted"); this.logger.debug("Elapsed time sorting file: " + stopwatch); } private void preprocessTranscriptVariationFile() throws IOException, InterruptedException { Path preprocessedTranscriptVariationFile = variationDirectoryPath .resolve(PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME); if (!existsZippedOrUnzippedFile(PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME)) { this.logger.info("Preprocessing " + TRANSCRIPT_VARIATION_FILENAME + " file ..."); Stopwatch stopwatch = Stopwatch.createStarted(); // add variationId to transcript_variation file Map<Integer, Integer> variationFeatureToVariationId = createVariationFeatureIdToVariationIdMap(); Path transcriptVariationTempFile = addVariationIdToTranscriptVariationFile( variationFeatureToVariationId); // transcript_variation file columns number can vary, so we need to save the variationId column index because it is not constant variationIdColumnIndexInTranscriptVariationFile = getVariationIdColumnIndexInTranscriptVariationFile( transcriptVariationTempFile); sortFileByNumericColumn(transcriptVariationTempFile, preprocessedTranscriptVariationFile, variationIdColumnIndexInTranscriptVariationFile); this.logger.info("Removing temp file " + transcriptVariationTempFile); transcriptVariationTempFile.toFile().delete(); this.logger.info("Removed"); this.logger.info(TRANSCRIPT_VARIATION_FILENAME + " preprocessed. New file " + PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME + " including (and sorted by) variation Id has been created"); this.logger.debug("Elapsed time preprocessing transcript variation file: " + stopwatch); } else { // transcript_variation file columns number can vary, so we need to save the variationId column index because it is not constant variationIdColumnIndexInTranscriptVariationFile = getVariationIdColumnIndexInTranscriptVariationFile( preprocessedTranscriptVariationFile); } } private int getVariationIdColumnIndexInTranscriptVariationFile(Path preprocessedTranscriptVariationFile) throws IOException { BufferedReader br = getBufferedReader(preprocessedTranscriptVariationFile); int variationIdColumIndex = br.readLine().split("\t").length - 1; br.close(); return variationIdColumIndex; } private Path addVariationIdToTranscriptVariationFile(Map<Integer, Integer> variationFeatureToVariationId) throws IOException { Path transcriptVariationTempFile = variationDirectoryPath.resolve(TRANSCRIPT_VARIATION_FILENAME + ".tmp"); this.logger.info("Adding variation Id to transcript variations and saving them into " + transcriptVariationTempFile + " ..."); Stopwatch stopwatch = Stopwatch.createStarted(); Path unpreprocessedTranscriptVariationFile = variationDirectoryPath.resolve(TRANSCRIPT_VARIATION_FILENAME); BufferedReader br = getBufferedReader(unpreprocessedTranscriptVariationFile); BufferedWriter bw = Files.newBufferedWriter(transcriptVariationTempFile, Charset.defaultCharset(), StandardOpenOption.CREATE, StandardOpenOption.WRITE); String line; while ((line = br.readLine()) != null) { // TODO: add limit parameter would do that run faster? // TODO: use a precompiled pattern would improve efficiency Integer variationFeatureId = Integer.valueOf(line.split("\t")[1]); Integer variationId = variationFeatureToVariationId.get(variationFeatureId); bw.write(line + "\t" + variationId + "\n"); } br.close(); bw.close(); this.logger.info("Added"); this.logger.debug("Elapsed time adding variation Id to transcript variation file: " + stopwatch); return transcriptVariationTempFile; } private Map<Integer, Integer> createVariationFeatureIdToVariationIdMap() throws IOException { this.logger.info("Creating variationFeatureId to variationId map ..."); Stopwatch stopwatch = Stopwatch.createStarted(); Map<Integer, Integer> variationFeatureToVariationId = new HashMap<>(); BufferedReader variationFeatFileReader = getBufferedReader( variationDirectoryPath.resolve(VARIATION_FEATURE_FILENAME)); String line; while ((line = variationFeatFileReader.readLine()) != null) { // TODO: add limit parameter would do that run faster? // TODO: use a precompiled pattern would improve efficiency String[] fields = line.split("\t"); Integer variationFeatureId = Integer.valueOf(fields[0]); Integer variationId = Integer.valueOf(fields[5]); variationFeatureToVariationId.put(variationFeatureId, variationId); } variationFeatFileReader.close(); this.logger.info("Done"); this.logger.debug("Elapsed time creating variationFeatureId to variationId map: " + stopwatch); return variationFeatureToVariationId; } private void createVariationFilesBufferedReaders() throws IOException { variationFeaturesFileReader = getBufferedReader(PREPROCESSED_VARIATION_FEATURE_FILENAME); variationSynonymsFileReader = getBufferedReader(PREPROCESSED_VARIATION_SYNONYM_FILENAME); variationTranscriptsFileReader = getBufferedReader(PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME); if (Files.exists(variationDirectoryPath.resolve(VARIATION_FREQUENCIES_FILENAME))) { frequenciesTabixReader = new TabixReader( variationDirectoryPath.resolve(VARIATION_FREQUENCIES_FILENAME).toString()); } } private Variation buildVariation(String[] variationFields, String[] variationFeatureFields, String chromosome, int start, int end, String id, String reference, String alternate, List<TranscriptVariation> transcriptVariation, List<Xref> xrefs, List<PopulationFrequency> populationFrequencies, String[] allelesArray, List<String> consequenceTypes, String displayConsequenceType) { Variation variation; variation = new Variation(id, chromosome, "SNV", start, end, variationFeatureFields[4], reference, alternate, variationFeatureFields[6], (variationFields[4] != null && !variationFields[4].equals("\\N")) ? variationFields[4] : "", displayConsequenceType, // species, assembly, source, version, consequenceTypes, transcriptVariation, null, null, populationFrequencies, xrefs, /*"featureId",*/ (variationFeatureFields[16] != null && !variationFeatureFields[16].equals("\\N")) ? variationFeatureFields[16] : "", (variationFeatureFields[17] != null && !variationFeatureFields[17].equals("\\N")) ? variationFeatureFields[17] : "", (variationFeatureFields[11] != null && !variationFeatureFields[11].equals("\\N")) ? variationFeatureFields[11] : "", (variationFeatureFields[20] != null && !variationFeatureFields[20].equals("\\N")) ? variationFeatureFields[20] : ""); return variation; } private String getDisplayConsequenceType(List<String> consequenceTypes) { String displayConsequenceType = null; if (consequenceTypes.size() == 1) { displayConsequenceType = consequenceTypes.get(0); } else { for (String cons : consequenceTypes) { if (!cons.equals("intergenic_variant")) { displayConsequenceType = cons; break; } } } return displayConsequenceType; } private String[] getAllelesArray(String[] variationFeatureFields) { String[] allelesArray; if (variationFeatureFields != null && variationFeatureFields[6] != null) { allelesArray = variationFeatureFields[6].split("/"); if (allelesArray.length == 1) { // In some cases no '/' exists, ie. in 'HGMD_MUTATION' allelesArray = new String[] { "", "" }; } } else { allelesArray = new String[] { "", "" }; } return allelesArray; } private List<Xref> getXrefs(Map<String, String> sourceMap, int variationId) throws IOException, SQLException { List<String[]> variationSynonyms = getVariationRelatedFields(VARIATION_SYNONYM_FILE_ID, variationId); List<Xref> xrefs = new ArrayList<>(); if (variationSynonyms != null && variationSynonyms.size() > 0) { String arr[]; for (String[] variationSynonymFields : variationSynonyms) { // TODO: use constans to identify the fields if (sourceMap.get(variationSynonymFields[3]) != null) { arr = sourceMap.get(variationSynonymFields[3]).split(","); xrefs.add(new Xref(variationSynonymFields[4], arr[0], arr[1])); } } } return xrefs; } private void initializeVariationRelatedArrays() { lastLineInVariationRelatedFile = new String[5][]; lastVariationIdInVariationRelatedFiles = new int[5]; for (int i = 0; i < lastVariationIdInVariationRelatedFiles.length; i++) { lastVariationIdInVariationRelatedFiles[i] = -1; } endOfFileOfVariationRelatedFiles = new boolean[5]; variationRelatedFileReader = new BufferedReader[5]; variationRelatedFileReader[VARIATION_FEATURE_FILE_ID] = variationFeaturesFileReader; variationRelatedFileReader[TRANSCRIPT_VARIATION_FILE_ID] = variationTranscriptsFileReader; variationRelatedFileReader[VARIATION_SYNONYM_FILE_ID] = variationSynonymsFileReader; variationIdColumnIndexInVariationRelatedFile = new int[5]; variationIdColumnIndexInVariationRelatedFile[VARIATION_FEATURE_FILE_ID] = VARIATION_ID_COLUMN_INDEX_IN_VARIATION_FEATURE_FILE; variationIdColumnIndexInVariationRelatedFile[TRANSCRIPT_VARIATION_FILE_ID] = variationIdColumnIndexInTranscriptVariationFile; variationIdColumnIndexInVariationRelatedFile[VARIATION_SYNONYM_FILE_ID] = VARIATION_ID_COLUMN_INDEX_IN_VARIATION_SYNONYM_FILE; } private List<String[]> getVariationRelatedFields(int fileId, int variationId) throws IOException { List<String[]> variationRelatedLines; readFileLinesUntilReachVariation(fileId, variationId); if (endOfFile(fileId) || variationIdExceededInFile(fileId, variationId)) { variationRelatedLines = Collections.EMPTY_LIST; } else { variationRelatedLines = new ArrayList<>(); while (!endOfFile(fileId) && !variationIdExceededInFile(fileId, variationId)) { variationRelatedLines.add(lastLineInVariationRelatedFile[fileId]); readLineInVariationRelatedFile(fileId); } } return variationRelatedLines; } private boolean variationIdExceededInFile(int fileId, int variationId) { return lastVariationIdInVariationRelatedFiles[fileId] > variationId; } private void readFileLinesUntilReachVariation(int fileId, int variationId) throws IOException { while (!endOfFileOfVariationRelatedFiles[fileId] && !variationReachedInFile(fileId, variationId)) { readLineInVariationRelatedFile(fileId); } } private void readLineInVariationRelatedFile(int fileId) throws IOException { String line = variationRelatedFileReader[fileId].readLine(); if (line == null) { endOfFileOfVariationRelatedFiles[fileId] = true; } else { lastLineInVariationRelatedFile[fileId] = line.split("\t", -1); lastVariationIdInVariationRelatedFiles[fileId] = getVariationIdFromLastLineInVariationRelatedFile( fileId); } } private int getVariationIdFromLastLineInVariationRelatedFile(int fileId) { int variationId = Integer.parseInt( lastLineInVariationRelatedFile[fileId][variationIdColumnIndexInVariationRelatedFile[fileId]]); return variationId; } private boolean variationReachedInFile(int fileId, int variationId) { return lastVariationIdInVariationRelatedFiles[fileId] != -1 && lastVariationIdInVariationRelatedFiles[fileId] >= variationId; } private boolean endOfFile(int fileId) { return endOfFileOfVariationRelatedFiles[fileId]; } private List<PopulationFrequency> getPopulationFrequencies(String chromosome, int start, int end, String id, String referenceAllele, String alternativeAllele) throws IOException { List<PopulationFrequency> populationFrequencies; String variationFrequenciesString = getVariationFrequenciesString(chromosome, start, end, id); if (variationFrequenciesString != null) { populationFrequencies = parseVariationFrequenciesString(variationFrequenciesString, referenceAllele, alternativeAllele); } else { populationFrequencies = Collections.EMPTY_LIST; } return populationFrequencies; } private String getVariationFrequenciesString(String chromosome, int start, int end, String id) throws IOException { try { if (frequenciesTabixReader != null) { TabixReader.Iterator frequenciesFileIterator = frequenciesTabixReader .query(chromosome + ":" + start + "-" + end); if (frequenciesFileIterator != null) { String variationFrequenciesLine = frequenciesFileIterator.next(); while (variationFrequenciesLine != null) { String[] variationFrequenciesFields = variationFrequenciesLine.split("\t"); if (variationFrequenciesFields[3].equals(id)) { return variationFrequenciesFields[4]; } variationFrequenciesLine = frequenciesFileIterator.next(); } } } } catch (Exception e) { } return null; } private List<PopulationFrequency> parseVariationFrequenciesString(String variationFrequenciesString, String referenceAllele, String alternativeAllele) { List<PopulationFrequency> frequencies = new ArrayList<>(); for (String populationFrequency : variationFrequenciesString.split(";")) { frequencies.add(parsePopulationFrequency(populationFrequency, referenceAllele, alternativeAllele)); } thousandGenomesPhase1MissedPopulations.add(THOUSAND_GENOMES_AFRICAN_POPULATION); thousandGenomesPhase1MissedPopulations.add(THOUSAND_GENOMES_AMERICAN_POPULATION); thousandGenomesPhase1MissedPopulations.add(THOUSAND_GENOMES_EUROPEAN_POPULATION); thousandGenomesPhase1MissedPopulations.add(THOUSAND_GENOMES_ASIAN_POPULATION); frequencies = addMissedPopulations(frequencies, thousandGenomesPhase1MissedPopulations, THOUSAND_GENOMES_PHASE_1_STUDY, THOUSAND_GENOMES_ALL_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_AFRICAN_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_AMERICAN_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_EUROPEAN_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_EASTASIAN_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_SOUTHASIAN_POPULATION); thousandGenomesPhase3MissedPopulations.add(THOUSAND_GENOMES_ASIAN_POPULATION); frequencies = addMissedPopulations(frequencies, thousandGenomesPhase3MissedPopulations, THOUSAND_GENOMES_PHASE_3_STUDY, THOUSAND_GENOMES_ALL_POPULATION); return frequencies; } private PopulationFrequency parsePopulationFrequency(String frequencyString, String referenceAllele, String alternativeAllele) { PopulationFrequency populationFrequency = null; Matcher m = populationFrequnciesPattern.matcher(frequencyString); if (m.matches()) { String populationName; String study = ""; String population = m.group(POPULATION_ID_GROUP); switch (population) { case "1000G_PHASE_1_AF": study = THOUSAND_GENOMES_PHASE_1_STUDY; populationName = THOUSAND_GENOMES_ALL_POPULATION; break; case "1000G_PHASE_1_AMR_AF": study = THOUSAND_GENOMES_PHASE_1_STUDY; populationName = THOUSAND_GENOMES_AMERICAN_POPULATION; break; case "1000G_PHASE_1_ASN_AF": study = THOUSAND_GENOMES_PHASE_1_STUDY; populationName = THOUSAND_GENOMES_ASIAN_POPULATION; break; case "1000G_PHASE_1_AFR_AF": study = THOUSAND_GENOMES_PHASE_1_STUDY; populationName = THOUSAND_GENOMES_AFRICAN_POPULATION; break; case "1000G_PHASE_1_EUR_AF": study = THOUSAND_GENOMES_PHASE_1_STUDY; populationName = THOUSAND_GENOMES_EUROPEAN_POPULATION; break; case "1000G_PHASE_3_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_ALL_POPULATION; break; case "1000G_PHASE_3_AMR_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_AMERICAN_POPULATION; break; case "1000G_PHASE_3_ASN_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_ASIAN_POPULATION; break; case "1000G_PHASE_3_AFR_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_AFRICAN_POPULATION; break; case "1000G_PHASE_3_EUR_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_EUROPEAN_POPULATION; break; case "1000G_PHASE_3_EAS_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_EASTASIAN_POPULATION; break; case "1000G_PHASE_3_SAS_AF": study = THOUSAND_GENOMES_PHASE_3_STUDY; populationName = THOUSAND_GENOMES_SOUTHASIAN_POPULATION; break; case "ESP_EA_AF": study = ESP_6500_STUDY; populationName = ESP_EUROPEAN_AMERICAN_POPULATION; break; case "ESP_AA_AF": study = ESP_6500_STUDY; populationName = ESP_AFRICAN_AMERICAN_POPULATION; break; default: populationName = population; } Float referenceFrequency = Float.parseFloat(m.group(REFERENCE_FREQUENCY_GROUP)); Float alternativeFrequency = Float.parseFloat(m.group(ALTERNATE_FREQUENCY_GROUP)); populationFrequency = new PopulationFrequency(study, populationName, populationName, referenceAllele, alternativeAllele, referenceFrequency, alternativeFrequency); } return populationFrequency; } private List<PopulationFrequency> addMissedPopulations(List<PopulationFrequency> frequencies, Set<String> missedPopulations, String study, String allPopulation) { int thousandGenomesPopulationsNumber = missedPopulations.size(); String refAllele = null; String altAllele = null; for (PopulationFrequency frequency : frequencies) { if (frequency != null && frequency.getStudy() != null && frequency.getStudy().equals(study)) { if (frequency.getPop().equals(allPopulation)) { refAllele = frequency.getRefAllele(); altAllele = frequency.getAltAllele(); } missedPopulations.remove(frequency.getPop()); } } // if the variation has some superpopulation frequency, but not all, add the missed superpopulations with 1 as ref allele proportion if (!missedPopulations.isEmpty() && missedPopulations.size() != thousandGenomesPopulationsNumber) { for (String population : missedPopulations) { frequencies.add(new PopulationFrequency(study, population, population, refAllele, altAllele, 1, 0)); } } return frequencies; } private List<TranscriptVariation> getTranscriptVariations(int variationId, String variationFeatureId) throws IOException, SQLException { // Note the ID used, TranscriptVariation references to VariationFeature no Variation !!! List<TranscriptVariation> transcriptVariation = new ArrayList<>(); List<String[]> resultTranscriptVariations = getVariationRelatedFields(TRANSCRIPT_VARIATION_FILE_ID, variationId); //getVariationTranscripts(variationId, Integer.parseInt(variationFeatureId)); if (resultTranscriptVariations != null && resultTranscriptVariations.size() > 0) { for (String[] transcriptVariationFields : resultTranscriptVariations) { if (transcriptVariationFields[VARIATION_FEATURE_ID_COLUMN_INDEX_IN_TRANSCRIPT_VARIATION_FILE] .equals(variationFeatureId)) { TranscriptVariation tv = buildTranscriptVariation(transcriptVariationFields); transcriptVariation.add(tv); } } } return transcriptVariation; } private TranscriptVariation buildTranscriptVariation(String[] transcriptVariationFields) { return new TranscriptVariation( (transcriptVariationFields[2] != null && !transcriptVariationFields[2].equals("\\N")) ? transcriptVariationFields[2] : "", (transcriptVariationFields[3] != null && !transcriptVariationFields[3].equals("\\N")) ? transcriptVariationFields[3] : "", (transcriptVariationFields[4] != null && !transcriptVariationFields[4].equals("\\N")) ? transcriptVariationFields[4] : "", Arrays.asList(transcriptVariationFields[5].split(",")), (transcriptVariationFields[6] != null && !transcriptVariationFields[6].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[6]) : 0, (transcriptVariationFields[7] != null && !transcriptVariationFields[7].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[7]) : 0, (transcriptVariationFields[8] != null && !transcriptVariationFields[8].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[8]) : 0, (transcriptVariationFields[9] != null && !transcriptVariationFields[9].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[9]) : 0, (transcriptVariationFields[10] != null && !transcriptVariationFields[10].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[10]) : 0, (transcriptVariationFields[11] != null && !transcriptVariationFields[11].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[11]) : 0, (transcriptVariationFields[12] != null && !transcriptVariationFields[12].equals("\\N")) ? Integer.parseInt(transcriptVariationFields[12]) : 0, (transcriptVariationFields[13] != null && !transcriptVariationFields[13].equals("\\N")) ? transcriptVariationFields[13] : "", (transcriptVariationFields[14] != null && !transcriptVariationFields[14].equals("\\N")) ? transcriptVariationFields[14] : "", (transcriptVariationFields[15] != null && !transcriptVariationFields[15].equals("\\N")) ? transcriptVariationFields[15] : "", (transcriptVariationFields[16] != null && !transcriptVariationFields[16].equals("\\N")) ? transcriptVariationFields[16] : "", (transcriptVariationFields[17] != null && !transcriptVariationFields[17].equals("\\N")) ? transcriptVariationFields[17] : "", (transcriptVariationFields[18] != null && !transcriptVariationFields[18].equals("\\N")) ? transcriptVariationFields[18] : "", (transcriptVariationFields[19] != null && !transcriptVariationFields[19].equals("\\N")) ? Float.parseFloat(transcriptVariationFields[19]) : 0f, (transcriptVariationFields[20] != null && !transcriptVariationFields[20].equals("\\N")) ? transcriptVariationFields[20] : "", (transcriptVariationFields[21] != null && !transcriptVariationFields[21].equals("\\N")) ? Float.parseFloat(transcriptVariationFields[21]) : 0f); } private void gunzipVariationInputFiles() throws IOException, InterruptedException { logger.info("Unzipping variation files ..."); Stopwatch stopwatch = Stopwatch.createStarted(); if (!existsZippedOrUnzippedFile(PREPROCESSED_VARIATION_FILENAME)) { // unzip variation file name for preprocess it later gunzipFileIfNeeded(variationDirectoryPath, VARIATION_FILENAME); } if (!existsZippedOrUnzippedFile(PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME)) { gunzipFileIfNeeded(variationDirectoryPath, TRANSCRIPT_VARIATION_FILENAME); } if (!existsZippedOrUnzippedFile(PREPROCESSED_VARIATION_FEATURE_FILENAME)) { gunzipFileIfNeeded(variationDirectoryPath, VARIATION_FEATURE_FILENAME); } if (!existsZippedOrUnzippedFile(PREPROCESSED_VARIATION_SYNONYM_FILENAME)) { gunzipFileIfNeeded(variationDirectoryPath, VARIATION_SYNONYM_FILENAME); } logger.info("Done"); logger.debug("Elapsed time unzipping files: " + stopwatch); } private boolean existsZippedOrUnzippedFile(String baseFilename) { return Files.exists(variationDirectoryPath.resolve(baseFilename)) || Files.exists(variationDirectoryPath.resolve(baseFilename + ".gz")); } private boolean isEmpty(String fileName) throws IOException { if (Files.exists(Paths.get(fileName))) { return Files.size(Paths.get(fileName)) == 0; } else { return Files.size(Paths.get(fileName + ".gz")) == 0; } } private void gunzipFileIfNeeded(Path directory, String fileName) throws IOException, InterruptedException { Path zippedFile = directory.resolve(fileName + ".gz"); if (Files.exists(zippedFile)) { logger.info("Unzipping " + zippedFile + "..."); Process process = Runtime.getRuntime().exec("gunzip " + zippedFile.toAbsolutePath()); process.waitFor(); } else { Path unzippedFile = directory.resolve(fileName); if (Files.exists(unzippedFile)) { logger.info( "File " + unzippedFile + " was previously unzipped: skipping 'gunzip' for this file ..."); } else { throw new FileNotFoundException("File " + zippedFile + " doesn't exist"); } } } private void gzipVariationFiles(Path variationDirectoryPath) throws IOException, InterruptedException { this.logger.info("Compressing variation files ..."); Stopwatch stopwatch = Stopwatch.createStarted(); gzipFile(variationDirectoryPath, VARIATION_FILENAME); gzipFile(variationDirectoryPath, PREPROCESSED_VARIATION_FILENAME); gzipFile(variationDirectoryPath, VARIATION_FEATURE_FILENAME); gzipFile(variationDirectoryPath, TRANSCRIPT_VARIATION_FILENAME); gzipFile(variationDirectoryPath, VARIATION_SYNONYM_FILENAME); gzipFile(variationDirectoryPath, PREPROCESSED_VARIATION_FEATURE_FILENAME); gzipFile(variationDirectoryPath, PREPROCESSED_TRANSCRIPT_VARIATION_FILENAME); gzipFile(variationDirectoryPath, PREPROCESSED_VARIATION_SYNONYM_FILENAME); this.logger.info("Files compressed"); this.logger.debug("Elapsed time compressing files: " + stopwatch); } private void gzipFile(Path directory, String fileName) throws IOException, InterruptedException { Path unzippedFile = directory.resolve(fileName); if (Files.exists(unzippedFile)) { this.logger.info("Compressing " + unzippedFile.toAbsolutePath()); Process process = Runtime.getRuntime().exec("gzip " + unzippedFile.toAbsolutePath()); process.waitFor(); } } private BufferedReader getBufferedReader(Path file) throws IOException { if (Files.exists(file)) { return FileUtils.newBufferedReader(file); } else { return FileUtils.newBufferedReader(Paths.get(file.toString() + ".gz")); } } private BufferedReader getBufferedReader(String fileName) throws IOException { Path inputFile; if (Files.exists(variationDirectoryPath.resolve(fileName))) { inputFile = variationDirectoryPath.resolve(fileName); } else { inputFile = variationDirectoryPath.resolve(fileName + ".gz"); } return FileUtils.newBufferedReader(inputFile); } private String getOutputFileName(String chromosome) { String outputFileName = outputFileNames.get(chromosome); if (outputFileName == null) { outputFileName = "variation_chr" + chromosome; outputFileNames.put(chromosome, outputFileName); } return outputFileName; } }