Java tutorial
/* * Copyright (C) 2009-2011 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This file is part of the Goby IO API. * * The Goby IO API is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The Goby IO API is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the Goby IO API. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.readers.vcf; import edu.cornell.med.icb.goby.modes.TabToColumnInfoMode; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.objects.ObjectArrayList; import it.unimi.dsi.io.FastBufferedReader; import it.unimi.dsi.io.LineIterator; import it.unimi.dsi.lang.MutableString; import net.sf.samtools.util.BlockCompressedInputStream; import org.apache.commons.io.IOUtils; import java.io.*; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Map; /** * Parser for files in the <a href="http://vcftools.sourceforge.net/specs.html">Variant Call Format</a>, or in plain TSV format. * This parser will read either a VCF file with meta-data information defined about columns, or will read plain TSV files. * Two version of TSV files are supported. That which starts with a header line without # or with # as first character. * This reader supports a Group attribute in field declarations. When the VCF 4 supports the following declaration line, * which declares ID, Number, Type and Description attributes: * <pre>##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood ..."></pre> * This parser can additionally read a Group attribute, such as in: * <pre>##INFO=<ID=AF1,Number=1,Type=Float,Group=LIKELIHOODS,Description="Max-likelihood ..."></pre> * * @author Fabien Campagne * Date: Mar 26, 2011 * Time: 3:01:47 PM */ public class VCFParser implements Closeable { private Reader input; private Columns columns = new Columns(); private boolean hasNextDataLine; private int numberOfColumns; private int[] columnStarts; private int[] columnEnds; private MutableString line; private char columnSeparatorCharacter = '\t'; private int[] fieldStarts; private int[] fieldEnds; private char fieldSeparatorCharacter = ';'; private char formatFieldSeparatorCharacter = ':'; private int numberOfFields; private int globalFieldIndex; private int formatColumnIndex; private int lineLength; private int globalColumnIndex; /** * Variable TSV is true if we determined the file is tab delimited. */ private boolean TSV = true; /** * An array with dimensions numAllFields that stores the permutation from the global field index * to the observed field index (taking into account absence or presence of Flag attributes, and the fact that * fields that occur in any order on each line in a column). */ private int[] fieldPermutation; /** * Sorts columns in increasing columnIndex order. */ public static final Comparator<ColumnInfo> COLUMN_SORT = new Comparator<ColumnInfo>() { @Override public int compare(final ColumnInfo c1, final ColumnInfo c2) { return c1.columnIndex - c2.columnIndex; } }; /** * Sorts fields in increasing globalFieldIndex order. */ public static final Comparator<ColumnField> FIELD_SORT = new Comparator<ColumnField>() { @Override public int compare(final ColumnField c1, final ColumnField c2) { return c1.globalFieldIndex - c2.globalFieldIndex; } }; private final ObjectArrayList<ColumnInfo> columnList = new ObjectArrayList<ColumnInfo>(); private final ObjectArrayList<ColumnField> fieldList = new ObjectArrayList<ColumnField>(); private ColumnInfo formatColumn; private boolean headerLineNotParsed = true; private boolean headerParsed; private File inputFile; /** * Map of column name to column type for TSV files. When reading a TSV file this is populated by * readTsvColumnTypes() during parseHeaderLine() and used to set the correct types based on the * data found in the TSV file. */ private Map<String, ColumnType> tsvColumnNameToTypeMap; /** * If we should cache the TSV column types to a .colinfo file when parsing TSV files. This must be * set BEFORE calling readHeader() if you want it to be used. */ private boolean cacheTsvColumnTypes = true; /** * When scanning a TSV file to determine column types, this is the number of lines that will be checked. * Set to <= 0 to scan the entire file. This must be set before calling readHeader() for the value to be used. */ private int tsvLinesToScanForColumnType = -1; /** * Indicate that the field permutation has already been computed. When this field is true, the permutation * of the first line computed is reused for subsequent lines. */ private boolean computedFieldPermutation; /** * When this field is true, the permutation of the first line computed is reused for subsequent lines. */ private boolean cacheFieldPermutation; private String associationString; /** * Constructs a VCF parser. * * @param file Input to parse */ public VCFParser(final Reader file) { this.input = file; } /** * Constructs a VCF parser. When the filename ends in .gz, this method attempts to decompress * the file on the fly, using BlockCompressedInputStream (from samtools). BlockCompressedInputStream * is used preferentially to GZipInputStream to avoid truncating bgzip input files produced with bgzip. * http://biostar.stackexchange.com/questions/6112/how-to-decompress-1000genomes-bgzip-compressed-files-using-java * * @param filename Input to parse * @throws java.io.IOException when an error occurs. */ public VCFParser(final String filename) throws IOException { inputFile = new File(filename); input = filename.endsWith(".gz") ? new InputStreamReader(new BlockCompressedInputStream(new FileInputStream(filename))) : new FileReader(filename); } /** * If we should cache the TSV column types to a .colinfo file when parsing TSV files. This must be set * before calling readHeader() for the value to be used. * * @param cacheTsvColumnTypes new cacheTsvColumnTypes value */ public void setCacheTsvColumnTypes(final boolean cacheTsvColumnTypes) { this.cacheTsvColumnTypes = cacheTsvColumnTypes; } /** * If we should cache the TSV column types to a .colinfo file when parsing TSV files. * * @return cacheTsvColumnTypes value */ public boolean isCacheTsvColumnTypes() { return cacheTsvColumnTypes; } /** * When scanning a TSV file to determine column types, this is the number of lines that will be checked. * Set to <= 0 to scan the entire file. This must be set before calling readHeader() for the value to be used. * * @param tsvLinesToScanForColumnType the new value of tsvLinesToScanForColumnType */ public void setTsvLinesToScanForColumnType(final int tsvLinesToScanForColumnType) { this.tsvLinesToScanForColumnType = tsvLinesToScanForColumnType; } /** * When scanning a TSV file to determine column types, this is the number of lines that will be checked. * * @return the value of tsvLinesToScanForColumnType */ public int getTsvLinesToScanForColumnType() { return tsvLinesToScanForColumnType; } /** * If parsing a TSV file, this can be called to retrieve the column types map. The column types map * .colinfo cache file will be created the first time this is called and read on subsequent calls. * * @throws IOException if error reading / creating .colinfo cache file */ public void readTsvColumnTypes() throws IOException { // TODO: Currently this will not work right if the TSV file header line DOES start with "#" // TODO: which could be a problem, although current TSV files produced by DiffExp's do not do this. if (tsvColumnNameToTypeMap == null && inputFile != null && inputFile.exists()) { final TabToColumnInfoMode reader = new TabToColumnInfoMode(); reader.addInputFile(inputFile); reader.setCreateCache(cacheTsvColumnTypes); reader.setNumberOfLinesToProcess(tsvLinesToScanForColumnType); reader.setReadFromCache(true); reader.execute(); tsvColumnNameToTypeMap = reader.getDetailsAtIndex(0); } } /** * Return the number of columns in the file. This method can be called after the header has been read to obtain * the number of columns in the file. * * @return The number of columns in the file */ public int getNumberOfColumns() { return numberOfColumns; } /** * Return the columns in the file. This method can be called after the header has been read. * * @return The columns declared in the file */ public Columns getColumns() { return columns; } public ColumnType getColumnType(final int columnIndex) { for (final ColumnInfo col : columns) { if (col.columnIndex == columnIndex) { if (col.fields.size() == 1) { return ((ColumnField) col.fields.toArray()[0]).type; } else { break; } } } return ColumnType.String; } /** * Return the type of the specified field. * * @param globalFieldIndex global field index, from zero to countAllFields()-1 * @return type of the specified field. */ public ColumnType getFieldType(final int globalFieldIndex) { return fieldList.get(globalFieldIndex).type; } /** * Return the number of values in the specified field. * * @param globalFieldIndex global field index, from zero to countAllFields()-1 * @return the number of values contained in this field. */ public int getFieldNumValues(final int globalFieldIndex) { return fieldList.get(globalFieldIndex).numberOfValues; } public String getColumnName(final int columnIndex) { for (final ColumnInfo col : columns) { if (col.columnIndex == columnIndex) { return col.columnName; } } return null; } private LineIterator lineIterator; public boolean hasNextDataLine() { if (hasNextDataLine) { return true; } hasNextDataLine = lineIterator.hasNext(); if (hasNextDataLine) { line = lineIterator.next(); if (!TSV) { parseCurrentLine(); } else { parseTSVLine(); } } return hasNextDataLine; } public void next() { if (!hasNextDataLine) { throw new IllegalArgumentException("Next can be called only after hasNext has returned true."); } hasNextDataLine = false; } /** * Returns a column value as a CharSequence. Faster than returning a String. * * @param columnIndex index of the field on a line of input. * @return a column value as a CharSequence. */ public CharSequence getColumnValue(final int columnIndex) { if (hasNextDataLine) { return line.subSequence(columnStarts[columnIndex], columnEnds[columnIndex]); } else return null; } /** * Returns the total number of fields across all columns. * * @return the sum of the number of fields in each column. */ public int countAllFields() { int n = 0; for (final ColumnInfo column : columns) { n += column.fields.size(); } return n; } /** * Returns the value of the field. * The field is identified by a global index that runs from zero (inclusive) to countAllFields() (exclusive). * * @param globalFieldIndex a global index that runs from zero to countAllFields() * @return Value of this field. */ public CharSequence getFieldValue(final int globalFieldIndex) { if (hasNextDataLine) { final int lineFieldIndex = fieldPermutation[globalFieldIndex]; if (lineFieldIndex == -1) { // missing field in this row; return ""; } final int start = fieldStarts[lineFieldIndex]; final int end = fieldEnds[lineFieldIndex]; assert (start >= 0 && end <= lineLength) : String.format( "position indices must be within line boundaries start: %d end: %d length: %d", start, end, lineLength); return line.subSequence(start, end); } else return null; } /** * Returns the value of a field. * The field is identified by a global index that runs from zero (inclusive) to countAllFields() (exclusive). * * @param globalFieldIndex a global index that runs from zero to countAllFields() * @return Value of this field. */ public String getStringFieldValue(final int globalFieldIndex) { final CharSequence value = getFieldValue(globalFieldIndex); return value == null ? null : value.toString(); } /** * Returns a column value as a String. * * @param columnIndex index of the field on a line of input. * @return a column value as a String. */ public String getStringColumnValue(final int columnIndex) { return getColumnValue(columnIndex).toString(); } private Int2ObjectMap<String> fieldIndexToName; /** * Return the field name, in the format: * <LI>For columns with multiple fields: <column-name>[<field-id>] * <LI>For columns with a single field: <column-name>[<field-id>] * * @param globalFieldIndex index of the field across columns. * @return the field name */ public String getFieldName(final int globalFieldIndex) { return fieldIndexToName.get(globalFieldIndex); } private FastBufferedReader bufferedReader; /** * Read the header of this file. Headers in the VCF format are supported, as well as TSV single header lines (with or * without first character #. * * @throws SyntaxException When the syntax of the VCF file is incorrect. */ public void readHeader() throws SyntaxException { if (headerParsed) { return; } headerParsed = true; globalFieldIndex = 0; fieldIndexToName = new Int2ObjectOpenHashMap<String>(); bufferedReader = new FastBufferedReader(input); lineIterator = new LineIterator(bufferedReader); int lineNumber = 1; try { while (lineIterator.hasNext()) { line = lineIterator.next(); if (line.startsWith("##")) { TSV = false; } if (!line.startsWith("#")) { if (TSV && lineNumber == 1 && headerLineNotParsed) { // assume the file is TSV and starts directly with the header line. Parse lineIterator here. parseHeaderLine(new MutableString("#" + line)); } else { // We are seeing an actual line of data. Prepare for parsing: if (!TSV) { parseCurrentLine(); } else { parseTSVLine(); } hasNextDataLine = true; } break; } if (line.startsWith("##")) { TSV = false; processMetaInfoLine(line); } else if (line.startsWith("#")) { parseHeaderLine(line); } lineNumber++; } } catch (net.sf.samtools.FileTruncatedException e) { line = null; hasNextDataLine = false; // install a dummy line iteratory that always returns false when hasNext is called. lineIterator = new LineIterator(bufferedReader) { @Override public boolean hasNext() { return false; } @Override public MutableString next() { return null; } }; } } private void parseTSVLine() { Arrays.fill(columnStarts, 0); Arrays.fill(columnEnds, 0); Arrays.fill(fieldStarts, 0); Arrays.fill(fieldEnds, 0); int columnIndex = 0; lineLength = line.length(); for (int i = 0; i < lineLength; i++) { final char c = line.charAt(i); if (c == '\t') { final String columnName = columnList.get(columnIndex).columnName; columnEnds[columnIndex] = i; if (columnIndex + 1 < columnStarts.length) { columnStarts[columnIndex + 1] = i + 1; } fieldPermutation[columnIndex] = columnIndex; ++columnIndex; } } fieldPermutation[columnEnds.length - 1] = columnEnds.length - 1; columnEnds[columnEnds.length - 1] = lineLength; columnStarts[columnEnds.length - 1] = columnEnds[columnEnds.length - 2] + 1; System.arraycopy(columnEnds, 0, fieldEnds, 0, columnEnds.length); System.arraycopy(columnStarts, 0, fieldStarts, 0, columnStarts.length); } final IntArrayList previousColumnFieldIndices = new IntArrayList(); private void parseCurrentLine() { Arrays.fill(columnStarts, 0); Arrays.fill(columnEnds, 0); Arrays.fill(fieldStarts, 0); Arrays.fill(fieldEnds, 0); columnStarts[0] = 0; int columnIndex = 0; int fieldIndex = 0; lineLength = line.length(); final int[] lineFieldIndexToColumnIndex = new int[numberOfFields]; Arrays.fill(lineFieldIndexToColumnIndex, -1); previousColumnFieldIndices.clear(); // determine the position of column and field delimiters: final char[] chrs = line.toCharArray(); for (int i = 0; i < lineLength; i++) { final char c = chrs[i]; if (c == columnSeparatorCharacter) { fieldPermutation[columnIndex] = columnIndex; columnEnds[columnIndex] = i; if (columnIndex + 1 < numberOfColumns) { columnStarts[columnIndex + 1] = i + 1; } //lineFieldIndexToColumnIndex[columnIndex] = columnIndex; } if (c == columnSeparatorCharacter || c == fieldSeparatorCharacter || (columnIndex >= formatColumnIndex && c == formatFieldSeparatorCharacter)) { if (TSV) { // there are no fields, only columns, the field separators do not apply fieldEnds[columnIndex] = columnEnds[columnIndex]; fieldStarts[columnIndex] = columnStarts[columnIndex]; fieldIndex = columnIndex; lineFieldIndexToColumnIndex[fieldIndex] = columnIndex; } else { fieldEnds[fieldIndex] = i; if (fieldIndex + 1 < numberOfFields) { fieldStarts[fieldIndex + 1] = i + 1; } previousColumnFieldIndices.add(fieldIndex); fieldIndex++; fieldIndex = Math.min(fieldEnds.length - 1, fieldIndex); fieldIndex = Math.min(fieldStarts.length - 1, fieldIndex); } } if (c == columnSeparatorCharacter) { if (TSV) { lineFieldIndexToColumnIndex[fieldIndex] = columnIndex; } push(columnIndex, lineFieldIndexToColumnIndex, previousColumnFieldIndices); columnIndex++; } } int numberOfFieldsOnLine = Math.min(fieldIndex, fieldEnds.length - 1); int numberOfColumnsOnLine = Math.min(columnIndex, columnEnds.length - 1); columnStarts[0] = 0; columnEnds[numberOfColumnsOnLine - (TSV ? 1 : 0)] = line.length(); fieldStarts[0] = 0; fieldEnds[numberOfFieldsOnLine - (TSV ? 1 : 0)] = line.length(); previousColumnFieldIndices.add(fieldIndex); push(columnIndex, lineFieldIndexToColumnIndex, previousColumnFieldIndices); if (cacheFieldPermutation && computedFieldPermutation) return; Arrays.fill(fieldPermutation, -1); for (ColumnInfo c : columns) { c.formatIndex = 0; } // determine the fieldPermutation for each possible field: for (int lineFieldIndex = 0; lineFieldIndex <= numberOfFieldsOnLine; lineFieldIndex++) { final int start = fieldStarts[lineFieldIndex]; final int end = fieldEnds[lineFieldIndex]; final int cIndex = lineFieldIndexToColumnIndex[lineFieldIndex]; if (cIndex >= columnList.size()) { break; } final ColumnInfo column = columnList.get(cIndex); int colMinGlobalFieldIndex = Integer.MAX_VALUE; int colMaxGlobalFieldIndex = Integer.MIN_VALUE; final ColumnFields fields = column.fields; fields.rebuildList(); for (int fi = 0; fi < fields.size(); ++fi) { final ColumnField f = fields.get(fi); colMinGlobalFieldIndex = Math.min(colMinGlobalFieldIndex, f.globalFieldIndex); colMaxGlobalFieldIndex = Math.max(colMaxGlobalFieldIndex, f.globalFieldIndex); } final int formatColumnIndex = TSV ? -1 : formatColumn.columnIndex; final int startFormatColumn = TSV ? 0 : columnStarts[formatColumnIndex]; final int endFormatColumn = TSV ? 0 : columnEnds[formatColumnIndex]; final String[] formatTokens = split(line, formatFieldSeparatorCharacter, startFormatColumn, endFormatColumn); for (int fi = 0; fi < fields.size(); ++fi) { final ColumnField f = fields.get(fi); if (fieldPermutation[f.globalFieldIndex] != -1) { // already assigned. continue; } if (colMaxGlobalFieldIndex == colMinGlobalFieldIndex) { // This column has only one field. fieldPermutation[f.globalFieldIndex] = lineFieldIndex; break; } else { // find the column field f whose id matches the character span we are looking at : int j = start; final String id = f.id; int matchLength = 0; for (int i = 0; i < id.length(); i++) { if (j >= end) { // reached end of field, not this field. break; } final char linechar = line.charAt(j); if (id.charAt(i) != linechar) { // found mimatch with field id, not this field. matchLength = -1; break; } matchLength++; j++; } if (matchLength == id.length() && line.charAt(j) == '=' || (j == end && f.type == ColumnType.Flag)) { // found the correct field. /* System.out.printf("Assigning global %s %d -> %d for field %s%n", f.id, globalFieldIndex, lineFieldIndex, line.subSequence(start, end)); */ fieldPermutation[f.globalFieldIndex] = lineFieldIndex; if (f.type != ColumnType.Flag) { fieldStarts[lineFieldIndex] += f.id.length() + 1; // remove id= from value; //fieldStarts[lineFieldIndex]=Math.min(fieldStarts[lineFieldIndex],fieldEnds[lineFieldIndex]); } break; } else { if (column.useFormat && column.formatIndex < formatTokens.length) { if (f.id.equals(formatTokens[column.formatIndex])) { /* System.out.printf("Assigning FORMAT global %s %d -> %d for field %s%n", f.id, f.globalFieldIndex, lineFieldIndex, line.subSequence(start, end)); */ fieldPermutation[f.globalFieldIndex] = lineFieldIndex; column.formatIndex++; break; } } } } } } computedFieldPermutation = true; } String[] formatSplit = null; private String[] split(final MutableString line, final char formatFieldSeparatorCharacter, final int startFormatColumn, final int endFormatColumn) { if (formatSplit != null) { return formatSplit; } else { final MutableString formatSpan = line.substring(startFormatColumn, endFormatColumn); int fieldCount = 0; formatSpan.append(formatFieldSeparatorCharacter); final int length = formatSpan.length(); for (int i = 0; i < length; i++) { if (formatSpan.charAt(i) == formatFieldSeparatorCharacter) { ++fieldCount; } } final String[] result = new String[fieldCount]; final MutableString value = new MutableString(); int last = 0; int j = 0; for (int i = 0; i < length; i++) { if (formatSpan.charAt(i) == formatFieldSeparatorCharacter && i > last) { value.append(formatSpan.substring(last, i)); last = i + 1; result[j] = value.toString(); value.setLength(0); ++j; } } formatSplit = result; return result; } } // System.out.println("ned"); private void push(final int columnIndex, final int[] lineFieldIndexToColumnIndex, final IntArrayList previousColumnFieldIndices) { // System.out.println("---"); final int size = previousColumnFieldIndices.size(); for (int i = 0; i < size; ++i) { final int fIndex = previousColumnFieldIndices.getInt(i); /* System.out.printf("field %s gfi:%d belongs to column %d %s%n ", line.substring(fieldStarts[fIndex], fieldEnds[fIndex]), fIndex, columnIndex, columnList.get(columnIndex).columnName);*/ lineFieldIndexToColumnIndex[fIndex] = columnIndex; } previousColumnFieldIndices.clear(); } private void parseHeaderLine(MutableString line) { if (TSV) { // Attempt to determine the column types using TabToColumnInfoMode. try { readTsvColumnTypes(); } catch (IOException e) { System.err.println("Could not determine column info from tsv file " + e.getMessage()); } } headerLineNotParsed = false; // System.out.printf("header line:%s%n", line); // drop the # line = line.substring(1); final String[] columnNames = line.toString().split("[\\t]"); for (final String columnName : columnNames) { defineFixedColumn(columnName); if (!columns.hasColumnName(columnName)) { final ColumnInfo formatColumn = columns.find("FORMAT"); // copy the fields of the FORMAT column for each sample: final ColumnField[] fields; if (formatColumn != null) { fields = new ColumnField[formatColumn.fields.size()]; int i = 0; for (final ColumnField field : formatColumn.fields) { fields[i] = new ColumnField(field.id, field.numberOfValues, field.type, field.description); fields[i].globalFieldIndex = -1; i++; } } else { final ColumnType columnType; if (tsvColumnNameToTypeMap != null) { columnType = tsvColumnNameToTypeMap.get(columnName) == null ? ColumnType.String : tsvColumnNameToTypeMap.get(columnName); } else { columnType = ColumnType.String; } fields = new ColumnField[] { new ColumnField("VALUE", 1, columnType, "") }; fields[0].globalFieldIndex = -1; } final ColumnInfo newCol = new ColumnInfo(columnName, fields); newCol.useFormat = true; columns.add(newCol); } } formatColumn = columns.find("FORMAT"); // columns.remove(formatColumn); // columnList.remove(formatColumn); for (final ColumnInfo column : columns) { if (column.columnIndex == -1) { column.columnIndex = globalColumnIndex++; } for (final ColumnField field : column.fields) { if (field.globalFieldIndex == -1) { field.globalFieldIndex = globalFieldIndex++; } final String name; if (column.fields.size() == 1) { name = column.columnName; } else { name = String.format("%s[%s]", column.columnName, field.id); } fieldIndexToName.put(field.globalFieldIndex, name); } } formatColumnIndex = TSV ? -1 : formatColumn.columnIndex; numberOfColumns = globalColumnIndex; columnStarts = new int[numberOfColumns]; columnEnds = new int[numberOfColumns]; numberOfFields = globalFieldIndex; fieldStarts = new int[numberOfFields]; fieldEnds = new int[numberOfFields]; fieldPermutation = new int[numberOfFields]; columnList.addAll(columns); Collections.sort(columnList, COLUMN_SORT); for (final ColumnInfo column : columnList) { fieldList.addAll(column.fields); } } private void defineFixedColumn(final String columnName) { for (final ColumnInfo fixed : fixedColumns) { if (fixed.columnName.equals(columnName) && !columns.hasColumnName(columnName)) { fixed.columnIndex = globalColumnIndex++; for (final ColumnField field : fixed.fields) { field.globalFieldIndex = globalFieldIndex++; } columns.add(fixed); return; } } } public static ColumnInfo[] fixedColumn() { // returns a deep copy of fixed columns so that this class is not affected by possible changes to the returned // value. final ColumnInfo[] copy = new ColumnInfo[fixedColumns.length]; int i = 0; for (final ColumnInfo fixedColumn : fixedColumns) { copy[i++] = fixedColumn.copy(); } return copy; } final static private ColumnInfo fixedColumns[] = new ColumnInfo[] { new ColumnInfo("CHROM", true, new ColumnField("VALUE", 1, ColumnType.String, "The reference position, with the 1st base having position 1. " + "Positions are sorted numerically, in increasing order, within each reference sequence CHROM.", "genomic-coordinate", "cross-sample-field")), new ColumnInfo("POS", true, new ColumnField("VALUE", 1, ColumnType.Integer, "he reference position, with the 1st base having position 1. " + "Positions are sorted numerically, in increasing order, within each reference sequence CHROM.", "genomic-coordinate", "cross-sample-field")), new ColumnInfo("ID", true, new ColumnField("VALUE", 1, ColumnType.String, "ID semi-colon separated list of unique identifiers where available. " + "If this is a dbSNP variant it is encouraged to use the rs number(s). " + "No identifier should be present in more than one data record. " + "If there is no identifier available, then the missing value should be used.", "external-identifiers", "cross-sample-field")), new ColumnInfo("REF", true, new ColumnField("VALUE", 1, ColumnType.String, "Reference base(s): Each base must be one of A,C,G,T,N. " + "Bases should be in uppercase. Multiple bases are permitted. " + "The value in the POS field refers to the position of the first base in the String. " + "For InDels, the reference String must include the base before the event " + "(which must be reflected in the POS field).", "cross-sample-field")), new ColumnInfo("ALT", true, new ColumnField("VALUE", 1, ColumnType.String, "Comma separated list of alternate non-reference alleles called on at least one of the samples. " + "Options are base Strings made up of the bases A,C,G,T,N, or " + "an angle-bracketed ID String (\"<ID>\"). " + "If there are no alternative alleles, then the missing value should be used. " + "Bases should be in uppercase. (Alphanumeric String; no whitespace, commas, " + "or angle-brackets are permitted in the ID String itself).", "cross-sample-field")), new ColumnInfo("QUAL", true, new ColumnField("VALUE", 1, ColumnType.Float, "Phred-scaled quality score for the assertion made in ALT. i.e. give -10log_10 prob(call in ALT is wrong). " + "If ALT is \".\" (no variant) then this is -10log_10 p(variant), " + "and if ALT is not \".\" this is -10log_10 p(no variant). " + "High QUAL scores indicate high confidence calls. " + "Although traditionally people use integer phred scores, this field is permitted to be " + "a floating point to enable higher resolution for low confidence calls if desired.", "cross-sample-field")), new ColumnInfo("FILTER", true, new ColumnField("VALUE", 1, ColumnType.String, "Filter: PASS if this position has passed all filters, i.e. a call is made at this position. " + "Otherwise, if the site has not passed all filters, a semicolon-separated list of codes " + "for filters that fail. e.g. \"10;s50\" might indicate that at this site the quality is " + "below 10 and the number of samples with data is below 50%% of the total number of samples. " + "\"0\" is reserved and should not be used as a filter String. " + "If filters have not been applied, then this field should be set to the missing value.", "cross-sample-field")), new ColumnInfo("INFO", true, new ColumnField("VALUE", 1, ColumnType.String, "Additional information: INFO fields are encoded as a semicolon-separated series of short keys " + "with optional values in the format: <key>=<data>[,data]. Arbitrary keys are permitted, " + "although some sub-fields are reserved.", "cross-sample-field")), }; private void processMetaInfoLine(final MutableString line) throws SyntaxException { final int start = 2; final int end = line.indexOf('='); final String columnName = line.substring(start, end).toString(); final MutableString restOfLine = line.substring(end + 1); processMetaInfo(columnName, restOfLine); } private void processMetaInfo(final String columnName, final MutableString infoDefinition) throws SyntaxException { if (line.startsWith("##FieldGroupAssociations=")) { associationString = line.replace("##FieldGroupAssociations=", "").toString(); return; } if ("fileformat".equals(columnName) || "samtoolsVersion".equals(columnName)) { return; } if (!infoDefinition.startsWith("<") && !infoDefinition.endsWith(">")) { // is this a syntax error? test-data/vcf/tricky.vcf would trigger errors return; } final ColumnInfo info; if (columns.hasColumnName(columnName)) { info = columns.find(columnName); } else { info = new ColumnInfo(); info.columnName = columnName; columns.add(info); } final ColumnField field = new ColumnField(); final MutableString insideBrackets = infoDefinition.substring(1, infoDefinition.length() - 1); try { final String[] tokens = insideBrackets.toString().split("(,N)|(,T)|(,D)|(,G)"); for (final String token : tokens) { final String[] kv = new String[2]; final int firstEqualIndex = token.indexOf('='); kv[0] = token.substring(0, firstEqualIndex); kv[1] = token.substring(firstEqualIndex + 1); if ("ID".equals(kv[0])) { field.id = kv[1]; } else if ("umber".equals(kv[0])) { String num = kv[1]; if (".".equals(num)) { // . indicates any number of values. Use 2 num = "-1"; } field.numberOfValues = Integer.parseInt(num); } else if ("ype".equals(kv[0])) { field.type = ColumnType.valueOf(kv[1]); } else if ("roup".equals(kv[0])) { field.group = kv[1]; } else if ("escription".equals(kv[0])) { if (kv[1].startsWith("\"") && kv[1].endsWith("\"")) { kv[1] = kv[1].substring(1, kv[1].length() - 1); } field.description = kv[1]; } else { throw new SyntaxException(infoDefinition); } } } catch (NumberFormatException e) { throw new SyntaxException(infoDefinition); } // System.out.println("adding " + field); // do not set the global field index on a meta-info field yet. We will do this after fixed columns have been added. info.addField(field); } /** * Return a global field index, or -1 if the column or field id are not declared. * * @param columnName name of column. * @param fieldId Identifier for field in column. * @return a global field index, or -1 if the column or field id are not declared. */ public int getGlobalFieldIndex(final String columnName, final String fieldId) { final ColumnInfo column = columns.find(columnName); if (column == null) { return -1; } final ColumnField columnField = column.fields.find(fieldId); if (columnField == null) { return -1; } return columnField.globalFieldIndex; } /** * Releases the IO resources held by this parser. * * @throws IOException */ @Override public void close() throws IOException { if (bufferedReader != null) { IOUtils.closeQuietly(bufferedReader); } IOUtils.closeQuietly(input); } /** * Return the sample names, or more specifically, the names of column that use the FORMAT column. * * @return */ public String[] getColumnNamesUsingFormat() { final ObjectArrayList<String> columnNamesUsingFormat = new ObjectArrayList<String>(); for (final ColumnInfo info : columns) { if (info.useFormat) { columnNamesUsingFormat.add(info.columnName); } } return columnNamesUsingFormat.toArray(new String[columnNamesUsingFormat.size()]); } public void setCacheFieldPermutation(boolean cacheFieldPermutation) { this.cacheFieldPermutation = cacheFieldPermutation; } public GroupAssociations getGroupAssociations() { return new GroupAssociations(associationString, columns.find("FORMAT"), getColumnNamesUsingFormat()); } public class SyntaxException extends Exception { public SyntaxException(final MutableString line) { super(line.toString()); } } }