Java tutorial
/* * The MIT License * * Copyright (c) 2018 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools.reference; import htsjdk.samtools.SAMSequenceDictionaryCodec; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.SequenceUtil; import htsjdk.utils.ValidationUtils; import org.apache.commons.compress.utils.CountingOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.HashSet; import java.util.Set; /** * Writes a FASTA formatted reference file. * In addition it can also compose the index and dictionary files for the newly written reference file. * </p> * <p> * Example: * <pre> * String[] seqNames = ...; * byte[][] seqBases = ...; * ... * try (final FastaReferenceWriter writer = new FastaReferenceFileWriter(outputFile)) { * for (int i = 0; i < seqNames.length; i++) { * writer.startSequence(seqNames[i]).appendBases(seqBases[i]); * } * } * </pre> * </p> * <p> * The two main operations that one can invoke on a opened writer is {@link #startSequence} and {@link #appendBases}. * The former indicates that we are going to append a new sequence to the output and is invoked once per sequence. * The latter adds bases to the current sequence and can be called as many times as is needed. * </p> * <p> * The writer will make sure that the output adheres to the FASTA reference sequence file format restrictions: * <ul> * <li>Sequence names are valid (non-empty, without space/blank, control characters),</li> * <li>Sequence description are valid (without control characters),</li> * <li>Bases are valid nucleotides or IUPAC redundancy codes and X [ACGTNX...] (lower or uppercase are accepted),</li> * <li>Sequence cannot have 0 length,</li> * <li>And that each sequence can only appear once in the output</li> * </ul> * </p> */ public final class FastaReferenceWriter implements AutoCloseable { /** * Default number of bases per line. */ public static final int DEFAULT_BASES_PER_LINE = 60; /** * Sequence header start character. */ public static final char HEADER_START_CHAR = '>'; /** * Character used to separate the sequence name and the description if any. */ public static final char HEADER_NAME_AND_DESCRIPTION_SEPARATOR = ' '; /** * Charset used for all outputs; fixed to UTF-8. */ private static final Charset CHARSET = Charset.forName("UTF-8"); /** * The line separator string. */ private static final char LINE_SEPARATOR_CHR = '\n'; /** * Character used to separate the fields in a index file line. */ private static final char INDEX_FIELD_SEPARATOR_CHR = '\t'; /** * Convenient cached {@code byte[]} representation of the line separator. */ private static final byte[] LINE_SEPARATOR = String.valueOf(LINE_SEPARATOR_CHR).getBytes(CHARSET); /** * Output stream to the main FASTA output. * <p> * We use it also to count the number of bytes so far outputted thus the offset included in * the index file entry. * </p> */ private final CountingOutputStream fastaStream; /** * Writer for the index file. */ private final Writer indexWriter; /** * Output writer to the output dictionary. */ private final Writer dictWriter; /** * the md5 digester (or null if not adding md5) */ private final MessageDigest md5Digester; /** * Output codec for the dictionary. */ private final SAMSequenceDictionaryCodec dictCodec; /** * Default number of bases per line to be applied unless one is */ private final int defaultBasePerLine; /** * Records the sequence names that have been already fully appended to this writer. */ private final Set<String> sequenceNames = new HashSet<>(); /** * Bases per line to be applied to the sequence that is been currently appended to the output. */ private int currentBasesPerLine; /** * Holds the number of bases in the current output line. */ private int currentLineBasesCount; /** * Holds the number of bases so far appended for the current sequence. */ private long currentBasesCount; /** * Holds the FASTA output file offset for the current sequence. */ private long currentSequenceOffset; /** * Holds the name of the sequence that is been appended currently. */ private String currentSequenceName; /** * Flag indicating whether this writer has been already closed. */ private boolean closed; /** * Creates a reference FASTA file writer (private...use the builder: {@link FastaReferenceWriterBuilder}. * <p> * You can specify a specific output stream to each file: the main fasta output, its index and its dictionary. * You can only provide a compressed stream to the fastaOutput, and only in the case that an index isn't written. * <p> * <p> * </p> * * @param fastaOutput the (uncompressed) output fasta file path. * @param indexOutput the (uncompressed) output stream to the index file, if requested, {@code null} if none should be generated. * @param dictOutput the (uncompressed) output stream to the dictFile, if requested, {@code null} if none should be generated. * @throws IllegalArgumentException if {@code fastaFile} is {@code null} or {@code basesPerLine} is 0 or negative. */ FastaReferenceWriter(final int basesPerLine, final boolean addMd5, final OutputStream fastaOutput, final OutputStream indexOutput, final OutputStream dictOutput) { try { this.md5Digester = addMd5 ? MessageDigest.getInstance("MD5") : null; } catch (NoSuchAlgorithmException e) { throw new RuntimeException("Couldn't get md5 algorithm!", e); } this.defaultBasePerLine = basesPerLine; this.fastaStream = new CountingOutputStream(fastaOutput); this.indexWriter = indexOutput == null ? NullWriter.NULL_WRITER : new OutputStreamWriter(indexOutput, CHARSET); this.dictWriter = dictOutput == null ? NullWriter.NULL_WRITER : new OutputStreamWriter(dictOutput, CHARSET); this.dictCodec = new SAMSequenceDictionaryCodec(dictWriter); this.dictCodec.encodeHeaderLine(false); } // checks that a sequence name is valid. private static void checkSequenceName(final String name) { ValidationUtils.nonEmpty(name, "Sequence name"); for (int i = 0; i < name.length(); i++) { final char ch = name.charAt(i); if (Character.isWhitespace(ch)) { throw new IllegalArgumentException("the input name contains blank characters: '" + name + "'"); } else if (Character.isISOControl(ch)) { throw new IllegalArgumentException("the input name contains control characters: '" + name + "'"); } } } private static void checkSequenceBases(final byte[] bases, final int offset, final int length) { ValidationUtils.nonNull(bases, "input bases"); ValidationUtils.validateArg(bases.length >= offset + length, "Cannot validate bases beyond end of array."); final int to = offset + length; for (int i = offset; i < to; i++) { final byte b = bases[i]; if (!SequenceUtil.isIUPAC(b)) { throw new IllegalArgumentException( "the input sequence contains invalid base calls like: " + (char) b); } } } private static String checkDescription(final String description) { if (description == null || description.isEmpty()) { return ""; } for (int i = 0; i < description.length(); i++) { final char c = description.charAt(i); if (Character.isISOControl(c) && c != '\t') { // tab is the only valid control char in the description. throw new IllegalArgumentException( "the input name contains non-tab control characters: '" + description + "'"); } } return description; } /** * Starts the input of the bases of a new sequence. * <p> * This operation automatically closes the previous sequence base input if any. * </p> * <p> * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header start character * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). * </p> * <p> * No description is included in the output. * </p> * <p> * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE} * if none was provided. * </p> * <p> * This method cannot be called after the writer has been closed. * </p> * <p> * It also will fail if no base was added to the previous sequence if any. * </p> * * @param sequenceName the name of the new sequence. * @return this instance. * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence * with the same name has already been added to the writer. * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. * @throws IOException if such exception is thrown when writing into the output resources. */ public FastaReferenceWriter startSequence(final String sequenceName) throws IOException { return startSequence(sequenceName, "", defaultBasePerLine); } /** * Starts the input of the bases of a new sequence. * <p> * This operation automatically closes the previous sequence base input if any. * </p> * <p> * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header start character * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). * </p> * <p> * The input bases-per-line must be 1 or greater. * </p> * <p> * This method cannot be called after the writer has been closed. * </p> * <p> * It also will fail if no base was added to the previous sequence if any. * </p> * * @param sequenceName the name of the new sequence. * @param basesPerLine number of bases per line for this sequence. * @return this instance. * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence * with the same name has already been added to the writer. * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. * @throws IOException if such exception is thrown when writing into the output resources. */ public FastaReferenceWriter startSequence(final String sequenceName, final int basesPerLine) throws IOException { return startSequence(sequenceName, "", FastaReferenceWriterBuilder.checkBasesPerLine(basesPerLine)); } /** * Starts the input of the bases of a new sequence. * <p> * This operation automatically closes the previous sequence base input if any. * </p> * <p> * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header start character * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). * </p> * <p> * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty * string ("") no description will be outputted. * </p> * <p> * The input bases-per-line is set to the default provided at construction or {@link #DEFAULT_BASES_PER_LINE} * if none was provided. * </p> * <p> * This method cannot be called after the writer has been closed. * </p> * <p> * It also will fail if no base was added to the previous sequence if any. * </p> * * @param sequenceName the name of the new sequence. * @param description optional description for that sequence. * @return this instance. * @throws IllegalArgumentException if any argument does not comply with requirements listed above or if a sequence * with the same name has already been added to the writer. * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed. * @throws IOException if such exception is thrown when writing into the output resources. */ public FastaReferenceWriter startSequence(final String sequenceName, final String description) throws IOException { return startSequence(sequenceName, description, defaultBasePerLine); } /** * Starts the input of the bases of a new sequence. * <p> * This operation automatically closes the previous sequence base input if any. * </p> * <p> * The sequence name cannot contain any blank characters (as determined by {@link Character#isWhitespace(char)}), * control characters (as determined by {@link Character#isISOControl(char)}) or the the FASTA header start character * {@value #HEADER_START_CHAR}. It cannot be the empty string either (""). * </p> * <p> * The description cannot contain {@link Character#isISOControl(char)}. If set to {@code null} or the empty * string ("") no description will be outputted. * </p> * <p> * The input bases-per-line must be 1 or greater. * </p> * <p> * This method cannot be called after the writer has been closed. * </p> * <p> * It also will fail if no base was added to the previous sequence if any. * </p> * * @param sequenceName the name of the new sequence. * @param description optional description for that sequence. * @param basesPerLine number of bases per line for this sequence. * @return this instance. * @throws IllegalArgumentException if any argument does not comply with requirements listed above. * @throws IllegalStateException if no base was added to the previous sequence or the writer is already closed of * the sequence has been already added. * @throws IOException if such exception is thrown when writing into the output resources. */ public FastaReferenceWriter startSequence(final String sequenceName, final String description, final int basesPerLine) throws IOException { assertIsNotClosed(); checkSequenceName(sequenceName); final String nonNullDescription = checkDescription(description); FastaReferenceWriterBuilder.checkBasesPerLine(basesPerLine); closeSequence(); if (sequenceNames.contains(sequenceName)) { throw new IllegalStateException( "the input sequence name '" + sequenceName + "' has already been added"); } currentSequenceName = sequenceName; currentBasesPerLine = basesPerLine; final StringBuilder builder = new StringBuilder(sequenceName.length() + nonNullDescription.length() + 2); builder.append(HEADER_START_CHAR).append(sequenceName); if (!nonNullDescription.isEmpty()) { builder.append(HEADER_NAME_AND_DESCRIPTION_SEPARATOR).append(nonNullDescription); } fastaStream.write(builder.toString().getBytes(CHARSET)); fastaStream.write(LINE_SEPARATOR); currentSequenceOffset = fastaStream.getBytesWritten(); if (md5Digester != null) { md5Digester.reset(); } return this; } private void closeSequence() throws IOException { if (currentSequenceName != null) { if (currentBasesCount == 0) { throw new IllegalStateException("no base was added"); } sequenceNames.add(currentSequenceName); writeIndexEntry(); writeDictEntry(); fastaStream.write(LINE_SEPARATOR); currentBasesCount = 0; currentLineBasesCount = 0; currentSequenceName = null; } } private void writeIndexEntry() throws IOException { indexWriter.append(currentSequenceName).append(INDEX_FIELD_SEPARATOR_CHR) .append(String.valueOf(currentBasesCount)).append(INDEX_FIELD_SEPARATOR_CHR) .append(String.valueOf(currentSequenceOffset)).append(INDEX_FIELD_SEPARATOR_CHR) .append(String.valueOf(currentBasesPerLine)).append(INDEX_FIELD_SEPARATOR_CHR) .append(String.valueOf(currentBasesPerLine + LINE_SEPARATOR.length)).append(LINE_SEPARATOR_CHR); } private void writeDictEntry() { final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord(currentSequenceName, (int) currentBasesCount); if (md5Digester != null) { samSequenceRecord.setMd5(SequenceUtil.md5DigestToString(md5Digester.digest())); } dictCodec.encodeSequenceRecord(samSequenceRecord); } /** * Adds bases to current sequence from a {@code byte} array. * * @param basesBases String containing the bases to be added. * string will be interpreted using ascii and will throw if any character is >= 127. * @return this instance. * @throws IllegalArgumentException if {@code bases} is {@code null} or * the input array contains invalid bases (as assessed by: {@link SequenceUtil#isIUPAC(byte)}). * @throws IllegalStateException if no sequence was started or the writer is already closed. * @throws IOException if such exception is throw when writing in any of the outputs. */ public FastaReferenceWriter appendBases(final String basesBases) throws IOException { return appendBases(basesBases.getBytes(StandardCharsets.US_ASCII)); } /** * Adds bases to current sequence from a {@code byte} array. * Will throw if any character is >= 127. * * @param bases array containing the bases to be added. * @return this instance. * @throws IllegalArgumentException if {@code bases} is {@code null} or * the input array contains invalid bases (as assessed by: {@link SequenceUtil#isIUPAC(byte)}). * @throws IllegalStateException if no sequence was started or the writer is already closed. * @throws IOException if such exception is throw when writing in any of the outputs. */ public FastaReferenceWriter appendBases(final byte[] bases) throws IOException { return appendBases(bases, 0, bases.length); } /** * Adds bases to current sequence from a range in a {@code byte} array. * Will throw if any character is >= 127. * * @param bases array containing the bases to be added. * @param offset the position of the first base to add. * @param length how many bases to be added starting from position {@code offset}. * @return this instance. * @throws IllegalArgumentException if {@code bases} is {@code null} or * {@code offset} and {@code length} do not entail a valid range in {@code bases} or * that range in {@code base} contain invalid bases (as assessed by: {@link SequenceUtil#isIUPAC(byte)}). * @throws IllegalStateException if no sequence was started or the writer is already closed. * @throws IOException if such exception is throw when writing in any of the outputs. */ public FastaReferenceWriter appendBases(final byte[] bases, final int offset, final int length) throws IOException { assertIsNotClosed(); assertSequenceOpen(); checkSequenceBases(bases, offset, length); ValidationUtils.validateArg(offset >= 0, "the input offset cannot be negative"); ValidationUtils.validateArg(length >= 0, "the input length must not be negative"); final int to = offset + length; ValidationUtils.validateArg(to <= bases.length, "the length + offset goes beyond the end of " + "the input base array: '" + to + "' > '" + bases.length + "'"); int next = offset; while (next < to) { if (currentLineBasesCount == currentBasesPerLine) { fastaStream.write(LINE_SEPARATOR); currentLineBasesCount = 0; } final int nextLength = Math.min(to - next, currentBasesPerLine - currentLineBasesCount); fastaStream.write(bases, next, nextLength); if (md5Digester != null) { md5Digester.update(new String(bases, next, nextLength).toUpperCase().getBytes()); } currentLineBasesCount += nextLength; next += nextLength; } currentBasesCount += length; return this; } /** * Appends a new sequence to the output. * <p> * This is a convenient short handle for {@code startSequence(name).appendBases(bases)}. * </p> * <p> * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. * </p> * * @param sequence a {@link ReferenceSequence} to add. * @return a reference to this very same writer. * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence * with such name was already appended to this writer. */ public FastaReferenceWriter addSequence(ReferenceSequence sequence) throws IOException { return startSequence(sequence.getName()).appendBases(sequence.getBases()); } /** * Appends a new sequence to the output with or without a description. * <p> * This is a convenient short handle for {@code startSequence(name, description).appendBases(bases)}. * </p> * <p> * A {@code null} or empty ("") description will be ignored (no description will be output). * </p> * <p> * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. * </p> * * @param name the name of the new sequence. * @param bases the (first) bases of the sequence. * @param description the description for the new sequence. * @return a reference to this very same writer. * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when * the {@code description} contains unsupported characters. * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence * with such name was already appended to this writer. */ public FastaReferenceWriter appendSequence(final String name, final String description, final byte[] bases) throws IOException { return startSequence(name, description).appendBases(bases); } /** * Appends a new sequence to the output with or without a description and an alternative number of bases-per-line. * <p> * This is a convenient short handle for {@code startSequence(name, description, bpl).appendBases(bases)}. * </p> * <p> * A {@code null} or empty ("") description will be ignored (no description will be output). * </p> * <p> * The new sequence remains open meaning that additional bases for that sequence can be added with additional calls to {@link #appendBases}. * </p> * * @param name the name of the new sequence. * @param bases the (first) bases of the sequence. * @param description the description for the sequence. * @param basesPerLine alternative number of bases per line to be used for the sequence. * @return a reference to this very same writer. * @throws IOException if such an exception is thrown when actually writing into the output streams/channels. * @throws IllegalArgumentException if either {@code name} or {@code bases} is {@code null} or contains an invalid value (e.g. unsupported bases or sequence names). Also when the * {@code description} contains unsupported characters or {@code basesPerLine} is 0 or negative. * @throws IllegalStateException if the writer is already closed, a previous sequence (if any was opened) has no base appended to it or a sequence * with such name was already appended to this writer. */ public FastaReferenceWriter appendSequence(final String name, final String description, final int basesPerLine, final byte[] bases) throws IOException { return startSequence(name, description, basesPerLine).appendBases(bases); } private void assertSequenceOpen() { if (currentSequenceName == null) { throw new IllegalStateException("trying to add bases without starting a sequence"); } } private void assertIsNotClosed() { if (closed) { throw new IllegalStateException("already closed"); } } /** * Closes this writer flushing all remaining writing operation input the output resources. * <p> * Further calls to {@link #appendBases} or {@link #startSequence} will result in an exception. * </p> * * @throws IOException if such exception is thrown when closing output writers and output streams. * @throws IllegalStateException if closing without writing any sequences or closing when writing a sequence is in progress */ @Override public void close() throws IOException { if (!closed) { try { closeSequence(); if (sequenceNames.isEmpty()) { throw new IllegalStateException("no sequences were added to the reference"); } } finally { closed = true; fastaStream.close(); indexWriter.close(); dictWriter.close(); } } } /** * Convenient method to write a FASTA file with a single sequence. * * @param whereTo the path to. must not be null. * @param makeIndex whether the index file should be written at its standard location. * @param makeDict whether the dictionary file should be written at it standard location. * @param name the sequence name, cannot contain white space, or control chracter or the header start character. * @param description the sequence description, can be null or "" if no description. * @param bases the sequence bases, cannot be {@code null}. * @throws IOException if such exception is thrown when writing in the output resources. */ public static void writeSingleSequenceReference(final Path whereTo, final boolean makeIndex, final boolean makeDict, final String name, final String description, final byte[] bases) throws IOException { try (final FastaReferenceWriter writer = new FastaReferenceWriterBuilder().setFastaFile(whereTo) .setMakeFaiOutput(makeIndex).setMakeDictOutput(makeDict).build()) { writer.startSequence(name, description); writer.appendBases(bases); } } /** * Convenient method to write a FASTA file with a single sequence. * * @param whereTo the path to. must not be null. * @param basesPerLine number of bases per line. must be 1 or greater. * @param makeIndex whether the index file should be written at its standard location. * @param makeDict whether the dictionary file should be written at it standard location. * @param name the sequence name, cannot contain white space, or control chracter or the header start character. * @param description the sequence description, can be null or "" if no description. * @param bases the sequence bases, cannot be {@code null}. * @throws IOException if such exception is thrown when writing in the output resources. */ public static void writeSingleSequenceReference(final Path whereTo, final int basesPerLine, final boolean makeIndex, final boolean makeDict, final String name, final String description, final byte[] bases) throws IOException { try (final FastaReferenceWriter writer = new FastaReferenceWriterBuilder().setBasesPerLine(basesPerLine) .setFastaFile(whereTo).setMakeFaiOutput(makeIndex).setMakeDictOutput(makeDict).build()) { writer.startSequence(name, description); writer.appendBases(bases); } } private static class NullWriter extends Writer { @Override public void write(char[] cbuf, int off, int len) throws IOException { // no op } @Override public void flush() throws IOException { // no op } @Override public void close() throws IOException { // no op } private NullWriter() { } /** * The only singleton instance of this class (no need for more!) */ public final static NullWriter NULL_WRITER = new NullWriter(); } }