edu.cornell.med.icb.goby.readers.FastXReader.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.goby.readers.FastXReader.java

Source

/*
 * Copyright (C) 2009-2011 Institute for Computational Biomedicine,
 *                    Weill Medical College of Cornell University
 *
 *  This file is part of the Goby IO API.
 *
 *     The Goby IO API is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU Lesser General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     The Goby IO API is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU Lesser General Public License for more details.
 *
 *     You should have received a copy of the GNU Lesser General Public License
 *     along with the Goby IO API.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.goby.readers;

import edu.cornell.med.icb.goby.exception.GobyRuntimeException;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.io.FastBufferedReader;
import org.apache.commons.io.IOUtils;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;

/**
 * A reader for <a href="http://en.wikipedia.org/wiki/FASTA_format">FASTA</a>
 * or <a href="http://en.wikipedia.org/wiki/FASTQ_format">FASTQ</a> files.
 * This reuses the same {@link edu.cornell.med.icb.goby.readers.FastXEntry} over and
 * over for reading the file, so don't directly store it. If you need to store the
 * resultant FastXEntry object, use the {@link edu.cornell.med.icb.goby.readers.FastXEntry#clone()}
 * method to duplicate the FastXEntry object.
 *
 * For FASTQ this parser assumes the # of quality symbols is the same (or more,
 * but should be the same) than the # of sequence characters. The position of linefeeds
 * makes NO DIFFERENCE to this parser in the sequence or quality characters.
 *
 * @author Kevin Dorff
 */
public class FastXReader implements Iterator<FastXEntry>, Iterable<FastXEntry>, Closeable {
    /** The start of line character the designates a new FASTA record. */
    private static final char FASTA_RECORD_START = '>';

    /** The start of line character the designates a new FASTA record. */
    private char fastXRecordStart = FASTA_RECORD_START;

    /** The reader for the FASTA file. */
    private BufferedReader reader;

    /** The "nextEntry" used by next/hasNext. */
    private FastXEntry nextEntry;

    /** The MutableString entry to continually reuse for reading the FASTA file. */
    private final FastXEntry mutableEntry;

    /** The detected file type ("fa" or "fq"). */
    private final String fileType;

    private boolean useCasavaQualityFilter;

    /**
     * Create the FASTX reader.
     * @param file the file that contains the FASTA / FASTQ data
     * @throws IOException error reading or the input stream doesn't support "mark"
     */
    public FastXReader(final String file) throws IOException {
        this(file.endsWith(".gz") ? new GZIPInputStream(new FastBufferedInputStream(new FileInputStream(file)))
                : new FileInputStream(file));
    }

    /**
     * Create the FASTX reader.
     * @param is the input stream that contains FASTA / FASTQ data.
     * @throws IOException error reading or the input stream doesn't support "mark"
     */
    public FastXReader(final InputStream is) throws IOException {
        reader = new BufferedReader(new FastBufferedReader(new InputStreamReader(new FastBufferedInputStream(is))));
        nextEntry = null;
        mutableEntry = new FastXEntry();
        if (!reader.markSupported()) {
            reader.close();
            reader = null;
            throw new IOException("FastaReader requires markSupported() on its input stream");
        }
        // We assume the first record will be within the first 32k of the file.
        reader.mark(32768);
        while (true) {
            final String nextLine = reader.readLine();
            if (nextLine == null) {
                // Odd, couldn't find a record. oh well.
                // Assume it is a FASTA file (the default assumption).
                break;
            }
            if (nextLine.length() == 0 || nextLine.charAt(0) == ';' || nextLine.charAt(0) == '#') {
                // Comment or blank line at TOP of file. After this all lines are assumed
                // to be data.
                continue;
            }
            fastXRecordStart = nextLine.charAt(0);
            break;
        }
        reader.reset();

        if (fastXRecordStart == '>') {
            fileType = "fa";
        } else if (fastXRecordStart == '@') {
            fileType = "fq";
        } else {
            fileType = "UNKNOWN";
        }
    }

    /**
     * Get the file type (extension for file). This is determined from the
     * content of the file, not from the file name when the file was opened.
     * @return the file type
     */
    public String getFileType() {
        return fileType;
    }

    /**
     * Check if there are more FASTA entries to read.
     *
     * @return true if there are more FASTA entries to be retrieved.
     */
    public boolean hasNext() {
        if (nextEntry != null) {
            return true;
        }
        try {
            readNextEntry();
            return nextEntry != null;
        } catch (IOException e) {
            return false;
        }
    }

    /**
     * Return the next FASTA entries. This returns next FastaEntry that is
     * re-used for every next! Do not directly store this value as it is reused.
     * @return the next FastaEntry entry
     */
    public FastXEntry next() {
        if (nextEntry == null) {
            try {
                readNextEntry();
                if (nextEntry == null) {
                    throw new NoSuchElementException();
                }
            } catch (IOException e) {
                throw new GobyRuntimeException(e);
            }
        }
        final FastXEntry toReturn = nextEntry;
        nextEntry = null;
        return toReturn;
    }

    /**
     * Read the next FASTA record.
     * @throws IOException error reading
     */
    private void readNextEntry() throws IOException {
        nextEntry = mutableEntry;
        nextEntry.reset();
        while (true) {
            reader.mark(32768); // 80 chars is recommended for FASTA, but...
            final String nextLine = reader.readLine();
            if (nextLine == null) {
                // No more lines to read
                if (nextEntry.getEntry().length() == 0) {
                    nextEntry = null;
                } else {
                    // End of file, it IS complete.
                    nextEntry.setEntryComplete(true);
                }
                return;
            }
            if (nextLine.length() == 0) {
                // blank line
                continue;
            }
            final char firstChar = nextLine.charAt(0);
            if ((firstChar == ';' || firstChar == '#') && (nextEntry.getEntry().length() == 0)) {
                // We are between entries and found a comment
                continue;
            }
            if (!nextEntry.addLine(nextLine)) {
                reader.reset();
                return;
            }
            if (nextEntry.isEntryComplete()) {
                if (useCasavaQualityFilter) {
                    if (!nextEntry.isCasavaFilteredOutEntry()) {
                        return;
                    } else {
                        // The entry we just read should be filtered out. Move to the next one.
                        nextEntry.reset();
                    }
                } else {
                    return;
                }
            }
        }
    }

    public boolean isUseCasavaQualityFilter() {
        return useCasavaQualityFilter;
    }

    public void setUseCasavaQualityFilter(final boolean useCasavaQualityFilter) {
        this.useCasavaQualityFilter = useCasavaQualityFilter;
    }

    /**
     * Does nothing. Unsupported.
     */
    public void remove() {
        throw new UnsupportedOperationException();
    }

    /**
     * Iterator for FASTA entries.
     *
     * @return the MutableString iterator (this class)
     */
    public Iterator<FastXEntry> iterator() {
        return this;
    }

    /**
     * Close the reader.
     *
     * @throws IOException error closing
     */
    public void close() throws IOException {
        IOUtils.closeQuietly(reader);
    }
}