edu.rit.flick.genetics.FastFileInflator.java Source code

Java tutorial

Introduction

Here is the source code for edu.rit.flick.genetics.FastFileInflator.java

Source

/**
 *  COPYRIGHT (C) 2015 Alex Aiezza. All Rights Reserved.
 *
 *  See the LICENSE for the specific language governing permissions and
 *  limitations under the License provided with this project.
 */
package edu.rit.flick.genetics;

import static edu.rit.flick.config.DefaultOptionSet.DELETE_FLAG;
import static edu.rit.flick.config.DefaultOptionSet.VERBOSE_FLAG;
import static org.apache.commons.io.FileUtils.getFile;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileChannel.MapMode;
import java.util.Properties;
import java.util.Scanner;
import java.util.StringTokenizer;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.LongAdder;

import org.apache.commons.io.FileUtils;

import com.google.common.collect.BiMap;
import com.google.common.io.Files;

import edu.rit.flick.FileInflator;
import edu.rit.flick.config.Configuration;
import edu.rit.flick.genetics.util.ByteBufferOutputStream;
import it.unimi.dsi.io.ByteBufferInputStream;
import it.unimi.dsi.lang.MutableString;
import net.lingala.zip4j.core.ZipFile;
import net.lingala.zip4j.exception.ZipException;

/**
 * @author Alex Aiezza
 *
 */
public abstract class FastFileInflator implements FastFileArchiver, FileInflator {
    private boolean interrupted = false;

    // Input files
    protected ByteBufferInputStream datahcf;
    protected Scanner nfile;
    protected Scanner headerfile;
    protected Scanner iupacfile;
    protected Scanner tailfile;
    protected Properties metafile;

    // Output file
    protected ByteBufferOutputStream fastOut;

    // Tracking fields
    protected final MutableString header = new MutableString();
    protected long headerPosition;

    protected long nStart = -1, consecNs = -1;

    protected long iupacPosition = -1;
    protected char iupacBase = 0x0;

    protected final MutableString nucleotides = new MutableString();
    // @formatter:off
    protected final LongAdder dnaPosition = new LongAdder() {
        private static final long serialVersionUID = 1L;

        @Override
        public void increment() {
            super.increment();

            // Check for headerPosition index
            if (fastOut.position() == headerPosition)
                writeNextHeader();
            else
                afterWriteNucleotide();

            processSequence();
        }

    };
    // @formatter:on
    protected final AtomicLong seqDnaPosition = new AtomicLong();

    private long fastFileSize;
    private boolean containsCarriageReturns = false;
    private boolean isRNAData = false;

    protected final BiMap<String, Byte> byteConverter;

    public FastFileInflator() {
        byteConverter = new ByteConverterBiMapFactory().getByteConverter(4);
    }

    protected void afterWriteNucleotide() {
        seqDnaPosition.incrementAndGet();
    }

    protected void beforeSequence() throws IOException {
    }

    protected void close() throws IOException, InterruptedException {
        if (fastOut == null)
            return;

        fastOut.close();
        iupacfile.close();
        datahcf.close();
        headerfile.close();
        nfile.close();
        tailfile.close();

        datahcf = null;
        fastOut = null;

        // Give the last method a moment to garbage collect
        // System.gc();
        // Thread.sleep( 1000 );
    }

    @Override
    public boolean containsCarriageReturns() {
        return containsCarriageReturns;
    }

    @SuppressWarnings("resource")
    protected void createOutputFiles(final String tempOutputDirectory, final File fastFile) throws IOException {
        datahcf = ByteBufferInputStream
                .map(new FileInputStream(getFile(tempOutputDirectory, SEQUENCE_DATA_FILE)).getChannel());
        nfile = new Scanner(getFile(tempOutputDirectory, N_FILE)).useDelimiter("\\" + PIPE);
        headerfile = new Scanner(getFile(tempOutputDirectory, SEQUENCE_ID_FILE)).useDelimiter("" + NEWLINE);
        iupacfile = new Scanner(getFile(tempOutputDirectory, IUPAC_CODE_FILE)).useDelimiter("\\" + PIPE);
        tailfile = new Scanner(getFile(tempOutputDirectory, SEQUENCE_TAIL_FILE));
        metafile = getProperties(getFile(tempOutputDirectory, META_FILE));

        parseProperties();

        fastOut = ByteBufferOutputStream.map(fastFile, MapMode.READ_WRITE, fastFileSize);
    }

    @Override
    public BiMap<String, Byte> getByteConverter() {
        return byteConverter;
    }

    protected void getNextIupacBase() {
        final StringTokenizer iupacs;
        if (iupacfile.hasNext()) {
            iupacs = new StringTokenizer(iupacfile.next(), RANGE);
            iupacPosition = Long.parseLong(iupacs.nextToken(), 16);
            iupacBase = iupacs.nextToken().charAt(0);
        }
        // Check for IUPAC index
        if (fastOut.position() > 0 && dnaPosition.longValue() == iupacPosition)
            writeNextIupacBase();
    }

    protected void getNextNs() {
        final long nEnd;
        final StringTokenizer ns;
        if (nfile.hasNext()) {
            final String line = nfile.next().trim();
            if (!line.isEmpty()) {
                ns = new StringTokenizer(line, RANGE);
                nStart = Long.parseLong(ns.nextToken(), 16);
                nEnd = Long.parseLong(ns.nextToken(), 16);
                consecNs = nEnd - nStart;
            }
        }
        // Check for nStart index
        if (fastOut.position() > 0 && dnaPosition.longValue() == nStart)
            writeNextNs();
    }

    protected boolean getNextNucleotides() {
        if (datahcf.available() > 0) {
            nucleotides.replace(byteConverter.inverse().get((byte) datahcf.read()));
            return true;
        }
        return false;
    }

    private void getNextSequenceIdentifier() {
        if (headerfile.hasNext()) {
            final String headerInfo = headerfile.nextLine()
                    + (containsCarriageReturns() ? (char) CARRIAGE_RETURN + "" + (char) NEWLINE : (char) NEWLINE);
            final String[] headInd = headerInfo.split("\\" + PIPE, 2);
            headerPosition = Long.parseLong(headInd[0]);
            header.replace(
                    (containsCarriageReturns() ? (char) CARRIAGE_RETURN + "" + (char) NEWLINE : (char) NEWLINE) + ""
                            + (char) getSequenceIdentifierStart() + headInd[1]);
        } else
            header.replace("");
    }

    protected Properties getProperties(final File propertiesFile) throws IOException {
        final Properties props = new Properties();

        final InputStream in = new FileInputStream(propertiesFile);
        props.load(in);

        in.close();

        return props;
    }

    @Override
    public synchronized File inflate(final Configuration configuration, final File fileIn, final File fileOut) {
        assert fileIn.exists();

        try {
            // Inflate to Directory
            final String outputDirectoryPath = fileOut.getPath()
                    .replaceAll("." + Files.getFileExtension(fileOut.getPath()), FLICK_FAST_FILE_TMP_DIR_SUFFIX);

            final File tmpOutputDirectory = new File(outputDirectoryPath);
            if (tmpOutputDirectory.exists())
                FileUtils.deleteDirectory(tmpOutputDirectory);

            final AtomicReference<Thread> cleanHookAtomic = new AtomicReference<Thread>();

            final Thread inflateToDirectoryThread = new Thread(() -> {
                try {
                    // Inflate Fast file to a temporary directory
                    inflateFromFile(fileIn, tmpOutputDirectory);

                    // Inflate Directory to a zip file
                    inflateFromDirectory(tmpOutputDirectory, fileOut);

                    // Clean up IO
                    close();
                    System.gc();
                    Thread.sleep(100);

                    // Clean up temporary directory
                    FileUtils.deleteDirectory(tmpOutputDirectory);

                    Runtime.getRuntime().removeShutdownHook(cleanHookAtomic.get());
                } catch (final Exception e) {
                    if (!interrupted)
                        System.err.println(e.getMessage());
                }
            }, "Default_Inflation_Thread");

            // Make cleaning hook
            final Thread cleanHook = new Thread(() -> {
                interrupted = true;
                configuration.setFlag(VERBOSE_FLAG, false);
                configuration.setFlag(DELETE_FLAG, false);
                try {
                    if (inflateToDirectoryThread.isAlive())
                        inflateToDirectoryThread.interrupt();

                    // Clean up IO
                    close();
                    System.gc();
                    Thread.sleep(100);

                    synchronized (this) {
                        while (inflateToDirectoryThread.isAlive())
                            this.wait();
                    }

                } catch (final IOException | InterruptedException e) {
                    e.printStackTrace();
                } finally {
                    // Clean up temporary directory
                    FileUtils.deleteQuietly(tmpOutputDirectory);
                    // Clean up INCOMPLETE output file
                    FileUtils.deleteQuietly(fileOut);
                    System.out.println();
                }

            }, "Inflation_Cleaning_Thread");

            cleanHookAtomic.set(cleanHook);

            Runtime.getRuntime().addShutdownHook(cleanHook);

            inflateToDirectoryThread.start();
            inflateToDirectoryThread.join();

        } catch (final IOException | InterruptedException e) {
            e.printStackTrace();
        }

        return fileOut;
    }

    public File inflateFromDirectory(final File tmpOutputDirectory, final File fileOut) throws IOException {
        createOutputFiles(tmpOutputDirectory.getPath() + File.separator, fileOut);

        initializeInflator();

        while (getNextNucleotides())
            // Write sequence
            nucleotides.chars().mapToObj(base -> (byte) base).forEach(base -> {

                // Check for headerPosition index
                if (fastOut.position() == headerPosition)
                    writeNextHeader();

                processSequence();

                writeNucleotide(base);
            });

        // Write tail
        writeTail();

        return fileOut;
    }

    protected File inflateFromFile(final File fileIn, final File tmpOutputDirectory) {
        tmpOutputDirectory.mkdirs();
        ZipFile zipFile;
        try {
            zipFile = new ZipFile(fileIn);

            zipFile.extractAll(tmpOutputDirectory.getPath());
        } catch (final ZipException e) {
            if (!interrupted)
                System.err.println(e.getMessage());
        }

        return tmpOutputDirectory;
    }

    protected void initializeInflator() {
        header.replace("");

        nStart = -1;
        consecNs = -1;

        iupacPosition = -1;
        iupacBase = 0x0;

        nucleotides.replace("");
        dnaPosition.reset();
        seqDnaPosition.set(0);

        // Get first sequence identifier
        if (headerfile.hasNext()) {
            final String headerInfo = headerfile.nextLine()
                    + (containsCarriageReturns() ? (char) CARRIAGE_RETURN + "" + (char) NEWLINE : (char) NEWLINE);
            final String[] headInd = headerInfo.split("\\" + PIPE, 2);
            headerPosition = Long.parseLong(headInd[0]);
            header.replace((char) getSequenceIdentifierStart() + headInd[1]);
        }

        // Get first N
        getNextNs();

        // Get first IUPAC Base
        getNextIupacBase();
    }

    @Override
    public boolean isRNAData() {
        return isRNAData;
    }

    protected void parseProperties() {
        fastFileSize = Long.parseLong((String) metafile.get(META_FILE_SIZE));
        containsCarriageReturns = Boolean.parseBoolean((String) metafile.get(META_CARRIAGE_RETURN));
        isRNAData = Boolean.parseBoolean((String) metafile.get(META_RNA_DATA));
    }

    protected void processSequence() {
        // Check for nStart index
        if (dnaPosition.longValue() == nStart)
            writeNextNs();

        // Check for IUPAC index
        if (dnaPosition.longValue() == iupacPosition)
            writeNextIupacBase();
    }

    protected void writeNewline() throws IOException {
        if (containsCarriageReturns())
            fastOut.write(CARRIAGE_RETURN);
        fastOut.write(NEWLINE);
    }

    protected final void writeNextHeader() {
        try {
            // Write header
            beforeSequence();
            fastOut.put(header.toString().getBytes());
            getNextSequenceIdentifier();
            seqDnaPosition.set(0);
        } catch (final IOException e) {
            if (!interrupted)
                e.printStackTrace();
        }
    }

    protected void writeNextIupacBase() {
        writeNucleotide((byte) iupacBase);
        getNextIupacBase();
        // Check for nStart index
        if (dnaPosition.longValue() == nStart)
            writeNextNs();
    }

    protected void writeNextNs() {
        for (int n = 0; n < consecNs; writeNucleotide(N), n++)
            ;
        getNextNs();
        // Check for IUPAC index
        if (dnaPosition.longValue() == iupacPosition)
            writeNextIupacBase();
    }

    protected void writeNucleotide(final byte base) {
        try {
            final byte nucleotide = isRNAData() && base == T ? U : base;
            fastOut.write(nucleotide);
        } catch (final IOException e) {
            if (!interrupted)
                System.err.println(e.getMessage());
        }
        dnaPosition.increment();
    }

    protected void writeTail() throws IOException {
        if (tailfile.hasNext()) {
            final String tail = tailfile.next();
            tail.chars().mapToObj(base -> (byte) base).forEach(base -> {
                writeNucleotide(base);
            });
        } else
            processSequence();
        if (fastOut.available() > 0)
            writeNewline();
    }
}