it.crs4.seal.read_sort.FastaChecksummer.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.read_sort.FastaChecksummer.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.read_sort;

import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.UnknownItemException;

import java.io.BufferedReader;
import java.io.Reader;

import java.util.Iterator;
import java.util.Map;
import java.util.HashMap;
import java.util.regex.*;

import org.apache.commons.codec.binary.Hex;

public class FastaChecksummer implements Iterable<FastaChecksummer.ChecksumEntry> {
    public static class ChecksumEntry {
        private String name;
        private String checksum;

        public ChecksumEntry(String name, String checksum) {
            this.name = name;
            this.checksum = checksum;
        }

        public String getName() {
            return name;
        }

        public String getChecksum() {
            return checksum;
        }
    }

    private BufferedReader input;
    private HashMap<String, ChecksumEntry> contigHashes;

    private final String checksumAlgorithm = "MD5";
    private static final Pattern ContigNamePattern = Pattern.compile(">\\s*(\\S+).*");

    public void setInput(Reader stream) {
        input = new BufferedReader(stream, 4 * 1024 * 1024);
        contigHashes = null;
    }

    public void calculate() throws FormatException, java.io.IOException {
        if (input == null)
            throw new IllegalStateException("FastaChecksummer input not set");

        contigHashes = new HashMap<String, ChecksumEntry>();

        String currentContig = null;
        java.security.MessageDigest hasher = null;

        try {
            hasher = java.security.MessageDigest.getInstance(checksumAlgorithm);
        } catch (java.security.NoSuchAlgorithmException e) {
            throw new RuntimeException(
                    "Unexpected NoSuchAlgorithmException when asking for " + checksumAlgorithm + " algorithm");
        }

        String line = input.readLine();
        if (line == null)
            throw new FormatException("empty Fasta");

        try {
            while (line != null) {
                if (line.startsWith(">")) // start a new contig
                {
                    if (currentContig != null) {
                        // Hadoop 0.20,2 ships with Apache commons version 1.3, which doesn't
                        // have encodeHexString
                        String cs = new String(Hex.encodeHex(hasher.digest()));
                        contigHashes.put(currentContig, new ChecksumEntry(currentContig, cs));
                    }

                    Matcher m = ContigNamePattern.matcher(line);
                    if (m.matches()) {
                        currentContig = m.group(1);
                        hasher.reset();
                    } else
                        throw new FormatException("Unexpected contig name format: " + line);
                } else {
                    if (currentContig == null)
                        throw new FormatException(
                                "Sequence outside any fasta record (header is missing). Line: " + line);
                    else
                        hasher.update(line.getBytes("US-ASCII"));
                }

                line = input.readLine();
            }

            if (currentContig != null) // store the last contig
            {
                String cs = new String(Hex.encodeHex(hasher.digest()));
                contigHashes.put(currentContig, new ChecksumEntry(currentContig, cs));
            }
        } catch (java.io.UnsupportedEncodingException e) {
            throw new RuntimeException("Unexpected UnsupportedEncodingException! Line: " + line);
        }
    }

    public Iterator<ChecksumEntry> iterator() {
        if (contigHashes == null)
            throw new IllegalStateException("Checksums not calculated");

        return contigHashes.values().iterator();
    }

    public boolean hasChecksum(String contigName) {
        if (contigHashes == null)
            throw new IllegalStateException("Checksums not calculated");

        return contigHashes.containsKey(contigName);
    }

    public String getChecksum(String contigName) throws UnknownItemException {
        if (contigHashes == null)
            throw new IllegalStateException("Checksums not calculated");

        ChecksumEntry entry = contigHashes.get(contigName);
        if (entry == null)
            throw new UnknownItemException("Unknown contig name " + contigName);
        else
            return entry.getChecksum();
    }
}