juicebox.tools.utils.original.Preprocessor.java Source code

Java tutorial

Introduction

Here is the source code for juicebox.tools.utils.original.Preprocessor.java

Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2011-2015 Broad Institute, Aiden Lab
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 *  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 *  THE SOFTWARE.
 */

package juicebox.tools.utils.original;

//import juicebox.MainWindow;

import htsjdk.tribble.util.LittleEndianInputStream;
import htsjdk.tribble.util.LittleEndianOutputStream;
import juicebox.HiCGlobals;
import juicebox.data.ContactRecord;
import juicebox.windowui.NormalizationType;
import org.apache.commons.math.stat.StatUtils;
import org.broad.igv.feature.Chromosome;
import org.broad.igv.tdf.BufferedByteWriter;
import org.broad.igv.util.collections.DownsampledDoubleArrayList;

import java.awt.*;
import java.io.*;
import java.util.*;
import java.util.List;
import java.util.zip.Deflater;

/**
 * @author jrobinso
 * @since Aug 16, 2010
 */
public class Preprocessor {

    private static final int VERSION = 8;
    private static final int BLOCK_SIZE = 1000;

    private final List<Chromosome> chromosomes;

    // Map of name -> index
    private final Map<String, Integer> chromosomeIndexes;

    private final File outputFile;
    private final Map<String, IndexEntry> matrixPositions;
    private final String genomeId;
    private final Deflater compressor;
    private LittleEndianOutputStream los;
    private long masterIndexPosition;
    private int countThreshold = 0;
    private int mapqThreshold = 0;
    private boolean diagonalsOnly = false;
    private String fragmentFileName = null;
    private String statsFileName = null;
    private String graphFileName = null;
    private FragmentCalculation fragmentCalculation = null;
    private Set<String> includedChromosomes;
    /**
     * The position of the field containing the masterIndex position
     */
    private long masterIndexPositionPosition;
    private Map<String, ExpectedValueCalculation> expectedValueCalculations;
    private File tmpDir;

    public Preprocessor(File outputFile, String genomeId, List<Chromosome> chromosomes) {
        this.genomeId = genomeId;
        this.outputFile = outputFile;
        this.matrixPositions = new LinkedHashMap<String, IndexEntry>();

        this.chromosomes = chromosomes;
        chromosomeIndexes = new Hashtable<String, Integer>();
        for (int i = 0; i < chromosomes.size(); i++) {
            chromosomeIndexes.put(chromosomes.get(i).getName(), i);
        }

        compressor = new Deflater();
        compressor.setLevel(Deflater.DEFAULT_COMPRESSION);

        this.tmpDir = null;

    }

    public void setCountThreshold(int countThreshold) {
        this.countThreshold = countThreshold;
    }

    public void setMapqThreshold(int mapqThreshold) {
        this.mapqThreshold = mapqThreshold;
    }

    public void setDiagonalsOnly(boolean diagonalsOnly) {
        this.diagonalsOnly = diagonalsOnly;
    }

    public void setIncludedChromosomes(Set<String> includedChromosomes) {
        this.includedChromosomes = includedChromosomes;
    }

    public void setFragmentFile(String fragmentFileName) {
        this.fragmentFileName = fragmentFileName;
    }

    public void setGraphFile(String graphFileName) {
        this.graphFileName = graphFileName;
    }

    public void preprocess(final String inputFile) throws IOException {
        File file = new File(inputFile);

        if (!file.exists() || file.length() == 0) {
            System.err.println(inputFile + " does not exist or does not contain any reads.");
            System.exit(1);
        }

        try {
            String stats = null;
            String graphs = null;
            if (fragmentFileName != null) {
                fragmentCalculation = FragmentCalculation.readFragments(fragmentFileName);
            } else {
                System.out.println("WARNING: Not including fragment map");
            }
            if (statsFileName != null) {
                FileInputStream is = null;
                try {
                    is = new FileInputStream(statsFileName);
                    BufferedReader reader = new BufferedReader(new InputStreamReader(is), HiCGlobals.bufferSize);
                    stats = "";
                    String nextLine;
                    while ((nextLine = reader.readLine()) != null) {
                        stats += nextLine + "\n";
                    }
                } catch (IOException e) {
                    System.err.println("Error while reading stats file: " + e);
                    stats = null;
                } finally {
                    if (is != null) {
                        is.close();
                    }
                }

            }
            if (graphFileName != null) {
                FileInputStream is = null;
                try {
                    is = new FileInputStream(graphFileName);
                    BufferedReader reader = new BufferedReader(new InputStreamReader(is), HiCGlobals.bufferSize);
                    graphs = "";
                    String nextLine;
                    while ((nextLine = reader.readLine()) != null) {
                        graphs += nextLine + "\n";
                    }
                } catch (IOException e) {
                    System.err.println("Error while reading graphs file: " + e);
                    graphs = null;
                } finally {
                    if (is != null) {
                        is.close();
                    }
                }
            }

            expectedValueCalculations = new LinkedHashMap<String, ExpectedValueCalculation>();
            for (int bBinSize : HiCGlobals.bpBinSizes) {
                ExpectedValueCalculation calc = new ExpectedValueCalculation(chromosomes, bBinSize, null,
                        NormalizationType.NONE);
                String key = "BP_" + bBinSize;
                expectedValueCalculations.put(key, calc);
            }
            if (fragmentCalculation != null) {

                // Create map of chr name -> # of fragments
                Map<String, int[]> sitesMap = fragmentCalculation.getSitesMap();
                Map<String, Integer> fragmentCountMap = new HashMap<String, Integer>();
                for (Map.Entry<String, int[]> entry : sitesMap.entrySet()) {
                    int fragCount = entry.getValue().length + 1;
                    String chr = entry.getKey();
                    fragmentCountMap.put(chr, fragCount);
                }

                for (int fBinSize : HiCGlobals.fragBinSizes) {
                    ExpectedValueCalculation calc = new ExpectedValueCalculation(chromosomes, fBinSize,
                            fragmentCountMap, NormalizationType.NONE);
                    String key = "FRAG_" + fBinSize;
                    expectedValueCalculations.put(key, calc);
                }
            }

            System.out.println("Start preprocess");

            los = new LittleEndianOutputStream(
                    new BufferedOutputStream(new FileOutputStream(outputFile), HiCGlobals.bufferSize));

            System.out.println("Writing header");
            writeHeader(stats, graphs);

            System.out.println("Writing body");
            writeBody(inputFile);

            System.out.println();
            System.out.println("Writing footer");
            writeFooter();

        } finally {
            if (los != null)
                los.close();
        }

        updateMasterIndex();
        System.out.println("\nFinished preprocess");
    }

    private void writeHeader(String stats, String graphs) throws IOException {
        // Magic number
        byte[] magicBytes = "HIC".getBytes();
        los.write(magicBytes[0]);
        los.write(magicBytes[1]);
        los.write(magicBytes[2]);
        los.write(0);

        // VERSION
        los.writeInt(VERSION);

        // Placeholder for master index position, replaced with actual position after all contents are written
        masterIndexPositionPosition = los.getWrittenCount();
        los.writeLong(0l);

        // Genome ID
        los.writeString(genomeId);

        // Attribute dictionary
        int nAttributes;
        if (stats != null && graphs != null)
            nAttributes = 2;
        else if (stats != null)
            nAttributes = 1;
        else if (graphs != null)
            nAttributes = 1;
        else
            nAttributes = 0;

        los.writeInt(nAttributes);
        if (stats != null) {
            los.writeString("statistics");
            los.writeString(stats);
        }
        if (graphs != null) {
            los.writeString("graphs");
            los.writeString(graphs);
        }

        // Sequence dictionary
        int nChrs = chromosomes.size();
        los.writeInt(nChrs);
        for (Chromosome chromosome : chromosomes) {
            los.writeString(chromosome.getName());
            los.writeInt(chromosome.getLength());
        }

        //BP resolution levels
        int nBpRes = HiCGlobals.bpBinSizes.length;
        los.writeInt(nBpRes);
        for (int bpBinSize : HiCGlobals.bpBinSizes) {
            los.writeInt(bpBinSize);
        }

        //fragment resolutions
        int nFragRes = fragmentCalculation == null ? 0 : HiCGlobals.fragBinSizes.length;
        los.writeInt(nFragRes);
        for (int i = 0; i < nFragRes; i++) {
            los.writeInt(HiCGlobals.fragBinSizes[i]);
        }

        // fragment sites
        if (nFragRes > 0) {
            for (Chromosome chromosome : chromosomes) {
                int[] sites = fragmentCalculation.getSites(chromosome.getName());
                int nSites = sites == null ? 0 : sites.length;
                los.writeInt(nSites);
                for (int i = 0; i < nSites; i++) {
                    los.writeInt(sites[i]);
                }
            }
        }
    }

    private void writeBody(String inputFile) throws IOException {
        MatrixPP wholeGenomeMatrix = computeWholeGenomeMatrix(inputFile);

        writeMatrix(wholeGenomeMatrix);

        PairIterator iter = (inputFile.endsWith(".bin")) ? new BinPairIterator(inputFile, chromosomeIndexes)
                : new AsciiPairIterator(inputFile, chromosomeIndexes);

        int currentChr1 = -1;
        int currentChr2 = -1;
        MatrixPP currentMatrix = null;
        HashSet<String> writtenMatrices = new HashSet<String>();
        String currentMatrixKey = null;

        while (iter.hasNext()) {
            AlignmentPair pair = iter.next();
            // skip pairs that mapped to contigs
            if (!pair.isContigPair()) {
                // Flip pair if needed so chr1 < chr2
                int chr1, chr2, bp1, bp2, frag1, frag2, mapq;
                if (pair.getChr1() < pair.getChr2()) {
                    bp1 = pair.getPos1();
                    bp2 = pair.getPos2();
                    frag1 = pair.getFrag1();
                    frag2 = pair.getFrag2();
                    chr1 = pair.getChr1();
                    chr2 = pair.getChr2();
                } else {
                    bp1 = pair.getPos2();
                    bp2 = pair.getPos1();
                    frag1 = pair.getFrag2();
                    frag2 = pair.getFrag1();
                    chr1 = pair.getChr2();
                    chr2 = pair.getChr1();
                }
                mapq = Math.min(pair.getMapq1(), pair.getMapq2());
                // Filters
                if (diagonalsOnly && chr1 != chr2)
                    continue;
                if (includedChromosomes != null && chr1 != 0) {
                    String c1Name = chromosomes.get(chr1).getName();
                    String c2Name = chromosomes.get(chr2).getName();
                    if (!(includedChromosomes.contains(c1Name) || includedChromosomes.contains(c2Name))) {
                        continue;
                    }
                }
                // only increment if not intraFragment and passes the mapq threshold
                if (mapq < mapqThreshold || (chr1 == chr2 && frag1 == frag2))
                    continue;
                if (!(currentChr1 == chr1 && currentChr2 == chr2)) {
                    // Starting a new matrix
                    if (currentMatrix != null) {
                        currentMatrix.parsingComplete();
                        writeMatrix(currentMatrix);
                        writtenMatrices.add(currentMatrixKey);
                        currentMatrix = null;
                        System.gc();
                        //System.out.println("Available memory: " + RuntimeUtils.getAvailableMemory());
                    }

                    // Start the next matrix
                    currentChr1 = chr1;
                    currentChr2 = chr2;
                    currentMatrixKey = currentChr1 + "_" + currentChr2;

                    if (writtenMatrices.contains(currentMatrixKey)) {
                        System.err.println("Error: the chromosome combination " + currentMatrixKey
                                + " appears in multiple blocks");
                        if (outputFile != null)
                            outputFile.deleteOnExit();
                        System.exit(1);
                    }
                    currentMatrix = new MatrixPP(currentChr1, currentChr2);
                }
                currentMatrix.incrementCount(bp1, bp2, frag1, frag2, pair.getScore());

            }
        }

        if (currentMatrix != null) {
            currentMatrix.parsingComplete();
            writeMatrix(currentMatrix);
        }

        if (iter != null)
            iter.close();

        masterIndexPosition = los.getWrittenCount();
    }

    /**
     * @param file List of files to read
     * @return Matrix with counts in each bin
     * @throws IOException
     */
    private MatrixPP computeWholeGenomeMatrix(String file) throws IOException {

        MatrixPP matrix;
        // NOTE: always true that c1 <= c2

        int genomeLength = chromosomes.get(0).getLength(); // <= whole genome in KB
        int binSize = genomeLength / 500;
        if (binSize == 0)
            binSize = 1;
        int nBinsX = genomeLength / binSize + 1;
        int nBlockColumns = nBinsX / BLOCK_SIZE + 1;
        matrix = new MatrixPP(0, 0, binSize, nBlockColumns);

        PairIterator iter = null;

        int belowMapq = 0;
        int intraFrag = 0;
        int totalRead = 0;
        int contig = 0;
        int hicContact = 0;

        // Create an index the first time through
        try {
            iter = (file.endsWith(".bin")) ? new BinPairIterator(file, chromosomeIndexes)
                    : new AsciiPairIterator(file, chromosomeIndexes);

            while (iter.hasNext()) {
                totalRead++;
                AlignmentPair pair = iter.next();
                if (pair.isContigPair()) {
                    contig++;
                } else {
                    int bp1 = pair.getPos1();
                    int bp2 = pair.getPos2();
                    int chr1 = pair.getChr1();
                    int chr2 = pair.getChr2();
                    int frag1 = pair.getFrag1();
                    int frag2 = pair.getFrag2();
                    int mapq1 = pair.getMapq1();
                    int mapq2 = pair.getMapq2();

                    int pos1, pos2;

                    if (chr1 == chr2 && frag1 == frag2) {
                        intraFrag++;
                    } else if (mapq1 < mapqThreshold || mapq2 < mapqThreshold) {
                        belowMapq++;
                    } else {
                        pos1 = getGenomicPosition(chr1, bp1);
                        pos2 = getGenomicPosition(chr2, bp2);
                        matrix.incrementCount(pos1, pos2, pos1, pos2, pair.getScore());
                        hicContact++;
                    }
                }
            }
        } finally {
            if (iter != null)
                iter.close();
        }
        /*
        Intra-fragment Reads: 2,321 (0.19% / 0.79%)
        Below MAPQ Threshold: 44,134 (3.57% / 15.01%)
        Hi-C Contacts: 247,589 (20.02% / 84.20%)
         Ligation Motif Present: 99,245  (8.03% / 33.75%)
         3' Bias (Long Range): 73% - 27%
         Pair Type %(L-I-O-R): 25% - 25% - 25% - 25%
        Inter-chromosomal: 58,845  (4.76% / 20.01%)
        Intra-chromosomal: 188,744  (15.27% / 64.19%)
        Short Range (<20Kb): 48,394  (3.91% / 16.46%)
        Long Range (>20Kb): 140,350  (11.35% / 47.73%)
            
                System.err.println("contig: " + contig + " total: " + totalRead + " below mapq: " + belowMapq + " intra frag: " + intraFrag); */

        matrix.parsingComplete();
        return matrix;
    }

    private int getGenomicPosition(int chr, int pos) {
        long len = 0;
        for (int i = 1; i < chr; i++) {
            len += chromosomes.get(i).getLength();
        }
        len += pos;

        return (int) (len / 1000);

    }

    private void updateMasterIndex() throws IOException {
        RandomAccessFile raf = null;
        try {
            raf = new RandomAccessFile(outputFile, "rw");

            // Master index
            raf.getChannel().position(masterIndexPositionPosition);
            BufferedByteWriter buffer = new BufferedByteWriter();
            buffer.putLong(masterIndexPosition);
            raf.write(buffer.getBytes());

        } finally {
            if (raf != null)
                raf.close();
        }
    }

    private void writeFooter() throws IOException {

        // Index
        BufferedByteWriter buffer = new BufferedByteWriter();
        buffer.putInt(matrixPositions.size());
        for (Map.Entry<String, IndexEntry> entry : matrixPositions.entrySet()) {
            buffer.putNullTerminatedString(entry.getKey());
            buffer.putLong(entry.getValue().position);
            buffer.putInt(entry.getValue().size);
        }

        // Vectors  (Expected values,  other).
        buffer.putInt(expectedValueCalculations.size());
        for (Map.Entry<String, ExpectedValueCalculation> entry : expectedValueCalculations.entrySet()) {
            ExpectedValueCalculation ev = entry.getValue();

            ev.computeDensity();

            int binSize = ev.getGridSize();
            String unit = ev.isFrag ? "FRAG" : "BP";

            buffer.putNullTerminatedString(unit);
            buffer.putInt(binSize);

            // The density values
            double[] expectedValues = ev.getDensityAvg();
            buffer.putInt(expectedValues.length);
            for (double expectedValue : expectedValues) {
                buffer.putDouble(expectedValue);
            }

            // Map of chromosome index -> normalization factor
            Map<Integer, Double> normalizationFactors = ev.getChrScaleFactors();
            buffer.putInt(normalizationFactors.size());
            for (Map.Entry<Integer, Double> normFactor : normalizationFactors.entrySet()) {
                buffer.putInt(normFactor.getKey());
                buffer.putDouble(normFactor.getValue());
                //System.out.println(normFactor.getKey() + "  " + normFactor.getValue());
            }
        }

        byte[] bytes = buffer.getBytes();
        los.writeInt(bytes.length);
        los.write(bytes);
    }

    private synchronized void writeMatrix(MatrixPP matrix) throws IOException {

        long position = los.getWrittenCount();

        los.writeInt(matrix.getChr1Idx());
        los.writeInt(matrix.getChr2Idx());
        int numResolutions = 0;

        for (MatrixZoomDataPP zd : matrix.getZoomData()) {
            if (zd != null) {
                numResolutions++;
            }
        }
        los.writeInt(numResolutions);

        //fos.writeInt(matrix.getZoomData().length);
        for (MatrixZoomDataPP zd : matrix.getZoomData()) {
            if (zd != null)
                writeZoomHeader(zd);
        }

        int size = (int) (los.getWrittenCount() - position);
        matrixPositions.put(matrix.getKey(), new IndexEntry(position, size));

        for (MatrixZoomDataPP zd : matrix.getZoomData()) {
            if (zd != null) {
                List<IndexEntry> blockIndex = zd.mergeAndWriteBlocks();
                zd.updateIndexPositions(blockIndex);
            }
        }

        System.out.print(".");
    }

    private void writeZoomHeader(MatrixZoomDataPP zd) throws IOException {

        int numberOfBlocks = zd.blockNumbers.size();
        los.writeString(zd.getUnit()); // Unit, ether "BP" or "FRAG"
        los.writeInt(zd.getZoom()); // zoom index,  lowest res is zero
        los.writeFloat((float) zd.getSum()); // sum
        los.writeFloat((float) zd.getOccupiedCellCount());
        los.writeFloat((float) zd.getPercent5());
        los.writeFloat((float) zd.getPercent95());
        los.writeInt(zd.getBinSize());
        los.writeInt(zd.getBlockBinCount());
        los.writeInt(zd.getBlockColumnCount());
        los.writeInt(numberOfBlocks);

        zd.blockIndexPosition = los.getWrittenCount();

        // Placeholder for block index
        for (int i = 0; i < numberOfBlocks; i++) {
            los.writeInt(0);
            los.writeLong(0l);
            los.writeInt(0);
        }

    }

    /**
     * Note -- compressed
     *
     * @param zd          Matrix zoom data
     * @param block       Block to write
     * @param sampledData Array to hold a sample of the data (to compute statistics)
     * @throws IOException
     */
    private void writeBlock(MatrixZoomDataPP zd, BlockPP block, DownsampledDoubleArrayList sampledData)
            throws IOException {

        final Map<Point, ContactCount> records = block.getContactRecordMap();//   getContactRecords();

        // System.out.println("Write contact records : records count = " + records.size());

        // Count records first
        int nRecords;
        if (countThreshold > 0) {
            nRecords = 0;
            for (ContactCount rec : records.values()) {
                if (rec.getCounts() >= countThreshold) {
                    nRecords++;
                }
            }
        } else {
            nRecords = records.size();
        }
        BufferedByteWriter buffer = new BufferedByteWriter(nRecords * 12);
        buffer.putInt(nRecords);
        zd.cellCount += nRecords;

        // Find extents of occupied cells
        int binXOffset = Integer.MAX_VALUE;
        int binYOffset = Integer.MAX_VALUE;
        int binXMax = 0;
        int binYMax = 0;
        for (Map.Entry<Point, ContactCount> entry : records.entrySet()) {
            Point point = entry.getKey();
            binXOffset = Math.min(binXOffset, point.x);
            binYOffset = Math.min(binYOffset, point.y);
            binXMax = Math.max(binXMax, point.x);
            binYMax = Math.max(binYMax, point.y);
        }

        buffer.putInt(binXOffset);
        buffer.putInt(binYOffset);

        // Sort keys in row-major order
        List<Point> keys = new ArrayList<Point>(records.keySet());
        Collections.sort(keys, new Comparator<Point>() {
            @Override
            public int compare(Point o1, Point o2) {
                if (o1.y != o2.y) {
                    return o1.y - o2.y;
                } else {
                    return o1.x - o2.x;
                }
            }
        });
        Point lastPoint = keys.get(keys.size() - 1);
        final short w = (short) (binXMax - binXOffset + 1);

        boolean isInteger = true;
        float maxCounts = 0;

        LinkedHashMap<Integer, List<ContactRecord>> rows = new LinkedHashMap<Integer, List<ContactRecord>>();
        for (Point point : keys) {
            final ContactCount contactCount = records.get(point);
            float counts = contactCount.getCounts();
            if (counts >= countThreshold) {

                isInteger = isInteger && (Math.floor(counts) == counts);
                maxCounts = Math.max(counts, maxCounts);

                final int px = point.x - binXOffset;
                final int py = point.y - binYOffset;
                List<ContactRecord> row = rows.get(py);
                if (row == null) {
                    row = new ArrayList<ContactRecord>(10);
                    rows.put(py, row);
                }
                row.add(new ContactRecord(px, py, counts));
            }
        }

        // Compute size for each representation and choose smallest
        boolean useShort = isInteger && (maxCounts < Short.MAX_VALUE);
        int valueSize = useShort ? 2 : 4;

        int lorSize = 0;
        int nDensePts = (lastPoint.y - binYOffset) * w + (lastPoint.x - binXOffset) + 1;

        int denseSize = nDensePts * valueSize;
        for (List<ContactRecord> row : rows.values()) {
            lorSize += 4 + row.size() * valueSize;
        }

        buffer.put((byte) (useShort ? 0 : 1));

        if (lorSize < denseSize) {

            buffer.put((byte) 1); // List of rows representation

            buffer.putShort((short) rows.size()); // # of rows

            for (Map.Entry<Integer, List<ContactRecord>> entry : rows.entrySet()) {

                int py = entry.getKey();
                List<ContactRecord> row = entry.getValue();
                buffer.putShort((short) py); // Row number
                buffer.putShort((short) row.size()); // size of row

                for (ContactRecord contactRecord : row) {
                    buffer.putShort((short) (contactRecord.getBinX()));
                    final float counts = contactRecord.getCounts();

                    if (useShort) {
                        buffer.putShort((short) counts);
                    } else {
                        buffer.putFloat(counts);
                    }

                    sampledData.add(counts);
                    zd.sum += counts;
                }
            }

        } else {
            buffer.put((byte) 2); // Dense matrix

            buffer.putInt(nDensePts);
            buffer.putShort(w);

            int lastIdx = 0;
            for (Point p : keys) {

                int idx = (p.y - binYOffset) * w + (p.x - binXOffset);
                for (int i = lastIdx; i < idx; i++) {
                    // Filler value
                    if (useShort) {
                        buffer.putShort(Short.MIN_VALUE);
                    } else {
                        buffer.putFloat(Float.NaN);
                    }
                }
                float counts = records.get(p).getCounts();
                if (useShort) {
                    buffer.putShort((short) counts);
                } else {
                    buffer.putFloat(counts);
                }
                lastIdx = idx + 1;

                sampledData.add(counts);
                zd.sum += counts;
            }
        }

        byte[] bytes = buffer.getBytes();
        byte[] compressedBytes = compress(bytes);
        los.write(compressedBytes);

    }

    public void setTmpdir(String tmpDirName) {

        if (tmpDirName != null) {
            this.tmpDir = new File(tmpDirName);

            if (!tmpDir.exists()) {
                System.err.println("Tmp directory does not exist: " + tmpDirName);
                if (outputFile != null)
                    outputFile.deleteOnExit();
                System.exit(1);
            }
        }
    }

    public void setStatisticsFile(String statsOption) {
        statsFileName = statsOption;
    }

    private synchronized byte[] compress(byte[] data) {

        // Give the compressor the data to compress
        compressor.reset();
        compressor.setInput(data);
        compressor.finish();

        // Create an expandable byte array to hold the compressed data.
        // You cannot use an array that's the same size as the orginal because
        // there is no guarantee that the compressed data will be smaller than
        // the uncompressed data.
        ByteArrayOutputStream bos = new ByteArrayOutputStream(data.length);

        // Compress the data
        byte[] buf = new byte[1024];
        while (!compressor.finished()) {
            int count = compressor.deflate(buf);
            bos.write(buf, 0, count);
        }
        try {
            bos.close();
        } catch (IOException e) {
            System.err.println("Error clossing ByteArrayOutputStream");
            e.printStackTrace();
        }

        return bos.toByteArray();
    }

    interface BlockQueue {

        void advance() throws IOException;

        BlockPP getBlock();

    }

    public static class IndexEntry {
        public final long position;
        public final int size;
        int id;

        IndexEntry(int id, long position, int size) {
            this.id = id;
            this.position = position;
            this.size = size;
        }

        public IndexEntry(long position, int size) {
            this.position = position;
            this.size = size;
        }
    }

    static class BlockQueueFB implements BlockQueue {

        final File file;
        BlockPP block;
        long filePosition;

        BlockQueueFB(File file) {
            this.file = file;
            try {
                advance();
            } catch (IOException e) {
                e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
            }
        }

        public void advance() throws IOException {

            if (filePosition >= file.length()) {
                block = null;
                return;
            }

            FileInputStream fis = null;

            try {
                fis = new FileInputStream(file);
                fis.getChannel().position(filePosition);

                LittleEndianInputStream lis = new LittleEndianInputStream(fis);
                int blockNumber = lis.readInt();
                int nRecords = lis.readInt();

                byte[] bytes = new byte[nRecords * 12];
                readFully(bytes, fis);

                ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
                lis = new LittleEndianInputStream(bis);

                Map<Point, ContactCount> contactRecordMap = new HashMap<Point, ContactCount>(nRecords);
                for (int i = 0; i < nRecords; i++) {
                    int x = lis.readInt();
                    int y = lis.readInt();
                    float v = lis.readFloat();
                    ContactCount rec = new ContactCount(v);
                    contactRecordMap.put(new Point(x, y), rec);
                }
                block = new BlockPP(blockNumber, contactRecordMap);

                // Update file position based on # of bytes read, for next block
                filePosition = fis.getChannel().position();

            } finally {
                if (fis != null)
                    fis.close();
            }
        }

        public BlockPP getBlock() {
            return block;
        }

        /**
         * Read enough bytes to fill the input buffer
         */
        public void readFully(byte b[], InputStream is) throws IOException {
            int len = b.length;
            if (len < 0)
                throw new IndexOutOfBoundsException();
            int n = 0;
            while (n < len) {
                int count = is.read(b, n, len - n);
                if (count < 0)
                    throw new EOFException();
                n += count;
            }
        }
    }

    // class to support block merging

    static class BlockQueueMem implements BlockQueue {

        final List<BlockPP> blocks;
        int idx = 0;

        BlockQueueMem(Collection<BlockPP> blockCollection) {

            this.blocks = new ArrayList<BlockPP>(blockCollection);
            Collections.sort(blocks, new Comparator<BlockPP>() {
                @Override
                public int compare(BlockPP o1, BlockPP o2) {
                    return o1.getNumber() - o2.getNumber();
                }
            });
        }

        public void advance() {
            idx++;
        }

        public BlockPP getBlock() {
            if (idx >= blocks.size()) {
                return null;
            } else {
                return blocks.get(idx);
            }
        }
    }

    /**
     * Representation of a sparse matrix block used for preprocessing.
     */
    static class BlockPP {

        private final int number;

        // Key to the map is a Point representing the x,y coordinate for the cell.
        private final Map<Point, ContactCount> contactRecordMap;

        BlockPP(int number) {
            this.number = number;
            this.contactRecordMap = new HashMap<Point, ContactCount>();
        }

        public BlockPP(int number, Map<Point, ContactCount> contactRecordMap) {
            this.number = number;
            this.contactRecordMap = contactRecordMap;
        }

        public int getNumber() {
            return number;
        }

        public void incrementCount(int col, int row, float score) {
            Point p = new Point(col, row);
            ContactCount rec = contactRecordMap.get(p);
            if (rec == null) {
                rec = new ContactCount(1);
                contactRecordMap.put(p, rec);

            } else {
                rec.incrementCount(score);
            }
        }

        /*
         useless at present
        public void parsingComplete() {
            
        }
        */

        public Map<Point, ContactCount> getContactRecordMap() {
            return contactRecordMap;
        }

        public void merge(BlockPP other) {

            for (Map.Entry<Point, ContactCount> entry : other.getContactRecordMap().entrySet()) {

                Point point = entry.getKey();
                ContactCount otherValue = entry.getValue();

                ContactCount value = contactRecordMap.get(point);
                if (value == null) {
                    contactRecordMap.put(point, otherValue);
                } else {
                    value.incrementCount(otherValue.getCounts());
                }

            }
        }
    }

    public static class ContactCount {
        float value;

        ContactCount(float value) {
            this.value = value;
        }

        void incrementCount(float increment) {
            value += increment;
        }

        public float getCounts() {
            return value;
        }
    }

    /**
     * @author jrobinso
     * @since Aug 12, 2010
     */
    class MatrixPP {

        private final int chr1Idx;
        private final int chr2Idx;
        private final MatrixZoomDataPP[] zoomData;

        /**
         * Constructor for creating a matrix and initializing zoomed data at predefined resolution scales.  This
         * constructor is used when parsing alignment files.
         * c
         *
         * @param chr1Idx Chromosome 1
         * @param chr2Idx Chromosome 2
         */
        MatrixPP(int chr1Idx, int chr2Idx) {
            this.chr1Idx = chr1Idx;
            this.chr2Idx = chr2Idx;

            int nResolutions = HiCGlobals.bpBinSizes.length;
            if (fragmentCalculation != null) {
                nResolutions += HiCGlobals.fragBinSizes.length;
            }

            zoomData = new MatrixZoomDataPP[nResolutions];

            int zoom = 0; //
            for (int idx = 0; idx < HiCGlobals.bpBinSizes.length; idx++) {
                int binSize = HiCGlobals.bpBinSizes[zoom];
                Chromosome chrom1 = chromosomes.get(chr1Idx);
                Chromosome chrom2 = chromosomes.get(chr2Idx);

                // Size block (submatrices) to be ~500 bins wide.
                int len = Math.max(chrom1.getLength(), chrom2.getLength());
                int nBins = len / binSize + 1; // Size of chrom in bins
                int nColumns = nBins / BLOCK_SIZE + 1;
                zoomData[idx] = new MatrixZoomDataPP(chrom1, chrom2, binSize, nColumns, zoom, false);
                zoom++;

            }

            if (fragmentCalculation != null) {
                Chromosome chrom1 = chromosomes.get(chr1Idx);
                Chromosome chrom2 = chromosomes.get(chr2Idx);
                int nFragBins1 = Math.max(fragmentCalculation.getNumberFragments(chrom1.getName()),
                        fragmentCalculation.getNumberFragments(chrom2.getName()));

                zoom = 0;
                for (int idx = HiCGlobals.bpBinSizes.length; idx < nResolutions; idx++) {
                    int binSize = HiCGlobals.fragBinSizes[zoom];
                    int nBins = nFragBins1 / binSize + 1;
                    int nColumns = nBins / BLOCK_SIZE + 1;
                    zoomData[idx] = new MatrixZoomDataPP(chrom1, chrom2, binSize, nColumns, zoom, true);
                    zoom++;
                }
            }
        }

        /**
         * Constructor for creating a matrix with a single zoom level at a specified bin size.  This is provided
         * primarily for constructing a whole-genome view.
         *
         * @param chr1Idx Chromosome 1
         * @param chr2Idx Chromosome 2
         * @param binSize Bin size
         */
        MatrixPP(int chr1Idx, int chr2Idx, int binSize, int blockColumnCount) {
            this.chr1Idx = chr1Idx;
            this.chr2Idx = chr2Idx;
            zoomData = new MatrixZoomDataPP[1];
            zoomData[0] = new MatrixZoomDataPP(chromosomes.get(chr1Idx), chromosomes.get(chr2Idx), binSize,
                    blockColumnCount, 0, false);

        }

        String getKey() {
            return "" + chr1Idx + "_" + chr2Idx;
        }

        void incrementCount(int pos1, int pos2, int frag1, int frag2, float score) throws IOException {

            for (MatrixZoomDataPP aZoomData : zoomData) {
                if (aZoomData.isFrag) {
                    aZoomData.incrementCount(frag1, frag2, score);
                } else {
                    aZoomData.incrementCount(pos1, pos2, score);
                }
            }
        }

        void parsingComplete() {
            for (MatrixZoomDataPP zd : zoomData) {
                if (zd != null) // fragment level could be null
                    zd.parsingComplete();
            }
        }

        int getChr1Idx() {
            return chr1Idx;
        }

        int getChr2Idx() {
            return chr2Idx;
        }

        MatrixZoomDataPP[] getZoomData() {
            return zoomData;
        }

    }

    /**
     * @author jrobinso
     * @since Aug 10, 2010
     */
    class MatrixZoomDataPP {

        final boolean isFrag;
        final Set<Integer> blockNumbers; // The only reason for this is to get a count
        final List<File> tmpFiles;
        private final Chromosome chr1; // Redundant, but convenient    BinDatasetReader
        private final Chromosome chr2; // Redundant, but convenient
        private final int zoom;
        private final int binSize; // bin size in bp
        private final int blockBinCount; // block size in bins
        private final int blockColumnCount; // number of block columns
        private final LinkedHashMap<Integer, BlockPP> blocks;
        public long blockIndexPosition;
        private double sum = 0;
        private double cellCount = 0;
        private double percent5;
        private double percent95;

        /**
         * Representation of MatrixZoomData used for preprocessing
         *
         * @param chr1             index of first chromosome  (x-axis)
         * @param chr2             index of second chromosome
         * @param binSize          size of each grid bin in bp
         * @param blockColumnCount number of block columns
         * @param zoom             integer zoom (resolution) level index.
         */
        MatrixZoomDataPP(Chromosome chr1, Chromosome chr2, int binSize, int blockColumnCount, int zoom,
                boolean isFrag) {

            this.tmpFiles = new ArrayList<File>();
            this.blockNumbers = new HashSet<Integer>(1000);

            this.sum = 0;
            this.chr1 = chr1;
            this.chr2 = chr2;
            this.binSize = binSize;
            this.blockColumnCount = blockColumnCount;
            this.zoom = zoom;
            this.isFrag = isFrag;

            // Get length in proper units
            Chromosome longChr = chr1.getLength() > chr2.getLength() ? chr1 : chr2;
            int len = isFrag ? fragmentCalculation.getNumberFragments(longChr.getName()) : longChr.getLength();

            int nBinsX = len / binSize + 1;

            blockBinCount = nBinsX / blockColumnCount + 1;
            blocks = new LinkedHashMap<Integer, BlockPP>(blockColumnCount * blockColumnCount);
        }

        String getUnit() {
            return isFrag ? "FRAG" : "BP";
        }

        double getSum() {
            return sum;
        }

        public double getOccupiedCellCount() {
            return cellCount;
        }

        public double getPercent95() {
            return percent95;
        }

        public double getPercent5() {
            return percent5;
        }

        int getBinSize() {
            return binSize;
        }

        Chromosome getChr1() {
            return chr1;
        }

        Chromosome getChr2() {
            return chr2;
        }

        int getZoom() {
            return zoom;
        }

        int getBlockBinCount() {
            return blockBinCount;
        }

        int getBlockColumnCount() {
            return blockColumnCount;
        }

        Map<Integer, BlockPP> getBlocks() {
            return blocks;
        }

        /**
         * Increment the count for the bin represented by the GENOMIC position (pos1, pos2)
         */
        public void incrementCount(int pos1, int pos2, float score) throws IOException {

            sum += score;
            // Convert to proper units,  fragments or base-pairs

            if (pos1 < 0 || pos2 < 0)
                return;

            int xBin = pos1 / binSize;
            int yBin = pos2 / binSize;

            // Intra chromosome -- we'll store lower diagonal only
            if (chr1.equals(chr2)) {
                int b1 = Math.min(xBin, yBin);
                int b2 = Math.max(xBin, yBin);
                xBin = b1;
                yBin = b2;

                if (b1 != b2) {
                    sum += score; // <= count for mirror cell.
                }

                String evKey = (isFrag ? "FRAG_" : "BP_") + binSize;
                ExpectedValueCalculation ev = expectedValueCalculations.get(evKey);
                if (ev != null) {
                    ev.addDistance(chr1.getIndex(), xBin, yBin, score);
                }
            }

            // compute block number (fist block is zero)
            int blockCol = xBin / blockBinCount;
            int blockRow = yBin / blockBinCount;
            int blockNumber = blockColumnCount * blockRow + blockCol;

            BlockPP block = blocks.get(blockNumber);
            if (block == null) {

                block = new BlockPP(blockNumber);
                blocks.put(blockNumber, block);
            }
            block.incrementCount(xBin, yBin, score);

            // If too many blocks write to tmp directory
            if (blocks.size() > 1000) {
                File tmpfile = tmpDir == null ? File.createTempFile("blocks", "bin")
                        : File.createTempFile("blocks", "bin", tmpDir);
                //System.out.println(chr1.getName() + "-" + chr2.getName() + " Dumping blocks to " + tmpfile.getAbsolutePath());
                dumpBlocks(tmpfile);
                tmpFiles.add(tmpfile);
                tmpfile.deleteOnExit();
            }
        }

        /**
         * Dump the blocks calculated so far to a temporary file
         *
         * @param file File to write to
         * @throws IOException
         */
        private void dumpBlocks(File file) throws IOException {
            LittleEndianOutputStream los = null;
            try {
                los = new LittleEndianOutputStream(new BufferedOutputStream(new FileOutputStream(file), 4194304));

                List<BlockPP> blockList = new ArrayList<BlockPP>(blocks.values());
                Collections.sort(blockList, new Comparator<BlockPP>() {
                    @Override
                    public int compare(BlockPP o1, BlockPP o2) {
                        return o1.getNumber() - o2.getNumber();
                    }
                });

                for (BlockPP b : blockList) {

                    // Remove from map
                    blocks.remove(b.getNumber());

                    int number = b.getNumber();
                    blockNumbers.add(number);

                    los.writeInt(number);
                    Map<Point, ContactCount> records = b.getContactRecordMap();

                    los.writeInt(records.size());
                    for (Map.Entry<Point, ContactCount> entry : records.entrySet()) {

                        Point point = entry.getKey();
                        ContactCount count = entry.getValue();

                        los.writeInt(point.x);
                        los.writeInt(point.y);
                        los.writeFloat(count.getCounts());
                    }
                }

                blocks.clear();

            } finally {
                if (los != null)
                    los.close();

            }
        }

        // Merge and write out blocks one at a time.
        private List<IndexEntry> mergeAndWriteBlocks() throws IOException {
            DownsampledDoubleArrayList sampledData = new DownsampledDoubleArrayList(10000, 10000);

            List<BlockQueue> activeList = new ArrayList<BlockQueue>();

            // Initialize queues -- first whatever is left over in memory
            if (blocks.size() > 0) {
                BlockQueue bqInMem = new BlockQueueMem(blocks.values());
                activeList.add(bqInMem);
            }
            // Now from files
            for (File file : tmpFiles) {
                BlockQueue bq = new BlockQueueFB(file);
                if (bq.getBlock() != null) {
                    activeList.add(bq);
                }
            }

            List<IndexEntry> indexEntries = new ArrayList<IndexEntry>();

            if (activeList.size() == 0) {
                throw new RuntimeException(
                        "No reads in Hi-C contact matrices. This could be because the MAPQ filter is set too high (-q) or because all reads map to the same fragment.");
            }

            do {
                Collections.sort(activeList, new Comparator<BlockQueue>() {
                    @Override
                    public int compare(BlockQueue o1, BlockQueue o2) {
                        return o1.getBlock().getNumber() - o2.getBlock().getNumber();
                    }
                });

                BlockQueue topQueue = activeList.get(0);
                BlockPP currentBlock = topQueue.getBlock();
                topQueue.advance();
                int num = currentBlock.getNumber();

                for (int i = 1; i < activeList.size(); i++) {
                    BlockQueue blockQueue = activeList.get(i);
                    BlockPP block = blockQueue.getBlock();
                    if (block.getNumber() == num) {
                        currentBlock.merge(block);
                        blockQueue.advance();
                    }
                }

                Iterator<BlockQueue> iterator = activeList.iterator();
                while (iterator.hasNext()) {
                    if (iterator.next().getBlock() == null) {
                        iterator.remove();
                    }
                }

                // Output block
                long position = los.getWrittenCount();
                writeBlock(this, currentBlock, sampledData);
                int size = (int) (los.getWrittenCount() - position);

                indexEntries.add(new IndexEntry(num, position, size));

            } while (activeList.size() > 0);

            for (File f : tmpFiles) {
                boolean result = f.delete();
                if (!result) {
                    System.out.println("Error while deleting file");
                }
            }

            computeStats(sampledData);

            return indexEntries;
        }

        private void computeStats(DownsampledDoubleArrayList sampledData) {

            double[] data = sampledData.toArray();
            this.percent5 = StatUtils.percentile(data, 5);
            this.percent95 = StatUtils.percentile(data, 95);

        }

        public void parsingComplete() {
            // Add the block numbers still in memory
            for (BlockPP block : blocks.values()) {
                blockNumbers.add(block.getNumber());
            }
        }

        public void updateIndexPositions(List<IndexEntry> blockIndex) throws IOException {

            // Temporarily close output stream.  Remember position
            long losPos = los.getWrittenCount();
            los.close();

            RandomAccessFile raf = null;
            try {
                raf = new RandomAccessFile(outputFile, "rw");

                // Block indices
                long pos = blockIndexPosition;
                raf.getChannel().position(pos);

                // Write as little endian
                BufferedByteWriter buffer = new BufferedByteWriter();
                for (IndexEntry aBlockIndex : blockIndex) {
                    buffer.putInt(aBlockIndex.id);
                    buffer.putLong(aBlockIndex.position);
                    buffer.putInt(aBlockIndex.size);
                }
                raf.write(buffer.getBytes());

            } finally {

                if (raf != null)
                    raf.close();

                // Restore
                FileOutputStream fos = new FileOutputStream(outputFile, true);
                fos.getChannel().position(losPos);
                los = new LittleEndianOutputStream(new BufferedOutputStream(fos, HiCGlobals.bufferSize));
                los.setWrittenCount(losPos);

            }
        }
    }

}