it.crs4.seal.recab.HashSetVariantTable.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.recab.HashSetVariantTable.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.recab;

import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.CutString;

import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class HashSetVariantTable implements VariantTable {
    private static final Log LOG = LogFactory.getLog(HashSetVariantTable.class);

    private static final int InitialCapacityPerChr = 400000;
    private static final float LoadFactor = 0.80f;

    /**
     * Main data structure.
     * We use a Map with one entry per contig/chromosome.
     * For each contig, we have a Set which stores all its SNP positions.
     */
    // XXX: save some memory with Integer as opposed to Long.  We'll be fine with
    // the human genome, but large genomes would be a problem.
    //
    // TODO:  Can we be more clever in the way we use store these things to save some memory?
    protected Map<String, Set<Integer>> data;

    public boolean isVariantLocation(String chr, long pos) {
        if (pos > Integer.MAX_VALUE)
            throw new RuntimeException("pos bigger than expected!  File a bug!!");

        Set<Integer> s = data.get(chr);
        if (s != null)
            return s.contains((int) pos);
        return false;
    }

    public void load(VariantReader reader) throws IOException, FormatException {
        data = new HashMap<String, Set<Integer>>(30); // initial capacity for ok for human genome plus a few extra contigs
        VariantRegion snp = new VariantRegion();
        long count = 0;

        while (reader.nextEntry(snp)) // snp is re-used
        {
            // col 1
            String chr = snp.getContigName();
            Set<Integer> s = data.get(chr);
            if (s == null) {
                s = new HashSet<Integer>(InitialCapacityPerChr, LoadFactor);
                data.put(chr, s);
            }

            int end = snp.getPosition() + snp.getLength();
            for (int pos = snp.getPosition(); pos < end; ++pos)
                s.add(pos);

            count += 1;
            if (LOG.isInfoEnabled()) {
                if (count % 1000000 == 0)
                    LOG.info("Loaded " + count);
            }
        }
        LOG.info("Loaded a total of " + count + " known variations");
    }

    public int size() {
        int sum = 0;
        if (data != null) {
            for (Set<Integer> s : data.values())
                sum += s.size();
        }

        return sum;
    }

    public Set<String> getContigs() {
        return data.keySet();
    }
}