Java tutorial
/* * Copyright (C) 2009-2010 Institute for Computational Biomedicine, * Weill Medical College of Cornell University * * This file is part of the Goby IO API. * * The Goby IO API is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The Goby IO API is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with the Goby IO API. If not, see <http://www.gnu.org/licenses/>. */ package edu.cornell.med.icb.goby.alignments; import com.google.protobuf.CodedInputStream; import it.unimi.dsi.fastutil.ints.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Iterator; import java.util.List; import java.util.zip.GZIPInputStream; /** * Reads alignments too many hits data structure written with * {@link edu.cornell.med.icb.goby.alignments.AlignmentTooManyHitsWriter}. * * @author Fabien Campagne * Date: Apr 30, 2009 * Time: 6:36:04 PM */ public class AlignmentTooManyHitsReader { /** * Used to log debug and informational messages. */ private static final Log LOG = LogFactory.getLog(AlignmentTooManyHitsReader.class); /** * A map from query index to the number of hits at that index. */ private Int2IntMap queryIndex2NumHits = new Int2IntAVLTreeMap(); /** * A map from query index to depth/length of match. */ private Int2IntMap queryIndex2Depth = new Int2IntAVLTreeMap(); /** * The threshold used by the aligner to determine that a query is ambiguous and * should be dropped. */ private int alignerThreshold; private boolean closed; public AlignmentTooManyHitsReader(final String basename) throws IOException { final String filename = basename + ".tmh"; final File optionalFile = new File(filename); InputStream tmhStream = null; try { if (optionalFile.exists()) { try { tmhStream = new GZIPInputStream(new FileInputStream(optionalFile)); } catch (IOException e) { // try not compressed for compatibility with 1.6-: LOG.trace("falling back to legacy 1.6- uncompressed TMH."); tmhStream = new FileInputStream(optionalFile); } // accept very large too many hits messages, since these may describe more than 60 million reads: final CodedInputStream codedInput = CodedInputStream.newInstance(tmhStream); codedInput.setSizeLimit(Integer.MAX_VALUE); final Alignments.AlignmentTooManyHits tmh = Alignments.AlignmentTooManyHits.parseFrom(codedInput); queryIndex2NumHits.defaultReturnValue(-1); queryIndex2Depth.defaultReturnValue(-1); final List<Alignments.AmbiguousLocation> hitsList = tmh.getHitsList(); final Iterator<Alignments.AmbiguousLocation> iterator = hitsList.iterator(); while (iterator.hasNext()) { Alignments.AmbiguousLocation hit = iterator.next(); queryIndex2NumHits.put(hit.getQueryIndex(), hit.getAtLeastNumberOfHits()); if (hit.hasLengthOfMatch()) { queryIndex2Depth.put(hit.getQueryIndex(), hit.getLengthOfMatch()); } } this.alignerThreshold = tmh.getAlignerThreshold(); } else { // the file does not exist. Log this fact, and act as if no query had too many hits. LOG.info("basename " + optionalFile + " has no 'too many hits' information (" + basename + ".tmh does not exist)." + " Assuming no queries have too many hits."); } } finally { if (tmhStream != null) { tmhStream.close(); } } } /** * The number of hits against the reference that the aligner considered was too many to report. * * @return alignerThreshold. */ public int getAlignerThreshold() { return alignerThreshold; } /** * Returns the number 'at least number of hits' reported by the alignment tool against * the reference. * * @param queryIndex The index of the query sequence. * @return The number of hits that triggered membership in the too many hits list. * The query may hit more locations than reported here, since some alignment * tools will just drop queries that match above a threshold and stop counting. * This number can be >=k. */ public final int getNumberOfHits(final int queryIndex) { assert !closed : "TMH reader was already closed."; return queryIndex2NumHits.get(queryIndex); } /** * Returns the length of match that resulted in the number of occurence againt the specific * reference sequence(s) this alignment was produced against. * * @param queryIndex The index of the query sequence. * @return The length of the longest match between the query and the reference sequence(s) * that yielded the number of hits. */ public final int getLengthOfMatch(final int queryIndex) { assert !closed : "TMH reader was already closed."; return queryIndex2Depth.get(queryIndex); } public final IntSet getQueryIndices() { assert !closed : "TMH reader was already closed."; return queryIndex2NumHits.keySet(); } /** * Returns true if the query was considered ambiguous by the alignment tool. * * @param queryIndex The index of the query sequence. * @return True or false. */ public boolean isQueryAmbiguous(final int queryIndex) { assert !closed : "TMH reader was already closed."; return queryIndex2NumHits.containsKey(queryIndex); } /** * Returns true if the query matched at least k number of locations in the reference. * However, if k >= alignerThreshold, then true is returned regardless. * <p/> * TODO : provide rationale for logic of k >= alignerThreshold behaviour * <p/> * TODO : discuss removal of this method as result is logically equivalent to isQueryAmbiguous(queryIndex) * TODO : given that tmhWriter.append() guarantees numHits > threshold * * @param queryIndex The index of the query sequence. * @param k The parameter k. * @return True or false. */ public final boolean isQueryAmbiguous(final int queryIndex, final int k) { assert !closed : "TMH reader was already closed."; final int atLeastNumberOfHits = queryIndex2NumHits.get(queryIndex); if (atLeastNumberOfHits == -1) { return false; } if (k >= alignerThreshold) { // since k is larger than the aligner threshold, we have to assume the query is // ambiguous at k, this is the safe choice. return true; } else { return (atLeastNumberOfHits >= k); } } /** * Returns true if the query matched at least k number of locations in the reference, at the * specified match length or less. * <p/> * TODO : discuss removal of 2nd argument from this method as result does not depend on its value * TODO : see comments at isQueryAmbiguous above * * @param queryIndex The index of the query sequence. * @param k The parameter k. * @param matchLength The match length. * @return True or false. */ public final boolean isQueryAmbiguous(final int queryIndex, final int k, final int matchLength) { if (matchLength < getLengthOfMatch(queryIndex)) { return true; } else { return isQueryAmbiguous(queryIndex, k); } } public void close() { this.queryIndex2Depth.clear(); this.queryIndex2NumHits.clear(); this.queryIndex2Depth = null; this.queryIndex2NumHits = null; closed = true; } }