savant.diff.DiffDataSource.java Source code

Java tutorial

Introduction

Here is the source code for savant.diff.DiffDataSource.java

Source

/**
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package savant.diff;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import savant.api.adapter.BookmarkAdapter;
import savant.api.adapter.DataSourceAdapter;
import savant.api.adapter.RangeAdapter;
import savant.api.adapter.RecordFilterAdapter;
import savant.api.adapter.TrackAdapter;
import savant.api.data.ContinuousRecord;
import savant.api.data.DataFormat;
import savant.api.util.TrackUtils;
import savant.data.types.GenericContinuousRecord;
import savant.util.MiscUtils;
import savant.api.util.Resolution;

/**
 * The actual data-source implemented by the Diff plugin.  Takes two existing continuous data-sources
 * and creates a third data-source based on the difference between them.
 *
 * @author tarkvara
 */
public class DiffDataSource implements DataSourceAdapter<ContinuousRecord> {
    private static final Log LOG = LogFactory.getLog(DiffDataSource.class);

    /** Our two input data-sources. */
    DataSourceAdapter<? extends ContinuousRecord> inputA, inputB;

    private URI uri;

    /**
     * Construct a data-source for the given diff:// URI.
     */
    public DiffDataSource(URI uri) throws URISyntaxException {
        this.uri = uri;
    }

    /**
     * Set of references is whatever's common to both input tracks.
     */
    @Override
    public Set<String> getReferenceNames() {
        Set<String> result = new HashSet<String>();
        if (inputsAttached()) {
            // In a perfect world, we could just intersect the two sets and be done.
            // In practice, some tracks will store chromosome names like "chr1", "chr2", etc.,
            // while others will use "1", "2".  We reconcile these as best we can.
            Set<String> aRefs = inputA.getReferenceNames();
            Set<String> bRefs = inputB.getReferenceNames();
            for (String a : aRefs) {
                String goodA = MiscUtils.homogenizeSequence(a); // Reduce "chr1" to "1".
                String match = null;
                for (String b : bRefs) {
                    String goodB = MiscUtils.homogenizeSequence(b); // Reduce to "1" before comparing.
                    if (goodA.equals(goodB)) {
                        match = b;
                        break;
                    }
                }
                if (match != null) {
                    result.add(goodA);
                    bRefs.remove(match);
                }
            }
        }
        return result;
    }

    /**
     * The number of records we return will be based on the first input.  In some cases,
     * the second input may not happen to have a point at the expected location, so
     * we may have to interpolate.
     * 
     * The data-sources are expected to return records in order, so we can count on that
     * to simplify our task.
     */
    @Override
    public List<ContinuousRecord> getRecords(String ref, RangeAdapter range, Resolution res,
            RecordFilterAdapter filter) throws InterruptedException, IOException {
        List<ContinuousRecord> result = null;
        if (inputsAttached()) {
            List<? extends ContinuousRecord> aRecords = inputA.getRecords(ref, range, res, filter);
            List<? extends ContinuousRecord> bRecords = inputB.getRecords(ref, range, res, filter);

            result = new ArrayList<ContinuousRecord>(aRecords.size());
            int j = 0;
            ContinuousRecord recB = bRecords.get(j);
            for (int i = 0; i < aRecords.size(); i++) {
                ContinuousRecord recA = aRecords.get(i);
                int pos = recA.getPosition();

                // Figure out which record in B corresponds to A.
                while (recB.getPosition() < pos && j + 1 < bRecords.size()) {
                    j++;
                    recB = bRecords.get(j);
                }

                // For the purposes of this demonstration, we'll treat NaNs as zero.
                float value = recA.getValue();
                if (Float.isNaN(value)) {
                    value = 0.0f;
                }
                value -= interpolate(bRecords, j, pos);
                result.add(GenericContinuousRecord.valueOf(ref, pos, value));
            }
        }
        return result;
    }

    /**
     * Because the two input data-sources may be returning differing numbers of records,
     * we may have to interpolate.  Arbitrarily, we decide that inputA provides the
     * bench-mark and inputB gets interpolated.
     */
    private float interpolate(List<? extends ContinuousRecord> bRecords, int j, int pos) {
        float result = 0.0f;
        if (j < bRecords.size()) {
            ContinuousRecord recB = bRecords.get(j);

            if (recB.getPosition() == pos || j == 0) {
                // Simple case.  We have a data-point at the exact position.
                result = recB.getValue();
            } else if (recB.getPosition() > pos) {
                // If we got here, recB is further on in the chromosome than pos, so we need to interpolate with the preceding data-point.
                ContinuousRecord prevRecB = bRecords.get(j - 1);
                float weight = (float) (pos - prevRecB.getPosition())
                        / (recB.getPosition() - prevRecB.getPosition());
                result = prevRecB.getValue() * weight + recB.getValue() * (1.0f - weight);
            }
        }
        return Float.isNaN(result) ? 0.0f : result;
    }

    /**
     * Get the URI corresponding to the difference between our input tracks.
     * @return
     */
    @Override
    public URI getURI() {
        return uri;
    }

    /**
     * @return
     */
    @Override
    public String getName() {
        return uri.toString();
    }

    /**
     * No extra clean-up to be done.
     */
    @Override
    public void close() {
    }

    /**
     * @return <code>CONTINUOUS</code>
     */
    @Override
    public DataFormat getDataFormat() {
        return DataFormat.CONTINUOUS;
    }

    /**
     * @return "Reference", "Position", "Value"
     */
    @Override
    public String[] getColumnNames() {
        return GenericContinuousRecord.COLUMN_NAMES;
    }

    /**
     * A do-nothing stub because continuous tracks don't have any strings to look up.
     */
    @Override
    public void loadDictionary() {
    }

    /**
     * A do-nothing stub because continuous tracks don't have any strings to look up.
     */
    @Override
    public List<BookmarkAdapter> lookup(String string) {
        return null;
    }

    /**
     * Utility method to calculate the URI which will be used to specify the difference between the two given tracks.
     */
    public static URI getDiffURI(TrackAdapter trackA, TrackAdapter trackB) throws URISyntaxException {
        return new URI("diff://(" + trackA.getDataSource().getURI() + ";" + trackB.getDataSource().getURI() + ")");
    }

    /**
     * Search through our existing tracks to find the ones which match our input URIs.
     * Because we can't rely on Savant loading the tracks in any known order, we may have
     * to call this method repeatedly before the inputs get hooked up.
     */
    private boolean inputsAttached() {
        if (inputA != null && inputB != null) {
            return true;
        }
        try {
            String uriString = uri.getRawSchemeSpecificPart().substring(3); // Trim off the initial "diff://("
            uriString = uriString.substring(0, uriString.length() - 1); // Trim off the final ")"
            int delimiterPos = uriString.indexOf(';');
            if (delimiterPos > 0) {
                URI uriA = new URI(uriString.substring(0, delimiterPos));
                URI uriB = new URI(uriString.substring(delimiterPos + 1));

                TrackAdapter[] availableTracks = TrackUtils.getTracks(DataFormat.CONTINUOUS);
                for (TrackAdapter t : availableTracks) {
                    URI u = t.getDataSource().getURI();
                    if (u.equals(uriA)) {
                        inputA = t.getDataSource();
                    } else if (u.equals(uriB)) {
                        inputB = t.getDataSource();
                    }
                }
            }
        } catch (URISyntaxException x) {
            LOG.error(String.format("Unable to parse %s as a valid URI.", x));
        }
        return inputA != null && inputB != null;
    }
}