org.mskcc.shenkers.control.track.gene.GTFGeneModelProvider.java Source code

Java tutorial

Introduction

Here is the source code for org.mskcc.shenkers.control.track.gene.GTFGeneModelProvider.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package org.mskcc.shenkers.control.track.gene;

import org.mskcc.shenkers.data.interval.CoordinateOrderComparator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import htsjdk.samtools.tabix.BgzfUtil;
import htsjdk.samtools.tabix.ChainContext;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.tribble.AbstractFeatureReader;
import htsjdk.tribble.CloseableTribbleIterator;
import htsjdk.tribble.TabixFeatureReader;
import htsjdk.tribble.index.tabix.TabixFormat;
import htsjdk.tribble.readers.LineIterator;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.logging.Level;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import javafx.scene.layout.Pane;
import javafx.util.Pair;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.mskcc.shenkers.control.track.DomainFlippable;
import org.mskcc.shenkers.view.GeneViewBuilder;

/**
 *
 * @author sol
 */
public class GTFGeneModelProvider<T extends Pane & DomainFlippable> implements GeneModelProvider {

    private static final Logger logger = LogManager.getLogger();

    GTFCodec codec = new GTFCodec();

    TabixFeatureReader<GTFContext, LineIterator> features;

    public GTFGeneModelProvider(String uri, boolean force_convert, boolean force_index) throws IOException {

        File chain_gff_bgz = new File(uri.concat(".bgz"));
        File chain_gff_bgz_tbi = new File(uri.concat(".bgz.tbi"));

        logger.info("converting {} to {}", uri, chain_gff_bgz.getAbsoluteFile());
        logger.info("{} already exists? {}", chain_gff_bgz.getAbsoluteFile(), chain_gff_bgz.exists());
        if (force_convert || !chain_gff_bgz.exists()) {
            logger.info("bgzipping...");
            createGtfBgz(new File(uri), chain_gff_bgz);
        }
        logger.info("{} already exists? {}", chain_gff_bgz_tbi.getAbsoluteFile(), chain_gff_bgz_tbi.exists());
        if (force_index || !chain_gff_bgz_tbi.exists()) {
            logger.info("creating tabix index");
            BgzfUtil.createTabixIndex(chain_gff_bgz, codec::decode, TabixFormat.GFF);
        }
        features = new TabixFeatureReader(chain_gff_bgz.getAbsolutePath(), codec);
    }

    @Override
    public Iterable<GeneModel> query(String chr, int start, int end) {
        CloseableTribbleIterator<GTFContext> query = null;
        try {
            query = features.query(chr, start, end);

            Stream<GTFContext> filter = StreamSupport
                    .stream(Spliterators.spliteratorUnknownSize(query, Spliterator.CONCURRENT), true)
                    .filter(c -> c.getTranscriptId() != null);

            ImmutableListMultimap<String, GTFContext> transcript_id_multimap = Multimaps.index(filter.iterator(),
                    GTFContext::getTranscriptId);

            Stream<GeneModel> map = transcript_id_multimap.keySet().stream().map(key -> {
                System.out.println(key);
                ImmutableListMultimap<String, GTFContext> transcript_features = Multimaps
                        .index(transcript_id_multimap.get(key), GTFContext::getFeature);
                Map<String, RangeSet<Integer>> collect = transcript_features.keySet().stream()
                        .collect(Collectors.toMap(s -> s, feature -> {
                            return transcript_features.get(feature).stream()
                                    .map(c -> Range.closed(c.getStart(), c.getEnd()))
                                    .collect(new RangeSetCollector());
                        }));

                GeneModel model = new GeneModel(collect.get("transcript").span(),
                        Optional.ofNullable(collect.get("exon")),
                        Optional.ofNullable(collect.get("CDS")).map(c -> c.span()));
                return model;
            });

            return map::iterator;
        } catch (IOException ex) {
            logger.error("exception reading gene models ", ex);
            throw new RuntimeException(ex);
        } finally {
            query.close();
        }
    }

    private void createGtfBgz(File gtf_file, File gtf_bgz_file) throws IOException {
        logger.info("reading {}", gtf_file.getAbsolutePath());
        AbstractFeatureReader<GTFContext, LineIterator> afr = AbstractFeatureReader
                .getFeatureReader(gtf_file.getAbsolutePath(), codec, false);
        CloseableTribbleIterator<GTFContext> iterator = afr.iterator();
        List<GTFContext> gtf = new ArrayList<>();
        while (iterator.hasNext()) {
            GTFContext next = iterator.next();
            gtf.add(next);
        }
        ImmutableListMultimap<String, GTFContext> transcript_id_multimap = Multimaps.index(gtf.iterator(),
                GTFContext::getTranscriptId);

        logger.info("adding transcript ranges");
        gtf.addAll(transcript_id_multimap.keySet().stream().map(key -> {
            System.out.println(key);
            ImmutableList<GTFContext> contexts = transcript_id_multimap.get(key);
            Range<Integer> span = contexts.stream().map(c -> Range.closed(c.getStart(), c.getEnd()))
                    .collect(new RangeSetCollector()).span();

            GTFContext context = new GTFContext(contexts.get(0).getChr(), span.lowerEndpoint(),
                    span.upperEndpoint());
            context.setFeature("transcript");
            context.setFrame(".");
            context.setName(".");
            context.setScore(".");
            context.setSource(".");
            context.setStrand('.');
            context.setAttributes(String.format("transcript_id \"%s\";", key));
            return context;
        }).collect(Collectors.toList()));

        logger.info("sorting");
        Collections.sort(gtf, new CoordinateOrderComparator());
        logger.info("writing to compressed output stream");
        BlockCompressedOutputStream os = new BlockCompressedOutputStream(gtf_bgz_file);
        Writer w = new OutputStreamWriter(os);
        for (GTFContext feature : gtf) {
            w.write(codec.encodeToString(feature));
        }
        w.close();
    }

    /*
     GTFCodec codec = new GTFCodec();
     AbstractFeatureReader<GTFContext, LineIterator> afr = AbstractFeatureReader.getFeatureReader("/home/sol/data/gene_models/Drosophila_melanogaster.BDGP5.69.gtf", codec, false);
     CloseableTribbleIterator<GTFContext> iterator = afr.iterator();
     List<GTFContext> gtf = new ArrayList<>();
     while (iterator.hasNext()) {
     GTFContext next = iterator.next();
     gtf.add(next);
     //            System.out.println(
     //            codec.encodeToString(next));
     }
     System.out.println("sorting");
     Collections.sort(gtf, new CoordinateOrderComparator());
     System.out.println("done, bgzipping");
     File file_bgz = new File("test.gtf.gz");
     BlockCompressedOutputStream os = new BlockCompressedOutputStream(file_bgz);
     Writer w = new OutputStreamWriter(os);
     for (GTFContext feature : gtf) {
     w.write(codec.encodeToString(feature));
     }
     w.close();
     System.out.println("done, indexing");
     BgzfUtil.createTabixIndex(file_bgz, codec::decode, TabixFormat.GFF);
     System.out.println("done, querying");
     AbstractFeatureReader<GTFContext, LineIterator> featureReader = AbstractFeatureReader.getFeatureReader(file_bgz.getAbsolutePath(), codec, true);
     //        3R:1050932-1065576
     //        chr3R:1058118-1063042
     int start = 1058118;
     int end = 1063042;
     CloseableTribbleIterator<GTFContext> query = featureReader.query("chr3R", start, end);
     Pattern p = Pattern.compile("([^ ]+) \"([^\"]*)\"");
        
     Stream<GTFContext> filter = StreamSupport.stream(Spliterators.spliteratorUnknownSize(query, Spliterator.CONCURRENT), true)
     .filter((c) -> {
     System.out.println(c);
     System.out.println(codec.encodeToString(c));
     String attributes = c.getAttributes();
     String[] split = attributes.split(";");
     Set<String> attributeKeys = Stream.of(split).map(str -> {
     Matcher matcher = p.matcher(str);
     System.out.println("matching: " + str);
     matcher.find();
     return matcher.group(1);
     }
     ).collect(Collectors.toSet());
     return attributeKeys.contains("transcript_id");
     });
        
     ImmutableListMultimap<String, GTFContext> transcript_id_multimap = Multimaps.index(filter.iterator(), c -> {
     String attributes = c.getAttributes();
     String[] split = attributes.split(";");
     Map<String, String> attributeMap = Stream.of(split).map(str -> {
     System.out.println("mapping: " + str);
     Matcher matcher = p.matcher(str);
     matcher.find();
     return new String[]{matcher.group(1), matcher.group(2)};
     }
     ).collect(Collectors.toMap(s -> s[0], s -> s[1]));
     return attributeMap.get("transcript_id");
     });
        
     // write to bgzipped file
     // index file
     // query file
     // aggregate by transcript id
     // create separate gene model for each transcript
     // stack into stacked interval view
     class RangeSetCollector implements Collector<Range<Integer>, RangeSet<Integer>, RangeSet<Integer>> {
        
     @Override
     public Supplier<RangeSet<Integer>> supplier() {
     return () -> TreeRangeSet.create();
     }
        
     @Override
     public BiConsumer<RangeSet<Integer>, Range<Integer>> accumulator() {
     return (set, range) -> set.add(range);
     }
        
     @Override
     public BinaryOperator<RangeSet<Integer>> combiner() {
     return (set1, set2) -> {
     set1.addAll(set2);
     return set1;
     };
     }
        
     @Override
     public Function<RangeSet<Integer>, RangeSet<Integer>> finisher() {
     return set -> set;
     }
        
     @Override
     public Set<Characteristics> characteristics() {
     return EnumSet.of(Characteristics.IDENTITY_FINISH, Characteristics.UNORDERED);
     }
        
     }
        
     List<Pair<Integer, Integer>> range = new ArrayList<>();
     List<T> nodes = new ArrayList<>();
     for (String key : transcript_id_multimap.keySet()) {
     System.out.println(key);
     ImmutableListMultimap<String, GTFContext> transcript_features = Multimaps.index(transcript_id_multimap.get(key), GTFContext::getFeature);
     Map<String, RangeSet<Integer>> collect = transcript_features.keySet().stream().collect(Collectors.toMap(s -> s, feature -> {
     return transcript_features.get(feature)
     .stream()
     .map(c -> Range.closed(c.getStart(), c.getEnd()))
     .collect(new RangeSetCollector());
     }));
        
     Range<Integer> span = collect.get("exon").span();
     range.add(new Pair(span.lowerEndpoint(), span.upperEndpoint()));
     System.out.println("span: " + span);
     T view = (T) new GeneViewBuilder(collect.get("exon"), Optional.of(collect.get("CDS")).map(c -> c.span())).getView(start, end);
     nodes.add(view);
        
     }
     */
}