ubic.gemma.loader.genome.ProbeSequenceParser.java Source code

Introduction

Here is the source code for ubic.gemma.loader.genome.ProbeSequenceParser.java
Source

/*
 * The Gemma project
 * 
 * Copyright (c) 2008 University of British Columbia
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package ubic.gemma.loader.genome;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.StopWatch;
import ubic.gemma.loader.util.parser.BasicLineMapParser;
import ubic.gemma.model.genome.biosequence.BioSequence;

/**
 * Parse probes from a tabular file. First columnn = probe id; Second column = sequence name; Third column = sequence.
 * <p>
 * This is designed primarily to deal with oligonucleotide arrays that have sequence names different from the probe
 * names.
 * 
 * @author paul
 * @version $Id: ProbeSequenceParser.java,v 1.13 2012/04/28 23:48:53 paul Exp $
 */
public class ProbeSequenceParser extends BasicLineMapParser<String, BioSequence> {

    private Map<String, BioSequence> results = new HashMap<String, BioSequence>();

    /*
     * (non-Javadoc)
     * 
     * @see baseCode.io.reader.BasicLineMapParser#parseOneLine(java.lang.String)
     */
    @Override
    public BioSequence parseOneLine(String line) {

        if (line.startsWith(">")) {
            throw new RuntimeException(
                    "FASTA format not supported - please use the tabular format for oligonucleotides");
        }

        if (StringUtils.isBlank(line)) {
            return null;
        }

        String[] sArray = StringUtils.splitPreserveAllTokens(line);

        if (sArray.length == 0) {
            return null;
        }

        if (sArray.length != 3) {
            throw new IllegalArgumentException(
                    "Expected 3 fields: probe name, sequence name, sequence; line=" + line);
        }

        String probeId = sArray[0].trim();

        if (StringUtils.isBlank(probeId)) {
            return null;
        }

        String sequenceName = sArray[1].trim();

        String sequence = sArray[2].trim();

        // Rarely there are extra junk characters. See bug 2719
        sequence = sequence.replaceAll("[^a-yA-Y]", "");

        // A Adenine
        // C Cytosine
        // G Guanine
        // T Thymine
        // U Uracil
        // R Purine (A or G)
        // Y Pyrimidine (C, T, or U)
        // M C or A
        // K T, U, or G
        // W T, U, or A
        // S C or G
        // B C, T, U, or G (not A)
        // D A, T, U, or G (not C)
        // H A, T, U, or C (not G)
        // V A, C, or G (not T, not U)
        // N Any base (A, C, G, T, or U)

        if (StringUtils.isBlank(sequence)) {
            return null;
        }

        BioSequence seq = BioSequence.Factory.newInstance();
        seq.setSequence(sequence);
        seq.setLength((long) sequence.length());
        seq.setIsCircular(false);
        seq.setIsApproximateLength(false);
        seq.setName(sequenceName);

        if (this.results.containsKey(probeId)) {
            log.warn("Duplicated probe id: " + probeId);
        }
        put(probeId, seq);

        return seq;
    }

    @Override
    public void parse(InputStream is) throws IOException {

        if (is == null)
            throw new IllegalArgumentException("InputStream was null");
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        StopWatch timer = new StopWatch();
        timer.start();
        int nullLines = 0;
        String line = null;
        int linesParsed = 0;
        while ((line = br.readLine()) != null) {

            BioSequence newItem = parseOneLine(line);

            if (++linesParsed % PARSE_ALERT_FREQUENCY == 0 && timer.getTime() > PARSE_ALERT_TIME_FREQUENCY_MS) {
                String message = "Parsed " + linesParsed + " lines ";
                log.info(message);
                timer.reset();
                timer.start();
            }

            if (newItem == null) {
                nullLines++;
                continue;
            }

        }
        log.info("Parsed " + linesParsed + " lines. "
                + (nullLines > 0 ? nullLines + " yielded no parse result (they may have been filtered)." : ""));

        br.close();
    }

    /*
     * (non-Javadoc)
     * 
     * @see baseCode.io.reader.BasicLineMapParser#getKey(java.lang.Object)
     */
    @Override
    protected String getKey(BioSequence newItem) {
        throw new UnsupportedOperationException();
    }

    @Override
    public BioSequence get(String key) {
        return results.get(key);
    }

    @Override
    public Collection<BioSequence> getResults() {
        return results.values();
    }

    @Override
    protected void put(String key, BioSequence value) {
        results.put(key, value);
    }

    @Override
    public boolean containsKey(String key) {
        return results.containsKey(key);
    }

    @Override
    public Collection<String> getKeySet() {
        return results.keySet();
    }
}