contrail.stages.GraphToFasta.java Source code

Java tutorial

Introduction

Here is the source code for contrail.stages.GraphToFasta.java

Source

/**
 * Copyright 2012 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
// Author: Jeremy Lewi (jeremy@lewi.us)
package contrail.stages;

import contrail.graph.GraphNode;
import contrail.graph.GraphNodeData;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

import org.apache.avro.mapred.AvroInputFormat;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroWrapper;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

/**
 * Convert the graph to fasta files.
 */
public class GraphToFasta extends Stage {
    private static final Logger sLogger = Logger.getLogger(GraphToFasta.class);

    /**
     * Get the parameters used by this stage.
     */
    protected Map<String, ParameterDefinition> createParameterDefinitions() {
        HashMap<String, ParameterDefinition> defs = new HashMap<String, ParameterDefinition>();

        defs.putAll(super.createParameterDefinitions());

        for (ParameterDefinition def : ContrailParameters.getInputOutputPathOptions()) {
            defs.put(def.getName(), def);
        }
        return Collections.unmodifiableMap(defs);
    }

    /**
     * Mapper for converting the AVRO records into FASTQ format.
     *
     * We use a regular mapper not an AVRO mapper because the output is not avro.
     */
    public static class GraphToFastqMapper extends MapReduceBase
            implements Mapper<AvroWrapper<GraphNodeData>, NullWritable, Text, NullWritable> {
        private Text textOutput;
        private GraphNode node;
        private String[] lines;

        public void configure(JobConf job) {
            textOutput = new Text();
            node = new GraphNode();

            // Each entry in the FASTA file is 4 lines of text.
            lines = new String[2];
        }

        public void map(AvroWrapper<GraphNodeData> nodeData, NullWritable inputValue,
                OutputCollector<Text, NullWritable> output, Reporter reporter) throws IOException {
            node.setData(nodeData.datum());
            lines[0] = "@" + node.getNodeId();
            lines[1] = node.getSequence().toString();
            textOutput.set(StringUtils.join(lines, "\n"));
            output.collect(textOutput, NullWritable.get());
        }
    }

    @Override
    public RunningJob runJob() throws Exception {
        String inputPath = (String) stage_options.get("inputpath");
        String outputPath = (String) stage_options.get("outputpath");

        sLogger.info(" - inputpath: " + inputPath);
        sLogger.info(" - outputpath: " + outputPath);

        JobConf conf = new JobConf(GraphToFasta.class);

        AvroJob.setInputSchema(conf, GraphNodeData.SCHEMA$);

        initializeJobConfiguration(conf);

        FileInputFormat.addInputPath(conf, new Path(inputPath));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
        AvroInputFormat<GraphNodeData> input_format = new AvroInputFormat<GraphNodeData>();
        conf.setInputFormat(input_format.getClass());
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);

        // Make it mapper only.
        conf.setNumReduceTasks(0);
        conf.setMapperClass(GraphToFastqMapper.class);

        if (stage_options.containsKey("writeconfig")) {
            writeJobConfig(conf);
        } else {
            // Delete the output directory if it exists already
            Path out_path = new Path(outputPath);
            if (FileSystem.get(conf).exists(out_path)) {
                // TODO(jlewi): We should only delete an existing directory
                // if explicitly told to do so.
                sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists.");
                FileSystem.get(conf).delete(out_path, true);
            }

            long starttime = System.currentTimeMillis();
            RunningJob result = JobClient.runJob(conf);
            long endtime = System.currentTimeMillis();

            float diff = (float) ((endtime - starttime) / 1000.0);

            System.out.println("Runtime: " + diff + " s");
            return result;
        }
        return null;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new GraphToFasta(), args);
        System.exit(res);
    }
}