org.mitre.ccv.mapred.GenerateFeatureVectors.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.mapred.GenerateFeatureVectors.java

Source

/**
 * Created on April 3, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.mapred;

import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;
import org.mitre.ccv.mapred.io.CompositionVectorKey;
import org.mitre.ccv.mapred.io.CompositionVectorWritable;
import org.mitre.ccv.mapred.io.KmerEntropyPairWritable;

import org.mitre.la.SparseVector;
import org.mitre.la.VectorLengthException;
import org.mitre.la.mapred.io.SparseVectorWritable;
import org.mitre.mapred.fs.FileUtils;

/**
 * Map-Reduce class that generates feature vectors as {@link SparseVectorWritable}s.
 *
 * <P>This can use a given (or serilized) {@link ArrayList} to limit what add to the vector.
 * If <B>NO</B> sample contains a k-mer in the list, it <B>WILL BE</B> included in the final vector.
 *
 * @author Marc Colosimo
 */
public class GenerateFeatureVectors extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(GenerateFeatureVectors.class);
    private static final String KMER_LIST = "ccv.mapred.feature.vector.kmer.listfile";
    private static final String VECTOR_CARDINALITY = "ccv.mapred.feature.vector.cardinality";

    /**
     * Returns a read only {@link MappedByteBuffer}.
     * 
     * @param input full <b>local</b>  path 
     * @throws java.io.IOException
     */
    public static MappedByteBuffer getMappedByteBuffer(String input) throws IOException {
        FileInputStream fins = new FileInputStream(input);
        FileChannel channel = fins.getChannel();
        return channel.map(FileChannel.MapMode.READ_ONLY, 0, (int) channel.size());
    }

    public static class CompositionVectorMap extends MapReduceBase
            implements Mapper<CompositionVectorKey, CompositionVectorWritable, Text, SparseVectorWritable> {

        private FileSystem localFs; // local (node) filesystem
        private Text name = new Text();
        private SparseVectorWritable vector = new SparseVectorWritable();
        private MappedByteBuffer listBuffer = null;
        private int cardinality = -1;

        @Override
        public void configure(JobConf conf) {
            this.cardinality = conf.getInt(VECTOR_CARDINALITY, -1);
            try {
                this.localFs = FileSystem.getLocal(new Configuration());
                //localArchives = DistributedCache.getLocalCacheArchives(job);
                String listInput = conf.get(KMER_LIST, null);
                Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);

                if (listInput != null && localFiles.length != 0) {
                    //System.err.println("Got list of local cache files!");
                    // only expecting one but we should check
                    //System.err.printf("Looking for %s\n", listInput);
                    for (int cv = 0; cv < localFiles.length; cv++) {
                        // getName()=kmer_1208b0a341b_tmp
                        //System.err.printf("Got cache file %s(%s)\n", localFiles[cv].toString(), localFiles[cv].getName());
                        if (!localFiles[cv].getName().equals(listInput)) {
                            continue;
                        }
                        // /state/partition1/hadoop/var/hadoop-root/mapred/local/taskTracker/archive/rocks5.local/user/mcolosimo/kmer_1208b0a341b_tmp/kmer_1208b0a341b_tmp
                        //System.err.println("Got matching file: " + localFiles[cv].toString());
                        // Full path to cached file to make a read only MappedByteBuffer
                        this.listBuffer = getMappedByteBuffer(localFiles[cv].toString());
                        if (this.listBuffer == null) {
                            LOG.warn("Buffer returned is null!");
                        }
                        // capacity and remaining are the same here, but hasRemaining returns false until we rewind!
                        //System.err.printf("Buffer has %d capacity with %d remaining.\n", this.listBuffer.capacity(), this.listBuffer.remaining());
                        break;
                    }
                }
                // hopefully throws exception when map is called, just log it here
            } catch (IOException ex) {
                LOG.fatal("Unable to get cached files", ex);
            }
        }

        @Override
        public void close() throws IOException {
            this.localFs.close();
        }

        @Override
        public void map(CompositionVectorKey key, CompositionVectorWritable value,
                OutputCollector<Text, SparseVectorWritable> output, Reporter reporter) throws IOException {
            if (this.cardinality <= 0) {
                throw new IOException("Cardinality less than or equal to zero!");
            }
            // 
            if (this.listBuffer == null) {
                throw new IOException("Local list of k-mers is missing!");
            } else {
                // Reset/rewind/clear if not first time here
                //this.listBuffer.clear();
                this.listBuffer.rewind(); // might use clear()
            }
            if (this.listBuffer.remaining() <= 0) {
                throw new IOException("Local list of k-mers is empty!");
            }
            reporter.setStatus(String.format("%s:%d", key.getName(), key.getWindowSize()));
            int windowSize = key.getWindowSize();
            name.set(key.getName());
            KmerEntropyPairWritable w = new KmerEntropyPairWritable();
            SparseVector sv = new SparseVector(key.getName(), this.cardinality);

            int cv = 0;
            while (this.listBuffer.hasRemaining()) {
                w.readFields(this.listBuffer);
                String kmer = w.getKey();
                //System.err.printf("Got '%s' k-mer from list.\n", kmer);
                if (kmer.length() == windowSize) {
                    if (value.containsKey(kmer)) {
                        /**
                        if (value.getValue(kmer).isInfinite()) {
                        //System.err.printf("INFINITE: %s %s(%d):%f\n", key.getName(), kmer, cv, value.getValue(kmer));
                        LOG.warn(String.format("INFINITE: %s %s(%d):%f\n", key.getName(), kmer, cv, value.getValue(kmer)));
                        throw new IOException(String.format("INFINITE: %s %s(%d):%f\n", key.getName(), kmer, cv, value.getValue(kmer)));
                        }
                        */
                        //System.err.printf("%s(%d):%f\n", kmer, cv, value.getValue(kmer));
                        sv.set(cv, value.getValue(kmer));
                    }
                }
                cv++;
            }
            vector.set(sv);
            output.collect(name, vector);
        }
    }

    public static class Features2VectorReducer extends MapReduceBase
            implements Reducer<Text, SparseVectorWritable, Text, SparseVectorWritable> {

        private int cardinality;
        private SparseVectorWritable svw = new SparseVectorWritable();

        @Override
        public void configure(JobConf conf) {
            this.cardinality = conf.getInt(VECTOR_CARDINALITY, -1);
        }

        @Override
        public void reduce(Text key, Iterator<SparseVectorWritable> values,
                OutputCollector<Text, SparseVectorWritable> output, Reporter reporter) throws IOException {

            // We should be able to get the cardinality from the values
            if (this.cardinality <= 0) {
                throw new IOException("Cardinality less than or equal to zero!");
            }
            SparseVector sv = new SparseVector(key.toString(), this.cardinality);
            SparseVectorWritable w;
            while (values.hasNext()) {
                w = values.next();
                try {
                    sv.plusEquals(w.get());
                } catch (VectorLengthException ex) {
                    LOG.warn(ex);
                }
            }
            svw.set(sv);

            /** DEBUG 
            System.out.printf("%s=", key.toString());
            for(Map.Entry<Integer, Double> entry : sv.getSparseMap().entrySet()) {
            System.out.printf(" %d:%f", entry.getKey(), entry.getValue());
            }
            System.out.printf("\n");
             /* */
            output.collect(key, svw);
        }
    }

    /**
     * Start a new job with the given configuration and parameters.
     *
     * @param jobConf
     * @param listInput         file path containing list of k-mers to use
     * @param cardinality       number of k-mers to use (if list contains less,then that will be used instead).
     * @param input             composition vector {@link SequenceFile} such as generated by {@link CalculateCompositionVectors}
     * @param output
     * @param cleanLogs
     * @return zero if no errors
     * @throws java.lang.Exception
     */
    public int initJob(JobConf jobConf, String listInput, Integer cardinality, String input, String output,
            boolean cleanLogs) throws Exception {
        JobConf conf = new JobConf(jobConf, GenerateFeatureVectors.class);
        conf.setJobName("GenerateFeatureVectors");

        Path listPath = new Path(listInput); // i.e, listInput = win32_200902260829/kmer_120811a7fa1_tmp
        FileSystem fs = listPath.getFileSystem(conf);
        if (listInput != null) {
            // @todo: should check to see if it is there!

            // It doesn't say it, but we need the quailifed path with the host name
            // otherwise URI sticks the host on to it not so nicely
            Path qPath = fs.makeQualified(listPath);
            // listPath = hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp
            LOG.info(String.format("Caching k-mer file %s", qPath.toString()));
            // URI:hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp
            URI listURI = new URI(qPath.toString());
            DistributedCache.addCacheFile(listURI, conf);
            conf.set(KMER_LIST, listPath.getName());
            //LOG.info("k-mer URI:" + listURI.toString());
        } else {
            throw new Exception("GenerateFeatureVectors requires a list of k-mers!");
        }

        /** We need this. It is okay if the cardinality is larger than the number of k-mers. */
        if (cardinality == null) {
            LOG.info("Scanning k-mer file to determine cardinality");
            FSDataInputStream ins = fs.open(listPath);

            KmerEntropyPairWritable w = new KmerEntropyPairWritable();
            int c = 0;
            while (ins.available() > 0) {
                w.readFields(ins);
                c++;
            }
            ins.close();
            fs.close();
            LOG.info(String.format("Found %d k-mers in the file", c));
            cardinality = c;
        }
        conf.setInt(VECTOR_CARDINALITY, cardinality);

        // Set up mapper
        SequenceFileInputFormat.setInputPaths(conf, new Path(input));
        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setMapperClass(CompositionVectorMap.class);
        conf.setOutputKeyClass(Text.class); // final output key class - sample name
        conf.setOutputValueClass(SparseVectorWritable.class); // final output value class

        // Set up combiner/reducer
        conf.setReducerClass(Features2VectorReducer.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

        JobClient.runJob(conf);

        return 0;
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());
        int cardinality = Integer.MAX_VALUE;
        boolean cleanLogs = false;
        String listInput = null;

        // @TODO: use commons getopts, org.apache.hadoop.util.GenericOptionsParser used it
        ArrayList<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-c".equals(args[i])) {
                    cleanLogs = true;
                } else if ("-l".equals(args[i])) {
                    listInput = args[++i];
                } else if ("-t".equals(args[i])) {
                    cardinality = Integer.parseInt(args[++i]);
                } else if ("-libjars".equals(args[i])) {
                    conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf));

                    URL[] libjars = FileUtils.getLibJars(conf);
                    if (libjars != null && libjars.length > 0) {
                        // Add libjars to client/tasks classpath
                        conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
                        // Adds libjars to our classpath
                        Thread.currentThread().setContextClassLoader(
                                new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
                    }
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 3.");
            return printUsage();
        }

        if (listInput == null || listInput.length() == 0) {
            System.out.println("Need kmer sequence file path!");
            return printUsage();
        }

        long now = System.currentTimeMillis();
        Path listInputPath = new Path(listInput);
        Path listOutputPath = new Path(listInputPath.getParent(), "kmer_" + Long.toHexString(now) + "_tmp");
        LOG.info(String.format("Loading %d sorted k-mers from %s to %s", cardinality, listInputPath.toString(),
                listOutputPath.toString()));
        int num = CompleteCompositionVectorUtils.flattenKmerEntropySequenceFile(conf, cardinality,
                listInputPath.toString(), listOutputPath.toString(), cleanLogs);

        initJob(conf, listOutputPath.toString(), num, other_args.get(0), other_args.get(1), cleanLogs);
        return 0;
    }

    static int printUsage() {
        System.out.println(
                "GenerateFeatureVectors [-libjars <classpath,...>] [-m <maps>] [-r <reduces>] [-t <num kmers>] -l <kmer list path>"
                        + " <input> <output>");
        System.out.println("Where -l give the path to the SequenceFiles containing the k-mers");
        return -1;
    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new GenerateFeatureVectors(), args);
        System.exit(res);
    }
}