org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.mitre.ccv.mapred.CompleteCompositionVectorUtils.java

Source

/**
 * Created on April 9, 2009.
 *
 * Copyright 2010- The MITRE Corporation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you  may not 
 * use this file except in compliance with the License. You may obtain a copy of 
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 * License for the specific language governing permissions andlimitations under
 * the License.
 *
 * $Id$
 */
package org.mitre.ccv.mapred;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLClassLoader;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;

import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonGenerationException;
import org.codehaus.jackson.JsonGenerator;

import org.mitre.ccv.mapred.io.KmerEntropyPairWritable;

import org.mitre.la.SparseVector;
import org.mitre.la.mapred.io.SparseVectorWritable;
import org.mitre.mapred.fs.FileUtils;

/**
 * A class of Complete Composition Vector utility methods.
 *
 * @author Marc Colosimo
 */
public class CompleteCompositionVectorUtils extends Configured implements Tool {

    private static final Log LOG = LogFactory.getLog(CompleteCompositionVectorUtils.class);

    private CompleteCompositionVectorUtils() {
    } // not instantiable

    /**
     * Returns the given number of k-mers from {@link SequenceFile}s containing {@link KmerEntropyPairWritable} as the keys.
     *
     * @param conf      JobConf
     * @param input     path to SequenceFile
     * @param numKmers  the number of k-mers to return (if null or 0, all will be returned).
     * @return          {@link TreeSet} of sorted (see {@link KmerEntropyPairWritable} k-mers.
     * @throws java.io.IOException
     */
    public static TreeSet<String> getKmerEntropiesFromSequenceFile(JobConf conf, String input, Integer length)
            throws IOException {
        TreeSet<String> nmers = new TreeSet<String>();
        Path inputPath = new Path(input);
        FileSystem fs = inputPath.getFileSystem(conf);
        //Path inputPath = fs.makeQualified(path);
        Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
        if (length == null || length <= 0) {
            length = Integer.MAX_VALUE;
        }
        int cnt = 0;
        for (int idx = 0; idx < paths.length; idx++) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
            KmerEntropyPairWritable key = new KmerEntropyPairWritable();
            boolean hasNext = true;
            while (hasNext && cnt < length) {
                hasNext = reader.next(key);
                nmers.add(key.getKey());
                cnt++;
            }
        }
        return nmers;
    }

    /**
     * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a file
     * containing only the keys as {@link KmerEntropyPairWritable} in the same order.
     *
     * @param conf
     * @param numKmers
     * @param input     the input path containing the kmers.
     * @param output    the output file path to write the keys to.
     * @param asText    if <code>true</code>, then save keys and values as text. Otherwise, save as {@link Writable}s
     * @return          the actual number written out.
     * @throws java.io.IOException
     */
    public static synchronized int flattenKmerEntropySequenceFile(JobConf conf, int numKmers, String input,
            String output, boolean asText) throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug(String.format("Flattening %d k-mers entropies from %s to %s", numKmers, input, output));
        }
        Path outPath = new Path(output);
        FileSystem fs = outPath.getFileSystem(conf);

        FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
        Path inputPath = new Path(input);
        Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
        if (numKmers <= 0) {
            numKmers = Integer.MAX_VALUE;
        }
        int cnt = 0;
        for (int idx = 0; idx < paths.length; idx++) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
            KmerEntropyPairWritable key = new KmerEntropyPairWritable();
            boolean hasNext = true;
            while (hasNext && cnt < numKmers) {
                hasNext = reader.next(key);
                if (asText) {
                    fos.writeUTF(key.toString());
                } else {
                    key.write(fos);
                }
                cnt++;
            }
            try {
                fos.close();
                reader.close();
            } catch (IOException ioe) {
                // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
                LOG.debug("Probably caused by closing the SequenceFile.Reader", ioe);
            }
        }
        return cnt;
    }

    /**
     * Writes out the {@link SequenceFile} feature vectors in row major (packed) order. No labels are outputed.
     *
     * @param jobConf
     * @param input     top level SequenceFile directory path
     * @param output    path to output the matrix
     * @param digits    the maximum number of fraction digits
     * @throws IOException
     */
    public static void featureVectors2RowMajorMatrix(JobConf jobConf, String input, String output, int digits)
            throws IOException {
        JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

        DecimalFormat format = new DecimalFormat();
        format.setDecimalFormatSymbols(new DecimalFormatSymbols(Locale.US));
        format.setMinimumIntegerDigits(1);
        format.setMaximumFractionDigits(digits);
        //format.setMinimumFractionDigits(fractionDigits);
        format.setGroupingUsed(false);

        final Path inputPath = new Path(input);
        final FileSystem fs = inputPath.getFileSystem(conf);
        final Path qInputPath = fs.makeQualified(inputPath);
        final Path outputPath = new Path(output);
        Path[] paths = FileUtils.ls(conf, qInputPath.toString() + Path.SEPARATOR + "part-*");

        FSDataOutputStream fos = fs.create(outputPath, true); // throws nothing!
        final Writer writer = new OutputStreamWriter(fos);
        final Text key = new Text();
        final SparseVectorWritable value = new SparseVectorWritable();
        for (int idx = 0; idx < paths.length; idx++) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
            boolean hasNext = reader.next(key, value);
            while (hasNext) {

                final SparseVector vector = value.get();
                final StringBuilder sb = new StringBuilder();
                for (int i = 0; i < vector.getCardinality(); i++) {
                    final String s = format.format(vector.get(i)); // format the number
                    sb.append(s);
                    sb.append(' ');
                }
                writer.write(sb.toString());
                hasNext = reader.next(key, value);
            }
            try {
                writer.flush();
                reader.close();
            } catch (IOException ioe) {
                // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
                LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
            }
        }
        try {
            writer.close();
            fos.flush();
            fos.close();
        } catch (IOException ioe) {
            LOG.debug("Caused by distributed cache output stream.", ioe);
        }
    }

    /**
     * Flattens a {@link SequenceFile} containing {@link KmerEntropyPairWritable}s as keys to a json file
     * containing the k-mers (<tt>features</tt>) in the same order, along with the start and end window sizes.
     *
     * @param conf
     * @param numKmers      the number of k-mers to return (if 0 or less, all will be returned).
     * @param input         the input path containing the kmers.
     * @param output        the output file path to write the json file to.
     * @return the actual number of kmers written out
     * @throws              java.io.IOException
     */
    public static int kmerSequenceFile2Json(JobConf conf, int start, int end, int numKmers, String input,
            String output) throws IOException {
        Path outPath = new Path(output);
        FileSystem fs = outPath.getFileSystem(conf);

        FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
        Path inputPath = new Path(input);
        Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
        if (numKmers <= 0) {
            numKmers = Integer.MAX_VALUE;
        }
        int cnt = 0;
        Writer writer = new OutputStreamWriter(fos);
        JsonFactory jf = new JsonFactory();
        JsonGenerator jg = jf.createJsonGenerator(writer);
        CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils();
        try {
            jg.writeStartObject();
            util.writeJsonCcvProperties(jg, start, end);
            cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers);
            jg.writeEndObject();
            jg.close();
            writer.close();
        } catch (JsonGenerationException ex) {
            LOG.error("Unable to write the nmers to a json object", ex);
        }
        return cnt;
    }

    /**
     * Write out feature vectors, features (k-mers), and properties (start, end) to a JSON file.
     * <P>JSON format
     * <blockquote>
     * {
     *      "properties" :
     *      {
     *          "begin" : 3
     *          "end"   : 9
     *      }
     *      "features" : [..]
     *      "samples" :
     *      [
     *          {
     *              "name" : "sample name",
     *              "data" : { nmer_index: non-zero pi-values }
     *          }, ....
     *      ]
     * }
     * </blockquote>
     *
     * The data will be the same as  {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}, but the features
     * will be in a different order. The mapred version, by default sorts, only by entropy value, whereas the
     * ccv in-memory version sorts by the k-mer natural order (lexigraphic).
     *
     * @see {@link org.mitre.ccv.CompleteMatrix#jsonCompleteMatrix}
     *
     * @param conf          the job configuration
     * @param start         begining window size
     * @param end           ending window size
     * @param numKmers      the number of k-mers to return (if 0 or less, all will be returned).
     * @param listInput     {@link SequenceFile} path containing k-mers used to generate the feature vectors.
     * @param featureInput  {@link SequenceFile} path contains feature vectors {@link SparseVectorWritable}.
     * @param output        the output file path to write the json file to.
     * @return the actual number of kmers written out (not samples/feature vectors)
     * @throws java.io.IOException
     */
    public static int featureVectors2Json(JobConf conf, int start, int end, int numKmers, String listInput,
            String featureInput, String output) throws IOException {
        Path outPath = new Path(output);
        FileSystem fs = outPath.getFileSystem(conf);

        FSDataOutputStream fos = fs.create(outPath, true); // throws nothing!
        if (numKmers <= 0) {
            numKmers = Integer.MAX_VALUE;
        }
        Writer writer = new OutputStreamWriter(fos);

        JsonFactory jf = new JsonFactory();
        JsonGenerator jg = jf.createJsonGenerator(writer);
        CompleteCompositionVectorUtils util = new CompleteCompositionVectorUtils();
        int cnt = 0;
        try {
            jg.writeStartObject();
            util.writeJsonCcvProperties(jg, start, end);

            /** Get k-mers (features) */
            Path inputPath = new Path(listInput);
            Path[] paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
            cnt = util.writeJsonKmers(conf, fs, paths, jg, numKmers);

            /** Get samples */
            inputPath = new Path(featureInput);
            paths = FileUtils.ls(conf, inputPath.toString() + Path.SEPARATOR + "part-*");
            util.jsonCcvVectors(conf, fs, paths, jg);
            jg.writeEndObject();
            jg.close();
            writer.close();
        } catch (JsonGenerationException ex) {
            LOG.error("Unable to write the nmers to a json object", ex);
        }
        return cnt;
    }

    /**
     * Writes a JSON array of the k-mers ("features") in order (hopefully).
     *
     * "features" : [...]
     *
     * @return the actual number of kmers written out.
     */
    private int writeJsonKmers(JobConf conf, FileSystem fs, Path[] paths, JsonGenerator jg, int numKmers)
            throws JsonGenerationException, IOException {
        jg.writeArrayFieldStart("features");
        int cnt = 0;
        KmerEntropyPairWritable key = new KmerEntropyPairWritable();
        for (int idx = 0; idx < paths.length; idx++) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
            boolean hasNext = reader.next(key);
            while (hasNext && cnt < numKmers) {
                cnt++;
                jg.writeString(key.getKey());
                hasNext = reader.next(key);
            }
        }
        jg.writeEndArray();
        return cnt;
    }

    /**
     * "properties" : { "begin":start, "end":end }
     */
    private void writeJsonCcvProperties(JsonGenerator jg, int start, int end)
            throws JsonGenerationException, IOException {
        jg.writeObjectFieldStart("properties");
        jg.writeNumberField("begin", start);
        jg.writeNumberField("end", end);
        jg.writeEndObject();
    }

    /**
     * Writes out our vectors as sparse arrays (only non-zeros) in JSONObjects in a JSONArray
     * Format:
     * "samples": [
     *      {
     *          "name" : "sample name",
     *          "data" : { nmer_index: non-zero pi-values }
     *      }, ....
     * ]
     *
     * nmer_index starts at 0 (zero)
     *
     * data is stored as SparseVectors in SequenceFiles with the key (Text) as the name.
     *
     * @param paths     listing of paths (parts-) files
     * @param map       mapping of k-mers to position
     *
     */
    private void jsonCcvVectors(JobConf conf, FileSystem fs, Path[] paths, JsonGenerator jg)
            throws JsonGenerationException, IOException {
        jg.writeArrayFieldStart("samples");
        Text key = new Text();
        SparseVectorWritable values = new SparseVectorWritable();
        for (int idx = 0; idx < paths.length; idx++) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, paths[idx], conf);
            boolean hasNext = reader.next(key, values);
            while (hasNext) {
                jg.writeStartObject();
                jg.writeStringField("name", key.toString());
                jg.writeObjectFieldStart("data");
                SparseVector vector = values.get();
                for (Iterator<Entry<Integer, Double>> iter = vector.getSparseMap().entrySet().iterator(); iter
                        .hasNext();) {
                    Entry<Integer, Double> entry = iter.next();
                    if (entry.getValue().isInfinite()) {
                        LOG.warn(String.format("Skipping %s:%d\t%f\n", key.toString(), entry.getKey(),
                                entry.getValue()));
                        System.err.printf("Skipping %s:%d\t%f\n", key.toString(), entry.getKey(), entry.getValue());
                    } else {
                        jg.writeNumberField(Integer.toString(entry.getKey()), entry.getValue());
                    }
                    jg.flush(); // force the buffer to empty to disk/io
                }
                jg.writeEndObject(); // data object
                jg.writeEndObject(); // sample object
                hasNext = reader.next(key, values);
            }
        }
        jg.writeEndArray(); // samples array
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf conf = new JobConf(getConf());

        ArrayList<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else if ("-libjars".equals(args[i])) {
                    conf.set("tmpjars", FileUtils.validateFiles(args[++i], conf));

                    URL[] libjars = FileUtils.getLibJars(conf);
                    if (libjars != null && libjars.length > 0) {
                        // Add libjars to client/tasks classpath
                        conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
                        // Adds libjars to our classpath
                        Thread.currentThread().setContextClassLoader(
                                new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
                    }
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() < 1) {
            System.out.println("ERROR: Require ONE argument!");
            return printUsage();
        }

        String cmd = other_args.get(0);
        if (cmd.equals("featureVectors2Json")) {
            if (other_args.size() >= 7) {
                try {
                    int start = Integer.parseInt(other_args.get(1));
                    int end = Integer.parseInt(other_args.get(2));
                    int kmers = Integer.parseInt(other_args.get(3));
                    featureVectors2Json(conf, start, end, kmers, other_args.get(4), other_args.get(5),
                            other_args.get(6));
                } catch (NumberFormatException except) {
                    System.err.println("Woops. Error converting number!");
                    return -1;
                }
            } else {
                System.err.println("We need more arguments!");
                return -1;
            }
        } else if (cmd.equals("featureVectors2rows")) {
            int digits = 6;
            if (other_args.size() > 3) {
                try {
                    digits = Integer.parseInt(other_args.get(1));
                    featureVectors2RowMajorMatrix(conf, other_args.get(2), other_args.get(3), digits);
                } catch (NumberFormatException except) {
                    System.err.println("Woops. Error converting number!");
                    return -1;
                }
            } else {
                featureVectors2RowMajorMatrix(conf, other_args.get(1), other_args.get(2), digits);
            }
        } else {
            System.out.println("Unknown command:" + cmd);
            return -1;
        }
        return 0;
    }

    static int printUsage() {
        System.out.println(
                "CompleteCompositionVectorUtils [-libjars <classpath,...>] [-m <maps>] [-r <reduces>] command [arguments]");
        System.out.println("Current commands:");
        System.out.println("\tfeatureVectors2json\tstart end num-kmers kmer-list feature-vectors output");
        System.out.println("\tfeatureVectors2rows\t[digits] feature-vectors output");
        return -1;
    }

    static public void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CompleteCompositionVectorUtils(), args);
        System.exit(res);
    }
}