gov.jgi.meta.MetaUtils.java Source code

Java tutorial

Introduction

Here is the source code for gov.jgi.meta.MetaUtils.java

Source

/*
 * Copyright (c) 2010, The Regents of the University of California, through Lawrence Berkeley
 * National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy).
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided
 * that the following conditions are met:
 *
 * (1) Redistributions of source code must retain the above copyright notice, this list of conditions and the
 * following disclaimer.
 *
 * (2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions
 * and the following disclaimer in the documentation and/or other materials provided with the distribution.
 *
 * (3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept.
 * of Energy, nor the names of its contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the
 * features, functionality or performance of the source code ("Enhancements") to anyone; however,
 * if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley
 * National Laboratory, without imposing a separate written license agreement for such Enhancements,
 * then you hereby grant the following license: a  non-exclusive, royalty-free perpetual license to install,
 * use, modify, prepare derivative works, incorporate into other computer software, distribute, and
 * sublicense such enhancements or derivative works thereof, in binary and source code form.
 */

package gov.jgi.meta;

import gov.jgi.meta.hadoop.input.FastaBlockLineReader;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.*;

/**
 * utility class for common functionality across various applications
 *
 * @author karan bhatia
 */
public class MetaUtils {

    /**
     * parses commandline args and configures the hadoop job configuration with defaults loaded from
     * either cluster defaults file ($(META_HOME)/conf/<app>-conf.xml), user defaults (~/.meta-prefs)
     * or commandline args (with -D<parameter>=<value>).  The <app> by default is of the form:
     * <application.name>HadoopApp-conf.xml
     *
     * @param conf the job configuration to add the defaults to
     * @param args the commandline args
     * @return modifies conf with appropriate defaults
    * @throws IOException 
     */
    public static String[] loadConfiguration(Configuration conf, String[] args) throws IOException {
        return (loadConfiguration(conf, null, args));
    }

    /**
     * same as {@link #loadConfiguration(Configuration, String[]) loadConfiguration()} but loads from
     * a specified file instead of the default filename.
     *
     * @param conf the job configuration to add the defaults to
     * @param configurationFileName the cluster defaults file to load
     * @param args the commandline args
     * @return modifies conf
    * @throws IOException 
     */
    public static String[] loadConfiguration(Configuration conf, String configurationFileName, String[] args)
            throws IOException {
        /*
         * first load the configuration from the build properties (typically packaged in the jar)
         */
        System.out.println("loading build.properties ...");
        try {
            Properties buildProperties = new Properties();
            buildProperties.load(MetaUtils.class.getResourceAsStream("/build.properties"));
            for (Enumeration e = buildProperties.propertyNames(); e.hasMoreElements();) {
                String k = (String) e.nextElement();
                System.out.println("setting " + k + " to " + buildProperties.getProperty(k));
                System.setProperty(k, buildProperties.getProperty(k));
                conf.set(k, buildProperties.getProperty(k));
            }
        } catch (Exception e) {
            System.out.println("unable to find build.properties ... skipping");
        }

        /*
         * override properties with the deployment descriptor
         */
        if (configurationFileName == null) {
            String appName = System.getProperty("application.name");
            String appVersion = System.getProperty("application.version");
            configurationFileName = appName + "-" + appVersion + "-conf.xml";
        }
        System.out.println("loading application configuration from " + configurationFileName);
        try {
            URL u = ClassLoader.getSystemResource(configurationFileName);
            if (u == null) {
                System.err.println("unable to find " + configurationFileName + " ... skipping");
            } else {
                conf.addResource(configurationFileName);
            }
        } catch (Exception e) {
            System.err.println("unable to find " + configurationFileName + " ... skipping");
        }

        /*
         * override properties from user's preferences defined in ~/.meta-prefs
         */

        try {
            java.io.FileInputStream fis = new java.io.FileInputStream(
                    new java.io.File(System.getenv("HOME") + "/.meta-prefs"));
            Properties props = new Properties();
            props.load(fis);
            System.out.println("loading preferences from ~/.meta-prefs");
            for (Enumeration e = props.propertyNames(); e.hasMoreElements();) {
                String k = (String) e.nextElement();
                System.out.println("overriding property: " + k);
                conf.set(k, props.getProperty(k));
            }
        } catch (Exception e) {
            System.out.println("unable to find ~/.meta-prefs ... skipping");
        }

        /*
         * finally, allow user to override from commandline
         */
        return (new GenericOptionsParser(conf, args).getRemainingArgs());
    }

    /**
     * prints out the configuration properties
     * @param conf the job configuration holding parameter values
     * @param log the logger to use to print the information
     * @param allProperties a string array with the set of properties to print
     */
    public static void printConfiguration(Configuration conf, Logger log, String[] allProperties) {
        for (String option : allProperties) {
            if (option.startsWith("---")) {
                log.info(option);
                continue;
            }
            String c = conf.get(option);
            if (c != null) {
                log.info("\toption " + option + ":\t" + c);
            }
        }
    }

    /**
     * same as {@link #printConfiguration(Configuration, Logger, String[]) printConfiguration(Configuration, Logger, String[])}
     * but prints to stdout.
     * @param conf the job configuration holding the values
     * @param allProperties a string array of values
     */
    public static void printConfiguration(Configuration conf, String[] allProperties) {
        for (String option : allProperties) {
            if (option.startsWith("---")) {
                System.out.println(option);
                continue;
            }
            String c = conf.get(option);
            if (c != null) {
                System.out.println("\toption " + option + ":\t" + c);
            }
        }
    }

    /**
     * find all files from a given root.
     * @param p is the root on which to search
     * @return a set of file paths
     * @throws IOException if filesystem can't find file or has other io problems
     */
    public static Set<Path> findAllPaths(Path p) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        HashSet<Path> s = new HashSet<Path>();

        if (fs.getFileStatus(p).isDir()) {
            for (FileStatus f : fs.listStatus(p)) {
                if (!f.isDir()) {
                    s.add(f.getPath());
                }
            }
        } else {
            s.add(p);
        }

        return (s);
    }

    /**
     * counts the number of sequences in fasta format found in given file or directory
     * @param contigFileName the file or directory name
     * @return an integer count of the number of sequences found
     * @throws IOException if filesystem has error
     */
    public static int countSequences(String contigFileName) throws IOException {
        return (countSequences(contigFileName, new Configuration()));
    }

    /**
     * see @{link #countSequences(String) countSequences(String)}
     * @param contigFileName the file or directory name
     * @param conf the hadoop configuration object specifiing filesystem
     * @return an integer count of number of sequences found
     * @throws IOException if filesystem has error
     */
    public static int countSequences(String contigFileName, Configuration conf) throws IOException {
        FileSystem fs = FileSystem.get(conf);
        Path filenamePath = new Path(contigFileName);
        int count = 0;

        if (!fs.exists(filenamePath)) {
            throw new IOException("file not found: " + contigFileName);
        }

        for (Path f : findAllPaths(filenamePath)) {
            FSDataInputStream in = fs.open(f);
            FastaBlockLineReader fblr = new FastaBlockLineReader(in);

            Text key = new Text();
            long length = fs.getFileStatus(f).getLen();
            HashMap<String, String> tmpcontigs = new HashMap<String, String>();
            fblr.readLine(key, tmpcontigs, Integer.MAX_VALUE, (int) length);
            count += tmpcontigs.size();
            in.close();
        }

        return (count);
    }

    /**
     * reads a fasta file and returns a map with id and sequence contents
     * @param contigFileName the file or file path of the fasta file
     * @return map containing sequences keyed by sequence id
     * @throws IOException if file exists
     */
    public static Map<String, String> readSequences(String contigFileName) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path filenamePath = new Path(contigFileName);

        Map<String, String> results = new HashMap<String, String>();

        if (!fs.exists(filenamePath)) {
            throw new IOException("file not found: " + contigFileName);
        }

        for (Path f : findAllPaths(filenamePath)) {
            FSDataInputStream in = fs.open(f);
            FastaBlockLineReader fblr = new FastaBlockLineReader(in);

            Text key = new Text();
            long length = fs.getFileStatus(f).getLen();
            HashMap<String, String> tmpcontigs = new HashMap<String, String>();
            fblr.readLine(key, tmpcontigs, Integer.MAX_VALUE, (int) length);
            results.putAll(tmpcontigs);
            in.close();
            fblr.close();
        }

        return (results);
    }

    /**
     * writes sequences from map to fasta file
     *
     * @param seqList is the map of sequences indexed by sequence id
     * @param filename the name/path of the file to create
     * @return the full path of the location of the fasta file
     * @throws IOException if file already exists
     */
    public static String sequenceToFile(Map<String, String> seqList, String filename) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path fp = new Path(filename);

        if (fs.exists(fp)) {
            throw new IOException("file " + filename + " already exists");
        }

        FSDataOutputStream out = fs.create(fp);

        /*
         * write out the sequences to file
         */
        for (String key : seqList.keySet()) {
            assert (seqList.get(key) != null);
            out.writeBytes(">" + key + " length=" + seqList.get(key).length() + "\n");
            out.writeBytes(seqList.get(key) + "\n");
        }

        /*
         * close temp file
         */
        out.close();

        return (fp.toString());
    }

    /**
    * given a map of sequences indexed by sequence id, write out a fasta file to local file system.
    * files are created with rwx bits set to 777.
    *
    * @param seqList is the list of sequences to create the database with
    * @param dirName the directory path in which to create the file
    * @param tmpFileName the file name to create
    * @return the full path of the location of the database
    * @throws IOException if error occures in file creation
    */
    public static String sequenceToLocalFile(Map<String, String> seqList, String dirName, String tmpFileName)
            throws IOException {
        return sequenceToLocalFile(seqList, new Path(dirName, tmpFileName).toString());
    }

    /**
     * given a map of sequences indexed by sequence id, write out a fasta file to local file system.
     * files are created with rwx bits set to 777.
     *
     * @param seqList is the list of sequences to create the database with
     * @param tmpFileName the file name/path to create
     * @return the full path of the location of the database
     * @throws IOException if error occures in file creation
     */
    public static String sequenceToLocalFile(Map<String, String> seqList, String tmpFileName) throws IOException {
        BufferedWriter out;
        File seqFile = null;

        /*
         * open temp file
         */
        seqFile = new File(tmpFileName);
        out = new BufferedWriter(new FileWriter(seqFile.getPath()));

        /*
         * write out the sequences to file
         */
        for (String key : seqList.keySet()) {
            assert (seqList.get(key) != null);
            out.write(">" + key + "\n");
            out.write(seqList.get(key) + "\n");
        }

        /*
         * close temp file
         */
        out.close();

        if (!(seqFile.setExecutable(true, false) && seqFile.setReadable(true, false)
                && seqFile.setWritable(true, false))) {
            throw new IOException("unable to set RWX bits to 777");
        }

        return (seqFile.getPath());
    }

    public static File createTempDir(String prefix, String tmpDir) throws IOException {
        final File sysTempDir = new File(tmpDir);
        File newTempDir;
        final int maxAttempts = 9;
        int attemptCount = 0;

        do {
            attemptCount++;
            if (attemptCount > maxAttempts) {
                throw new IOException(
                        "The highly improbable has occurred! Failed to create a unique temporary directory after "
                                + maxAttempts + " attempts.");
            }
            String dirName = prefix + UUID.randomUUID().toString();
            newTempDir = new File(sysTempDir, dirName);
        } while (newTempDir.exists());

        try {
            FileUtils.forceMkdir(newTempDir);
        } catch (Exception e) {
            throw new IOException("Failed to create temp dir named " + newTempDir.getAbsolutePath() + ". Reason: "
                    + e.getMessage(), e);
        }

        if (!(newTempDir.setExecutable(true, false) && newTempDir.setReadable(true, false)
                && newTempDir.setWritable(true, false))) {
            throw new IOException("unable to set RWX bits to 777");
        }

        return (newTempDir);
    }

    /**
     * Create a new temporary directory. Use something like
     * {@link #recursiveDelete(java.io.File)} to clean this directory up since it isn't
     * deleted automatically
     *
     * @param tmpDir is the directory underwhich to create the tmp dir
     * @return the new directory
     * @throws java.io.IOException if there is an error creating the temporary directory
     */
    public static File createTempDir(String tmpDir) throws IOException {
        return createTempDir("", tmpDir);
    }

    /**
     * Recursively delete file or directory
     *
     * @param fileOrDir the file or dir to delete
     * @return true iff all files are successfully deleted
     */
    public static boolean recursiveDelete(File fileOrDir) {
        if (fileOrDir.isDirectory()) {
            // recursively delete contents
            for (File innerFile : fileOrDir.listFiles()) {
                if (!recursiveDelete(innerFile)) {
                    fileOrDir.deleteOnExit();
                    return (false);
                }
            }
        }
        if (!fileOrDir.delete()) {
            fileOrDir.deleteOnExit();
            return (false);
        }
        return (true);
    }

    /**
     * given a sequence, return the reverse complement
     * @param s sequence
     * @return its reverse complement
     */
    public static String reverseComplement(String s) {
        StringBuffer sb = new StringBuffer();

        for (int i = 0; i < s.length(); i++) {
            if (s.charAt(i) == 'a') {
                sb.append("t");
            } else if (s.charAt(i) == 't') {
                sb.append("a");
            } else if (s.charAt(i) == 'g') {
                sb.append("c");
            } else if (s.charAt(i) == 'c') {
                sb.append("g");
            } else if (s.charAt(i) == 'n') {
                sb.append("n");
            }
        }
        return (sb.reverse().toString());
    }

    /**
     * find all sequences that are within distance 'distance'
     * @param start sequence to start with
     * @param distance -- hamming distance
     * @return set of sequences that are within distance including start sequence
     */
    public static Set<String> generateAllNeighbors(String start, int distance) {
        return generateAllNeighbors(new StringBuffer(start), distance);
    }

    public static Set<String> generateAllNeighbors2(String start, int distance) {
        Set<String> r = new HashSet<String>();

        if (distance == 0) {
            r.add(start);
            return (r);
        } else if (distance == 1) {
            return (generateHammingDistanceOne(start));
        } else if (distance == 2) {
            return (generateHammingDistanceTwo(start));
        } else {
            // throw exception;
        }
        return (r);
    }

    private static Set<String> generateHammingDistanceOne(String start) {
        char[] bases = { 'a', 't', 'g', 'c', 'n' };
        Set<String> r = new HashSet<String>();

        for (int i = 0; i < start.length(); i++) {
            for (char basePair : bases) {
                if (start.charAt(i) == basePair) {
                    continue;
                }
                String n = stringReplaceIth(start, i, basePair);
                if (r.contains(n)) {
                    continue;
                }
                r.add(n);
            }
        }
        return (r);
    }

    private static Set<String> generateHammingDistanceTwo(String start) {
        byte[] b = start.getBytes();
        byte[] bases = { 'a', 't', 'g', 'c', 'n' };
        Set<String> r = new HashSet<String>();

        for (int i = 0; i < start.length() - 1; i++) {
            for (int j = i + 1; j < start.length(); j++) {
                byte ii = b[i];
                byte jj = b[j];
                for (byte basePair1 : bases) {
                    for (byte basePair2 : bases) {
                        b[i] = basePair1;
                        b[j] = basePair2;
                        r.add(b.toString());
                    }
                }
                b[i] = ii;
                b[j] = jj;
            }
        }
        return (r);
    }

    public static boolean testvvEnd(int[] vv) {
        for (int i = 0; i < vv.length; i++) {
            if (vv[i] != 0)
                return false;
        }
        return true;
    }

    public static boolean vvcaniskip(String start, int[] v, int[] vv) {
        char[] bases = { 'a', 't', 'g', 'c' };
        for (int i = 0; i < v.length; i++) {
            if (start.charAt(v[i]) == bases[vv[i]])
                return true;
        }
        return false;
    }

    public static String applychangevectortovalues(String start, int[] v, int[] vv) {
        StringBuffer sb = new StringBuffer(start);
        char[] bases = { 'a', 't', 'g', 'c' };
        for (int i = 0; i < v.length; i++) {
            sb.setCharAt(v[i], bases[vv[i]]);
        }
        return sb.toString();
    }

    public static Set<String> changeVectorApply(String start, int[] v) {
        int i;
        char[] bases = { 'a', 't', 'g', 'c' };
        int[] vv = new int[v.length];
        Set<String> s = new HashSet<String>();

        for (i = 0; i < v.length; i++)
            vv[i] = 0;

        boolean overflow = false;
        while (!overflow) {
            if (!vvcaniskip(start, v, vv)) {
                String neighbor = applychangevectortovalues(start, v, vv);
                s.add(neighbor);
            }
            for (int j = vv.length - 1; j >= 0; j--) {
                if (vv[j] < 3) {
                    vv[j] += 1;
                    break;
                } else if (j == 0) {
                    overflow = true;
                    vv[j] = 0;
                } else {
                    vv[j] = 0;
                }
            }
        }

        return s;
    }

    public static Set<String> changeVectorApply2(String start, int[] v) {
        int i;
        char[] bases = { 'a', 't', 'g', 'c' };
        int[] vv = new int[v.length];
        Set<String> s = new HashSet<String>();

        for (i = 0; i < v.length; i++)
            vv[i] = 0;

        boolean overflow = false;
        while (!overflow) {
            String neighbor = applychangevectortovalues(start, v, vv);
            s.add(neighbor);
            for (int j = vv.length - 1; j >= 0; j--) {
                if (vv[j] < 3) {
                    vv[j] += 1;
                    break;
                } else if (j == 0) {
                    overflow = true;
                    vv[j] = 0;
                } else {
                    vv[j] = 0;
                }
            }
        }

        return s;
    }

    public static Set<String> generateAllNeighborsWithinDistance(String start, int distance) {
        Set<String> s = new HashSet<String>();

        if (distance == 0) {
            s.add(start);
            return s;
        }

        int[] v = new int[distance];
        int i;

        for (i = 0; i < distance; i++)
            v[i] = i; // init the change vector

        boolean overflow = false;
        while (!overflow) {
            Set<String> c = changeVectorApply2(start, v);
            s.addAll(c);
            for (int j = v.length - 1; j >= 0; j--) {
                if (v[j] < start.length() - distance + j) {
                    v[j] += 1;
                    for (int k = j + 1; k < v.length; k++)
                        v[k] = v[k - 1] + 1;
                    break;
                } else if (j == 0) {
                    v[j] = 0;
                    overflow = true;
                } else {
                    v[j] = 0;
                }
            }
        }
        return s;
    }

    public static Set<String> generateAllNeighborsAtDistance(String start, int distance) {
        Set<String> s = new HashSet<String>();
        int[] v = new int[distance];
        int i;

        for (i = 0; i < distance; i++)
            v[i] = i; // init the change vector

        boolean overflow = false;
        while (!overflow) {
            Set<String> c = changeVectorApply(start, v);
            s.addAll(c);
            for (int j = v.length - 1; j >= 0; j--) {
                if (v[j] < start.length() - distance + j) {
                    v[j] += 1;
                    for (int k = j + 1; k < v.length; k++)
                        v[k] = v[k - 1] + 1;
                    break;
                } else if (j == 0) {
                    v[j] = 0;
                    overflow = true;
                } else {
                    v[j] = 0;
                }
            }
        }
        return s;
    }

    /**
     * generate a set (unique) of all sequences ({ATGC} only) from start sequence
     * to distance steps (Hamming distance)
     *
     * @param start the start sequence
     * @param distance the number of steps to travel
     * @return s set containing all neighbors including itself.
     */
    public static Set<String> generateAllNeighbors(StringBuffer start, int distance) {
        char[] bases = { 'a', 't', 'g', 'c' };
        Set<String> s = new HashSet<String>();

        if (distance == 0) {
            s.add(start.toString());
            return (s);
        }

        for (int i = 0; i < start.length(); i++) {
            char old = start.charAt(i);
            for (char basePair : bases) {
                start.setCharAt(i, basePair);
                s.addAll(generateAllNeighbors(start, distance - 1));
            }
            start.setCharAt(i, old);
        }

        return (s);
    }

    /**
     * replace the i'th character of string s with character c
     * @param s the string
     * @param i the index of the string to replace
     * @param c the character to replace
     * @return the new string
     * @throws IndexOutOfBoundsException if necessary
     */
    public static String stringReplaceIth(String s, int i, char c) throws IndexOutOfBoundsException {
        if (i >= s.length()) {
            throw new IndexOutOfBoundsException("index " + i + " greater than length of " + s);
        }

        return (s.substring(0, i) + c + s.substring(i + 1));
    }

    public static void configureLog4j() {
        // configure the log4j system for hadoop map jobs

        String log4jConfigurationFile = System.getProperty("log4j.properties");

        System.out.println("configuring log4j using: " + log4jConfigurationFile);

        URL u = ClassLoader.getSystemResource(log4jConfigurationFile);
        if (u == null) {
            System.err.println("unable to find " + log4jConfigurationFile + " ... skipping");
        } else {
            PropertyConfigurator.configure(log4jConfigurationFile);
        }
    }
}