bioLockJ.module.classifier.r16s.qiime.QiimePreprocessor.java Source code

Java tutorial

Introduction

Here is the source code for bioLockJ.module.classifier.r16s.qiime.QiimePreprocessor.java

Source

/**
 * @UNCC Fodor Lab
 * @author Michael Sioda
 * @email msioda@uncc.edu
 * @date June 19, 2017
 * @disclaimer    This code is free software; you can redistribute it and/or
 *             modify it under the terms of the GNU General Public License
 *             as published by the Free Software Foundation; either version 2
 *             of the License, or (at your option) any later version,
 *             provided that any use properly credits the author.
 *             This program is distributed in the hope that it will be useful,
 *             but WITHOUT ANY WARRANTY; without even the implied warranty of
 *             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *             GNU General Public License for more details at http://www.gnu.org *
 */
package bioLockJ.module.classifier.r16s.qiime;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.io.filefilter.TrueFileFilter;
import bioLockJ.Config;
import bioLockJ.Constants;
import bioLockJ.Log;
import bioLockJ.module.classifier.r16s.QiimeClassifier;
import bioLockJ.util.BashScriptUtil;
import bioLockJ.util.MetadataUtil;

/**
 * This class prepares QIIME input files. 1. Reorder any metadata columns if
 * required for QIIME mapping. 2. Add columns to metadata if required for QIIME
 * mapping. 3. Decompress gzipped fasta/fastq files, if any. 4.
 * Convert FastQ files to FastA format, if any.
 */
public class QiimePreprocessor extends QiimeClassifier {

    /**
     * Read in required QIIME prop values.
     */
    @Override
    public void checkDependencies() throws Exception {
        super.checkDependencies();
        Config.requireString(EXE_AWK);
    }

    /**
     * Register num reads persample and create build script.
     */
    @Override
    public void executeProjectFile() throws Exception {
        registerNumReadsPerSample(getInputFiles(), getTempDir());
        BashScriptUtil.buildScripts(this, buildScript(getInputFiles()), failFiles,
                Config.requirePositiveInteger(Config.SCRIPT_BATCH_SIZE));
    }

    /**
     * This script will unzip if gzipped files are found and will convert fastQ to fastA if needed.
     * Otherwise, files are simply loaded to the output dir for next executor.  Last script will
     * also create the Qiime corrected mapping file by using QIIME verifyMapping python script.
     */
    @Override
    protected List<List<String>> buildScript(final List<File> files) throws Exception {
        final List<List<String>> data = new ArrayList<>();
        final String tempDir = getTempDir().getAbsolutePath() + File.separator;
        final String outDir = getOutputDir().getAbsolutePath() + File.separator;
        String ext = "." + getInputSequenceType();

        for (final File f : files) {
            final ArrayList<String> lines = new ArrayList<>();
            final String fileId = getSampleId(f.getName());
            final String zipExe = getZipExe(f);

            if (Config.getBoolean(Config.INPUT_PAIRED_READS)) {
                if (isForwardRead(f.getName())) {
                    ext = Config.requireString(Config.INPUT_FORWARD_READ_SUFFIX) + ext;
                } else {
                    ext = Config.requireString(Config.INPUT_REVERSE_READ_SUFFIX) + ext;
                }
            }

            String filePath = f.getAbsolutePath();

            if (zipExe != null) {
                filePath = (isFastQ() ? tempDir : outDir) + fileId + ext;
                lines.add(unzip(zipExe, f, filePath));
            }

            if (isFastQ()) {
                lines.add(convert2fastA(filePath, fileId, outDir));
            }

            if ((zipExe == null) && isFastA()) {
                lines.add(copyToOutputDir(filePath, fileId + ext));
            }

            failFiles.add(f);
            data.add(lines);
        }

        data.add(createQiimeCorrectedMapping());
        failFiles.add(new File(MetadataUtil.getMetadata().getAbsolutePath()));

        return data;
    }

    /**
     * Input files must be initialized.  If first executor, read in input dir from props, otherwise,
     * check for merged files.  Call setModuleInput based on merged files, or files in dir.
     */
    @Override
    protected void initInputFiles(File dir) throws Exception {
        if (dir == null) // get from prop file
        {
            dir = getInputDirs(dir).get(0);
        }

        setModuleInput(dir, TrueFileFilter.INSTANCE, null);
    }

    /**
     * Convert file format using awk.
     *
     * @param filePath
     * @param fileId
     * @param outDir
     * @return
     */
    private String convert2fastA(final String filePath, final String fileId, final String outDir) throws Exception {
        return "cat " + filePath + " | " + Config.requireString(EXE_AWK) + " '{if(NR%4==1) {printf(\">%s\\"
                + Constants.RETURN + "\",substr($0,2));} " + "else if(NR%4==2) print;}' > " + outDir + fileId + "."
                + Constants.FASTA;
    }

    /**
     * Copy files to output dir.
     * @param source
     * @param target
     * @return
     * @throws Exception
     */
    private String copyToOutputDir(final String source, final String target) throws Exception {
        return "cp " + source + " " + getOutputDir().getAbsolutePath() + File.separator + target;
    }

    /**
     * First, print_qiime_config.py will output version info, next we create the QIIME Mapping file
     * and rearranging columns as required by QIIME format rules.  Finally we call validate_mapping_file.py
     * to add the proper QIIME script call to the bash script.
     *
     * @return
     * @throws Exception
     */
    private List<String> createQiimeCorrectedMapping() throws Exception {
        Log.out.info("Create QIIME Specific Mapping File");
        final List<String> lines = new ArrayList<>();
        lines.add(SCRIPT_PRINT_CONFIG);

        createQiimeMapping();
        final String alignColLine = getAlignedMetadataColumns();
        if (alignColLine != null) {
            Log.out.info("Add line to BASH script to arrange QIIME columns in metadata.");
            lines.add(alignColLine);
        }

        lines.add(sortMetadata());
        lines.add(validateMapping());
        return lines;
    }

    /**
     * Create QIIME mapping based on metadata file, output to temp/QIIME_MAPPING.
     * Add required fields if missing.
     * @throws Exception
     */
    private void createQiimeMapping() throws Exception {
        final BufferedReader reader = new BufferedReader(
                new FileReader(MetadataUtil.getMetadata().getAbsolutePath()));
        MetadataUtil.setMetadata(new File(getTempDir().getAbsolutePath() + File.separator + QIIME_MAPPING));
        final BufferedWriter writer = new BufferedWriter(new FileWriter(MetadataUtil.getMetadata()));
        try {
            final boolean hasQm1 = MetadataUtil.getAttributeNames().contains(BARCODE_SEQUENCE);
            final boolean hasQm2 = MetadataUtil.getAttributeNames().contains(LINKER_PRIMER_SEQUENCE);
            final boolean hasQm3 = MetadataUtil.getAttributeNames().contains(DEMUX_COLUMN);
            final boolean hasQm4 = MetadataUtil.getAttributeNames().contains(DESCRIPTION);

            boolean isHeaderRow = true;
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                final StringTokenizer st = new StringTokenizer(line, Constants.TAB_DELIM);
                boolean firstColumn = true;
                String id = null;

                while (st.hasMoreTokens()) {
                    final String next = st.nextToken();
                    if (firstColumn) {
                        firstColumn = false;
                        if (isHeaderRow) {
                            writer.write(QIIME_ID + Constants.TAB_DELIM);

                            if (!hasQm1) {
                                writer.write(BARCODE_SEQUENCE + Constants.TAB_DELIM);
                            }

                            if (!hasQm2) {
                                writer.write(LINKER_PRIMER_SEQUENCE + Constants.TAB_DELIM);
                            }
                        } else {
                            id = next;
                            writer.write(id + Constants.TAB_DELIM);

                            if (!hasQm1) {
                                writer.write(Constants.TAB_DELIM);
                            }

                            if (!hasQm2) {
                                writer.write(Constants.TAB_DELIM);
                            }
                        }
                    } else {
                        writer.write(next + Constants.TAB_DELIM);
                    }
                }

                if (isHeaderRow) {
                    if (!hasQm3) {
                        writer.write(DEMUX_COLUMN + Constants.TAB_DELIM);
                    }

                    if (!hasQm4) {
                        writer.write(DESCRIPTION);
                    }

                    isHeaderRow = false;
                } else {
                    if (!hasQm3) {
                        writer.write(id + "." + Constants.FASTA + Constants.TAB_DELIM);
                    }

                    if (!hasQm4) {
                        writer.write(QIIME_COMMENT);
                    }
                }

                writer.write(Constants.RETURN);
            }
        } catch (final Exception ex) {
            Log.out.error("Error occcurred creating QIIME mapping file: ", ex);
        } finally {
            reader.close();
            writer.close();
        }

        MetadataUtil.refresh();
    }

    /**
     * If QIIME required fields exist in metadata, but are not in proper position,
     * output line for bash script that will move the column to the proper position.
     * --> BarcodeSequence = col 2
     * --> LinkerPrimerSequence = col 3
     * --> InputFileName = 2nd to last col
     * --> Description = last col
     * @return String - line for bash script
     * @throws Exception
     */
    private String getAlignedMetadataColumns() throws Exception {
        final StringBuffer sb = new StringBuffer();
        sb.append(Config.requireString(EXE_AWK) + " -F'\\" + Constants.TAB_DELIM + "' -v OFile.separator=\"\\"
                + Constants.TAB_DELIM + "\" '{ print $1");
        final List<String> cols = MetadataUtil.getAttributes(QIIME_ID);

        final boolean hasQm1 = MetadataUtil.getAttributeNames().contains(BARCODE_SEQUENCE);
        final boolean hasQm2 = MetadataUtil.getAttributeNames().contains(LINKER_PRIMER_SEQUENCE);
        final boolean hasQm3 = MetadataUtil.getAttributeNames().contains(DEMUX_COLUMN);
        final boolean hasQm4 = MetadataUtil.getAttributeNames().contains(DESCRIPTION);

        final int numCols = cols.size();
        int demuxIndex = numCols;
        int descIndex = numCols + 1;

        final List<Integer> colsToSkip = new ArrayList<>();
        if (hasQm1 && !cols.get(0).equals(BARCODE_SEQUENCE)) {
            skipIndex(BARCODE_SEQUENCE, cols, sb, colsToSkip, "column #2");
        }
        if (hasQm2 && (numCols > 1) && !cols.get(1).equals(LINKER_PRIMER_SEQUENCE)) {
            skipIndex(LINKER_PRIMER_SEQUENCE, cols, sb, colsToSkip, "column #3");
        }
        if (hasQm3 && (numCols > 2) && !cols.get((numCols - 2)).equals(DEMUX_COLUMN)) {
            demuxIndex = skipIndex(DEMUX_COLUMN, cols, sb, colsToSkip, " 2nd to last column");
        }
        if (hasQm4 && !cols.get((numCols - 1)).equals(DESCRIPTION)) {
            descIndex = skipIndex(DESCRIPTION, cols, sb, colsToSkip, " last column");
        }

        if (colsToSkip.isEmpty()) {
            Log.out.info("Metadata does not contain QIIME specific fields to reorder.");
            return null;
        }

        for (int i = 0; i < colsToSkip.size(); i++) {
            Log.out.debug("colsToSkip(" + i + ")=" + colsToSkip.get(i));
        }

        for (int i = 2; i < (numCols + 2); i++) {
            if (!colsToSkip.contains(i)) {
                Log.out.debug("colsToSkip() must not contain =" + i);
                sb.append(", $" + i);
            }
        }

        if (demuxIndex != numCols) {
            sb.append(", $" + demuxIndex);
        }

        if (descIndex != (numCols + 1)) {
            sb.append(", $" + descIndex);
        }

        final String path = getTempDir().getAbsolutePath() + File.separator + ORDERED_MAPPING;
        sb.append(" }' " + MetadataUtil.getMetadata().getAbsolutePath() + " > " + path);
        MetadataUtil.setMetadata(new File(path));

        return sb.toString();
    }

    /**
     * Get mapping dir, called "mapping" which is the directory the new mapping is output by Qiime
     * validate_mapping_file.py.
     * @return
     * @throws Exception
     */
    private String getMappingDir() throws Exception {
        final File dir = new File(getOutputDir().getAbsolutePath() + File.separator + "mapping");
        if (!dir.exists()) {
            dir.mkdirs();
        }

        return dir.getAbsolutePath() + File.separator;
    }

    private String getSortedMap() throws Exception {
        return getTempDir().getAbsolutePath() + File.separator + SORTED_MAP;
    }

    /**
     * Get zipExe from prop file.
     * @param f
     * @return
     * @throws Exception
     */
    private String getZipExe(final File f) throws Exception {
        final String name = f.getName().toLowerCase();
        if (name.endsWith(".gz")) {
            return Config.requireString(EXE_GZIP);
        }

        return null;
    }

    /**
     * When rearranging files, skip any index when adding columns, if it will be moved.
     * @param field
     * @param cols
     * @param sb
     * @param colsToSkip
     * @param colMsg
     * @return
     */
    private int skipIndex(final String field, final List<String> cols, final StringBuffer sb,
            final List<Integer> colsToSkip, final String colMsg) {
        final int index = (cols.indexOf(field) + 2);
        sb.append(", $" + index); // $9
        colsToSkip.add(index); // 9
        Log.out.info(field + " found in column #" + index + " but QIIME requires " + colMsg);
        return index;
    }

    private String sortMetadata() throws Exception {
        final String map = MetadataUtil.getMetadata().getAbsolutePath();
        return "(head -n 1 " + map + " && tail -n +2 " + map + " | sort -n) > " + getSortedMap();
    }

    /**
     * Get line for bash script to unzip file.
     * @param zipExe
     * @param f
     * @param filePath
     * @return
     */
    private String unzip(final String zipExe, final File f, final String filePath) {
        return zipExe + " -cd " + f.getAbsolutePath() + " > " + filePath;
    }

    /**
     * Call validate_mapping_file.py to get corrected QIIME Mapping.
     * @return
     * @throws Exception
     */
    private String validateMapping() throws Exception {
        return SCRIPT_VALIDATE_MAPPING + getSortedMap() + " -o " + getMappingDir() + " -j " + DEMUX_COLUMN;
    }

    private final List<File> failFiles = new ArrayList<>();
    private static final String EXE_GZIP = "exe.gzip";
    private static final String ORDERED_MAPPING = "orderedMapping.tsv";
    private static final String QIIME_COMMENT = "BioLockJ Generated Mapping";
    private static final String QIIME_ID = "#SampleID";
    private static final String SCRIPT_PRINT_CONFIG = "print_qiime_config.py -t";
    private static final String SCRIPT_VALIDATE_MAPPING = "validate_mapping_file.py -p -b -m ";
    private static final String SORTED_MAP = "sortedMapping.txt";
}