bioLockJ.util.MetadataUtil.java Source code

Java tutorial

Introduction

Here is the source code for bioLockJ.util.MetadataUtil.java

Source

/**
 * @UNCC Fodor Lab
 * @author Michael Sioda
 * @email msioda@uncc.edu
 * @date Feb 9, 2017
 * @disclaimer    This code is free software; you can redistribute it and/or
 *             modify it under the terms of the GNU General Public License
 *             as published by the Free Software Foundation; either version 2
 *             of the License, or (at your option) any later version,
 *             provided that any use properly credits the author.
 *             This program is distributed in the hope that it will be useful,
 *             but WITHOUT ANY WARRANTY; without even the implied warranty of
 *             MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *             GNU General Public License for more details at http://www.gnu.org *
 */
package bioLockJ.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeSet;
import org.apache.commons.lang.math.NumberUtils;
import bioLockJ.Config;
import bioLockJ.Constants;
import bioLockJ.Log;
import bioLockJ.Module;
import bioLockJ.util.r.RScript;

/**
 * The metadataUtil helps access and modify data in the metadata file.
 */
public class MetadataUtil {
    /**
     * When a new column is added to metadata, this method will add the column, with all row values.
     * The updated file is output to the "outputDir" to be picked up by the next executor.
     * @param name
     * @param map
     * @param fileDir
     * @throws Exception
     */
    public static void addNumericColumn(final String name, final Map<String, Integer> map, final File fileDir)
            throws Exception {
        if (getAttributeNames().contains(name)) {
            Log.out.warn("Metadata already contains column [" + name + "] so this data will not be added to "
                    + metadataFile.getName());
            return;
        }

        numericFields.add(name);

        final BufferedReader reader = new BufferedReader(new FileReader(metadataFile));
        metadataFile = new File(fileDir.getAbsolutePath() + File.separator + metadataFile.getName());
        final BufferedWriter writer = new BufferedWriter(new FileWriter(metadataFile));

        Log.out.info("Adding new attribute [" + name + "] to metadata");
        boolean isHeaderRow = true;
        try {
            final Set<String> keys = map.keySet();
            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
                final StringTokenizer st = new StringTokenizer(line, Constants.TAB_DELIM);
                if (isHeaderRow) {
                    isHeaderRow = false;
                    line += Constants.TAB_DELIM + name;
                } else {
                    final String id = st.nextToken();
                    if (keys.contains(id)) {
                        line += Constants.TAB_DELIM + map.get(id);
                    } else {
                        line += Constants.TAB_DELIM + Config.requireString(Config.INPUT_NULL_VALUE);
                    }
                }

                writer.write(line + Constants.RETURN);
            }
        } catch (final Exception ex) {
            Log.out.error("Error occurred updating metadata with new attribute [" + name + "]", ex);
        } finally {
            reader.close();
            writer.close();
            refresh();
        }
    }

    public static void addNumericField(final String field) {
        numericFields.add(field);
    }

    /**
     * Many users modify metadata spreadsheets in Excel.  If the first cell value starts with # symbol,
     * Excel adds a ZERO WIDTH NO-BREAK space as an invisible character.  Here we strip this value;
     * See http://www.fileformat.info/info/unicode/char/feff/index.htm
     * @param id
     * @return
     */
    public static String formatMetaId(String id) {
        final char c = id.trim().toCharArray()[0];
        if (c == 65279) {

            Log.out.warn(
                    "MetadataUtil found row ID starting with ASCII 65279 - this invalid invisble character has been removed!");

            final char[] chars = id.trim().toCharArray();
            for (int i = 0; i < chars.length; i++) {
                Log.out.debug("ID[" + i + "] = " + chars[i]);
            }

            id = id.substring(1);
            Log.out.info("Updated ID = " + id);
        }
        return id;
    }

    /**
     * Get a list of all attribute names from the metadata file column names.
     * @return
     */
    public static List<String> getAttributeNames() {
        return metadataMap.get(metaId);
    }

    /**
     * Get attribute values from metadata (get row for a given ID).
     * @param id
     * @return
     * @throws Exception
     */
    public static List<String> getAttributes(final String id) throws Exception {
        try {
            return metadataMap.get(id);
        } catch (final Exception ex) {
            throw new Exception("Invalid ID: " + id);
        }

    }

    public static Set<String> getBinaryFields() {
        return binaryFields;
    }

    public static List<String> getCleanVals(final Collection<String> vals) throws Exception {
        final List<String> formattedValues = new ArrayList<>();
        for (final String val : vals) {
            formattedValues.add(rScriptFormat(val));
        }
        return formattedValues;
    }

    public static String getID() {
        return metaId;
    }

    public static File getMetadata() {
        return metadataFile;
    }

    /**
     * Get the first column from the metadata file.
     * @return
     */
    public static Set<String> getMetaFileFirstColValues() {
        return metadataMap.keySet();
    }

    public static Set<String> getNominalFields() {
        return nominalFields;
    }

    public static Set<String> getNumericFields() {
        return numericFields;
    }

    /**
     * Loading new metadata will set the static field values and populate the attributeMap.
     * @param metadata
     * @throws Exception
     */
    public static void refresh() throws Exception {
        metadataMap.clear();
        attributeMap.clear();
        processMetadata(processFile());
        setRscriptFields();
        populateAttributeMap();
        validateConfig();

        Module.ignoreFile(MetadataUtil.getMetadata().getName());

        Log.out.info("Metadata Attributes: " + getAttributeNames());
        Log.out.info("Metadata 1st Column (Header ID name & Sample IDs): " + getMetaFileFirstColValues());
        Log.out.info(Constants.LOG_SPACER);
        Log.out.info("New Metadata file loaded: " + metadataFile.getAbsolutePath());

    }

    public static void setMetadata(final File f) {
        if (f != null) {
            Log.out.info("===> Found new metadata = " + f.getAbsolutePath());
        }
        metadataFile = f;
    }

    /**
     * The attributeMap maps attributes to their set of values.  Only done for metadata that will
     * be used in the R-script.
     *
     * @throws Exception
     */
    private static void populateAttributeMap() throws Exception {
        //Log.out.warn( "===> calling populateAttributeMap() for metaId = " + metaId );
        //Log.out.warn( "===> metadataMap.get( metaId ) = " + metadataMap.get( metaId ) );
        Log.out.info("===> All rScriptFields = " + rScriptFields);

        final Map<String, Integer> colIndexMap = new HashMap<>();
        int j = 0;
        for (final String att : metadataMap.get(metaId)) {
            if (rScriptFields.contains(att)) {
                Log.out.info("Initialize Attribute Map | attribute(" + att + ") = index(" + j + ")");
                colIndexMap.put(att, j);
                attributeMap.put(att, new HashSet<>());
            }
            j++;
        }

        for (final String att : colIndexMap.keySet()) {
            final int target = colIndexMap.get(att);
            for (final String key : metadataMap.keySet()) {

                if (!key.equals(metaId)) {
                    //               Log.out.warn( "===> metadataMap KEY DOES NOT MATCH metaId" );
                    //               Log.out.warn( "===> metadataMap CHECK IF --> key( " + key + " ) = metaId( " + metaId + " )" );
                    //               Log.out.warn(
                    //                     "===> key LENGTH( " + key.length() + " ) = metaId LENGTH( " + metaId.length() + " )" );
                    int i = 0;
                    final List<String> row = metadataMap.get(key);
                    for (final String value : row) {
                        if ((i++ == target) && !value.equals(Config.requireString(Config.INPUT_NULL_VALUE))) {
                            Log.out.debug(
                                    "===> Add (" + value + ") to existing attributeMap " + attributeMap.get(att));
                            attributeMap.get(att).add(value);
                        }
                    }
                }
            }
        }

        for (final String key : attributeMap.keySet()) {
            final Set<String> vals = attributeMap.get(key);
            Log.out.info("Attribute Map (" + key + ") = " + vals);
            if (nominalFields.contains(key) && (vals.size() < 2)) {
                throw new Exception("Property " + RScript.R_NOMINAL_DATA + " contains attribute [" + key
                        + "] with only " + vals.size()
                        + " values in the metadata file.  Statistical tests require at least 2 unique options.");
            } else if (nominalFields.contains(key) && (vals.size() == 2)) {

                binaryFields.add(key);
                nominalFields.remove(key);
            } else if (numericFields.contains(key)) {
                for (final String val : vals) {
                    if (!NumberUtils.isNumber(val)) {
                        throw new Exception("Property " + RScript.R_NUMERIC_DATA + " contains attribute [" + key
                                + "] with non-numeric data [" + val + "]");
                    }
                }
            }
        }
    }

    /**
     * Process a file by getting clean values for each cell in the spreadsheet.
     * @param file
     * @return
     */
    private static List<List<String>> processFile() throws Exception {
        final List<List<String>> data = new ArrayList<>();
        final BufferedReader reader = new BufferedReader(new FileReader(metadataFile));
        for (String line = reader.readLine(); line != null; line = reader.readLine()) {
            final ArrayList<String> record = new ArrayList<>();
            final String[] cells = line.split(Constants.TAB_DELIM, -1);
            for (final String cell : cells) {
                record.add(rScriptFormat(cell));
            }
            data.add(record);
        }
        reader.close();
        return data;
    }

    /**
     * Process metadata & output some values to output file for verification.
     * @param data
     */
    private static void processMetadata(final List<List<String>> data) {
        final int digits = new Integer(data.size()).toString().length();
        int rowNum = 0;
        final Iterator<List<String>> rows = data.iterator();
        while (rows.hasNext()) {
            final List<String> row = trim(rows.next());
            String id = row.get(0);
            row.remove(0);

            if (rowNum == 0) {
                id = formatMetaId(id);
                metaId = id;
                Log.out.info("Loading METADATA [ID = " + metaId + "] with " + row.size() + " attribute columns");
            }

            if (rowNum < 2) {
                Log.out.info("Example Metadata Row[" + String.format("%0" + digits + "d", rowNum) + "]: Key(" + id
                        + "): " + row);
            }

            metadataMap.put(id, row);
            rowNum++;
        }
    }

    /**
     * Clean values avoid commas, and replace spaces with underscores.
     * @param val
     * @return
     */
    private static String rScriptFormat(String val) throws Exception {
        if ((val == null) || val.trim().isEmpty()) {
            return Config.requireString(Config.INPUT_NULL_VALUE);
        }

        final int index = val.indexOf(Config.requireString(Config.INPUT_COMMENT));
        if (index > -1) {
            val = val.substring(0, val.indexOf(Config.requireString(Config.INPUT_COMMENT)));
        }

        return val.trim();

    }

    /**
     * Set rScriptFields variable =  all metadata attributes referenced in R Script:
     * Uses 4 config file props: nominalFields, numericFields, filterNaAttributes, & filterAttributes
     */
    private static void setRscriptFields() throws Exception {
        nominalFields.addAll(Config.getSet(RScript.R_NOMINAL_DATA));
        numericFields.addAll(Config.getSet(RScript.R_NUMERIC_DATA));

        rScriptFields.addAll(nominalFields);
        rScriptFields.addAll(numericFields);
        rScriptFields.addAll(Config.getSet(RScript.R_FILTER_NA_ATTRIBUTES));
        rScriptFields.addAll(Config.getSet(RScript.R_FILTER_ATTRIBUTES));

        for (final String att : metadataMap.get(metaId)) {
            if (att.equals(Constants.NUM_READS) && Config.getBoolean(Config.REPORT_NUM_READS)) {
                rScriptFields.add(Constants.NUM_READS);
            } else if (att.equals(Constants.NUM_HITS) && Config.getBoolean(Config.REPORT_NUM_HITS)) {
                rScriptFields.add(Constants.NUM_HITS);
            }
        }
    }

    /**
     * Trim all values in row.
     * @param row
     * @return
     */
    private static List<String> trim(final List<String> row) {
        final List<String> formattedRow = new ArrayList<>();
        final Iterator<String> it = row.iterator();
        while (it.hasNext()) {
            formattedRow.add(it.next().trim());
        }

        return formattedRow;
    }

    /**
     * Verify any fields to be used in R scripts.
     * @throws Exception
     */
    private static void validateConfig() throws Exception {
        if (Config.requireString(Config.INPUT_COMMENT).length() > 1) {
            throw new Exception(Config.INPUT_COMMENT + " must be a single character with length = 1");
        }

        if (Config.requireString(Config.INPUT_NULL_VALUE).equals(Config.requireString(Config.INPUT_COMMENT))) {
            throw new Exception("BioLockJ requires unique values for config properties: " + Config.INPUT_NULL_VALUE
                    + " & " + Config.INPUT_COMMENT);
        }

        for (final String field : rScriptFields) {
            if (!MetadataUtil.getAttributeNames().contains(field)) {
                throw new Exception(field + " is not found in metadata: " + metadataFile.getAbsolutePath());
            }
        }
    }

    private static final Map<String, Set<String>> attributeMap = new HashMap<>();
    private static final Set<String> binaryFields = new TreeSet<>();
    private static File metadataFile = null;
    private static final Map<String, List<String>> metadataMap = new HashMap<>();
    private static String metaId = null;
    private static final Set<String> nominalFields = new TreeSet<>();
    private static final Set<String> numericFields = new TreeSet<>();
    private static final Set<String> rScriptFields = new HashSet<>();
}