edu.cornell.med.icb.io.TsvToFromMap.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.med.icb.io.TsvToFromMap.java

Source

/*
 * Copyright (C) 2008-2010 Institute for Computational Biomedicine,
 *                         Weill Medical College of Cornell University
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.cornell.med.icb.io;

import edu.cornell.med.icb.iterators.TextFileLineIterator;
import edu.cornell.med.icb.maps.LinkedHashToMultiTypeMap;
import org.apache.commons.lang.StringUtils;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
 * Generic way to read/write TSV files given the column headers. This class uses
 * the headers and reads the data into a Map or given a map can write the data to
 * a TSV file.
 * TODO: Make it less case sensitive about case, etc. when reading/writing data?
 * @author Kevin Dorff
 */
public class TsvToFromMap {
    /** The column headers for this TSV file. */
    private final List<String> columnHeaders;

    /** The number of column headers. */
    private int numColumnHeaders;

    /**
     * If the header has been written, so it doesn't get written multiple times unless
     * you really want it to be written multiple times.
     */
    private boolean headerWritten;

    /**
     * If true, readDataToMap() will be lenient with respect to column count.
     */
    private boolean lenientColumnCount;

    /**
     * Create a new TsvToFromMap object with no columns, it is expected that the columns
     * will be added later with addColumn(...).
     */
    public TsvToFromMap() {
        this.columnHeaders = new LinkedList<String>();
        this.numColumnHeaders = columnHeaders.size();
        headerWritten = false;
        lenientColumnCount = false;
    }

    /**
     * Create a new TsvToFromMap object given an array of columnHeader labels.
     * @param columnHeaderLabels the columns of the TSV file
     */
    public TsvToFromMap(final String... columnHeaderLabels) {
        this.columnHeaders = new LinkedList<String>();
        if (columnHeaderLabels != null) {
            this.columnHeaders.addAll(Arrays.asList(columnHeaderLabels));
        }
        this.numColumnHeaders = columnHeaders.size();
        headerWritten = false;
    }

    /**
     * Add a column. If that column already exists this will do nothing.
     * @param newColumnHeader the new column header to add.
     */
    public void addColumn(final String newColumnHeader) {
        if (!columnHeaders.contains(newColumnHeader)) {
            columnHeaders.add(newColumnHeader);
            numColumnHeaders++;
        }
    }

    /**
     * Obtain a copy of the column headers. You can safely modify the list that is returned
     * from this method as you are only receiving a copy of the list of column headers.
     * @return the list of column headers
     */
    public List<String> getColumnHeaders() {
        final List<String> copy = new ArrayList<String>(columnHeaders.size());
        copy.addAll(columnHeaders);
        return copy;
    }

    /**
     * Get if the header has already been written for this object.
     * @return true if the header has already been written for this object
     */
    public boolean isHeaderWritten() {
        return this.headerWritten;
    }

    /**
     * Set if the header has already been written for this object.
     * @param headerWritten if the header has already been written for this object
     */
    public void setHeaderWritten(final boolean headerWritten) {
        this.headerWritten = headerWritten;
    }

    /**
     * Write the header. Note: If the header has already been written once for this
     * object it will not be written again, if you need to write it more than once
     * (ie, you have varied the PrintWriter for out) you need to call setHeaderWritten(...).
     * @param out the stream to write the header to
     */
    public void writeHeader(final PrintWriter out) {
        if (headerWritten) {
            return;
        }
        headerWritten = true;
        int pos = 0;
        for (final String columnHeader : columnHeaders) {
            if (pos++ > 0) {
                out.print("\t");
            }
            out.print(columnHeader);
        }
        out.println();
    }

    /**
     * Given a line of text, convert it to a Map[columnName, value] of data.
     * @param line the line of data to convert to the the Map
     * @return the Map of data
     * @throws IOException error converting the data, the number of columns found
     * in the data is incorrect given the number of columns expected in the TSV file
     */
    public LinkedHashToMultiTypeMap<String> readDataToMap(final String line) throws IOException {
        if (line.startsWith("#")) {
            return null;
        }
        final String[] parts = StringUtils.splitPreserveAllTokens(line, '\t');
        if ((!lenientColumnCount) && (parts.length != numColumnHeaders)) {
            throw new IOException(
                    String.format("Line should have %d columns but has %d", numColumnHeaders, parts.length));
        }
        final LinkedHashToMultiTypeMap<String> result = new LinkedHashToMultiTypeMap<String>();
        int i = 0;
        final int numActualParts = parts.length;
        for (final String columnHeader : columnHeaders) {
            if (lenientColumnCount && (i >= numActualParts)) {
                result.put(columnHeader, "");
            } else {
                result.put(columnHeader, parts[i]);
            }
            i++;
        }
        return result;
    }

    /**
     * Write the Map[columnHeader, value] of data to the PrintWriter out.
     * If the Map value for the given column is missing (null) an empty
     * value (nothing) will be written for that column. Extra entries in the
     * map will just be ignored.
     * @param out where to write the data to
     * @param data the Map of data to write
     */
    public void writeDataFromMap(final PrintWriter out, final Map<String, String> data) {
        int pos = 0;
        for (final String columnHeader : columnHeaders) {
            if (pos++ > 0) {
                out.print('\t');
            }
            final String dataItem = data.get(columnHeader);
            if (dataItem != null) {
                out.print(dataItem);
            }
        }
        out.println();
    }

    /**
     * Construct a TsvToFromMap based on the columns of a given
     * TSV file. This assumes that the first non-commented line contains the
     * column names.
     * @param file the type of file to make the object CsvToFromMap for
     * @return the TsvToFromMap for the given type
     * @throws IOException error reading file that was supposed to be a TSV file
     */
    public static TsvToFromMap createFromTsvFile(final File file) throws IOException {
        TextFileLineIterator in = null;
        try {
            in = new TextFileLineIterator(file);
            for (final String line : in) {
                if (line.startsWith("#")) {
                    continue;
                }
                final String[] parts = StringUtils.split(line, '\t');
                return new TsvToFromMap(parts);
            }
        } finally {
            if (in != null) {
                in.close();
            }
        }
        // Empty file or all lines were comments (start with "#")
        return null;
    }

    /**
     * Retrieve the number of column headers.
     * @return the number of column headers.
     */
    public int getNumColumnHeaders() {
        return numColumnHeaders;
    }

    /**
     * When executing readDataToMap() if the input string has too many or too few
     * columns it will normally throw an exception. If lenientColumnCount is set
     * to true, it will not. You should be careful if this is on!
     * @return the value of exceptionOnTooManyFields
     */
    public boolean isLenientColumnCount() {
        return lenientColumnCount;
    }

    /**
     * When executing readDataToMap() if the input string has too many or too few
     * columns it will normally throw an exception. If lenientColumnCount is set
     * to true, it will not. You should be careful if this is on!
     * @param lenientColumnCount the new value of exceptionOnTooManyFields
     */
    public void setLenientColumnCount(final boolean lenientColumnCount) {
        this.lenientColumnCount = lenientColumnCount;
    }
}