joinery.impl.Serialization.java Source code

Introduction

Here is the source code for joinery.impl.Serialization.java
Source

/*
 * Joinery -- Data frames for Java
 * Copyright (c) 2014, 2015 IBM Corp.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package joinery.impl;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.math.BigInteger;
import java.net.URL;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import joinery.DataFrame;
import joinery.DataFrame.NumberDefault;

import org.apache.poi.hssf.usermodel.HSSFDataFormat;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.supercsv.cellprocessor.ConvertNullTo;
import org.supercsv.cellprocessor.FmtDate;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.CsvListWriter;
import org.supercsv.prefs.CsvPreference;

public class Serialization {

    private static final String EMPTY_DF_STRING = "[empty data frame]";
    private static final String ELLIPSES = "...";
    private static final String NEWLINE = "\n";
    private static final String DELIMITER = "\t";
    private static final Object INDEX_KEY = new Object();
    private static final int MAX_COLUMN_WIDTH = 20;

    public static String toString(final DataFrame<?> df, final int limit) {
        final int len = df.length();

        if (len == 0) {
            return EMPTY_DF_STRING;
        }

        final StringBuilder sb = new StringBuilder();
        final Map<Object, Integer> width = new HashMap<>();
        final List<Class<?>> types = df.types();
        final List<Object> columns = new ArrayList<>(df.columns());

        // determine index width
        width.put(INDEX_KEY, 0);
        for (final Object row : df.index()) {
            Class<? extends Object> rowClass = row == null ? null : row.getClass();
            width.put(INDEX_KEY, clamp(width.get(INDEX_KEY), MAX_COLUMN_WIDTH, fmt(rowClass, row).length()));
        }

        // determine column widths
        for (int c = 0; c < columns.size(); c++) {
            final Object column = columns.get(c);
            width.put(column, String.valueOf(column).length());
            for (int r = 0; r < df.length(); r++) {
                width.put(column,
                        clamp(width.get(column), MAX_COLUMN_WIDTH, fmt(types.get(c), df.get(r, c)).length()));
            }
        }

        // output column names
        sb.append(lpad("", width.get(INDEX_KEY)));
        for (int c = 0; c < columns.size(); c++) {
            sb.append(DELIMITER);
            final Object column = columns.get(c);
            sb.append(lpad(column, width.get(column)));
        }
        sb.append(NEWLINE);

        // output rows
        final Iterator<Object> names = df.index().iterator();
        for (int r = 0; r < len; r++) {
            // output row name
            int w = width.get(INDEX_KEY);
            final Object row = names.hasNext() ? names.next() : r;
            Class<? extends Object> rowClass = row == null ? null : row.getClass();
            sb.append(truncate(lpad(fmt(rowClass, row), w), w));

            // output rows
            for (int c = 0; c < df.size(); c++) {
                sb.append(DELIMITER);
                final Class<?> cls = types.get(c);
                w = width.get(columns.get(c));
                if (Number.class.isAssignableFrom(cls)) {
                    sb.append(lpad(fmt(cls, df.get(r, c)), w));
                } else {
                    sb.append(truncate(rpad(fmt(cls, df.get(r, c)), w), w));
                }
            }
            sb.append(NEWLINE);

            // skip rows if necessary to limit output
            if (limit - 3 < r && r < (limit << 1) && r < len - 4) {
                sb.append(NEWLINE).append(ELLIPSES).append(" ").append(len - limit).append(" rows skipped ")
                        .append(ELLIPSES).append(NEWLINE).append(NEWLINE);
                while (r < len - 2) {
                    if (names.hasNext()) {
                        names.next();
                    }
                    r++;
                }
            }
        }

        return sb.toString();
    }

    private static final int clamp(final int lower, final int upper, final int value) {
        return Math.max(lower, Math.min(upper, value));
    }

    private static final String lpad(final Object o, final int w) {
        final StringBuilder sb = new StringBuilder();
        final String value = String.valueOf(o);
        for (int i = value.length(); i < w; i++) {
            sb.append(' ');
        }
        sb.append(value);
        return sb.toString();
    }

    private static final String rpad(final Object o, final int w) {
        final StringBuilder sb = new StringBuilder();
        final String value = String.valueOf(o);
        sb.append(value);
        for (int i = value.length(); i < w; i++) {
            sb.append(' ');
        }
        return sb.toString();
    }

    private static final String truncate(final Object o, final int w) {
        final String value = String.valueOf(o);
        return value.length() - ELLIPSES.length() > w ? value.substring(0, w - ELLIPSES.length()) + ELLIPSES
                : value;
    }

    private static final String fmt(final Class<?> cls, final Object o) {
        if (cls == null)
            return "null";
        String s;
        if (o instanceof Number) {
            if (Short.class.equals(cls) || Integer.class.equals(cls) || Long.class.equals(cls)
                    || BigInteger.class.equals(cls)) {
                s = String.format("% d", Number.class.cast(o).longValue());
            } else {
                s = String.format("% .8f", Number.class.cast(o).doubleValue());
            }
        } else if (o instanceof Date) {
            final Date dt = Date.class.cast(o);
            final Calendar cal = Calendar.getInstance();
            cal.setTime(dt);
            final DateFormat fmt = new SimpleDateFormat(
                    cal.get(Calendar.HOUR_OF_DAY) == 0 && cal.get(Calendar.MINUTE) == 0
                            && cal.get(Calendar.SECOND) == 0 ? "yyyy-MM-dd" : "yyyy-MM-dd'T'HH:mm:ssXXX");
            s = fmt.format(dt);
        } else {
            s = o != null ? String.valueOf(o) : "";
        }
        return s;
    }

    public static DataFrame<Object> readCsv(final String file) throws IOException {
        return readCsv(file.contains("://") ? new URL(file).openStream() : new FileInputStream(file), ",",
                NumberDefault.LONG_DEFAULT, null);
    }

    public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault)
            throws IOException {
        return readCsv(file.contains("://") ? new URL(file).openStream() : new FileInputStream(file), separator,
                numDefault, null);
    }

    public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault,
            final String naString) throws IOException {
        return readCsv(file.contains("://") ? new URL(file).openStream() : new FileInputStream(file), separator,
                numDefault, naString);
    }

    public static DataFrame<Object> readCsv(final String file, final String separator, NumberDefault numDefault,
            final String naString, boolean hasHeader) throws IOException {
        return readCsv(file.contains("://") ? new URL(file).openStream() : new FileInputStream(file), separator,
                numDefault, naString, hasHeader);
    }

    public static DataFrame<Object> readCsv(final InputStream input) throws IOException {
        return readCsv(input, ",", NumberDefault.LONG_DEFAULT, null);
    }

    public static DataFrame<Object> readCsv(final InputStream input, String separator, NumberDefault numDefault,
            String naString) throws IOException {
        return readCsv(input, separator, numDefault, naString, true);
    }

    public static DataFrame<Object> readCsv(final InputStream input, String separator, NumberDefault numDefault,
            String naString, boolean hasHeader) throws IOException {
        CsvPreference csvPreference;
        switch (separator) {
        case "\\t":
            csvPreference = CsvPreference.TAB_PREFERENCE;
            break;
        case ",":
            csvPreference = CsvPreference.STANDARD_PREFERENCE;
            break;
        case ";":
            csvPreference = CsvPreference.EXCEL_NORTH_EUROPE_PREFERENCE;
            break;
        case "|":
            csvPreference = new CsvPreference.Builder('"', '|', "\n").build();
            break;
        default:
            throw new IllegalArgumentException("Separator: " + separator + " is not currently supported");
        }
        try (CsvListReader reader = new CsvListReader(new InputStreamReader(input), csvPreference)) {
            final List<String> header;
            final DataFrame<Object> df;
            final CellProcessor[] procs;
            if (hasHeader) {
                header = Arrays.asList(reader.getHeader(true));
                procs = new CellProcessor[header.size()];
                df = new DataFrame<>(header);
            } else {
                // Read the first row to figure out how many columns we have              
                reader.read();
                header = new ArrayList<String>();
                for (int i = 0; i < reader.length(); i++) {
                    header.add("V" + i);
                }
                procs = new CellProcessor[header.size()];
                df = new DataFrame<>(header);
                // The following line executes the procs on the previously read row again
                df.append(reader.executeProcessors(procs));
            }
            for (List<Object> row = reader.read(procs); row != null; row = reader.read(procs)) {
                df.append(new ArrayList<>(row));
            }
            return df.convert(numDefault, naString);
        }
    }

    public static <V> void writeCsv(final DataFrame<V> df, final String output) throws IOException {
        writeCsv(df, new FileOutputStream(output));
    }

    public static <V> void writeCsv(final DataFrame<V> df, final OutputStream output) throws IOException {
        try (CsvListWriter writer = new CsvListWriter(new OutputStreamWriter(output),
                CsvPreference.STANDARD_PREFERENCE)) {
            final String[] header = new String[df.size()];
            final Iterator<Object> it = df.columns().iterator();
            for (int c = 0; c < df.size(); c++) {
                header[c] = String.valueOf(it.hasNext() ? it.next() : c);
            }
            writer.writeHeader(header);
            final CellProcessor[] procs = new CellProcessor[df.size()];
            final List<Class<?>> types = df.types();
            for (int c = 0; c < df.size(); c++) {
                final Class<?> cls = types.get(c);
                if (Date.class.isAssignableFrom(cls)) {
                    procs[c] = new ConvertNullTo("", new FmtDate("yyyy-MM-dd'T'HH:mm:ssXXX"));
                } else {
                    procs[c] = new ConvertNullTo("");
                }
            }
            for (final List<V> row : df) {
                writer.write(row, procs);
            }
        }
    }

    public static DataFrame<Object> readXls(final String file) throws IOException {
        return readXls(file.contains("://") ? new URL(file).openStream() : new FileInputStream(file));
    }

    public static DataFrame<Object> readXls(final InputStream input) throws IOException {
        final Workbook wb = new HSSFWorkbook(input);
        final Sheet sheet = wb.getSheetAt(0);
        final List<Object> columns = new ArrayList<>();
        final List<List<Object>> data = new ArrayList<>();

        for (final Row row : sheet) {
            if (row.getRowNum() == 0) {
                // read header
                for (final Cell cell : row) {
                    columns.add(readCell(cell));
                }
            } else {
                // read data values
                final List<Object> values = new ArrayList<>();
                for (final Cell cell : row) {
                    values.add(readCell(cell));
                }
                data.add(values);
            }
        }

        // create data frame
        final DataFrame<Object> df = new DataFrame<>(columns);
        for (final List<Object> row : data) {
            df.append(row);
        }

        return df.convert();
    }

    public static <V> void writeXls(final DataFrame<V> df, final String output) throws IOException {
        writeXls(df, new FileOutputStream(output));
    }

    public static <V> void writeXls(final DataFrame<V> df, final OutputStream output) throws IOException {
        final Workbook wb = new HSSFWorkbook();
        final Sheet sheet = wb.createSheet();

        // add header
        Row row = sheet.createRow(0);
        final Iterator<Object> it = df.columns().iterator();
        for (int c = 0; c < df.size(); c++) {
            final Cell cell = row.createCell(c);
            writeCell(cell, it.hasNext() ? it.next() : c);
        }

        // add data values
        for (int r = 0; r < df.length(); r++) {
            row = sheet.createRow(r + 1);
            for (int c = 0; c < df.size(); c++) {
                final Cell cell = row.createCell(c);
                writeCell(cell, df.get(r, c));
            }
        }

        //  write to stream
        wb.write(output);
        output.close();
    }

    private static final Object readCell(final Cell cell) {
        switch (cell.getCellType()) {
        case Cell.CELL_TYPE_NUMERIC:
            if (DateUtil.isCellDateFormatted(cell)) {
                return DateUtil.getJavaDate(cell.getNumericCellValue());
            }
            return cell.getNumericCellValue();
        case Cell.CELL_TYPE_BOOLEAN:
            return cell.getBooleanCellValue();
        default:
            return cell.getStringCellValue();
        }
    }

    private static final void writeCell(final Cell cell, final Object value) {
        if (value instanceof Number) {
            cell.setCellType(Cell.CELL_TYPE_NUMERIC);
            cell.setCellValue(Number.class.cast(value).doubleValue());
        } else if (value instanceof Date) {
            final CellStyle style = cell.getSheet().getWorkbook().createCellStyle();
            style.setDataFormat(HSSFDataFormat.getBuiltinFormat("m/d/yy h:mm"));
            cell.setCellStyle(style);
            cell.setCellType(Cell.CELL_TYPE_NUMERIC);
            cell.setCellValue(Date.class.cast(value));
        } else if (value instanceof Boolean) {
            cell.setCellType(Cell.CELL_TYPE_BOOLEAN);
        } else {
            cell.setCellType(Cell.CELL_TYPE_STRING);
            cell.setCellValue(value != null ? String.valueOf(value) : "");
        }
    }
}