uk.ac.ebi.arrayexpress.utils.saxon.FlatFileXMLReader.java Source code

Java tutorial

Introduction

Here is the source code for uk.ac.ebi.arrayexpress.utils.saxon.FlatFileXMLReader.java

Source

package uk.ac.ebi.arrayexpress.utils.saxon;

/*
 * Copyright 2009-2014 European Molecular Biology Laboratory
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

import au.com.bytecode.opencsv.CSVReader;
import joptsimple.OptionParser;
import joptsimple.OptionSet;
import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

public class FlatFileXMLReader extends AbstractCustomXMLReader {
    private static final Attributes EMPTY_ATTR = new AttributesImpl();

    private static final String EMPTY_NAMESPACE = "";
    private static final String CDATA_TYPE = "CDATA";

    private static final char DEFAULT_COL_DELIMITER = 0x9;
    private static final char DEFAULT_COL_QUOTE_CHAR = '"';

    private static final String OPTION_HEADER_ROWS = "header";
    private static final String OPTION_PAGE = "page";
    private static final String OPTION_PAGE_SIZE = "pagesize";
    private static final String OPTION_SORT_BY = "sortby";
    private static final String OPTION_SORT_ORDER = "sortorder";

    private enum ColDataType {
        STRING, INTEGER, DECIMAL
    }

    private char columnDelimiter;
    private char columnQuoteChar;

    private OptionSet options = null;

    public FlatFileXMLReader() {
        this.columnDelimiter = DEFAULT_COL_DELIMITER;
        this.columnQuoteChar = DEFAULT_COL_QUOTE_CHAR;
    }

    public FlatFileXMLReader(String options) {
        this();

        OptionParser parser = new OptionParser();
        parser.accepts(OPTION_HEADER_ROWS).withRequiredArg().ofType(Integer.class);
        parser.accepts(OPTION_PAGE).withRequiredArg().ofType(Integer.class);
        parser.accepts(OPTION_PAGE_SIZE).withRequiredArg().ofType(Integer.class);
        parser.accepts(OPTION_SORT_BY).withRequiredArg().ofType(Integer.class);
        parser.accepts(OPTION_SORT_ORDER).withRequiredArg().ofType(String.class);

        this.options = parser.parse(null != options ? options.split("[ ;]") : new String[] { "" });
    }

    public FlatFileXMLReader(final char columnDelimiter, final char columnQuoteChar) {
        this.columnDelimiter = columnDelimiter;
        this.columnQuoteChar = columnQuoteChar;
    }

    public void parse(InputSource input) throws IOException, SAXException {
        int headerRows = getIntOptionValue(OPTION_HEADER_ROWS, 0);
        int page = getIntOptionValue(OPTION_PAGE, 0);
        int pageSize = getIntOptionValue(OPTION_PAGE_SIZE, -1);
        Integer sortBy = getIntOptionValue(OPTION_SORT_BY, null);
        String sortOrder = getStringOptionValue(OPTION_SORT_ORDER, "a");

        ContentHandler ch = getContentHandler();
        if (null == ch) {
            return;
        }

        Reader inStream;
        if (input.getCharacterStream() != null) {
            inStream = input.getCharacterStream();
        } else if (input.getByteStream() != null) {
            inStream = new InputStreamReader(input.getByteStream());
        } else if (input.getSystemId() != null) {
            URL url = new URL(input.getSystemId());
            inStream = new InputStreamReader(url.openStream());
        } else {
            throw new SAXException("Invalid InputSource object");
        }

        CSVReader ffReader = new CSVReader(new BufferedReader(inStream), this.columnDelimiter,
                this.columnQuoteChar);

        List<String[]> ff = ffReader.readAll();
        int cols = ff.size() > 0 ? ff.get(0).length : 0;

        // verify that sort by column is with in range of columns
        // if not then sort will not be performed
        // else - switch from 1-based to 0-based index
        if (null != sortBy) {
            if (sortBy < 1 || sortBy > cols) {
                sortBy = null;
            } else {
                sortBy = sortBy - 1;
            }
        }

        // 1. removes all dodgy rows (that have less columns than the first one)
        // 2. determines if column to be sorted is numeric
        ColDataType sortColDataType = ColDataType.INTEGER;
        int colTypeSkipRows = headerRows;
        for (Iterator<String[]> iterator = ff.iterator(); iterator.hasNext();) {
            String[] row = iterator.next();
            if (row.length != cols || isRowBlank(row)) {
                iterator.remove();
            } else {
                if (null != sortBy && 0 == colTypeSkipRows && ColDataType.STRING != sortColDataType) {
                    ColDataType dataType = getColDataType(row[sortBy]);

                    // downgrade from int to decimal or string
                    if (ColDataType.INTEGER == sortColDataType && ColDataType.INTEGER != dataType) {
                        sortColDataType = dataType;
                    }
                    // downgrade from decimal to string only
                    if (ColDataType.DECIMAL == sortColDataType && ColDataType.STRING == dataType) {
                        sortColDataType = dataType;
                    }
                }
                if (colTypeSkipRows > 0) {
                    colTypeSkipRows--;
                }
            }
        }

        int rows = ff.size() > 0 ? ff.size() - headerRows : 0;

        if (-1 == pageSize) {
            page = 1;
            pageSize = rows;
        }

        ch.startDocument();

        AttributesImpl tableAttrs = new AttributesImpl();
        tableAttrs.addAttribute(EMPTY_NAMESPACE, "rows", "rows", CDATA_TYPE, String.valueOf(rows));
        ch.startElement(EMPTY_NAMESPACE, "table", "table", tableAttrs);

        for (Iterator<String[]> iterator = ff.iterator(); iterator.hasNext() && headerRows > 0; headerRows--) {
            String[] row = iterator.next();
            outputRow(ch, true, null, row);
            iterator.remove();
        }

        if (null != sortBy) {
            Collections.sort(ff, new SortColumnComparator(sortBy, sortOrder, sortColDataType));
        }

        int rowSeq = 1;
        for (String[] row : ff) {
            if (rowSeq > (pageSize * (page - 1)) && rowSeq <= (pageSize * page)) {
                outputRow(ch, false, String.valueOf(rowSeq), row);
            }
            ++rowSeq;
        }

        ch.endElement(EMPTY_NAMESPACE, "table", "table");
        ch.endDocument();
    }

    private void outputRow(ContentHandler ch, boolean isHeader, String seqValue, String[] rowData)
            throws SAXException {
        String rowElement = isHeader ? "header" : "row";

        AttributesImpl rowAttrs = new AttributesImpl();
        if (null != seqValue) {
            rowAttrs.addAttribute(EMPTY_NAMESPACE, "seq", "seq", CDATA_TYPE, seqValue);
        }
        rowAttrs.addAttribute(EMPTY_NAMESPACE, "cols", "cols", CDATA_TYPE, String.valueOf(rowData.length));
        ch.startElement(EMPTY_NAMESPACE, rowElement, rowElement, rowAttrs);

        for (String col : rowData) {
            if (isHeader) {
                col = StringUtils.trimToEmpty(col);
            }
            ch.startElement(EMPTY_NAMESPACE, "col", "col", EMPTY_ATTR);
            ch.characters(col.toCharArray(), 0, col.length());
            ch.endElement(EMPTY_NAMESPACE, "col", "col");
        }
        ch.endElement(EMPTY_NAMESPACE, rowElement, rowElement);
    }

    private Integer getIntOptionValue(String option, Integer defaultValue) {
        if (null != options && options.has(option)) {
            return (Integer) this.options.valueOf(option);
        } else {
            return defaultValue;
        }
    }

    private String getStringOptionValue(String option, String defaultValue) {
        if (null != options && options.has(option)) {
            return (String) this.options.valueOf(option);
        } else {
            return defaultValue;
        }
    }

    private boolean isRowBlank(String[] row) {
        if (null != row) {
            for (String col : row) {
                if (StringUtils.isNotBlank(col)) {
                    return false;
                }
            }
        }
        return true;
    }

    private ColDataType getColDataType(String string) {
        if (null != string) {
            if (string.matches("^\\s*\\d+\\s*$")) {
                return ColDataType.INTEGER;
            } else if (string.matches("^\\s*\\d*[.]\\d+\\s*$")) {
                return ColDataType.DECIMAL;
            }
        }
        return ColDataType.STRING;
    }

    private class SortColumnComparator implements Comparator<String[]> {
        private int sortBy;
        private String sortOrder;
        ColDataType sortColDataType;

        public SortColumnComparator(int sortBy, String sortOrder, ColDataType sortColDataType) {
            this.sortBy = sortBy;
            this.sortOrder = sortOrder;
            this.sortColDataType = sortColDataType;
        }

        @Override
        public int compare(String[] o1, String[] o2) {
            int result;
            switch (sortColDataType) {
            case INTEGER:
                long int1 = Long.valueOf(o1[sortBy]);
                long int2 = Long.valueOf(o2[sortBy]);

                result = Long.compare(int1, int2);
                break;
            case DECIMAL:
                double dec1 = Double.valueOf(o1[sortBy]);
                double dec2 = Double.valueOf(o2[sortBy]);

                result = Double.compare(dec1, dec2);
                break;
            case STRING:
                result = o1[sortBy].compareToIgnoreCase(o2[sortBy]);
                break;
            default:
                throw new IllegalArgumentException("Sort column data type is not defined");
            }
            return ("a".equalsIgnoreCase(sortOrder)) ? result : -result;
        }
    }
}