org.openiot.gsn.wrappers.general.CSVHandler.java Source code

Java tutorial

Introduction

Here is the source code for org.openiot.gsn.wrappers.general.CSVHandler.java

Source

/**
 *    Copyright (c) 2011-2014, OpenIoT
 *
 *    This file is part of OpenIoT.
 *
 *    OpenIoT is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU Lesser General Public License as published by
 *    the Free Software Foundation, version 3 of the License.
 *
 *    OpenIoT is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU Lesser General Public License for more details.
 *
 *    You should have received a copy of the GNU Lesser General Public License
 *    along with OpenIoT.  If not, see <http://www.gnu.org/licenses/>.
 *
 *     Contact: OpenIoT mailto: info@openiot.eu
 * @author Ali Salehi
 * @author Mehdi Riahi
 * @author Sofiane Sarni
 * @author Hylke van der Schaaf
 */

package org.openiot.gsn.wrappers.general;

import org.openiot.gsn.beans.DataField;
import org.openiot.gsn.utils.CaseInsensitiveComparator;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import au.com.bytecode.opencsv.CSVReader;

/**
 * possible formats for the timestamp fields are available @ http://joda-time.sourceforge.net/api-release/org/joda/time/format/DateTimeFormat.html
 * Possible timezone : http://joda-time.sourceforge.net/timezones.html
 */
public class CSVHandler {

    public static final String LOCAL_TIMEZONE_ID = DateTimeZone.getDefault().getID();

    private static Logger logger = Logger.getLogger(CSVHandler.class);

    private static final String TIMESTAMP = "timed";

    public static DateTime parseTimeStamp(String format, String value) throws IllegalArgumentException {
        DateTimeFormatter fmt = DateTimeFormat.forPattern(format);
        return fmt.parseDateTime(value);
    }

    private char stringSeparator, separator;
    private String dataFile;
    private DateTimeZone timeZone;
    private int skipFirstXLines;
    private String[] fields, formats, nulls;

    private String checkPointFile;

    public boolean initialize(String dataFile, String inFields, String inFormats, char separator,
            char stringSeparator, int skipFirstXLines, String nullValues) {
        return initialize(dataFile, inFields, inFormats, separator, stringSeparator, skipFirstXLines, nullValues,
                LOCAL_TIMEZONE_ID, "check-poin/" + (new File(dataFile).getName() + ".chk-point"));
    }

    public boolean initialize(String dataFile, String inFields, String inFormats, char separator,
            char stringSeparator, int skipFirstXLines, String nullValues, String timeZone, String checkpointFile) {

        this.stringSeparator = stringSeparator; // default to ,
        this.skipFirstXLines = skipFirstXLines;// default to 0
        this.dataFile = dataFile; // check if it exist.
        this.separator = separator;
        this.timeZone = DateTimeZone.forID(timeZone);
        this.checkPointFile = checkpointFile;
        File file = new File(dataFile);

        if (!file.isFile()) {
            logger.error("The specified CSV data file: " + dataFile + " doesn't exists.");
            return false;
        }

        try {
            setupCheckPointFileIfNeeded();
            this.fields = generateFieldIdx(inFields, true);
            this.formats = generateFieldIdx(inFormats, false);
            this.nulls = generateFieldIdx(nullValues, true);
            ////////////////////////
            // TODO: Check that the lengths are the same
            ////////////////////////

        } catch (IOException e) {
            logger.error(e.getMessage(), e);
            return false;
        }
        if (!validateFormats(this.formats)) {
            return false;
        }
        if (fields.length != formats.length) {
            logger.error("loading the csv-wrapper failed as the length of fields(" + fields.length
                    + ") doesn't match the length of formats(" + formats.length + ")");
            return false;
        }
        return true;

    }

    public void setupCheckPointFileIfNeeded() throws IOException {
        String chkPointDir = new File(new File(getCheckPointFile()).getParent()).getAbsolutePath();
        new File(chkPointDir).mkdirs();
        new File(getCheckPointFile()).createNewFile();
    }

    public static boolean validateFormats(String[] formats) {
        for (String format : formats) {
            if (format.equalsIgnoreCase("numeric") || format.equalsIgnoreCase("string")) {
                continue;
            } else if (isTimeStampFormat(format)) {
                try {
                    String tmp = DateTimeFormat.forPattern(getTimeStampFormat(format))
                            .print(System.currentTimeMillis());
                } catch (IllegalArgumentException e) {
                    logger.error("Validating the time-format(" + format + ") used by the CSV-wrapper is failed. ");
                    return false;
                }
            } else {
                logger.error("The format (" + format + ") used by the CSV-Wrapper doesn't exist.");
                return false;
            }
        }
        return true;

    }

    /**
     * Removes the space from the fields.
     * Split the rawFields using comma as the separator.
     *
     * @param rawFields
     * @param toLowerCase, if false, the case is preserved. if true, the actual outputs will be in lower-case.
     * @return
     * @throws IOException
     */
    public static String[] generateFieldIdx(String rawFields, boolean toLowerCase) throws IOException {
        String[] toReturn = new CSVReader(new StringReader(rawFields)).readNext();
        if (toReturn == null) {
            return new String[0];
        }
        for (int i = 0; i < toReturn.length; i++) {
            toReturn[i] = toReturn[i].trim();
            if (toLowerCase) {
                toReturn[i] = toReturn[i].toLowerCase();
            }
        }
        return toReturn;
    }

    public ArrayList<TreeMap<String, Serializable>> work(Reader dataFile, String checkpointDir,
            int samplingCountPerPeriod) throws IOException {
        ArrayList<TreeMap<String, Serializable>> items;
        setupCheckPointFileIfNeeded();
        String val = FileUtils.readFileToString(new File(checkPointFile), "UTF-8");
        long lastItem = 0;
        if (val != null && val.trim().length() > 0)
            lastItem = Long.parseLong(val.trim());
        items = parseValues(dataFile, lastItem, samplingCountPerPeriod);

        return items;
    }

    public void updateCheckPointFile(long timestamp) throws IOException {
        FileUtils.writeStringToFile(new File(checkPointFile), Long.toString(timestamp), "UTF-8");
    }

    private boolean loggedNoChange = false; // to avoid duplicate logging messages when there is no change

    public ArrayList<TreeMap<String, Serializable>> parseValues(Reader datainput, long previousCheckPoint,
            int samplingCountPerPeriod) throws IOException {
        ArrayList<TreeMap<String, Serializable>> toReturn = new ArrayList<TreeMap<String, Serializable>>();
        CSVReader reader = new CSVReader(datainput, getSeparator(), getStringSeparator(), getSkipFirstXLines());
        String[] values;
        long currentLine = 0;
        Serializable currentTimeStamp = null;
        boolean quit = false;
        while ((values = reader.readNext()) != null) {
            TreeMap<String, Serializable> se = convertTo(formats, fields, getNulls(), values, getSeparator());
            if (isEmpty(se)) {
                continue;
            }
            if (se.containsKey(TIMESTAMP)) {
                if (((Long) se.get(TIMESTAMP)) <= previousCheckPoint) {
                    continue;
                }
            } else {// assuming useCounterForCheckPoint = true

                if (logger.isDebugEnabled()) {
                    String symbol = (currentLine < previousCheckPoint) ? " < " : " >= ";
                    logger.debug("currentLine=" + currentLine + symbol + "checkpoint=" + previousCheckPoint);
                }

                if (currentLine < previousCheckPoint) {// skipping already read lines, based on line count
                    logger.debug("skipping");
                    currentLine++;
                    continue;
                }
            }
            if (quit) {
                if (se.containsKey(TIMESTAMP)) {
                    if (currentTimeStamp == null || !currentTimeStamp.equals(se.get(TIMESTAMP))) {
                        break;
                    }
                } else {
                    break;
                }
            }
            toReturn.add(se);
            currentLine++;
            loggedNoChange = false;
            if (toReturn.size() >= samplingCountPerPeriod) {
                // Move outside the loop as in each call we only read x values;
                // But if we use timeStampMode, still check the next value, since
                // if the timestamp is the same we have to return it, or data
                // would be lost.
                logger.trace("Time to quit.");
                quit = true;
                if (se.containsKey(TIMESTAMP)) {
                    currentTimeStamp = se.get(TIMESTAMP);
                } else {
                    break;
                }
            }
        }
        if (logger.isDebugEnabled() && toReturn.isEmpty() && loggedNoChange == false) {
            logger.debug("There is no new item after most recent checkpoint(previousCheckPoint:"
                    + new DateTime(previousCheckPoint) + ").");
            loggedNoChange = true;
        }

        reader.close();
        return toReturn;
    }

    private boolean isEmpty(Map<String, Serializable> se) {
        for (Object o : se.values()) {
            if (o != null) {
                return false;
            }
        }
        return true;
    }

    public TreeMap<String, Serializable> convertTo(String[] formats, String[] fields, String nullValues[],
            String[] values, char separator) {
        TreeMap<String, Serializable> streamElement = new TreeMap<String, Serializable>(
                new CaseInsensitiveComparator());
        for (String field : fields) {
            streamElement.put(field, null);
        }
        HashMap<String, String> timeStampFormats = new HashMap<String, String>();
        for (int i = 0; i < Math.min(fields.length, values.length); i++) {
            if (isNull(nullValues, values[i])) {
                continue;
            } else if (formats[i].equalsIgnoreCase("numeric")) {
                try {
                    streamElement.put(fields[i], Double.parseDouble(values[i]));
                } catch (java.lang.NumberFormatException e) {
                    logger.error("Parsing to Numeric fails: Value to parse=" + values[i]);
                    throw e;
                }
            } else if (formats[i].equalsIgnoreCase("string")) {
                streamElement.put(fields[i], values[i]);
            } else if (isTimeStampFormat(formats[i])) {
                String value = "";
                String format = "";
                if (streamElement.get(fields[i]) != null) {
                    value = (String) streamElement.get(fields[i]);
                    format = timeStampFormats.get(fields[i]);
                    value += separator;
                    format += separator;
                }
                if (isTimeStampLeftPaddedFormat(formats[i]))
                    values[i] = StringUtils.leftPad(values[i], getTimeStampFormat(formats[i]).length(), '0');

                value += values[i];
                format += getTimeStampFormat(formats[i]);
                streamElement.put(fields[i], value);
                timeStampFormats.put(fields[i], format);
            }
        }
        for (String timeField : timeStampFormats.keySet()) {
            String timeFormat = timeStampFormats.get(timeField);
            String timeValue = (String) streamElement.get(timeField);
            try {
                DateTime x = DateTimeFormat.forPattern(timeFormat).withZone(getTimeZone()).parseDateTime(timeValue);
                streamElement.put(timeField, x.getMillis());
            } catch (IllegalArgumentException e) {
                logger.error("Parsing error: TimeFormat=" + timeFormat + " , TimeValue=" + timeValue);
                logger.error(e.getMessage(), e);
                throw e;
            }
        }

        return streamElement;
    }

    public static String getTimeStampFormat(String input) {
        if (input.contains("timestampl("))
            return input.substring("timestampl(".length(), input.indexOf(")")).trim();
        else
            return input.substring("timestamp(".length(), input.indexOf(")")).trim();
    }

    public static boolean isTimeStampFormat(String input) {
        return (input.toLowerCase().startsWith("timestamp(") || input.toLowerCase().startsWith("timestampl("))
                && input.endsWith(")");
    }

    public static boolean isTimeStampLeftPaddedFormat(String input) {
        return input.toLowerCase().startsWith("timestampl(") && input.endsWith(")");
    }

    public char getSeparator() {
        return separator;
    }

    public char getStringSeparator() {
        return stringSeparator;
    }

    public int getSkipFirstXLines() {
        return skipFirstXLines;
    }

    public static boolean isNull(String[] possibleNullValues, String value) {
        if (value == null || value.length() == 0)
            return true;
        for (int i = 0; i < possibleNullValues.length; i++)
            if (possibleNullValues[i].equalsIgnoreCase(value.trim()))
                return true;
        return false;
    }

    public String[] getFields() {
        return fields;
    }

    public DataField[] getDataFields() {
        HashMap<String, String> dataFields = new HashMap<String, String>();
        for (int i = 0; i < getFields().length; i++) {
            String field = getFields()[i];
            String type = getFormats()[i];
            if (isTimeStampFormat(type)) {
                //GSN doesn't support timestamp data type, all timestamp values are supposed to be bigint.
                dataFields.put(field, "bigint");
            } else if (type.equalsIgnoreCase("numeric")) {
                dataFields.put(field, "numeric");
            } else {
                dataFields.put(field, "string");
            }
        }
        DataField[] toReturn = new DataField[dataFields.size()];
        int i = 0;
        for (String key : dataFields.keySet()) {
            toReturn[i++] = new DataField(key, dataFields.get(key));
        }
        return toReturn;
    }

    public String[] getFormats() {
        return formats;
    }

    public String getDataFile() {
        return dataFile;
    }

    public String[] getNulls() {
        return nulls;
    }

    public void setSkipFirstXLines(int skipFirstXLines) {
        this.skipFirstXLines = skipFirstXLines;
    }

    public DateTimeZone getTimeZone() {
        return timeZone;
    }

    public String getCheckPointFile() {
        return checkPointFile;
    }

}