com.justgiving.raven.kissmetrics.utils.KissmetricsRowParser.java Source code

Java tutorial

Introduction

Here is the source code for com.justgiving.raven.kissmetrics.utils.KissmetricsRowParser.java

Source

/*
 * Copyright (c) 2014-2015 Giving.com, trading as JustGiving or its affiliates. All Rights Reserved. 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"). 
 * You may not use this file except in compliance with the License. 
 * A copy of the License is located in the "license" file accompanying this file.
 * 
 * This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for 
 * the specific language governing permissions and limitations under the License.
 * 
 * @author Richard Freeman
 * 
 */

package com.justgiving.raven.kissmetrics.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Field;
import java.net.URLDecoder;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import com.justgiving.raven.kissmetrics.KissmetricsConstants.TRACKING_COUNTER;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.io.Text;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;

public class KissmetricsRowParser {

    static final Logger logger = Logger.getLogger(KissmetricsRowParser.class);
    private static final int max_property_value_size = 1500;
    static DateFormat dateFormatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); // %Y-%m-%d %H:%M:%S
    private static JSONParser jsonParser = new JSONParser();
    private static String id = "";
    private static String emailaddress = "";
    private static String tsvRow = "";
    private static String propertyValue = "";
    private static String timestampValueOutput = "";
    private static String mobileTimestampValueOutput = "";
    private static String tTimestampValue = "";
    private static String serverTimestampValue = "";
    private static String p = "";
    private static String p2 = "";
    private static String s = "";
    private static String event = "";
    private static String decodedStrRaw = "";
    private static String decodedStrParsed = "";

    public static String getDefaultCharEncoding() {
        byte[] bArray = { 'w' };
        InputStream is = new ByteArrayInputStream(bArray);
        InputStreamReader reader = new InputStreamReader(is);
        String defaultCharacterEncoding = reader.getEncoding();
        return defaultCharacterEncoding;
    }

    /***
     * This method is used to replace any Octal encoded character when the
     * existing decoding is not working.
     * Source: http://www.utf8-chartable.de/unicode-utf8-table.pl?utf8=oct&unicodeinhtml=dec
     * 
     * U+0000 to U+00FF - basic latin
     * 
     * @param input
     * @return Octal replaced String
     */
    public static String replaceOctalUft8Char(String input) {
        String output = input.replace("\\302\\241", "").replace("\\302\\242", "").replace("\\302\\243", "")
                .replace("\\302\\244", "").replace("\\302\\245", "").replace("\\302\\246", "")
                .replace("\\302\\247", "").replace("\\302\\250", "").replace("\\302\\251", "")
                .replace("\\302\\252", "").replace("\\302\\253", "").replace("\\302\\254", "")
                .replace("\\302\\255", "").replace("\\302\\256", "").replace("\\302\\257", "")
                .replace("\\302\\260", "").replace("\\302\\261", "").replace("\\302\\262", "")
                .replace("\\302\\263", "").replace("\\302\\264", "").replace("\\302\\265", "")
                .replace("\\302\\266", "").replace("\\302\\267", "").replace("\\302\\270", "")
                .replace("\\302\\271", "").replace("\\302\\272", "").replace("\\302\\273", "")
                .replace("\\302\\274", "").replace("\\302\\275", "").replace("\\302\\276", "")
                .replace("\\302\\277", "").replace("\\303\\200", "").replace("\\303\\201", "")
                .replace("\\303\\202", "").replace("\\303\\203", "").replace("\\303\\204", "")
                .replace("\\303\\205", "").replace("\\303\\206", "").replace("\\303\\207", "")
                .replace("\\303\\210", "").replace("\\303\\211", "").replace("\\303\\212", "")
                .replace("\\303\\213", "").replace("\\303\\214", "").replace("\\303\\215", "")
                .replace("\\303\\216", "").replace("\\303\\217", "").replace("\\303\\220", "")
                .replace("\\303\\221", "").replace("\\303\\222", "").replace("\\303\\223", "")
                .replace("\\303\\224", "").replace("\\303\\225", "").replace("\\303\\226", "")
                .replace("\\303\\227", "").replace("\\303\\230", "").replace("\\303\\231", "")
                .replace("\\303\\232", "").replace("\\303\\233", "").replace("\\303\\234", "")
                .replace("\\303\\235", "").replace("\\303\\236", "").replace("\\303\\237", "")
                .replace("\\303\\240", "").replace("\\303\\241", "").replace("\\303\\242", "")
                .replace("\\303\\243", "").replace("\\303\\244", "").replace("\\303\\245", "")
                .replace("\\303\\246", "").replace("\\303\\247", "").replace("\\303\\250", "")
                .replace("\\303\\251", "").replace("\\303\\252", "").replace("\\303\\253", "")
                .replace("\\303\\254", "").replace("\\303\\255", "").replace("\\303\\256", "")
                .replace("\\303\\257", "").replace("\\303\\260", "").replace("\\303\\261", "")
                .replace("\\303\\262", "").replace("\\303\\263", "").replace("\\303\\264", "")
                .replace("\\303\\265", "").replace("\\303\\266", "").replace("\\303\\267", "")
                .replace("\\303\\270", "").replace("\\303\\271", "").replace("\\303\\272", "")
                .replace("\\303\\273", "").replace("\\303\\274", "").replace("\\303\\275", "")
                .replace("\\303\\276", "").replace("\\303\\277", "");
        return output;
    }

    /***
     * Used to parse, escape and enrich Kissmetircs Json records
     * 
     * @param rawJsonRow
     * @param fileNameInputToMapper
     * @return
     */
    public static KeyRowWrapper parseJsonRowToValidJson(Text rawJsonRow, String fileNameInputToMapper,
            String filePath) {

        String jsonString = "";
        boolean wasOctalParsingNeeded = false;

        try {
            System.setProperty("file.encoding", "UTF-8");
            s = rawJsonRow.toString();
            Charset charSet = Charset.forName("UTF-8");
            byte[] encoded = s.getBytes(charSet);
            decodedStrRaw = new String(encoded, charSet);

            // Test new Apache Lang3
            // decodedStr = StringEscapeUtils.unescapeJava(decodedStr);

            //Replace any remaining Octal encoded Characters
            decodedStrParsed = replaceOctalUft8Char(decodedStrRaw);
            if (decodedStrParsed.compareTo(decodedStrRaw) == 0) {
                wasOctalParsingNeeded = false;
            } else {
                wasOctalParsingNeeded = true;
            }

            if (decodedStrParsed != null && decodedStrParsed != "") {
                JSONObject jsonObject = (JSONObject) jsonParser.parse(decodedStrParsed);

                // get email and user_id
                if (jsonObject.get("_p2") != null) {
                    p2 = jsonObject.get("_p2").toString().toLowerCase();
                    if (p2.contains("@")) {
                        jsonObject.put("user_email", p2);
                        jsonObject.put("user_email_back", p2);
                    } else if (p2 != null && p2.length() > 0) {
                        jsonObject.put("user_km_id", p2);
                    }
                }
                // get email and user_id
                if (jsonObject.get("_p") != null) {
                    p = jsonObject.get("_p").toString().toLowerCase();
                    if (p.contains("@")) {
                        jsonObject.put("user_email", p);
                        jsonObject.put("user_email_back", p);
                    } else if (p != null && p.length() > 0) {
                        jsonObject.put("user_km_id", p);
                    }
                }

                // Add Event
                if (jsonObject.get("_n") != null) {
                    event = jsonObject.get("_n").toString();
                    if (event != null) {
                        jsonObject.put("event", event);
                    }
                }

                // add unix timestamp and datetime
                long currentDateTime = System.currentTimeMillis();
                Date currentDate = new Date(currentDateTime);
                if (jsonObject.get("_t") == null) {
                    return (new KeyRowWrapper(jsonString, null, TRACKING_COUNTER.INVALID_JSON_ROW,
                            TRACKING_COUNTER.INVALID_DATE));
                }
                long kmTimeDateMilliSeconds;
                long kmTimeDateMilliSecondsMobile;
                try {
                    tTimestampValue = (String) jsonObject.get("_t").toString();

                    //See if new record with server timestamp
                    if (jsonObject.get("_server_timestamp") != null) {
                        serverTimestampValue = (String) jsonObject.get("_server_timestamp").toString();
                    } else {
                        serverTimestampValue = "0";
                    }

                    //Deal with mobile timedate cases
                    if (jsonObject.get("_c") != null) {
                        if (serverTimestampValue.equals("0")) {
                            timestampValueOutput = tTimestampValue;
                            kmTimeDateMilliSecondsMobile = 0;
                        } else {
                            timestampValueOutput = serverTimestampValue;
                            mobileTimestampValueOutput = tTimestampValue;
                            jsonObject.put("km_timestamp_mobile", mobileTimestampValueOutput);
                            kmTimeDateMilliSecondsMobile = Long.parseLong(mobileTimestampValueOutput) * 1000;
                        }
                    } else {//Ignore server time
                            //TODO Need a way to resolve mobile identify events
                        serverTimestampValue = "0";
                        timestampValueOutput = tTimestampValue;
                        kmTimeDateMilliSecondsMobile = 0;
                    }

                    jsonObject.put("km_timestamp", timestampValueOutput);
                    kmTimeDateMilliSeconds = Long.parseLong(timestampValueOutput) * 1000;
                } catch (Exception e) {
                    return (new KeyRowWrapper(jsonString, timestampValueOutput, TRACKING_COUNTER.INVALID_JSON_ROW,
                            TRACKING_COUNTER.INVALID_DATE));
                }
                Calendar calendar = Calendar.getInstance();
                calendar.setTimeInMillis(kmTimeDateMilliSeconds);
                String event_timedate = dateFormatter.format(calendar.getTime());
                jsonObject.put("event_timedate", event_timedate);

                if (kmTimeDateMilliSecondsMobile > 0) {
                    calendar.setTimeInMillis(kmTimeDateMilliSecondsMobile);
                    String event_timedate_mobile = dateFormatter.format(calendar.getTime());
                    jsonObject.put("event_timedate_mobile", event_timedate_mobile);
                }

                // add Map Reduce json_filename
                jsonObject.put("filename", fileNameInputToMapper);
                jsonString = jsonObject.toString();

                //Add bucket path
                jsonObject.put("bucket", filePath);
                jsonString = jsonObject.toString();

                // TODO add the time the record was processed by Mapper:
                //jsonObject.put("capturedDate", capturedDate);
                //jsonString = jsonObject.toString();

                return (new KeyRowWrapper(jsonString, timestampValueOutput, TRACKING_COUNTER.VALID_JSON_ROW,
                        wasOctalParsingNeeded ? null : TRACKING_COUNTER.OCTAL_PARSING_NEEDED));

            }

        } catch (Exception e) {
            // System.err.println(e.getMessage());
            // e.printStackTrace();
            StringWriter errors = new StringWriter();
            e.printStackTrace(new PrintWriter(errors));
            logger.error(errors.toString());

            logger.error("log - file " + fileNameInputToMapper);
            System.out.println("file " + fileNameInputToMapper);

            logger.error("log - row content: " + rawJsonRow.toString().replace("\t", ""));
            System.err.println("row content: " + rawJsonRow.toString().replace("\t", ""));

            System.err.println("Error skipping row");
            logger.error("Log - Error skipping row");
        }
        return null;
    }

    public static String runOnStringJson(Text rawJsonRow, String output_filename) throws FileNotFoundException {

        String fileNameInputToMapper = "pathtocurrentfile";
        //String capturedDate = getCurrentDate();
        KeyRowWrapper newValidJson = KissmetricsRowParser.parseJsonRowToValidJson(rawJsonRow, fileNameInputToMapper,
                output_filename);
        //logger.info(newValidJson.jsonrow);
        return newValidJson.jsonrow;
    }

    //static DateFormat dateFormatter = new SimpleDateFormat(
    //         "yyyy-MM-dd HH:mm:ss"); // %Y-%m-%d %H:%M:%S

    public static String getCurrentDate() {
        Calendar calendar = Calendar.getInstance();
        String event_timedate = dateFormatter.format(calendar.getTime());
        return event_timedate;
    }

    public static void runonfileValidJson(String input_filename, String output_filename) throws IOException {
        InputStream fis;
        BufferedReader bufferdReader;
        String line;

        try {
            File file = new File(output_filename);
            if (file.createNewFile()) {
                logger.warn("File has been created");
            } //else {
              //   logger.info("File already exists.");
              //}
              // if (!file.getParentFile().mkdirs())
              // throw new IOException("Unable to create " +
              // file.getParentFile());

            FileWriter fileWriter = new FileWriter(output_filename, false);
            BufferedWriter bufferedWriter = new BufferedWriter(fileWriter);
            String parsedLine;
            fis = new FileInputStream(input_filename);
            bufferdReader = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));
            while ((line = bufferdReader.readLine()) != null) {
                parsedLine = runOnStringJson(new Text(line), output_filename) + "\n";
                bufferedWriter.write(parsedLine);
            }
            bufferedWriter.close();
            bufferdReader.close();
        } catch (IOException e) {
            logger.error("Error writing to file '" + output_filename + "'");
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
        logger.info("Ouput written to " + output_filename);
    }

    private static void processFolder(String inputFolder, String outputFolder) throws IOException {

        File file = new File(outputFolder);
        if (!file.exists()) {
            if (file.mkdirs())
                logger.info("Directory successfully created");
            else
                logger.error("Failed to create directory");
        }

        File folder = new File(inputFolder);
        File[] listOfFiles = folder.listFiles();
        for (File currentFile : listOfFiles) {
            if (currentFile.isFile()) {
                logger.info("File " + currentFile.getName());
                runonfileValidJson(Paths.get(inputFolder, currentFile.getName()).toString(),
                        Paths.get(outputFolder, currentFile.getName()).toString());
            } else if (currentFile.isDirectory()) {
                //System.out.println("Directory " + currentFile.getName());
            }
        }
    }

    public static void main(String[] args) throws FileNotFoundException, IOException {
        for (String s : args) {
            System.out.println(s);
        }

        String inputFile = "D:\\datasets\\kissmetrics\\input\\2250.json";
        String outputFile = "D:\\datasets\\kissmetrics\\output\\2250.json";
        // String inputFile ="D:\\datasets\\kissmetrics\\input\\";
        //String inputFile = "D:\\datasets\\kissmetrics\\input5\\";
        //String outputFile = "D:\\datasets\\kissmetrics\\output5\\";

        if (args.length == 2) {
            try {
                inputFile = args[0];
                outputFile = args[1];
            } catch (Exception e) {
                System.err.println(
                        "Error unable to extract arguments, valid arguments are inputFilePath inputFilePath");
                System.exit(1);
            }
        } else if (args == null || args.length == 0) {
            logger.info("using defaul values for inputFile=" + inputFile + " outputFile=" + outputFile);
        }

        String logConfigPath = Paths.get(System.getProperty("user.dir"), "log4j.properties").toString();

        File f = new File(logConfigPath);
        if (f.exists() && !f.isDirectory()) {
            System.out.println("log config file used: " + logConfigPath);
            PropertyConfigurator.configure(logConfigPath);
            logger.info("log config file used: " + logConfigPath);
        } else {
            System.out.println(
                    "no log file detected, please copy the log4j.properties to the same folder as the JAR");
        }

        if (inputFile.endsWith("\\")) {
            logger.info("Detected folder");
            processFolder(inputFile, outputFile);
        } else {
            logger.info("Detected file");
            runonfileValidJson(inputFile, outputFile);
        }
    }
}