com.nistfortunetellers.cleaning.NISTClean.java Source code

Introduction

Here is the source code for com.nistfortunetellers.cleaning.NISTClean.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.nistfortunetellers.cleaning;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class NISTClean {

    //Run In Reverse does the largest files first when set.
    public static final boolean RUN_IN_REVERSE = true;

    public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm";
    public static final String KEY_SEP = "\t";
    public static final String KEY_SPECIAL = "#";
    public static final String SERGIO_REASON = "2";

    private static String LANE_ZONE_MAPPINGS;
    private static String DIR_INPUT;
    private static String DIR_UNSORTED;
    private static String DIR_TEMP;
    private static String DIR_DETECTOR_FILES;

    public static void main(String[] args) throws Exception {
        //I/O Vars Setup
        DIR_INPUT = args[0];
        DIR_TEMP = args[1];
        DIR_UNSORTED = args[2];
        LANE_ZONE_MAPPINGS = DIR_INPUT + "/detector_lane_inventory.csv";
        DIR_DETECTOR_FILES = DIR_INPUT + "/test";
        //Job Setup
        Configuration sergioCleanConfig = new Configuration();
        //add laneID/zone Key/Val Pairs to config
        addLaneIDsToConfig(sergioCleanConfig);

        File folder = new File(DIR_DETECTOR_FILES);
        File[] files = folder.listFiles();
        Arrays.sort(files);

        //Swap the file order if necessary
        if (RUN_IN_REVERSE) {
            int size = files.length;
            int halfSize = files.length / 2;
            for (int i = 0; i != halfSize; ++i) {
                int endIndex = size - i - 1;
                File first = files[i];
                File last = files[endIndex];
                files[i] = last;
                files[endIndex] = first;
            }
        }

        System.out.println("Detected Files:");
        for (File file : files) {
            String filename = file.getName();
            if (!filename.contains(".csv")) {
                continue;
            }
            System.out.println(" - " + file.getName());
        }

        for (File file : files) {
            String filename = file.getName();
            if (!filename.contains(".csv")) {
                continue;
            }
            System.out.println("Cleaning File: " + filename);
            cleanFile(sergioCleanConfig, filename);
        }
    }

    private static void cleanFile(Configuration config, String file) {
        String input = DIR_DETECTOR_FILES + '/' + file;
        String tempOutput = DIR_TEMP + '/' + file;
        String mergedOutput = DIR_UNSORTED + "/unsorted_" + file;
        runTextJob("Sergio Cleaning", config, input, tempOutput, SergioMapper.class, SergioReducer.class);
        mergeOutput(config, tempOutput, mergedOutput);
    }

    /** Runs a Job that is Text in and Out, and TextInput in and out, too! */
    @SuppressWarnings({ "deprecation", "rawtypes" })
    static void runTextJob(String jobName, Configuration jobConfig, String inputPath, String outputPath,
            Class<? extends Mapper> mapper, Class<? extends Reducer> reducer) {
        try {
            Job genericJob = new Job(jobConfig, jobName);
            // DEBUG
            //genericJob.setNumReduceTasks(0);
            // END DEBUG
            genericJob.setJarByClass(NISTClean.class);
            genericJob.setOutputKeyClass(Text.class);
            genericJob.setOutputValueClass(Text.class);
            genericJob.setMapperClass(mapper);
            genericJob.setReducerClass(reducer);
            genericJob.setInputFormatClass(TextInputFormat.class);
            genericJob.setOutputFormatClass(TextOutputFormat.class);
            FileInputFormat.addInputPath(genericJob, new Path(inputPath));
            FileOutputFormat.setOutputPath(genericJob, new Path(outputPath));
            genericJob.waitForCompletion(true);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    static FileSystem mergeOutput(Configuration jobConf, String input, String output) {
        FileSystem jobMergeFS = null;
        try {
            Path jobRaw = new Path(input);
            FileSystem jobFS = jobRaw.getFileSystem(jobConf);
            Path jobMerge = new Path(output);
            jobMergeFS = jobRaw.getFileSystem(jobConf);
            org.apache.hadoop.fs.FileUtil.copyMerge(jobFS, jobRaw, jobMergeFS, jobMerge, true, jobConf, "");
        } catch (IOException e) {
            e.printStackTrace();
        }

        return jobMergeFS;
    }

    static void addLaneIDsToConfig(Configuration config) {
        BufferedReader br = null;
        String line = "";
        String splitChar = ",";
        try {

            br = new BufferedReader(new FileReader(LANE_ZONE_MAPPINGS));
            //Skip First Line, since it is the keys.
            br.readLine();
            //Read In All Lines In File
            while ((line = br.readLine()) != null) {
                //Split by Comma
                String[] splits = line.split(splitChar);

                /* Parse Relevant Docs */
                //will be used as key
                String laneID = "";
                //associated val for key
                int zoneID = 0;
                //make sure we aren't just on a blank line
                if (splits.length >= 2) {
                    laneID = splits[0].trim();
                    try {
                        zoneID = Integer.parseInt(splits[1]);
                    } catch (NumberFormatException e) {
                        //If format doesn't work out so well, no worries. Just skip.
                        //System.out.println("--Skipped line Line: " + line.replace('\n', ' '));
                        continue;
                    }
                } else {
                    //probably means we're done, but continue just in case.
                    continue;
                }
                //DEBUG: Print Output
                System.out.println("LaneID: " + laneID + ", zoneID: " + zoneID);
                //Now actually set the config!
                config.setInt(laneID, zoneID);
            }

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }

    }
}