Java tutorial
/* Copyright (C) 2016 New York University This file is part of Data Polygamy which is released under the Revised BSD License See file LICENSE for full license details. */ package edu.nyu.vida.data_polygamy.pre_processing; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import edu.nyu.vida.data_polygamy.scalar_function.Aggregation; import edu.nyu.vida.data_polygamy.scalar_function.Count; import edu.nyu.vida.data_polygamy.resolution.SpatialResolution; import edu.nyu.vida.data_polygamy.resolution.SpatialResolutionUtils; import edu.nyu.vida.data_polygamy.resolution.ToCity; import edu.nyu.vida.data_polygamy.utils.FrameworkUtils; import edu.nyu.vida.data_polygamy.utils.FrameworkUtils.AggregationArrayWritable; import edu.nyu.vida.data_polygamy.utils.FrameworkUtils.Function; import edu.nyu.vida.data_polygamy.utils.FrameworkUtils.MultipleSpatioTemporalWritable; /** * * @author fchirigati * */ public class PreProcessingMapper extends Mapper<LongWritable, Text, MultipleSpatioTemporalWritable, AggregationArrayWritable> { public static FrameworkUtils utils = new FrameworkUtils(); Configuration conf = null; long records = 0L; boolean s3 = true; // output MultipleSpatioTemporalWritable keyWritable = new MultipleSpatioTemporalWritable(); AggregationArrayWritable valueWritable = new AggregationArrayWritable(); // input parameters int temporalResolution, spatialResolution, gridResolution, currentSpatialResolution; int[] xPositions, yPositions, spatialPos, temporalPos; // parameter names String[] parameterNames, keyNames, paramDefaults; // aggregation functions for each parameter boolean aggregatesInit = false; ArrayList<Integer> aggregatesIndex = new ArrayList<Integer>(); HashMap<Integer, String> aggregates = new HashMap<Integer, String>(); HashMap<Integer, Function> aggregateFunctions = new HashMap<Integer, Function>(); SpatialResolution spatialTranslation = null; int sizeSpatioTemp = 0; int nbParameters = 1; private void identifyAggregates(String[] input) { String[] inputTest = Arrays.copyOf(input, input.length); // skip spatio-temporal attributes // assuming reading first spatial than temporal for the headers keyNames = new String[xPositions.length + spatialPos.length + temporalPos.length]; int keyNamesIndex = 0; if (xPositions.length == 0) for (int i = 0; i < spatialPos.length; i++) { inputTest[spatialPos[i]] = null; keyNames[keyNamesIndex++] = parameterNames[spatialPos[i]].trim(); } else { for (int i = 0; i < xPositions.length; i++) { inputTest[xPositions[i]] = null; inputTest[yPositions[i]] = null; keyNames[keyNamesIndex++] = parameterNames[xPositions[i]].trim(); } } for (int i = 0; i < temporalPos.length; i++) { inputTest[temporalPos[i]] = null; keyNames[keyNamesIndex++] = parameterNames[temporalPos[i]].trim(); } /*System.out.println("Identifying attributes..."); for (int i = 0; i < keyNames.length; i++) System.out.print(keyNames[i] + ","); System.out.println();*/ // count aggregates.put(-1, (nbParameters - 1) + "-" + FrameworkUtils.functionToString(Function.COUNT) + "-" + parameterNames[0].trim()); aggregateFunctions.put(-1, Function.COUNT); for (int i = 0; i < inputTest.length; i++) { if (inputTest[i] == null) continue; if (inputTest[i].startsWith("$") && inputTest[i].endsWith("$")) continue; if (inputTest[i].startsWith("\"") && inputTest[i].endsWith("\"")) continue; /*if (!FrameworkUtils.isNumeric(inputTest[i])) continue;*/ nbParameters++; // id fields -- only use unique aggregate String parameterNameLowerCase = parameterNames[i].toLowerCase(); if (parameterNameLowerCase.contains("id") || parameterNameLowerCase.contains("key") || parameterNameLowerCase.contains("name")) { aggregates.put(i, (nbParameters - 1) + "-" + FrameworkUtils.functionToString(Function.UNIQUE) + "-" + parameterNames[i].trim()); aggregateFunctions.put(i, Function.UNIQUE); continue; } aggregates.put(i, (nbParameters - 1) + "-" + FrameworkUtils.functionToString(Function.AVERAGE) + "-" + parameterNames[i].trim()); aggregateFunctions.put(i, Function.AVERAGE); } Iterator<Integer> it = aggregates.keySet().iterator(); while (it.hasNext()) aggregatesIndex.add(it.next()); } @Override public void setup(Context context) throws IOException, InterruptedException { conf = context.getConfiguration(); String bucket = conf.get("bucket", ""); if (bucket.equals("")) s3 = false; // defaults Path defaults = new Path(conf.get("defaults", "")); FileSystem fs = null; BufferedReader br = null; if (!s3) fs = FileSystem.get(new Configuration()); else fs = FileSystem.get(defaults.toUri(), conf); br = new BufferedReader(new InputStreamReader(fs.open(defaults))); paramDefaults = br.readLine().split(","); br.close(); if (s3) fs.close(); temporalResolution = utils.temporalResolution(conf.get("temporal-resolution")); spatialResolution = utils.spatialResolution(conf.get("spatial-resolution")); currentSpatialResolution = utils.spatialResolution(conf.get("current-spatial-resolution")); gridResolution = (conf.get("grid-resolution", "").equals("")) ? 0 : Integer.parseInt(conf.get("grid-resolution", "")); sizeSpatioTemp = Integer.parseInt(conf.get("size-spatio-temporal", "0")); // positions String[] temporalArray = (conf.get("temporal-pos", "").equals("")) ? new String[0] : conf.get("temporal-pos", "").split(","); String[] spatialPosArray = (conf.get("spatial-pos", "").equals("")) ? new String[0] : conf.get("spatial-pos", "").split(","); String[] xPositionsArray = (conf.get("xPositions", "").equals("")) ? new String[0] : conf.get("xPositions", "").split(","); String[] yPositionsArray = (conf.get("yPositions", "").equals("")) ? new String[0] : conf.get("yPositions", "").split(","); temporalPos = FrameworkUtils.getIntArray(temporalArray); spatialPos = FrameworkUtils.getIntArray(spatialPosArray); xPositions = FrameworkUtils.getIntArray(xPositionsArray); yPositions = FrameworkUtils.getIntArray(yPositionsArray); // reading header Path header = new Path(conf.get("header", "")); if (s3) fs = FileSystem.get(header.toUri(), conf); br = new BufferedReader(new InputStreamReader(fs.open(header))); parameterNames = br.readLine().split(",", -1); br.close(); if (s3) fs.close(); /** * Spatial Resolution */ switch (currentSpatialResolution) { case FrameworkUtils.POINTS: spatialTranslation = SpatialResolutionUtils.pointsResolution(spatialResolution, gridResolution, xPositions, yPositions, conf); break; case FrameworkUtils.NBHD: spatialTranslation = SpatialResolutionUtils.nbhdResolution(spatialResolution, spatialPos); break; case FrameworkUtils.ZIP: spatialTranslation = SpatialResolutionUtils.zipResolution(spatialResolution, spatialPos, true, conf); break; case FrameworkUtils.GRID: spatialTranslation = SpatialResolutionUtils.gridResolution(spatialResolution, spatialPos); break; case FrameworkUtils.CITY: spatialTranslation = new ToCity(spatialPos); break; default: System.out.println("Something is wrong..."); System.exit(-1); break; } } @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] input; try { input = FrameworkUtils.splitStr(value.toString(), parameterNames.length); } catch (IOException e) { System.out.println("Error while parsing line: " + e.getLocalizedMessage()); return; } /** * Spatial Resolution */ ArrayList<Integer> spatial = spatialTranslation.translate(input); /** * Temporal Resolution */ ArrayList<Integer> temporal = new ArrayList<Integer>(); for (int tempPos : temporalPos) { int temp = FrameworkUtils.getTime(temporalResolution, input, tempPos); if (temp != -1) temporal.add(temp); } if ((spatial.size() <= 0) || (temporal.size() <= 0)) { System.out.println("Spatial size: " + spatial.size() + " | Temporal size: " + temporal.size()); return; } if ((spatial.size() != sizeSpatioTemp) || (temporal.size() != sizeSpatioTemp)) { System.out.println("Spatial size: " + spatial.size() + " | Temporal size: " + temporal.size()); return; } records++; // identifying all the aggregates for each parameter // done only once if (!aggregatesInit) { identifyAggregates(input); aggregatesInit = true; } /* * getting the parameters * null and default values are ignored */ ArrayList<Aggregation> output = new ArrayList<Aggregation>(); Iterator<Integer> it = aggregatesIndex.iterator(); Float defaultVal; while (it.hasNext()) { int index = it.next(); Float floatVal = 0f; // count if (index == -1) { Count agg = new Count(); agg.addValue(floatVal); output.add(agg); continue; } // others try { floatVal = Float.parseFloat(input[index]); if (floatVal == null) floatVal = Float.NaN; } catch (NumberFormatException e) { floatVal = Float.NaN; } if (!paramDefaults[index].equals("NONE")) { try { defaultVal = Float.parseFloat(paramDefaults[index]); if (floatVal.equals(defaultVal)) floatVal = Float.NaN; } catch (NumberFormatException e) { } } Aggregation agg = FrameworkUtils.getAggregation(aggregateFunctions.get(index)); agg.addValue(floatVal); output.add(agg); } keyWritable = new MultipleSpatioTemporalWritable(spatial, temporal); valueWritable = new AggregationArrayWritable(output); context.write(keyWritable, valueWritable); } @Override public void cleanup(Context context) throws IOException { if (records > 0) { conf = context.getConfiguration(); Path headerFile = null; FileSystem fs = null; if (s3) { headerFile = new Path(conf.get("aggregates", "")); fs = FileSystem.get(headerFile.toUri(), conf); } else { fs = FileSystem.get(new Configuration()); headerFile = new Path( fs.getHomeDirectory() + "/" + context.getConfiguration().get("aggregates", "")); } // we cannot have multiple mappers writing the same file if (fs.exists(headerFile)) { if (s3) fs.close(); return; } FSDataOutputStream fsDataOutputStream = fs.create(headerFile); String output = ""; for (int i = 0; i < keyNames.length; i++) { if (keyNames[i] != null) output += keyNames[i] + ","; } output = output.substring(0, output.length() - 1) + "\t"; Iterator<Integer> it = aggregatesIndex.iterator(); while (it.hasNext()) { Integer index = it.next(); output += aggregates.get(index) + ","; } BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream)); bw.write(output.substring(0, output.length() - 1) + "\n"); bw.write(String.valueOf(nbParameters)); bw.close(); if (s3) fs.close(); } } }