Java tutorial
/*********************************************************************** * Copyright (c) 2015 by Regents of the University of Minnesota. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Apache License, Version 2.0 which * accompanies this distribution and is available at * http://www.opensource.org/licenses/apache2.0.php. * *************************************************************************/ package edu.umn.cs.sthadoop.trajectory; import java.io.IOException; import java.text.ParseException; import java.util.List; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.LocalJobRunner; import org.apache.hadoop.mapred.Task; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import edu.umn.cs.spatialHadoop.OperationsParams; import edu.umn.cs.spatialHadoop.core.Rectangle; import edu.umn.cs.spatialHadoop.core.ResultCollector; import edu.umn.cs.spatialHadoop.core.Shape; import edu.umn.cs.spatialHadoop.io.Text2; import edu.umn.cs.spatialHadoop.io.TextSerializable; import edu.umn.cs.spatialHadoop.mapred.TextOutputFormat3; import edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3; import edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3; import edu.umn.cs.spatialHadoop.mapreduce.SpatialRecordReader3; import edu.umn.cs.spatialHadoop.nasa.HDFRecordReader; import edu.umn.cs.spatialHadoop.operations.RangeQuery; import edu.umn.cs.spatialHadoop.util.Parallel; import edu.umn.cs.spatialHadoop.util.Parallel.RunnableRange; import edu.umn.cs.sthadoop.core.QueryPlanner; import edu.umn.cs.sthadoop.core.STPoint; import edu.umn.cs.spatialHadoop.util.ResultCollectorSynchronizer; /** * Performs a trajectory range query over a spatio-temporal file. * * @author Louai Alarabi * */ public class TrajectoryOverlap { /** Logger for RangeQuery */ static final Log LOG = LogFactory.getLog(TrajectoryOverlap.class); /** * The map function used for range query * * @author LouaiAlarabi */ public static class RangeQueryMap extends Mapper<Rectangle, Iterable<Shape>, NullWritable, Shape> { @Override protected void map(final Rectangle cellMBR, Iterable<Shape> value, final Context context) throws IOException, InterruptedException { NullWritable dummyKey = NullWritable.get(); for (Shape s : value) { context.write(dummyKey, s); } } } public static Job rangeQueryMapReduce(Path inFile, Path outFile, OperationsParams params) throws IOException, ClassNotFoundException, InterruptedException { // Use the built-in range filter of the input format params.set(SpatialInputFormat3.InputQueryRange, params.get("rect")); // Use multithreading in case it is running locally params.setInt(LocalJobRunner.LOCAL_MAX_MAPS, Runtime.getRuntime().availableProcessors()); Job job = new Job(params, "Traj-KNN-distance"); job.setJarByClass(RangeQuery.class); job.setNumReduceTasks(0); job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inFile); job.setMapperClass(RangeQueryMap.class); if (params.getBoolean("output", true) && outFile != null) { job.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job, outFile); } else { // Skip writing the output for the sake of debugging job.setOutputFormatClass(NullOutputFormat.class); } // Submit the job if (!params.getBoolean("background", false)) { job.waitForCompletion(false); } else { job.submit(); } return job; } public static List<Path> getIndexedSlices(OperationsParams params) throws Exception { List<Path> slices = null; Path indexPath = params.getInputPath(); Path outputPath = params.getOutputPath(); String fromto = params.get("interval"); String level = params.get("time"); if (fromto.contains(",")) { // query temporal range different date String[] time = fromto.split(","); String fromTime = time[0]; String toTime = time[1]; QueryPlanner plan = new QueryPlanner(params); if (level != null) { slices = plan.getQueryPlanFromResolution(fromTime, toTime, level); } else { slices = plan.getQueryPlan(fromTime, toTime); } return slices; } return slices; } /** * Runs a range query on the local machine (no MapReduce) and the output is * streamed to the provided result collector. The query might run in * parallel which makes it necessary to design the result collector to * accept parallel calls to the method * {@link ResultCollector#collect(Object)}. You can use * {@link ResultCollectorSynchronizer} to synchronize calls to your * ResultCollector if you cannot design yours to be thread safe. * * @param inPath * @param queryRange * @param shape * @param params * @param output * @return * @throws IOException * @throws InterruptedException */ public static <S extends Shape> long rangeQueryLocal(Path inPath, final Shape queryRange, final S shape, final OperationsParams params, final ResultCollector<S> output) throws IOException, InterruptedException { // Set MBR of query shape in job configuration to work with the spatial // filter OperationsParams.setShape(params, SpatialInputFormat3.InputQueryRange, queryRange.getMBR()); // 1- Split the input path/file to get splits that can be processed // independently final SpatialInputFormat3<Rectangle, S> inputFormat = new SpatialInputFormat3<Rectangle, S>(); Job job = Job.getInstance(params); SpatialInputFormat3.setInputPaths(job, inPath); final List<InputSplit> splits = inputFormat.getSplits(job); // 2- Process splits in parallel List<Long> results = Parallel.forEach(splits.size(), new RunnableRange<Long>() { @Override public Long run(int i1, int i2) { long results = 0; for (int i = i1; i < i2; i++) { try { FileSplit fsplit = (FileSplit) splits.get(i); final RecordReader<Rectangle, Iterable<S>> reader = inputFormat.createRecordReader(fsplit, null); if (reader instanceof SpatialRecordReader3) { ((SpatialRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof RTreeRecordReader3) { ((RTreeRecordReader3) reader).initialize(fsplit, params); } else if (reader instanceof HDFRecordReader) { ((HDFRecordReader) reader).initialize(fsplit, params); } else { throw new RuntimeException("Unknown record reader"); } while (reader.nextKeyValue()) { Iterable<S> shapes = reader.getCurrentValue(); for (Shape s : shapes) { results++; if (output != null) output.collect((S) s); } } reader.close(); } catch (IOException e) { LOG.error("Error processing split " + splits.get(i), e); } catch (InterruptedException e) { LOG.error("Error processing split " + splits.get(i), e); } } return results; } }); long totalResultSize = 0; for (long result : results) totalResultSize += result; return totalResultSize; } private static String getTrajectoryRectangle(String trajectory) { String[] trajectoryPoints = trajectory.split(";"); double x1 = Double.parseDouble(trajectoryPoints[0].split(",")[0]); double y1 = Double.parseDouble(trajectoryPoints[0].split(",")[1]); double x2 = x1; double y2 = y1; double tempDouble; for (String p : trajectoryPoints) { String[] point = p.split(","); // get the minMax of x point tempDouble = Double.parseDouble(point[0]); if (tempDouble <= x1) x1 = tempDouble; if (tempDouble >= x2) x2 = tempDouble; // get the minMax of y. tempDouble = Double.parseDouble(point[1]); if (tempDouble <= y1) y1 = tempDouble; if (tempDouble >= y2) y2 = tempDouble; } return Double.toString(x1) + "," + Double.toString(y1) + "," + Double.toString(x2) + "," + Double.toString(y2); } private static void printUsage() { System.out.println("Runs a spatio-temporal range query on indexed data"); System.out.println("Parameters: (* marks required parameters)"); System.out.println("<input file> - (*) Path to input file"); System.out.println("<output file> - Path to input file"); System.out.println( "shape:<shape:edu.umn.cs.sthadoop.trajectory.GeolifeTrajectory> - (*) Type of shapes stored in input file"); System.out.println("interval:<date1,date2> - Temporal query range. " + "Format of each date is yyyy-mm-dd"); System.out.println("time:[day,week,month,year] - Time Format"); System.out.println("traj:<x1,y1;....;xn,yn> - the Full trajectory"); System.out.println("-overwrite - Overwrite output file without notice"); GenericOptionsParser.printGenericCommandUsage(System.out); } public static void main(String[] args) throws Exception { // args = new String[8]; // args[0] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/index_geolife"; // args[1] = "/export/scratch/mntgData/geolifeGPS/geolife_Trajectories_1.3/HDFS/knn-dis-result"; // args[2] = "shape:edu.umn.cs.sthadoop.trajectory.GeolifeTrajectory"; // args[3] = "interval:2008-05-01,2008-05-30"; // args[4] = "time:month"; // args[5] = "traj:39.9119983,116.606835;39.9119783,116.6065483;39.9119599,116.6062649;39.9119416,116.6059899;39.9119233,116.6057282;39.9118999,116.6054783;39.9118849,116.6052366;39.9118666,116.6050099;39.91185,116.604775;39.9118299,116.604525;39.9118049,116.6042649;39.91177,116.6040166;39.9117516,116.6037583;39.9117349,116.6035066;39.9117199,116.6032666;39.9117083,116.6030232;39.9117,116.6027566;39.91128,116.5969383;39.9112583,116.5966766;39.9112383,116.5964232;39.9112149,116.5961699;39.9111933,116.5959249;39.9111716,116.5956883"; // args[6] = "-overwrite"; // args[7] = "-local";//"-no-local"; final OperationsParams params = new OperationsParams(new GenericOptionsParser(args)); final Path[] paths = params.getPaths(); if (paths.length <= 1 && !params.checkInput()) { printUsage(); System.exit(1); } if (paths.length >= 2 && !params.checkInputOutput()) { printUsage(); System.exit(1); } if (params.get("traj") == null) { System.err.println("Trajectory query is missing"); printUsage(); System.exit(1); } // Invoke method to compute the trajectory MBR. String rectangle = getTrajectoryRectangle(params.get("traj")); params.set("rect", rectangle); if (params.get("rect") == null) { System.err.println("You must provide a Trajectory Query"); printUsage(); System.exit(1); } if (params.get("interval") == null) { System.err.println("Temporal range missing"); printUsage(); System.exit(1); } TextSerializable inObj = params.getShape("shape"); if (!(inObj instanceof STPoint)) { LOG.error("Shape is not instance of STPoint"); printUsage(); System.exit(1); } // Get spatio-temporal slices. List<Path> STPaths = getIndexedSlices(params); final Path outPath = params.getOutputPath(); final Rectangle[] queryRanges = params.getShapes("rect", new Rectangle()); // All running jobs final Vector<Long> resultsCounts = new Vector<Long>(); Vector<Job> jobs = new Vector<Job>(); Vector<Thread> threads = new Vector<Thread>(); long t1 = System.currentTimeMillis(); for (Path stPath : STPaths) { final Path inPath = stPath; for (int i = 0; i < queryRanges.length; i++) { final OperationsParams queryParams = new OperationsParams(params); OperationsParams.setShape(queryParams, "rect", queryRanges[i]); if (OperationsParams.isLocal(new JobConf(queryParams), inPath)) { // Run in local mode final Rectangle queryRange = queryRanges[i]; final Shape shape = queryParams.getShape("shape"); final Path output = outPath == null ? null : (queryRanges.length == 1 ? outPath : new Path(outPath, String.format("%05d", i))); Thread thread = new Thread() { @Override public void run() { FSDataOutputStream outFile = null; final byte[] newLine = System.getProperty("line.separator", "\n").getBytes(); try { ResultCollector<Shape> collector = null; if (output != null) { FileSystem outFS = output.getFileSystem(queryParams); final FSDataOutputStream foutFile = outFile = outFS.create(output); collector = new ResultCollector<Shape>() { final Text tempText = new Text2(); @Override public synchronized void collect(Shape r) { try { tempText.clear(); r.toText(tempText); foutFile.write(tempText.getBytes(), 0, tempText.getLength()); foutFile.write(newLine); } catch (IOException e) { e.printStackTrace(); } } }; } else { outFile = null; } long resultCount = rangeQueryLocal(inPath, queryRange, shape, queryParams, collector); resultsCounts.add(resultCount); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } finally { try { if (outFile != null) outFile.close(); } catch (IOException e) { e.printStackTrace(); } } } }; thread.start(); threads.add(thread); } else { // Run in MapReduce mode Path outTempPath = outPath == null ? null : new Path(outPath, String.format("%05d", i) + "-" + inPath.getName()); queryParams.setBoolean("background", true); Job job = rangeQueryMapReduce(inPath, outTempPath, queryParams); jobs.add(job); } } } while (!jobs.isEmpty()) { Job firstJob = jobs.firstElement(); firstJob.waitForCompletion(false); if (!firstJob.isSuccessful()) { System.err.println("Error running job " + firstJob); System.err.println("Killing all remaining jobs"); for (int j = 1; j < jobs.size(); j++) jobs.get(j).killJob(); System.exit(1); } Counters counters = firstJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); resultsCounts.add(outputRecordCounter.getValue()); jobs.remove(0); } while (!threads.isEmpty()) { try { Thread thread = threads.firstElement(); thread.join(); threads.remove(0); } catch (InterruptedException e) { e.printStackTrace(); } } long t2 = System.currentTimeMillis(); System.out.println("QueryPlan:"); for (Path stPath : STPaths) { System.out.println(stPath.getName()); } System.out.println("Time for " + queryRanges.length + " jobs is " + (t2 - t1) + " millis"); System.out.println("Results counts: " + resultsCounts); } }