List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/*from w ww . j a v a 2 s.c o m*/ } conf.setInt("NodeCount", n); Partitioner p = null; if (useRange) { p = new RangePartitioner<IntWritable, Writable>(); p.configure(conf); } else { p = new HashPartitioner<WritableComparable, Writable>(); } // this is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (f.getPath().getName().contains("_logs")) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); sLogger.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + "\t"); } sLogger.info(sb.toString().trim()); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - numPartitions: " + numPartitions); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private void phase2(String path, int i, int j, int n, float missing) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); sLogger.info("missing PageRank mass: " + missing); sLogger.info("number of nodes: " + n); String in = path + "/iter" + sFormat.format(j) + "t"; String out = path + "/iter" + sFormat.format(j); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase2"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length; int numReduceTasks = 0; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase2"); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setNumMapTasks(numMapTasks);//from w w w . ja va2 s.c o m conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MapPageRankMassDistributionClass.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); conf.setFloat("MissingMass", (float) missing); conf.setInt("NodeCount", n); FileSystem.get(conf).delete(new Path(out), true); JobClient.runJob(conf); }
From source file:edu.umd.cloud9.webgraph.driver.BuildAnchorTextForwardIndex.java
License:Apache License
/** * Runs this tool.// w w w. j a va2s.c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } JobConf conf = new JobConf(getConf()); FileSystem fs = FileSystem.get(conf); String collectionPath = args[0]; String outputPath = args[1]; String indexFile = args[2]; LOG.info("Tool name: BuildAnchorTextForwardIndex"); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setJobName("BuildAnchorTextForwardIndex"); conf.setNumMapTasks(100); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(collectionPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(IndexableAnchorTextForwardIndex.class.getName()); out.writeUTF(collectionPath); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 1000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } return 0; }
From source file:edu.umd.cloud9.webgraph.driver.GenerateTabDelimitedWebGraph.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 4) { printUsage();/*from w w w .jav a 2s .c o m*/ return -1; } JobConf conf = new JobConf(getConf(), GenerateTabDelimitedWebGraph.class); FileSystem fs = FileSystem.get(conf); String inPath = DriverUtil.argValue(args, "-webgraph") + "/" + DriverUtil.OUTPUT_WEBGRAPH; String outPath = DriverUtil.argValue(args, "-output"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); if (fs.exists(outputPath)) { fs.delete(outputPath); } conf.setJobName("TabDelimWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); return 0; }
From source file:edu.umn.cs.spatialHadoop.operations.Aggregate.java
License:Open Source License
/** * Computes the minimum and maximum values of readings in input. Useful as * a preparatory step before drawing.//ww w .j a v a 2 s . com * @param inFiles A list of input files. * @param params * @return * @throws IOException */ public static MinMax aggregate(Path[] inFiles, OperationsParams params) throws IOException { boolean forceCompute = params.getBoolean("force", false); // Check if we have hard-coded cached values for the given dataset String inPathStr = inFiles[0].toString(); if (!forceCompute && (inPathStr.contains("MOD11A1") || inPathStr.contains("MYD11A1"))) { MinMax min_max = new MinMax(); // Land temperature min_max = new MinMax(); //min_max.minValue = 10000; // 200 K, -73 C, -100 F //min_max.maxValue = 18000; // 360 K, 87 C, 188 F min_max.minValue = 13650; // 273 K, 0 C min_max.maxValue = 17650; // 353 K, 80 C return min_max; } // Need to process input files to get stats from it and calculate its size JobConf job = new JobConf(params, FileMBR.class); FileInputFormat.setInputPaths(job, inFiles); ShapeInputFormat<Shape> inputFormat = new ShapeInputFormat<Shape>(); return aggregateMapReduce(inFiles, params); }
From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java
License:Open Source License
/** * Performs a redistribute join between the given files using the * redistribute join algorithm. Currently, we only support a pair of files. * @param inFiles//from w w w.ja v a 2 s.c om * @param userOutputPath * @param params * @return * @throws IOException */ public static <S extends Shape> long joinStep(Path[] inFiles, Path userOutputPath, OperationsParams params) throws IOException { long t1 = System.currentTimeMillis(); JobConf job = new JobConf(params, DistributedJoin.class); FileSystem fs[] = new FileSystem[inFiles.length]; for (int i_file = 0; i_file < inFiles.length; i_file++) fs[i_file] = inFiles[i_file].getFileSystem(job); Path outputPath = userOutputPath; if (outputPath == null) { do { outputPath = new Path(inFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000)); } while (fs[0].exists(outputPath)); } job.setJobName("DistributedJoin"); ClusterStatus clusterStatus = new JobClient(job).getClusterStatus(); GlobalIndex<Partition> gindex1 = SpatialSite.getGlobalIndex(fs[0], inFiles[0]); GlobalIndex<Partition> gindex2 = SpatialSite.getGlobalIndex(fs[1], inFiles[1]); OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly); LOG.info("Joining " + inFiles[0] + " X " + inFiles[1]); if (SpatialSite.isRTree(fs[0], inFiles[0]) && SpatialSite.isRTree(fs[1], inFiles[1])) { job.setInputFormat(DJInputFormatRTree.class); } else { if (isOneShotReadMode) { // Ensure all objects are read in one shot job.setInt(SpatialSite.MaxBytesInOneRead, -1); job.setInt(SpatialSite.MaxShapesInOneRead, -1); } else { job.setInt(SpatialSite.MaxBytesInOneRead, maxBytesInOneRead); job.setInt(SpatialSite.MaxShapesInOneRead, maxShapesInOneRead); } job.setInputFormat(DJInputFormatArray.class); } // Set input paths and map function if (inFiles[0].equals(inFiles[1])) { // Self join job.setInputFormat(ShapeArrayInputFormat.class); // Remove the spatial filter to ensure all partitions are loaded FileInputFormat.setInputPaths(job, inFiles[0]); if (gindex1 != null && gindex1.isReplicated()) job.setMapperClass(RedistributeJoinMap.class); else job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } else { // Binary version of spatial join (two different input files) job.setClass(SpatialSite.FilterClass, SpatialJoinFilter.class, BlockFilter.class); FileInputFormat.setInputPaths(job, inFiles); if ((gindex1 != null && gindex1.isReplicated()) || (gindex2 != null && gindex2.isReplicated())) { // Need the map function with duplicate avoidance step. job.setMapperClass(RedistributeJoinMap.class); } else { // No replication in both indexes, use map function with no dup // avoidance job.setMapperClass(RedistributeJoinMapNoDupAvoidance.class); } } Shape shape = params.getShape("shape"); job.setMapOutputKeyClass(shape.getClass()); job.setMapOutputValueClass(shape.getClass()); job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks())); job.setNumReduceTasks(0); // No reduce needed for this task if (job.getBoolean("output", true)) job.setOutputFormat(TextOutputFormat.class); else job.setOutputFormat(NullOutputFormat.class); TextOutputFormat.setOutputPath(job, outputPath); if (!params.getBoolean("background", false)) { LOG.info("Submit job in sync mode"); RunningJob runningJob = JobClient.runJob(job); Counters counters = runningJob.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); // Output number of running map tasks Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS); System.out.println("Number of map tasks " + mapTaskCountCounter.getValue()); // Delete output directory if not explicitly set by user if (userOutputPath == null) fs[0].delete(outputPath, true); long t2 = System.currentTimeMillis(); System.out.println("Join time " + (t2 - t1) + " millis"); return resultCount; } else { JobClient jc = new JobClient(job); LOG.info("Submit job in async mode"); lastRunningJob = jc.submitJob(job); LOG.info("Job " + lastRunningJob + " submitted successfully"); return -1; } }
From source file:edu.umn.cs.sthadoop.operations.STJoin.java
License:Open Source License
/** * //from ww w .j a v a 2s.c o m * @param inputPath * @param outputPath * @param params * @return * @throws IOException * @throws Exception * @throws InterruptedException */ private static long stJoin(Path inputPath, Path outputPath, OperationsParams params) throws IOException, Exception, InterruptedException { JobConf conf = new JobConf(new Configuration(), STJoin.class); FileSystem outfs = outputPath.getFileSystem(conf); outfs.delete(outputPath, true); conf.setJobName("STJoin"); // pass params to the join map-reduce conf.set("timedistance", params.get("timedistance")); conf.set("spacedistance", params.get("spacedistance")); // conf.setMapOutputKeyClass(LongWritable.class); // conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); // Mapper settings conf.setMapperClass(STJoinMap.class); // conf.setReducerClass(STJoinReduce.class); // conf.setCombinerClass(STJoinReduce.class); conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setNumReduceTasks(0); JobClient.runJob(conf).waitForCompletion(); outfs = inputPath.getFileSystem(conf); outfs.delete(inputPath); return 0; }
From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskLargeHDFS.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws IOException { JobConf conf = new JobConf(getConf(), this.getClass()); conf.setJobName("aggregation_hdfs_large"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(AggTaskLargeHDFS.Map.class); conf.setCombinerClass(AggTaskLargeHDFS.Reduce.class); conf.setReducerClass(AggTaskLargeHDFS.Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 2) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); }//from ww w. jav a 2 s. c o m FileInputFormat.setInputPaths(conf, new Path(args[0])); // OUTPUT properties Path outputPath = new Path(args[1]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); return conf; }
From source file:edu.yale.cs.hadoopdb.benchmark.AggTaskSmallHDFS.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws IOException { JobConf conf = new JobConf(this.getClass()); conf.setJobName("aggregation_hdfs_small"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(AggTaskSmallHDFS.Map.class); conf.setCombinerClass(AggTaskSmallHDFS.Reduce.class); conf.setReducerClass(AggTaskSmallHDFS.Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 2) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); }//from w ww . ja v a 2 s . c om FileInputFormat.setInputPaths(conf, new Path(args[0])); // OUTPUT properties Path outputPath = new Path(args[1]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); return conf; }
From source file:edu.yale.cs.hadoopdb.benchmark.GrepTaskHDFS.java
License:Apache License
@Override protected JobConf configureJob(String... args) throws IOException { JobConf conf = new JobConf(getConf(), this.getClass()); conf.setJobName("grep_hdfs"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setNumReduceTasks(0);/*from w ww . ja v a 2s . c om*/ conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); if (args.length < 3) { throw new RuntimeException("Incorrect arguments provided for " + this.getClass()); } conf.set(GREP_PATTERN_PARAM, args[0]); FileInputFormat.setInputPaths(conf, new Path(args[1])); Path outputPath = new Path(args[2]); HDFSUtil.deletePath(outputPath); FileOutputFormat.setOutputPath(conf, outputPath); return conf; }