List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:com.jyz.study.hadoop.mapreduce.datajoin.DataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];//from w w w.j a v a 2 s .c om String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, DataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir), true); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
From source file:com.liferay.hadoop.action.HadoopJob.java
License:Open Source License
public String doExecute(HttpServletRequest request, HttpServletResponse response) throws Exception { response.setContentType(ContentTypes.TEXT_PLAIN_UTF8); PrintWriter writer = response.getWriter(); FileSystem fileSystem = HadoopManager.getFileSystem(); JobClient jobClient = HadoopManager.getJobClient(); writer.println("-- Job Status --"); Path inputPath = new Path("/index/*/*"); Path outputPath = new Path("/wordcount/results"); try {//w w w . ja va2s . c o m if (_runningJob == null) { writer.println("Creating job"); if (fileSystem.exists(_jobPath)) { fileSystem.delete(_jobPath, false); } if (!fileSystem.exists(_jobPath)) { writer.println("Deploying the job code to cluster"); FSDataOutputStream outputStream = null; try { outputStream = fileSystem.create(_jobPath); ServletContext servletContext = HadoopManager.getServletContext(); InputStream inputStream = servletContext.getResourceAsStream("/WEB-INF/lib/hadoop-job.jar"); StreamUtil.transfer(inputStream, outputStream, false); } finally { StreamUtil.cleanUp(outputStream); } writer.println("Job code deployed to cluster"); } if (fileSystem.exists(outputPath)) { writer.println("A previous job output was found, backing it up"); fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } _jobConf = HadoopManager.createNewJobConf(); _jobConf.setJobName("Word Count"); writer.println("Job '" + _jobConf.getJobName() + "' is being configured"); _jobConf.setJarByClass(Map.class); _jobConf.setOutputKeyClass(Text.class); _jobConf.setOutputValueClass(IntWritable.class); _jobConf.setMapperClass(Map.class); _jobConf.setCombinerClass(Reduce.class); _jobConf.setReducerClass(Reduce.class); _jobConf.setInputFormat(TextInputFormat.class); _jobConf.setOutputFormat(TextOutputFormat.class); writer.println("Job code deployed to distributed cache's classpath"); DistributedCache.addArchiveToClassPath(_jobPath, _jobConf, fileSystem); FileInputFormat.setInputPaths(_jobConf, inputPath); FileOutputFormat.setOutputPath(_jobConf, outputPath); writer.println("Submitting job the first time"); _runningJob = jobClient.submitJob(_jobConf); writer.println("Job submitted"); } int jobState = _runningJob.getJobState(); writer.println( "Job status: " + jobState + " (RUNNING = 1, SUCCEEDED = 2, FAILED = 3, PREP = 4, KILLED = 5)"); if ((jobState != JobStatus.RUNNING) && (jobState != JobStatus.PREP)) { writer.println("Re-issuing the job"); if (fileSystem.exists(outputPath)) { writer.println("A previous job output was found, backing it up"); fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } writer.println("Submitting job the first time"); _runningJob = jobClient.submitJob(_jobConf); writer.println("Job submitted"); } } catch (Exception ioe) { writer.println("Job error: "); ioe.printStackTrace(writer); } writer.flush(); writer.close(); return null; }
From source file:com.liferay.hadoop.util.HadoopManager.java
License:Open Source License
public static void runJob(StoreEvent storeEvent) throws IOException { FileSystem fileSystem = getFileSystem(); if (_servletContext == null) { return;// w w w .j a v a2 s . com } JobClient jobClient = getJobClient(); Path inputPath = new Path("/index".concat(storeEvent.getRootPath().toString()).concat("/*")); Path outputPath = new Path("/wordcount".concat(storeEvent.getRootPath().toString()).concat("/results")); try { if (_runningJob == null) { if (!fileSystem.exists(_jobPath)) { FSDataOutputStream outputStream = null; try { outputStream = fileSystem.create(_jobPath); InputStream inputStream = _servletContext .getResourceAsStream("/WEB-INF/lib/hadoop-job.jar"); StreamUtil.transfer(inputStream, outputStream, false); } finally { StreamUtil.cleanUp(outputStream); } } if (fileSystem.exists(outputPath)) { fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } _jobConf = new JobConf(_sharedJobConf); _jobConf.setJobName("Word Count"); _jobConf.setJarByClass(Map.class); _jobConf.setOutputKeyClass(Text.class); _jobConf.setOutputValueClass(IntWritable.class); _jobConf.setMapperClass(Map.class); _jobConf.setCombinerClass(Reduce.class); _jobConf.setReducerClass(Reduce.class); _jobConf.setInputFormat(TextInputFormat.class); _jobConf.setOutputFormat(TextOutputFormat.class); DistributedCache.addArchiveToClassPath(_jobPath, _jobConf, fileSystem); FileInputFormat.setInputPaths(_jobConf, inputPath); FileOutputFormat.setOutputPath(_jobConf, outputPath); _runningJob = jobClient.submitJob(_jobConf); } int jobState = _runningJob.getJobState(); if ((jobState != JobStatus.RUNNING) && (jobState != JobStatus.PREP)) { System.out.println("Re-issuing the word count job."); if (fileSystem.exists(outputPath)) { fileSystem.rename(outputPath, outputPath.getParent().suffix("/.results-" + System.currentTimeMillis())); } _runningJob = jobClient.submitJob(_jobConf); } } catch (Exception ioe) { ioe.printStackTrace(); } }
From source file:com.liveramp.hank.hadoop.HadoopDomainBuilder.java
License:Apache License
@Override protected void configureJob(JobConf conf) { // Input specification conf.setInputFormat(inputFormatClass); FileInputFormat.setInputPaths(conf, inputPath); // Mapper class and key/value classes conf.setMapperClass(mapperClass);/* ww w. j av a 2 s.com*/ conf.setMapOutputKeyClass(KeyAndPartitionWritableComparable.class); conf.setMapOutputValueClass(ValueWritable.class); // Reducer class and key/value classes conf.setReducerClass(DomainBuilderReducer.class); conf.setOutputKeyClass(KeyAndPartitionWritable.class); conf.setOutputValueClass(ValueWritable.class); // Partitioner conf.setPartitionerClass(DomainBuilderPartitioner.class); }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx); try {/*www . j av a 2 s . co m*/ FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.manning.hip.ch4.joins.improved.impl.OptimizedDataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];/* w w w . j a v a 2 s . c o m*/ String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, OptimizedDataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir)); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(CompositeKey.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setPartitionerClass(CompositeKeyPartitioner.class); job.setOutputKeyComparatorClass(CompositeKeyComparator.class); job.setOutputValueGroupingComparator(CompositeKeyOnlyComparator.class); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
From source file:com.me.neu.Popular_question.Runner.java
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Runner.class); conf.setJobName("pop-ques"); conf.setMapperClass(Mapper1.class); // conf.setOutputKeyComparatorClass(DescendingIntComparable.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setReducerClass(Reducer1.class); // take the input and output from the command line FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);// ww w. ja v a 2 s .co m }
From source file:com.me.neu.popular_tag_year.Runner.java
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Runner.class); conf.setJobName("tag-year"); conf.setMapperClass(Mapper1.class); // conf.setOutputKeyComparatorClass(DescendingIntComparable.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setReducerClass(Reducer1.class); // take the input and output from the command line FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//www. j ava 2 s . c om }
From source file:com.me.neu.stackoverflow.Runner.java
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(Runner.class); conf.setJobName("tag-reco"); conf.setMapperClass(Mapper1.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setReducerClass(Reducer1.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/*w w w . j av a 2 s .com*/ }
From source file:com.mycompany.mavenproject1.App.java
public static void main(String[] args) throws IOException { // give time to attach debugger try {// w ww. j a va 2s.c o m Thread.sleep(8000); } catch (InterruptedException ex) { Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex); } JobConf conf = new JobConf(App.class); // purge existing output file FileSystem fs = FileSystem.get(conf); fs.delete(new Path(args[1]), true); // delete file, true for recursive conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(WholeFileInputFormat.class); // conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }