List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:mr.WordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { Properties properties = new Properties(); AppProps.addApplicationTag(properties, "tutorials"); AppProps.addApplicationTag(properties, "cluster:development"); AppProps.setApplicationName(properties, "cascading-mapreduce-flow"); JobConf conf = new JobConf(WordCount.class); conf.setJobName("casading-mapreduce-flow"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); MapReduceFlow flow = new MapReduceFlow("wordcount", conf, true); // JobClient.runJob(conf); flow.complete();/*from w ww . j av a 2 s .c om*/ }
From source file:name.abhijitsarkar.hadoop.citation.CitationCombiner.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("citation-combiner"); /* This is to set the separator byte for KeyValueTextInputFormat */ conf.set("key.value.separator.in.input.line", ","); conf.setMapperClass(CitationMapper.class); conf.setReducerClass(CitationReducer.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/*from w ww .j av a 2 s . c o m*/ return 0; }
From source file:name.abhijitsarkar.hadoop.citation.CitationCombinerWithChaining.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("citation-combiner-with-chaining"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobConf map1Conf = new JobConf(false); ChainMapper.addMapper(conf, CitationInputSplitMapper.class, LongWritable.class, Text.class, Text.class, Text.class, true, map1Conf); JobConf map2Conf = new JobConf(false); ChainMapper.addMapper(conf, CitationHeaderStripMapper.class, Text.class, Text.class, Text.class, Text.class, true, map2Conf);/*w w w. ja v a2s . co m*/ JobConf red1Conf = new JobConf(false); ChainReducer.setReducer(conf, CitationReducer.class, Text.class, Text.class, Text.class, Text.class, true, red1Conf); JobClient.runJob(conf); return 0; }
From source file:NCDSearch.DistributedNCDSearch.java
public int run(String args[]) throws Exception { String inputpath = args[1];/*w ww . j av a 2 s .c o m*/ String outputpath = args[2]; JobConf conf = new JobConf(getConf(), ChunkyFileInputFormat.class); //Add the target file to a cache so all nodes can have a copy. DistributedCache.addCacheFile(new URI(args[0]), conf); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); FileInputFormat.setInputPaths(conf, new Path(inputpath)); conf.setJobName("NCDSearch"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(FloatWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInputFormat(ChunkyFileInputFormat.class); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); JobClient.runJob(conf); return 0; }
From source file:net.sf.katta.indexing.IndexerJob.java
License:Apache License
public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException { // create job conf with class pointing into job jar. JobConf jobConf = new JobConf(IndexerJob.class); jobConf.setJobName("indexer"); jobConf.setMapRunnerClass(Indexer.class); // alternative use a text file and a TextInputFormat jobConf.setInputFormat(SequenceFileInputFormat.class); Path input = new Path(path); FileInputFormat.setInputPaths(jobConf, input); // we just set the output path to make hadoop happy. FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination)); // setting the folder where lucene indexes will be copied when finished. jobConf.set("finalDestination", finalDestination); // important to switch spec exec off. // We dont want to have something duplicated. jobConf.setSpeculativeExecution(false); // The num of map tasks is equal to the num of input splits. // The num of input splits by default is equal to the num of hdf blocks // for the input file(s). To get the right num of shards we need to // calculate the best input split size. FileSystem fs = FileSystem.get(input.toUri(), jobConf); FileStatus[] status = fs.globStatus(input); long size = 0; for (FileStatus fileStatus : status) { size += fileStatus.getLen();//from www. j a v a 2 s . c o m } long optimalSplisize = size / numOfShards; jobConf.set("mapred.min.split.size", "" + optimalSplisize); // give more mem to lucene tasks. jobConf.set("mapred.child.java.opts", "-Xmx2G"); jobConf.setNumMapTasks(1); jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }
From source file:nlp.com.knowledgebooks.mapreduce.NameFinder.java
License:Open Source License
/** * The main driver for name finder map/reduce program. * <p/>//from w w w .j av a2 s . c om * NOTE: copied with modifications from Hadoppjava example programs * <p/> * Invoke this method to submit the map/reduce job. * * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), NameFinder.class); conf.setJobName("namefinder"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MapClass.class); //conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { conf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("-r".equals(args[i])) { conf.setNumReduceTasks(Integer.parseInt(args[++i])); } else { other_args.add(args[i]); } } catch (Exception ex) { System.err.println("ERROR: " + ex); } } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); JobClient.runJob(conf); return 0; }
From source file:nthu.scopelab.tsqr.ssvd.ABtDenseOutJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPath, Path inputBt, Path outputPath, int k, int p, int reduceTasks, int mis) throws Exception { JobConf job = new JobConf(conf, ABtDenseOutJob.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setInt(QJob.PROP_K, k);/*from w w w . j a va2 s .com*/ job.setInt(QJob.PROP_P, p); job.set(PROP_BT_PATH, inputBt.toString()); FileOutputFormat.setOutputPath(job, outputPath); job.setJobName("ABtDenseOutJob"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(LMatrixWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LMatrixWritable.class); job.setMapperClass(ABtMapper.class); fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job)); mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job)); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputPath); RunningJob rj = JobClient.runJob(job); }
From source file:nthu.scopelab.tsqr.ssvd.BtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p, int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis) throws Exception { boolean outputQ = true; String stages[] = reduceSchedule.split(","); JobConf job = new JobConf(conf, BtJob.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setInt(SCHEDULE_NUM, stages.length); job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight); job.setInt(QJob.PROP_K, k);//from ww w . j av a 2 s .c o m job.setInt(QJob.PROP_P, p); job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ); job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.set(QmultiplyJob.QRF_DIR, qrfPath); FileSystem.get(job).delete(btPath, true); FileOutputFormat.setOutputPath(job, btPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setJobName("BtJob"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); //job.setOutputValueClass(SparseRowBlockWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job)); mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job)); job.setNumMapTasks(fgather.recNumMapTasks(mis)); //job.setNumReduceTasks(0); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); if (outputQ) { MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); } if (outputBBtProducts) { MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } RunningJob rj = JobClient.runJob(job); System.out.println("Btjob Job ID: " + rj.getJobID().toString()); }
From source file:nthu.scopelab.tsqr.ssvd.itBtJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPath, Path btPath, String qrfPath, int k, int p, int outerBlockHeight, int reduceTasks, boolean outputBBtProducts, String reduceSchedule, int mis) throws Exception { boolean outputQ = true; String stages[] = reduceSchedule.split(","); JobConf job = new JobConf(conf, itBtJob.class); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setInt(SCHEDULE_NUM, stages.length); job.setInt(PROP_OUTER_PROD_BLOCK_HEIGHT, outerBlockHeight); job.setInt(QJob.PROP_K, k);/*from ww w.ja v a 2 s .com*/ job.setInt(QJob.PROP_P, p); job.setBoolean(QmultiplyJob.OUTPUT_Q, outputQ); job.setBoolean(PROP_OUPTUT_BBT_PRODUCTS, outputBBtProducts); job.set(QmultiplyJob.QRF_DIR, qrfPath); FileSystem.get(job).delete(btPath, true); FileOutputFormat.setOutputPath(job, btPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setJobName("itBtJob"); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(SparseRowBlockWritable.class); job.setOutputKeyClass(IntWritable.class); //job.setOutputValueClass(SparseRowBlockWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(BtMapper.class); job.setCombinerClass(OuterProductCombiner.class); job.setReducerClass(OuterProductReducer.class); fileGather fgather = new fileGather(inputPath, "", FileSystem.get(job)); mis = Checker.checkMis(mis, fgather.getInputSize(), FileSystem.get(job)); job.setNumMapTasks(fgather.recNumMapTasks(mis)); //job.setNumReduceTasks(0); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); if (outputQ) { MultipleOutputs.addNamedOutput(job, QmultiplyJob.Q_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); } if (outputBBtProducts) { MultipleOutputs.addNamedOutput(job, OUTPUT_BBT, SequenceFileOutputFormat.class, IntWritable.class, VectorWritable.class); } RunningJob rj = JobClient.runJob(job); System.out.println("itBtJob Job ID: " + rj.getJobID().toString()); }
From source file:nthu.scopelab.tsqr.ssvd.itQJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k, int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException { String stages[] = reduceSchedule.split(","); String rinput = ""; String routput = outputPath + "/iter-r-"; for (int i = 0; i < stages.length; i++) { String thenumber = Integer.toString(i + 1); JobConf job = new JobConf(conf, itQJob.class); job.setJobName("itQ-job-" + thenumber); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); if (i == 0) job.setMapperClass(QMapper.class); else/*ww w . j a v a 2 s. c o m*/ job.setMapperClass(IdentityMapper.class); job.setReducerClass(QReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LMatrixWritable.class); FileSystem fs = FileSystem.get(job); Path Paths[]; fileGather fgather = null; if (i == 0) fgather = new fileGather(inputPaths, "part", fs); else fgather = new fileGather(new Path(rinput), "part", fs); Paths = fgather.getPaths(); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(Integer.parseInt(stages[i])); job.setInt(QRFirstJob.COLUMN_SIZE, k + p); job.setLong(PROP_OMEGA_SEED, seed); job.setInt(PROP_K, k); job.setInt(PROP_P, p); fs.delete(new Path(routput + thenumber), true); FileInputFormat.setInputPaths(job, Paths); FileOutputFormat.setOutputPath(job, new Path(routput + thenumber)); //FileOutputFormat.setCompressOutput(job, true); //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK); //output first level Q MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); RunningJob rj = JobClient.runJob(job); System.out.println("itQJob Job ID: " + rj.getJobID().toString()); rinput = routput + thenumber; } }