List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:nthu.scopelab.tsqr.ssvd.QJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k, int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException { String stages[] = reduceSchedule.split(","); String rinput = ""; String routput = outputPath + "/iter-r-"; for (int i = 0; i < stages.length; i++) { String thenumber = Integer.toString(i + 1); JobConf job = new JobConf(conf, QJob.class); job.setJobName("Q-job-" + thenumber); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); if (i == 0) job.setMapperClass(QMapper.class); else// w ww.java2s .c o m job.setMapperClass(IdentityMapper.class); job.setReducerClass(QReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(LMatrixWritable.class); FileSystem fs = FileSystem.get(job); Path Paths[]; fileGather fgather = null; if (i == 0) fgather = new fileGather(inputPaths, "part", fs); else fgather = new fileGather(new Path(rinput), "part", fs); Paths = fgather.getPaths(); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(Integer.parseInt(stages[i])); job.setInt(QRFirstJob.COLUMN_SIZE, k + p); job.setLong(PROP_OMEGA_SEED, seed); job.setInt(PROP_K, k); job.setInt(PROP_P, p); fs.delete(new Path(routput + thenumber), true); FileInputFormat.setInputPaths(job, Paths); FileOutputFormat.setOutputPath(job, new Path(routput + thenumber)); //FileOutputFormat.setCompressOutput(job, true); //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK); //output first level Q MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class, LMatrixWritable.class); RunningJob rj = JobClient.runJob(job); System.out.println("QJob Job ID: " + rj.getJobID().toString()); rinput = routput + thenumber; } }
From source file:nthu.scopelab.tsqr.ssvd.UJob.java
License:Apache License
public void start(Configuration conf, Path inputPathQ, Path inputUHatPath, Path sigmaPath, Path outputPath, int k, boolean uHalfSigma, int mis) throws ClassNotFoundException, InterruptedException, IOException { String input = ""; JobConf job = new JobConf(conf, UJob.class); jobclient = new JobClient(job); job.setJobName("UJob"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapperClass(MultiplyMapper.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(LMatrixWritable.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LMatrixWritable.class); FileSystem fs = FileSystem.get(job); fileGather fgather = new fileGather( new Path(inputPathQ.toString().substring(0, inputPathQ.toString().lastIndexOf("/") - 1)), "Q-", fs); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); job.setNumMapTasks(fgather.recNumMapTasks(mis)); job.setNumReduceTasks(0);/*from w w w .j a va 2 s .c om*/ job.set("mapreduce.output.basename", OUTPUT_U); job.set(PROP_UHAT_PATH, inputUHatPath.toString()); job.set(PROP_SIGMA_PATH, sigmaPath.toString()); if (uHalfSigma) { job.set(PROP_U_HALFSIGMA, "y"); } job.setInt(QJob.PROP_K, k); FileSystem.get(job).delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); FileInputFormat.setInputPaths(job, inputPathQ); //JobClient.runJob(job); jobid = jobclient.submitJob(job).getID(); }
From source file:org.acacia.csr.java.LineCount.java
License:Apache License
public static void main(String[] args) throws Exception { /*// w w w.j ava2s. com String dir1 = "/user/miyuru/wcout"; String dir2 = "/user/miyuru/lcout"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if(fs1.exists(new Path(dir2))){ fs1.delete(new Path(dir2), true); } JobConf conf = new JobConf(LineCount.class); conf.setJobName("LineCount"); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job = new Job(conf, "line count"); job.waitForCompletion(true); org.apache.hadoop.mapreduce.Counters cntr = job.getCounters(); System .out.println("Number of lines in the file" + cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue()); */ long edgeCount = 0; //String dir3 = "/user/miyuru/wcout"; String dir4 = "/user/miyuru/lcout"; String dir5 = "/user/miyuru/input"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs2 = FileSystem.get(new JobConf()); if (fs2.exists(new Path(dir4))) { fs2.delete(new Path(dir4), true); } JobConf conf1 = new JobConf(LineCount.class); conf1.setJobName("LineCount"); conf1.setOutputKeyClass(Text.class); conf1.setOutputValueClass(IntWritable.class); conf1.setMapperClass(Map.class); conf1.setCombinerClass(Reduce.class); conf1.setReducerClass(Reduce.class); conf1.setInputFormat(TextInputFormat.class); conf1.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf1, new Path(dir5)); FileOutputFormat.setOutputPath(conf1, new Path(dir4)); Job job1 = new Job(conf1, "line count"); job1.setNumReduceTasks(0); job1.waitForCompletion(true); org.apache.hadoop.mapreduce.Counters cntr = job1.getCounters(); edgeCount = cntr.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS").getValue(); File efile = new File("/tmp/efile"); if (efile.exists()) { efile.delete(); } PrintWriter writer = new PrintWriter("/tmp/efile", "UTF-8"); writer.println(edgeCount); writer.flush(); writer.close(); //edgeCount = edgeCount -1;//This is to remove the line number additionlly added to each edgelist file by HDFS. This is strange, but it happens. System.out.println("======>Edge count is : " + edgeCount); System.out.println("------Done Line Count---------------"); }
From source file:org.acacia.csr.java.NotInFinder.java
License:Apache License
public static void main(String[] args) throws Exception { String dir1 = "/user/miyuru/wcout"; String dir2 = "/user/miyuru/notinverts"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); }/*from ww w . jav a 2 s . com*/ JobConf conf = new JobConf(); conf.setNumMapTasks(96); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(LongWritable.class); conf.setMapperClass(TokenizerMapper.class); conf.setReducerClass(IntSumReducer.class); conf.setCombinerClass(IntSumReducer.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job = new Job(conf, "NotInFinder"); job.setJarByClass(WordCount.class); // job.setMapperClass(TokenizerMapper.class); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(LongWritable.class); // job.setOutputValueClass(LongWritable.class); job.setSortComparatorClass(SortComparator.class); job.waitForCompletion(true); }
From source file:org.acacia.partitioner.java.EdgeDistributor.java
License:Apache License
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { String dir1 = "/user/miyuru/input"; String dir2 = "/user/miyuru/edgedistributed-out"; // //We first delete the temporary directories if they exist on the HDFS FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); }/*from ww w. j ava 2s. c o m*/ //First job scans through the edge list and splits the edges in to separate files based on the partitioned vertex files. JobConf conf = new JobConf(EdgeDistributor.class); conf.set("org.acacia.partitioner.hbase.zookeeper.quorum", args[0]); conf.set("org.acacia.partitioner.hbase.table", args[1]); conf.set("org.acacia.partitioner.index.contacthost", args[2]); conf.set("vert-count", args[3]); conf.set("initpartition-id", args[4]); conf.set("zero-flag", args[5]); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(FileMapper.class); conf.setReducerClass(FileReducer.class); //conf.setInputFormat(TextInputFormat.class); conf.setInputFormat(NLinesInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setNumReduceTasks(96); //Need to specify the number of reduce tasks explicitly. Otherwise it creates only one reduce task. FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); MultipleOutputs.addMultiNamedOutput(conf, "partition", TextOutputFormat.class, NullWritable.class, Text.class); Job job = new Job(conf, "EdgeDistributor"); job.waitForCompletion(true); System.out.println("Done job EdgeDistribution"); }
From source file:org.acacia.partitioner.java.FileMerger.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(FileMerger.class); conf.setMapperClass(FileMergerMapper.class); String dir1 = "/user/miyuru/partlist"; FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path("/user/miyuru/filemerger-out"))) { fs1.delete(new Path("/user/miyuru/filemerger-out"), true); }//from www . ja va 2s .co m if (fs1.exists(new Path("/user/miyuru/edgedistributed-out-filtered"))) { fs1.delete(new Path("/user/miyuru/edgedistributed-out-filtered"), true); } FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path("/user/miyuru/filemerger-out")); Job job = new Job(conf, "FileMerger"); job.waitForCompletion(true); }
From source file:org.acacia.partitioner.java.FileMover.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(FileMover.class); conf.setMapperClass(FileMoverMapper.class); String dir1 = "/user/miyuru/partfilelist"; FileSystem fs1 = FileSystem.get(new JobConf()); if (fs1.exists(new Path("/user/miyuru/filemover-out"))) { fs1.delete(new Path("/user/miyuru/filemover-out"), true); }//from www. j a v a2 s . co m FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path("/user/miyuru/filemover-out")); Job job = new Job(conf, "FileMover"); job.waitForCompletion(true); }
From source file:org.acacia.partitioner.java.NoptSplitter.java
License:Apache License
/** * @param args// ww w . j a v a 2s. c om */ public static void main(String[] args) { if (!validArgs(args)) { printUsage(); return; } //These are the temp paths that are created on HDFS String dir1 = "/user/miyuru/edgedistributed-out/nopt"; String dir2 = "/user/miyuru/nopt-distributed"; //We first delete the temporary directories if they exist on the HDFS FileSystem fs1; try { fs1 = FileSystem.get(new JobConf()); System.out.println("Deleting the dir : " + dir2); if (fs1.exists(new Path(dir2))) { fs1.delete(new Path(dir2), true); } // Path notinPath = new Path(dir2); // // if(!fs1.exists(notinPath)){ // fs1.create(notinPath); // } JobConf conf = new JobConf(NoptSplitter.class); // conf.setOutputKeyClass(Text.class); // conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); // conf.setInputFormat(TextInputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(dir1)); FileOutputFormat.setOutputPath(conf, new Path(dir2)); Job job1 = new Job(conf, "nopt_splitter"); job1.setNumReduceTasks(Integer.parseInt(args[0])); //The most importnt point in this job job1.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } catch (InterruptedException e) { e.printStackTrace(); } catch (ClassNotFoundException e) { e.printStackTrace(); } }
From source file:org.ahanna.DoubleConversionMapper.java
License:Apache License
public static void main(String[] args) { JobConf conf = new JobConf(DoubleConversion.class); conf.setJobName("DoubleConversation"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(DoubleConversionMapper.class); conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class); // KeyValueTextInputFormat treats each line as an input record, // and splits the line by the tab character to separate it into key and value conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); try {//from ww w . java 2 s . c om JobClient.runJob(conf); } catch (IOException e) { // do nothing } }
From source file:org.apache.apex.examples.mroperator.MapOperatorTest.java
License:Apache License
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException { CollectorTestSink sortSink = new CollectorTestSink(); oper.output.setSink(sortSink);/* w w w . j a v a 2s. co m*/ oper.setMapClass(WordCount.Map.class); oper.setCombineClass(WordCount.Reduce.class); oper.setDirName(testMeta.testDir); oper.setConfigFile(null); oper.setInputFormatClass(TextInputFormat.class); Configuration conf = new Configuration(); JobConf jobConf = new JobConf(conf); FileInputFormat.setInputPaths(jobConf, new Path(testMeta.testDir)); TextInputFormat inputFormat = new TextInputFormat(); inputFormat.configure(jobConf); InputSplit[] splits = inputFormat.getSplits(jobConf, 1); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass()); keySerializer.open(oper.getOutstream()); keySerializer.serialize(splits[0]); oper.setInputSplitClass(splits[0].getClass()); keySerializer.close(); oper.setup(null); oper.beginWindow(0); oper.emitTuples(); oper.emitTuples(); oper.endWindow(); oper.beginWindow(1); oper.emitTuples(); oper.endWindow(); Assert.assertEquals("number emitted tuples", 3, sortSink.collectedTuples.size()); for (Object o : sortSink.collectedTuples) { LOG.debug(o.toString()); } LOG.debug("Done testing round\n"); oper.teardown(); }