List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:SBP.java
License:Apache License
protected JobConf configInitMessage() throws Exception { final JobConf conf = new JobConf(getConf(), SBP.class); conf.set("nstate", "" + nstate); conf.set("compat_matrix_str", "" + edge_potential_str); conf.setJobName("BP_Init_Belief"); conf.setMapperClass(MapInitializeBelief.class); fs.delete(message_cur_path, true);//w w w.jav a2 s . c o m FileInputFormat.setInputPaths(conf, edge_path); FileOutputFormat.setOutputPath(conf, message_cur_path); conf.setNumReduceTasks(0); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); return conf; }
From source file:SBP.java
License:Apache License
protected JobConf configSumErr() throws Exception { final JobConf conf = new JobConf(getConf(), SBP.class); conf.set("nstate", "" + nstate); conf.setJobName("BP_Sum Err"); fs.delete(sum_error_path, true);/* ww w.j a v a 2 s. c o m*/ conf.setMapperClass(MapSumErr.class); conf.setReducerClass(RedSumErr.class); FileInputFormat.setInputPaths(conf, check_error_path); FileOutputFormat.setOutputPath(conf, sum_error_path); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); return conf; }
From source file:gen_rank.java
License:LGPL
public static void runjob(String input, String output) throws Exception { JobConf conf = new JobConf(gen_rank.class); conf.setJobName("Preparing_data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf);// w w w . j a v a 2 s.co m }
From source file:WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override//from ww w .j a va 2 s. c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:GapDeduceRunner.java
License:Apache License
public static void main(String[] args) throws IOException { JobConf conf = new JobConf(GapDeduceRunner.class); conf.setJobName("gapdeduce"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Gapper.class); conf.setReducerClass(Deducer.class); // KeyValueTextInputFormat treats each line as an input record, // and splits the line by the tab character to separate it into key and value conf.setInputFormat(KeyValueTextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);/*from w w w. j av a 2 s . c om*/ }
From source file:LicenseStatewiseCount.java
public static void main(String[] args) throws Exception { JobConf conf = new JobConf(ParkingTicketStatewiseCount.class); conf.setJobName("Statecounts"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from w w w . j a v a 2 s.co m }
From source file:TestFormatStorageRecordReader.java
License:Open Source License
static RecordReader<WritableComparable, Writable> getRecReader(JobConf conf, String path) throws IOException { conf.set("mapred.input.dir", path); FileInputFormat.setInputPaths(conf, path); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); System.out.println("get Splits:" + inputSplits.length); return inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL); }
From source file:DijikstraAlgo.java
License:GNU General Public License
public static void run(String[] args) throws Exception { IN = "hdfs://10.8.3.161:9000/user/sagar/input/"; OUT = "hdfs://10.8.3.161:9000/user/sagar/output/"; String input = IN;//from w w w . j a va 2 s. c o m String output = OUT + System.nanoTime(); String MAX_SPLIT_SIZE = args[0]; boolean isdone = false; // Reiteration again and again till the convergence while (isdone == false) { JobConf conf = new JobConf(DijikstraAlgo.class); conf.setJobName("Dijikstra"); // conf.set("mapred.max.split.size", MAX_SPLIT_SIZE); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf); input = output + "/part-00000"; isdone = true;// set the job to NOT run again! Path ofile = new Path(input); FileSystem fs = FileSystem.get(new URI("hdfs://10.8.3.165:9000"), conf); //FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(ofile))); HashMap<Integer, Integer> imap = new HashMap<Integer, Integer>(); String line = br.readLine(); // Read the current output file and put it into HashMap while (line != null) { String[] sp = line.split("\t| "); int node = Integer.parseInt(sp[0]); int distance = Integer.parseInt(sp[1]); imap.put(node, distance); line = br.readLine(); } br.close(); // Check for convergence condition if any node is still left then // continue else stop Iterator<Integer> itr = imap.keySet().iterator(); while (itr.hasNext()) { int key = itr.next(); int value = imap.get(key); if (value >= 125) { isdone = false; } } input = output; output = OUT + System.nanoTime(); } }
From source file:PiEstimator.java
License:Apache License
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi/*from ww w. j a v a2 s . c o m*/ */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException { // setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); // setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException( "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { // generate an input file for each map task for (int i = 0; i < numMaps; ++i) { final Path file = new Path(inDir, "part" + i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #" + i); } // start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job Finished in " + duration + " seconds"); // read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } // compute estimated value return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
From source file:DataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];// w w w . j a v a2 s . com String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, DataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir)); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }