List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:org.cloudata.examples.web.TermUploadJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]"); System.exit(0);//from w w w .jav a2 s .c om } JobConf jobConf = new JobConf(TermUploadJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 1) { maxReduce = Integer.parseInt(options[1]); } jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000); FileSystem fs = FileSystem.get(jobConf); CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TERM_TABLE)) { //Table Path path = new Path("blogdata/tmp/weight"); FileStatus[] paths = fs.listStatus(path); if (paths == null || paths.length == 0) { LOG.error("No Partition info:" + path); return; } SortedSet<Text> terms = new TreeSet<Text>(); Text text = new Text(); for (FileStatus eachPath : paths) { CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath())); while (true) { int length = reader.readLine(text); if (length <= 0) { break; } terms.add(new Text(text)); } } int temrsPerTablet = terms.size() / (maxReduce - 1); int count = 0; List<Row.Key> rowKeys = new ArrayList<Row.Key>(); for (Text term : terms) { count++; if (count == temrsPerTablet) { rowKeys.add(new Row.Key(term.getBytes())); count = 0; } } rowKeys.add(Row.Key.MAX_KEY); TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {})); } CTable termTable = CTable.openTable(nconf, TERM_TABLE); TabletInfo[] tabletInfos = termTable.listTabletInfos(); Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(TermUploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE); jobConf.setPartitionerClass(WebKeyRangePartitioner.class); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermUploadReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(tabletInfos.length); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); jobConf.setMaxReduceAttempts(0); //<REDUCE> //Run Job JobClient.runJob(jobConf); fs.delete(tempOutputPath); }
From source file:org.cloudata.examples.web.WebTableJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TestWebPage <num of repeats> webtable <inputPath>"); System.exit(0);// ww w. j ava 2s . c o m } //WebTable ? CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, WEB_TABLE)) { TableSchema webTableInfo = new TableSchema(WEB_TABLE, "Test", WEB_TABLE_COLUMNS); webTableInfo.setNumOfVersion(2); CTable.createTable(nconf, webTableInfo); } Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); JobConf jobConf = new JobConf(WebTableJob.class); jobConf.setJobName("WebTableJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(WebTableMap.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMaxMapAttempts(0); //</MAP> //Map Only jobConf.setNumReduceTasks(0); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); //Run Job JobClient.runJob(jobConf); //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.cloudata.examples.weblink.UploadJob.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 3) { System.out.println("Usage: java UploadJob <input path> <table name> <distributed cache dir>"); System.exit(0);/*from w w w. j av a2 s. com*/ } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("url"); tableSchema.addColumn("page"); tableSchema.addColumn("title"); tableSchema.addColumn("outlink"); CTable.createTable(nconf, tableSchema); } JobConf jobConf = new JobConf(UploadJob.class); jobConf.set("mapred.child.java.opts", "-Xss4096K"); jobConf.setJobName("CloudataExamles.weblink.UploadJob_" + new Date()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmllexer.jar"), jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmlparser.jar"), jobConf); DistributedCache.addArchiveToClassPath(new Path(args[2] + "/jdom.jar"), jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(UploadJobMapper.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> // Map Only FileOutputFormat.setOutputPath(jobConf, new Path("CloudataExamles_WebUploadJob_" + System.currentTimeMillis())); jobConf.setNumReduceTasks(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.util.upload.UploadUtil.java
License:Apache License
private void doHadoopUpload(CloudataConf conf) throws IOException { if (!CTable.existsTable(conf, tableName)) { throw new IOException("No table:" + tableName); }/* ww w. jav a 2s . c om*/ JobConf jobConf = new JobConf(UploadUtil.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.set("uploadJob.delim", delim); String columnStr = ""; for (String eachColumn : columns) { columnStr += eachColumn + ","; } jobConf.set("uploadJob.columns", columnStr); String fieldNumStr = ""; for (int eachField : fieldNums) { fieldNumStr += eachField + ","; } jobConf.set("uploadJob.fieldNums", fieldNumStr); jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(0); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.commoncrawl.util.JobBuilder.java
License:Open Source License
/** * add inputs to the job config //www . jav a 2s .c o m * * @param inputs * @return * @throws IOException */ public JobBuilder inputs(List<Path> inputs) throws IOException { for (Path input : inputs) { FileInputFormat.addInputPath(_jobConf, input); } return this; }
From source file:org.commoncrawl.util.JobBuilder.java
License:Open Source License
/** * add a single input file to the job config * @param input//w w w .jav a 2s.com * @return * @throws IOException */ public JobBuilder input(Path input) throws IOException { FileInputFormat.addInputPath(_jobConf, input); return this; }
From source file:org.dkpro.bigdata.hadoop.DkproHadoopDriver.java
License:Apache License
/** * Runs the UIMA pipeline.//from w ww . ja v a2 s . c o m * * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3 * * @see org.apache.hadoop.util.Tool#run(java.lang.String[]) */ @Override public int run(String[] args) throws Exception { if (args.length < 2) { System.out.println( "Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]"); System.exit(1); } this.job = new JobConf(getConf(), DkproHadoopDriver.class); final FileSystem fs = FileSystem.get(this.job); // set the factory class name this.job.set("dkpro.uima.factory", this.getClass().getName()); Path inputPath; if (args[0].contains(",")) { String[] inputPaths = args[0].split(","); inputPath = new Path(inputPaths[0]); for (String path : inputPaths) { FileInputFormat.addInputPath(job, new Path(path)); } } else { inputPath = new Path(args[0]); // input FileInputFormat.setInputPaths(this.job, inputPath); } String outDir = args[1]; if (!getConf().getBoolean("dkpro.output.overwrite", true)) { outDir = getUniqueDirectoryName(outDir, fs); } final Path outputPath = new Path(outDir);// output final CollectionReader reader = buildCollectionReader(); // if a collection reader was defined, import data into hdfs // try { // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec"); // FileOutputFormat.setOutputCompressorClass(this.job, // (Class<? extends CompressionCodec>) c); // } // catch (final Exception e) { // // } if (reader != null) { final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine( CASWritableSequenceFileWriter.class, // createTypeSystemDescription(), CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(), CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS, job.get(("fs.default.name"), "file:/")); runPipeline(reader, xcasWriter); } // cleanup previous output fs.delete(outputPath, true); // this is a sensible default for the UKP cluster // int numMappers = 256; // if (args.length > 2) { // numMappers = Integer.parseInt(args[2]); // } FileOutputFormat.setOutputPath(this.job, outputPath); // SequenceFileOutputFormat.setCompressOutput(this.job, true); if (this.job.get("mapred.output.compress") == null) { this.job.setBoolean("mapred.output.compress", true); } // Just in case compression is on this.job.set("mapred.output.compression.type", "BLOCK"); if (this.job.getBoolean("dkpro.output.writecas", true)) { if (this.job.getBoolean("dkpro.output.plaintext", false)) { this.job.setOutputFormat(TextOutputFormat.class); } else { this.job.setOutputFormat(SequenceFileOutputFormat.class); } } else { job.setOutputFormat(NullOutputFormat.class); } // this.job.set("mapred.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // use compression // setup some sensible defaults this.job.setMapperClass(this.mapperClass); this.job.setReducerClass(this.reducerClass); if (getInputFormatClass() != null) { this.job.setInputFormat(getInputFormatClass()); } else { this.job.setInputFormat(SequenceFileInputFormat.class); } // this.job.setOutputFormat(TextOutputFormat.class); this.job.setMapOutputKeyClass(Text.class); this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setOutputKeyClass(Text.class); this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class); this.job.setJobName(this.getClass().getSimpleName()); // this.job.set("mapred.child.java.opts", "-Xmx1g"); // this.job.setInt("mapred.job.map.memory.mb", 1280); // this.job.setInt("mapred.job.reduce.memory.mb", 1280); // this.job.setNumMapTasks(numMappers); this.job.setNumReduceTasks(0); configure(this.job); // create symlinks for distributed resources DistributedCache.createSymlink(this.job); // sLogger.info("Running job "+job.getJobName()); RunningJob runningJob = JobClient.runJob(this.job); runningJob.waitForCompletion(); int status = runningJob.getJobState(); if (status == JobStatus.SUCCEEDED) { return 0; } else if (status == JobStatus.FAILED) { return 1; } else if (status == JobStatus.KILLED) { return 2; } else { return 3; } }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);//from ww w .ja va 2 s. co m job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.hadoop.tdg.MaxTemperatureDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), getClass()); conf.setJobName("Max temperature"); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[0])); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MaxTemperatureMapper.class); conf.setCombinerClass(MaxTemperatureReducer.class); conf.setReducerClass(MaxTemperatureReducer.class); JobClient.runJob(conf);//from www.j av a 2s. c o m return 0; }
From source file:org.lamapacos.preprocessor.filter.SegmentFilter.java
License:Apache License
public void dump(Path segmentHome, Path output) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("SegmentFilter: filtering segment: " + segmentHome); }// ww w. ja v a2s .c o m JobConf job = createJobConf(); job.setJobName("filter " + segmentHome); FileStatus[] paths = fs.listStatus(segmentHome); for (FileStatus path : paths) { if (path.isDir()) { FileInputFormat.addInputPath(job, new Path(segmentHome, new Path(path.getPath(), Content.DIR_NAME))); } } job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(InputCompatMapper.class); job.setReducerClass(SegmentFilter.class); // Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segfilter-" + new java.util.Random().nextInt()); // fs.delete(tempDir, true); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("SegmentFilter: done"); } }