List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath
public static void addInputPath(JobConf conf, Path path)
From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java
License:Open Source License
/** * //w w w .ja v a 2 s . c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCRawStatsMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(TextOutputFormat.class); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCStatsTool.java
License:Open Source License
protected void createJobConf(JobConf conf, String[] args) throws IOException { // Store application properties where the mappers/reducers can access them Config index_conf = ConfigFactory.load(); log.info("Loaded warc config."); // Also set mapred speculative execution off: conf.set("mapred.reduce.tasks.speculative.execution", "false"); // Reducer count dependent on concurrent HTTP connections to Solr server. int numReducers = 1; try {//from ww w .j av a2 s. c o m numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: log.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(args[0])); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); log.info("Read input files."); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setJobName(args[0] + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCStatsMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.set("map.output.key.field.separator", ""); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.outlinks.OutlinkExtractor.java
License:Open Source License
public int run(String[] args) throws IOException { JobConf conf = new JobConf(getConf(), OutlinkExtractor.class); String line = null;//w ww. j a v a2 s . co m BufferedReader br = new BufferedReader(new FileReader(args[0])); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setJobName(args[0] + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(OutlinkExtractorMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); conf.setOutputFormat(TextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // JobClient.runJob( conf ); new JobClient(conf).submitJob(conf); return 0; }
From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java
License:Apache License
public static JobConf addAllSubPaths(JobConf conf, Path path) throws IOException { if (shouldPathBeIgnored(path)) { throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path)); }//from w w w . ja v a 2 s . c om final FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { for (FileStatus status : fs.listStatus(path)) { if (!shouldPathBeIgnored(status.getPath())) { if (status.isDir()) { addAllSubPaths(conf, status.getPath()); } else { FileInputFormat.addInputPath(conf, status.getPath()); } } } } return conf; }