List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths
public static Path[] getInputPaths(JobConf conf)
From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGenerator.java
License:Open Source License
/** * //from w w w . j a va2s . c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); // Reducer count int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCDatasetMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); // This can be optionally use to suppress keys: // conf.setOutputFormat(KeylessTextOutputFormat.class); // conf.set( "map.output.key.field.separator", "" ); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOSTS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FACES_NAME, TextOutputFormat.class, Text.class, Text.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGenerator.java
License:Open Source License
/** * //www. j a v a 2 s . c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(MDXSeqSampleMapper.class); conf.setReducerClass(ReservoirSamplingReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, GEO_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_SAMPLE_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); KeylessTextOutputFormat.setCompressOutput(conf, true); KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGenerator.java
License:Open Source License
/** * /*from w ww . j a v a 2 s . c o m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapperClass(MDXSeqStatsMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class, Text.class); KeylessTextOutputFormat.setCompressOutput(conf, true); KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGenerator.java
License:Open Source License
/** * /* www . ja v a 2s . co m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { LOG.info("Loading config from: " + configPath); index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { LOG.info("Using default config: mdx"); index_conf = ConfigFactory.load("mdx"); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config: " + index_conf.getString("warc.title")); // Reducer count: int numReducers = 10; if (index_conf.hasPath(WARC_HADOOP_NUM_REDUCERS)) { numReducers = index_conf.getInt(WARC_HADOOP_NUM_REDUCERS); } if (conf.getInt(WARC_HADOOP_NUM_REDUCERS, -1) != -1) { LOG.info("Overriding num_reducers using Hadoop config."); numReducers = conf.getInt(WARC_HADOOP_NUM_REDUCERS, numReducers); } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCMDXMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); // SequenceFileOutputFormat.setOutputCompressionType(conf, // CompressionType.BLOCK); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunner.java
License:Open Source License
/** * // w w w. j a v a 2s . c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS, ConfigValueFactory.fromAnyRef(applyAnnotations)); // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); if (index_conf.getBoolean("warc.solr.use_hash_url_id")) { LOG.info("Using hash-based ID."); } if (index_conf.hasPath("warc.solr.zookeepers")) { LOG.info("Using Zookeepers."); } else { LOG.info("Using SolrServers."); } // Also set reduce speculative execution off, avoiding duplicate // submissions to Solr. conf.set("mapred.reduce.tasks.speculative.execution", "false"); // Reducer count dependent on concurrent HTTP connections to Solr // server. int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCIndexerMapper.class); conf.setReducerClass(WARCIndexerReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setBoolean("mapred.output.oai-pmh", this.exportXml); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXMerger.java
License:Open Source License
/** * /*from ww w .j ava 2s. co m*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ public void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); // Input conf.setInputFormat(TextInputFormat.class); // M-R conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); // Map outputs conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); // Job outputs conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); LOG.info("Used " + numReducers + " reducers."); conf.setNumReduceTasks(numReducers); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); }
From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger.java
License:Open Source License
/** * //from ww w . j a v a 2s . c o m * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ public void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); // Input conf.setInputFormat(SequenceFileInputFormat.class); // M-R conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); // Map outputs conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); // Job outputs conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); LOG.info("Used " + numReducers + " reducers."); conf.setNumReduceTasks(numReducers); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); }
From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java
License:Open Source License
/** * // ww w. ja va 2 s . c o m * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCRawStatsMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(TextOutputFormat.class); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }