Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf) 

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGenerator.java

License:Open Source License

/**
 * //from w  w  w .  j a  va2s  .  c om
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        index_conf = ConfigFactory.load();
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    // Decide whether to apply annotations:
    // Store the properties:
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config.");
    LOG.info(index_conf.getString("warc.title"));

    // Reducer count
    int numReducers = 1;
    try {
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCDatasetMapper.class);
    conf.setReducerClass(FrequencyCountingReducer.class);
    // This can be optionally use to suppress keys:
    // conf.setOutputFormat(KeylessTextOutputFormat.class);
    // conf.set( "map.output.key.field.separator", "" );

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);

    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, TextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, HOSTS_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, FACES_NAME, TextOutputFormat.class, Text.class, Text.class);

}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGenerator.java

License:Open Source License

/**
 * //www.  j a  v a  2  s  .  c om
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(MDXSeqSampleMapper.class);
    conf.setReducerClass(ReservoirSamplingReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);

    MultipleOutputs.addMultiNamedOutput(conf, GEO_NAME, KeylessTextOutputFormat.class, Text.class, Text.class);

    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_SAMPLE_NAME, KeylessTextOutputFormat.class,
            Text.class, Text.class);

    KeylessTextOutputFormat.setCompressOutput(conf, true);
    KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGenerator.java

License:Open Source License

/**
 * /*from  w  ww  .  j a v  a  2 s  . c o m*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(MDXSeqStatsMapper.class);
    conf.setReducerClass(FrequencyCountingReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);

    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, KeylessTextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, KeylessTextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, KeylessTextOutputFormat.class, Text.class,
            Text.class);

    KeylessTextOutputFormat.setCompressOutput(conf, true);
    KeylessTextOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGenerator.java

License:Open Source License

/**
 * /* www  . ja  v  a  2s  . co m*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        LOG.info("Loading config from: " + configPath);
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        LOG.info("Using default config: mdx");
        index_conf = ConfigFactory.load("mdx");
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config: " + index_conf.getString("warc.title"));

    // Reducer count:
    int numReducers = 10;
    if (index_conf.hasPath(WARC_HADOOP_NUM_REDUCERS)) {
        numReducers = index_conf.getInt(WARC_HADOOP_NUM_REDUCERS);
    }
    if (conf.getInt(WARC_HADOOP_NUM_REDUCERS, -1) != -1) {
        LOG.info("Overriding num_reducers using Hadoop config.");
        numReducers = conf.getInt(WARC_HADOOP_NUM_REDUCERS, numReducers);
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCMDXMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);
    // SequenceFileOutputFormat.setOutputCompressionType(conf,
    // CompressionType.BLOCK);
    // OR TextOutputFormat?
    // conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunner.java

License:Open Source License

/**
 * // w w  w. j  a  v  a  2s . c om
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        index_conf = ConfigFactory.load();
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    // Decide whether to apply annotations:
    index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS,
            ConfigValueFactory.fromAnyRef(applyAnnotations));
    // Store the properties:
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config.");
    LOG.info(index_conf.getString("warc.title"));
    if (index_conf.getBoolean("warc.solr.use_hash_url_id")) {
        LOG.info("Using hash-based ID.");
    }
    if (index_conf.hasPath("warc.solr.zookeepers")) {
        LOG.info("Using Zookeepers.");
    } else {
        LOG.info("Using SolrServers.");
    }

    // Also set reduce speculative execution off, avoiding duplicate
    // submissions to Solr.
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // Reducer count dependent on concurrent HTTP connections to Solr
    // server.
    int numReducers = 1;
    try {
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCIndexerMapper.class);
    conf.setReducerClass(WARCIndexerReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setBoolean("mapred.output.oai-pmh", this.exportXml);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXMerger.java

License:Open Source License

/**
 * /*from   ww  w  .j  ava  2s. co  m*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
public void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    // Input
    conf.setInputFormat(TextInputFormat.class);
    // M-R
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    // Map outputs
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    // Job outputs
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    LOG.info("Used " + numReducers + " reducers.");
    conf.setNumReduceTasks(numReducers);

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
                                                            // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

}

From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger.java

License:Open Source License

/**
 * //from ww  w  .  j a v  a 2s .  c o m
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
public void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    // Input
    conf.setInputFormat(SequenceFileInputFormat.class);
    // M-R
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    // Map outputs
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    // Job outputs
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);
    LOG.info("Used " + numReducers + " reducers.");
    conf.setNumReduceTasks(numReducers);

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
                                                            // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

}

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java

License:Open Source License

/**
 * // ww  w. ja va  2  s .  c  o  m
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCRawStatsMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);
    // OR TextOutputFormat?
    // conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}