Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path)

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCRawStatsMDXGenerator.java

License:Open Source License

/**
 * //w w w  .ja v  a 2 s  .  c om
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCRawStatsMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);
    // OR TextOutputFormat?
    // conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCStatsTool.java

License:Open Source License

protected void createJobConf(JobConf conf, String[] args) throws IOException {

    // Store application properties where the mappers/reducers can access them
    Config index_conf = ConfigFactory.load();
    log.info("Loaded warc config.");

    // Also set mapred speculative execution off:
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // Reducer count dependent on concurrent HTTP connections to Solr server.
    int numReducers = 1;
    try {//from  ww  w .j av a2  s.  c o m
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    log.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(args[0]));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    log.info("Read input files.");

    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setJobName(args[0] + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCStatsMapper.class);
    conf.setReducerClass(FrequencyCountingReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.outlinks.OutlinkExtractor.java

License:Open Source License

public int run(String[] args) throws IOException {
    JobConf conf = new JobConf(getConf(), OutlinkExtractor.class);

    String line = null;//w ww. j  a v  a2  s  . co  m
    BufferedReader br = new BufferedReader(new FileReader(args[0]));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    conf.setJobName(args[0] + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(OutlinkExtractorMapper.class);
    conf.setReducerClass(FrequencyCountingReducer.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    //        JobClient.runJob( conf );
    new JobClient(conf).submitJob(conf);
    return 0;
}

From source file:voldemort.store.readonly.mr.utils.HadoopUtils.java

License:Apache License

public static JobConf addAllSubPaths(JobConf conf, Path path) throws IOException {
    if (shouldPathBeIgnored(path)) {
        throw new IllegalArgumentException(String.format("Path[%s] should be ignored.", path));
    }//from  w w  w .  ja  v a 2 s  .  c  om

    final FileSystem fs = path.getFileSystem(conf);

    if (fs.exists(path)) {
        for (FileStatus status : fs.listStatus(path)) {
            if (!shouldPathBeIgnored(status.getPath())) {
                if (status.isDir()) {
                    addAllSubPaths(conf, status.getPath());
                } else {
                    FileInputFormat.addInputPath(conf, status.getPath());
                }
            }
        }
    }

    return conf;
}