Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path) 

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts redirects and the target for each.
 *
 * @param inputPath//from  w ww  .java2s  .  c  o m
 * @param outputPath
 * @throws IOException
 */
private void task0(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting redirects (phase 0)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase0[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(MyMapper0.class);
    conf.setReducerClass(IdentityReducer.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from Wikipedia article to (srcID, (targetID, anchor).
 *
 * @param inputPath//from   w ww .  jav  a2  s.  c o m
 * @param outputPath
 * @throws IOException
 */
private void task1(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 1)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase1[input: %s, output: %s]", inputPath, outputPath));

    // 10 reducers is reasonable.
    conf.setNumReduceTasks(10);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfStringInt.class);
    conf.setMapOutputValueClass(PairOfStrings.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfStrings.class);

    conf.setMapperClass(MyMapper1.class);
    conf.setReducerClass(MyReducer1.class);
    conf.setPartitionerClass(MyPartitioner1.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 *
 * Maps from (srcID, (targetID, anchor) to (targetID, (anchor, count)).
 *
 * @param inputPath//from  ww  w  .jav a  2s  .  co m
 * @param outputPath
 * @throws IOException
 */
private void task2(String inputPath, String outputPath, String redirPath) throws IOException {
    LOG.info("Extracting anchor text (phase 2)...");
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);
    Random r = new Random();
    //String tmpOutput = "tmp-" + this.getClass().getCanonicalName() + "-" + r.nextInt(10000);
    //LOG.info( "intermediate folder for merge " + tmpOutput );

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));

    // Gathers everything together for convenience; feasible for Wikipedia.
    conf.setNumReduceTasks(1);

    try {
        DistributedCache.addCacheFile(new URI(redirPath + "/part-00000" + "#" + "redirs.dat"), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    //FileOutputFormat.setOutputPath(conf, new Path(tmpOutput));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper2.class);
    conf.setReducerClass(MyReducer2.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    JobClient.runJob(conf);
    // Clean up intermediate data.
    FileSystem.get(conf).delete(new Path(inputPath), true);

    /*
    //merge
    String finalO = outputPath+"/part-00000/data";
    FileSystem.get(conf).mkdirs( new Path( outputPath + "part-00000") );
    getMergeInHdfs( tmpOutput, finalO, conf );
    FileSystem.get(conf).delete(new Path(tmpOutput), true);
    */
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Extracts CF for each found anchor.// w  w w  . jav  a  2  s .  c o  m
 *
 * @param inputPath
 * @param mapPath
 * @param outputPath
 * @throws IOException
 */
private void task3(String inputPath, String mapPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 3)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);
    LOG.info(" - mapping: " + mapPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase3[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);
    String location = "map.dat";

    try {
        DistributedCache.addCacheFile(new URI(mapPath + "/part-00000/data" + "#" + location), conf);
        //DistributedCache.addCacheFile(new URI(mapPath + "/singleentitymap.data" + "#" + location), conf);
        DistributedCache.createSymlink(conf);
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper3.class);
    conf.setCombinerClass(MyReducer3.class);
    conf.setReducerClass(MyReducer3.class);

    JobClient.runJob(conf);
}

From source file:com.yahoo.semsearch.fastlinking.io.ExtractWikipediaAnchorText.java

License:Apache License

/**
 * Maps from (targetID, (anchor, count)) to (anchor, (targetID, count)).
 *
 * @param inputPath/*from   w  ww. java  2s  .  c om*/
 * @param outputPath
 * @throws IOException
 */
private void task4(String inputPath, String outputPath) throws IOException {
    LOG.info("Extracting anchor text (phase 4)...");
    LOG.info(" - input:   " + inputPath);
    LOG.info(" - output:  " + outputPath);

    JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
    conf.setJobName(
            String.format("ExtractWikipediaAnchorText:phase4[input: %s, output: %s]", inputPath, outputPath));

    conf.setNumReduceTasks(1);

    //FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-00000/data"));
    FileInputFormat.addInputPath(conf, new Path(inputPath + "/part-*/data"));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(MapFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(HMapSIW.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(HMapSIW.class);

    conf.setMapperClass(MyMapper4.class);
    conf.setReducerClass(MyReducer4.class);

    JobClient.runJob(conf);
}

From source file:contrail.correct.InvokeFlash.java

License:Apache License

public RunningJob runJob() throws Exception {
    JobConf conf = new JobConf(InvokeFlash.class);
    conf.setJobName("Flash invocation");
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");
    String flashPath = (String) stage_options.get("flash_binary");
    if (flashPath.length() == 0) {
        throw new Exception("Flash binary location required");
    }/*www  . ja va2  s.c o m*/
    DistributedCache.addCacheFile(new Path(flashPath).toUri(), conf);
    //Sets the parameters in JobConf
    initializeJobConfiguration(conf);
    AvroJob.setMapperClass(conf, RunFlashMapper.class);
    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    //Input  
    MatePair read = new MatePair();
    AvroJob.setInputSchema(conf, read.getSchema());
    AvroJob.setOutputSchema(conf, new fastqrecord().getSchema());

    //Map Only Job
    conf.setNumReduceTasks(0);

    // Delete the output directory if it exists already
    Path out_path = new Path(outputPath);
    if (FileSystem.get(conf).exists(out_path)) {
        FileSystem.get(conf).delete(out_path, true);
    }

    long starttime = System.currentTimeMillis();
    RunningJob result = JobClient.runJob(conf);
    long endtime = System.currentTimeMillis();
    float diff = (float) (((float) (endtime - starttime)) / 1000.0);
    System.out.println("Runtime: " + diff + " s");
    return result;
}

From source file:contrail.correct.InvokeQuakeForMatePairs.java

License:Apache License

public RunningJob runJob() throws Exception {
    JobConf conf = new JobConf(InvokeQuakeForMatePairs.class);
    conf.setJobName("Quake Paired invocation");
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");
    String quakePath = (String) stage_options.get("quake_binary");
    String bithashPath = (String) stage_options.get("bithashpath");
    //User wants to run quake
    if (quakePath.length() != 0) {
        DistributedCache.addCacheFile(new Path(quakePath).toUri(), conf);
    } else {//from   ww  w  .j  a va2  s.c  o m
        throw new Exception("Please specify Quake binary path");
    }
    if (bithashPath.length() != 0) {
        DistributedCache.addCacheFile(new Path(bithashPath).toUri(), conf);
    } else {
        throw new Exception("Please specify bithash path");
    }
    //Sets the parameters in JobConf
    initializeJobConfiguration(conf);
    AvroJob.setMapperClass(conf, RunQuakeMapper.class);
    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    ///Input  
    MatePair read = new MatePair();
    AvroJob.setInputSchema(conf, read.getSchema());
    AvroJob.setOutputSchema(conf, new fastqrecord().getSchema());
    //Map Only Job
    conf.setNumReduceTasks(0);
    // Delete the output directory if it exists already
    Path out_path = new Path(outputPath);
    if (FileSystem.get(conf).exists(out_path)) {
        FileSystem.get(conf).delete(out_path, true);
    }
    long starttime = System.currentTimeMillis();
    RunningJob result = JobClient.runJob(conf);
    long endtime = System.currentTimeMillis();
    float diff = (float) (((float) (endtime - starttime)) / 1000.0);
    System.out.println("Runtime: " + diff + " s");
    return result;
}

From source file:contrail.correct.InvokeQuakeForSingles.java

License:Apache License

public RunningJob runJob() throws Exception {
    JobConf conf = new JobConf(InvokeQuakeForSingles.class);
    conf.setJobName("Quake Singles invocation");
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");
    String quakePath = (String) stage_options.get("quake_binary");
    String bithashPath = (String) stage_options.get("bithashpath");
    //User wants to run quake
    if (quakePath.length() != 0) {
        DistributedCache.addCacheFile(new Path(quakePath).toUri(), conf);
    } else {//from  w w  w .  j  a  v a  2  s. com
        throw new Exception("Please specify Quake path");
    }
    if (bithashPath.length() != 0) {
        DistributedCache.addCacheFile(new Path(bithashPath).toUri(), conf);
    } else {
        throw new Exception("Please specify bithash path");
    }
    //Sets the parameters in JobConf
    initializeJobConfiguration(conf);
    AvroJob.setMapperClass(conf, RunQuakeMapper.class);
    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    ///Input  
    fastqrecord read = new fastqrecord();
    AvroJob.setInputSchema(conf, read.getSchema());
    AvroJob.setOutputSchema(conf, new fastqrecord().getSchema());
    //Map Only Job
    conf.setNumReduceTasks(0);
    // Delete the output directory if it exists already
    Path out_path = new Path(outputPath);
    if (FileSystem.get(conf).exists(out_path)) {
        FileSystem.get(conf).delete(out_path, true);
    }
    long starttime = System.currentTimeMillis();
    RunningJob result = JobClient.runJob(conf);
    long endtime = System.currentTimeMillis();
    float diff = (float) (((float) (endtime - starttime)) / 1000.0);
    System.out.println("Runtime: " + diff + " s");
    return result;
}

From source file:contrail.correct.KmerCounter.java

License:Open Source License

public RunningJob runJob() throws Exception {
    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");
    JobConf conf = new JobConf(KmerCounter.class);
    conf.setJobName("Kmer Counter ");
    initializeJobConfiguration(conf);/*from ww  w.j  a v  a 2s.c  o  m*/
    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    fastqrecord read = new fastqrecord();
    AvroJob.setInputSchema(conf, read.getSchema());
    AvroJob.setOutputSchema(conf, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
    AvroJob.setMapperClass(conf, KmerCounterMapper.class);
    AvroJob.setReducerClass(conf, KmerCounterReducer.class);
    // Delete the output directory if it exists already
    Path out_path = new Path(outputPath);
    if (FileSystem.get(conf).exists(out_path)) {
        FileSystem.get(conf).delete(out_path, true);
    }
    long starttime = System.currentTimeMillis();
    RunningJob run = JobClient.runJob(conf);
    long endtime = System.currentTimeMillis();
    float diff = (float) (((float) (endtime - starttime)) / 1000.0);
    System.out.println("Runtime: " + diff + " s");
    return run;
}

From source file:contrail.stages.GraphStats.java

License:Open Source License

@Override
public RunningJob runJob() throws Exception {
    String[] required_args = { "inputpath", "outputpath" };
    checkHasParametersOrDie(required_args);

    String inputPath = (String) stage_options.get("inputpath");
    String outputPath = (String) stage_options.get("outputpath");

    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    Configuration base_conf = getConf();
    JobConf conf = null;/*from w  w  w .ja va  2  s . c  o m*/
    if (base_conf != null) {
        conf = new JobConf(getConf(), PairMergeAvro.class);
    } else {
        conf = new JobConf(PairMergeAvro.class);
    }

    conf.setJobName("GraphStats " + inputPath);

    initializeJobConfiguration(conf);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Pair<Integer, GraphStatsData> mapOutput = new Pair<Integer, GraphStatsData>(0, new GraphStatsData());

    GraphNodeData nodeData = new GraphNodeData();
    AvroJob.setInputSchema(conf, nodeData.getSchema());
    AvroJob.setMapOutputSchema(conf, mapOutput.getSchema());
    AvroJob.setOutputSchema(conf, mapOutput.value().getSchema());

    AvroJob.setMapperClass(conf, GraphStatsMapper.class);
    AvroJob.setCombinerClass(conf, GraphStatsCombiner.class);
    AvroJob.setReducerClass(conf, GraphStatsReducer.class);

    // Use a single reducer task that we accumulate all the stats in one
    // reducer.
    conf.setNumReduceTasks(1);

    if (stage_options.containsKey("writeconfig")) {
        writeJobConfig(conf);
    } else {
        // Delete the output directory if it exists already
        Path out_path = new Path(outputPath);
        if (FileSystem.get(conf).exists(out_path)) {
            // TODO(jlewi): We should only delete an existing directory
            // if explicitly told to do so.
            sLogger.info("Deleting output path: " + out_path.toString() + " " + "because it already exists.");
            FileSystem.get(conf).delete(out_path, true);
        }

        long starttime = System.currentTimeMillis();
        RunningJob job = JobClient.runJob(conf);
        long endtime = System.currentTimeMillis();

        float diff = (float) ((endtime - starttime) / 1000.0);
        System.out.println("Runtime: " + diff + " s");

        // Create iterators to read the output
        Iterator<GraphStatsData> binsIterator = createOutputIterator();

        // Compute the N50 stats for each bin.
        ArrayList<GraphN50StatsData> N50Stats = computeN50Stats(binsIterator);

        // Write the N50 stats to a file.
        writeN50StatsToFile(N50Stats);

        Integer topn_contigs = (Integer) stage_options.get("topn_contigs");
        if (topn_contigs > 0) {
            // Get the lengths of the n contigs.
            binsIterator = createOutputIterator();
            List<Integer> topN = topNContigs(binsIterator, topn_contigs);
            writeTopNContigs(topN);
        }
        return job;
    }
    return null;
}