Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path) 

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);//from w w w .jav a2 s .c  om
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.cloudata.examples.web.WebTableJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TestWebPage <num of repeats> webtable <inputPath>");
        System.exit(0);// ww w. j ava  2s . c  o  m
    }
    //WebTable ?
    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, WEB_TABLE)) {
        TableSchema webTableInfo = new TableSchema(WEB_TABLE, "Test", WEB_TABLE_COLUMNS);
        webTableInfo.setNumOfVersion(2);
        CTable.createTable(nconf, webTableInfo);
    }

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(WebTableJob.class);
    jobConf.setJobName("WebTableJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(WebTableMap.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //Map Only
    jobConf.setNumReduceTasks(0);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.cloudata.examples.weblink.UploadJob.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 3) {
        System.out.println("Usage: java UploadJob <input path> <table name> <distributed cache dir>");
        System.exit(0);/*from w  w  w.  j av a2 s.  com*/
    }

    Path inputPath = new Path(args[0]);
    String tableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        TableSchema tableSchema = new TableSchema(tableName);
        tableSchema.addColumn("url");
        tableSchema.addColumn("page");
        tableSchema.addColumn("title");
        tableSchema.addColumn("outlink");

        CTable.createTable(nconf, tableSchema);
    }

    JobConf jobConf = new JobConf(UploadJob.class);
    jobConf.set("mapred.child.java.opts", "-Xss4096K");
    jobConf.setJobName("CloudataExamles.weblink.UploadJob_" + new Date());
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmllexer.jar"), jobConf);
    DistributedCache.addArchiveToClassPath(new Path(args[2] + "/htmlparser.jar"), jobConf);
    DistributedCache.addArchiveToClassPath(new Path(args[2] + "/jdom.jar"), jobConf);

    // <MAP>
    FileInputFormat.addInputPath(jobConf, inputPath);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(UploadJobMapper.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);
    // </MAP>

    // <REDUCE>
    // Map Only
    FileOutputFormat.setOutputPath(jobConf,
            new Path("CloudataExamles_WebUploadJob_" + System.currentTimeMillis()));
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(FileOutputFormat.getOutputPath(jobConf), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.util.upload.UploadUtil.java

License:Apache License

private void doHadoopUpload(CloudataConf conf) throws IOException {
    if (!CTable.existsTable(conf, tableName)) {
        throw new IOException("No table:" + tableName);
    }/*  ww  w. jav a 2s  . c om*/

    JobConf jobConf = new JobConf(UploadUtil.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")");

    //KeyRangePartitioner    
    //AbstractTabletInputFormat.OUTPUT_TABLE? ? 
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set("uploadJob.delim", delim);
    String columnStr = "";
    for (String eachColumn : columns) {
        columnStr += eachColumn + ",";
    }
    jobConf.set("uploadJob.columns", columnStr);

    String fieldNumStr = "";
    for (int eachField : fieldNums) {
        fieldNumStr += eachField + ",";
    }
    jobConf.set("uploadJob.fieldNums", fieldNumStr);
    jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair);
    jobConf.setMapperClass(UploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(0);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.commoncrawl.util.JobBuilder.java

License:Open Source License

/**
 * add inputs to the job config //www  .  jav  a 2s .c  o m
 * 
 * @param inputs
 * @return
 * @throws IOException
 */
public JobBuilder inputs(List<Path> inputs) throws IOException {
    for (Path input : inputs) {
        FileInputFormat.addInputPath(_jobConf, input);
    }
    return this;
}

From source file:org.commoncrawl.util.JobBuilder.java

License:Open Source License

/**
 * add a single input file to the job config
 * @param input//w  w w .jav a  2s.com
 * @return
 * @throws IOException
 */
public JobBuilder input(Path input) throws IOException {
    FileInputFormat.addInputPath(_jobConf, input);
    return this;
}

From source file:org.dkpro.bigdata.hadoop.DkproHadoopDriver.java

License:Apache License

/**
 * Runs the UIMA pipeline.//from   w  ww  .  ja  v a2 s . c  o m
 * 
 * @return 0 if Hadoop job succeeded, 1 if job failed, 2 if it was killed, otherwise 3
 * 
 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
 */
@Override
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.out.println(
                "Usage: " + this.getClass().getSimpleName() + " [hadoop-params] input output [job-params]");
        System.exit(1);
    }
    this.job = new JobConf(getConf(), DkproHadoopDriver.class);
    final FileSystem fs = FileSystem.get(this.job);
    // set the factory class name
    this.job.set("dkpro.uima.factory", this.getClass().getName());
    Path inputPath;
    if (args[0].contains(",")) {
        String[] inputPaths = args[0].split(",");
        inputPath = new Path(inputPaths[0]);
        for (String path : inputPaths) {
            FileInputFormat.addInputPath(job, new Path(path));
        }
    } else {
        inputPath = new Path(args[0]); // input
        FileInputFormat.setInputPaths(this.job, inputPath);

    }
    String outDir = args[1];
    if (!getConf().getBoolean("dkpro.output.overwrite", true)) {
        outDir = getUniqueDirectoryName(outDir, fs);
    }
    final Path outputPath = new Path(outDir);// output
    final CollectionReader reader = buildCollectionReader();
    // if a collection reader was defined, import data into hdfs
    // try {
    // final Class<?> c = Class.forName("org.apache.hadoop.io.compress.SnappyCodec");
    // FileOutputFormat.setOutputCompressorClass(this.job,
    // (Class<? extends CompressionCodec>) c);
    // }
    // catch (final Exception e) {
    //
    // }
    if (reader != null) {
        final AnalysisEngine xcasWriter = AnalysisEngineFactory.createEngine(
                CASWritableSequenceFileWriter.class, // createTypeSystemDescription(),
                CASWritableSequenceFileWriter.PARAM_PATH, inputPath.toString(),
                CASWritableSequenceFileWriter.PARAM_COMPRESS, true, CASWritableSequenceFileWriter.PARAM_FS,
                job.get(("fs.default.name"), "file:/"));
        runPipeline(reader, xcasWriter);
    }
    // cleanup previous output
    fs.delete(outputPath, true);
    // this is a sensible default for the UKP cluster
    //        int numMappers = 256;
    // if (args.length > 2) {
    // numMappers = Integer.parseInt(args[2]);
    // }

    FileOutputFormat.setOutputPath(this.job, outputPath);
    // SequenceFileOutputFormat.setCompressOutput(this.job, true);

    if (this.job.get("mapred.output.compress") == null) {
        this.job.setBoolean("mapred.output.compress", true);
    }
    // Just in case compression is on
    this.job.set("mapred.output.compression.type", "BLOCK");

    if (this.job.getBoolean("dkpro.output.writecas", true)) {
        if (this.job.getBoolean("dkpro.output.plaintext", false)) {
            this.job.setOutputFormat(TextOutputFormat.class);
        } else {
            this.job.setOutputFormat(SequenceFileOutputFormat.class);
        }
    } else {
        job.setOutputFormat(NullOutputFormat.class);
    }
    // this.job.set("mapred.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // use compression
    // setup some sensible defaults
    this.job.setMapperClass(this.mapperClass);
    this.job.setReducerClass(this.reducerClass);
    if (getInputFormatClass() != null) {
        this.job.setInputFormat(getInputFormatClass());
    } else {
        this.job.setInputFormat(SequenceFileInputFormat.class);
    }
    // this.job.setOutputFormat(TextOutputFormat.class);
    this.job.setMapOutputKeyClass(Text.class);
    this.job.setMapOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setOutputKeyClass(Text.class);
    this.job.setOutputValueClass(BinCasWithTypeSystemWritable.class);
    this.job.setJobName(this.getClass().getSimpleName());
    // this.job.set("mapred.child.java.opts", "-Xmx1g");
    //        this.job.setInt("mapred.job.map.memory.mb", 1280);
    //        this.job.setInt("mapred.job.reduce.memory.mb", 1280);
    //        this.job.setNumMapTasks(numMappers);
    this.job.setNumReduceTasks(0);
    configure(this.job);

    // create symlinks for distributed resources
    DistributedCache.createSymlink(this.job);
    // sLogger.info("Running job "+job.getJobName());

    RunningJob runningJob = JobClient.runJob(this.job);
    runningJob.waitForCompletion();
    int status = runningJob.getJobState();
    if (status == JobStatus.SUCCEEDED) {
        return 0;
    } else if (status == JobStatus.FAILED) {
        return 1;
    } else if (status == JobStatus.KILLED) {
        return 2;
    } else {
        return 3;
    }

}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);//from ww  w  .ja  va  2  s.  co m
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.hadoop.tdg.MaxTemperatureDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");

    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[0]));

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setMapperClass(MaxTemperatureMapper.class);
    conf.setCombinerClass(MaxTemperatureReducer.class);
    conf.setReducerClass(MaxTemperatureReducer.class);

    JobClient.runJob(conf);//from www.j  av  a  2s. c  o  m
    return 0;
}

From source file:org.lamapacos.preprocessor.filter.SegmentFilter.java

License:Apache License

public void dump(Path segmentHome, Path output) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentFilter: filtering segment: " + segmentHome);
    }//  ww  w. ja  v  a2s .c  o  m

    JobConf job = createJobConf();
    job.setJobName("filter " + segmentHome);

    FileStatus[] paths = fs.listStatus(segmentHome);
    for (FileStatus path : paths) {
        if (path.isDir()) {
            FileInputFormat.addInputPath(job,
                    new Path(segmentHome, new Path(path.getPath(), Content.DIR_NAME)));
        }
    }

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(InputCompatMapper.class);
    job.setReducerClass(SegmentFilter.class);

    //      Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segfilter-" + new java.util.Random().nextInt());
    //      fs.delete(tempDir, true);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentFilter: done");
    }
}