Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path)

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:org.archive.nutchwax.IndexerMapReduce.java

License:Apache License

public static void initMRJob(Collection<Path> segments, JobConf job) {

    for (final Path segment : segments) {
        LOG.info("IndexerMapReduces: adding segment: " + segment);
        FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
        FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    }/* www.j a  v  a2s .  c  o  m*/

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(IndexerMapReduce.class);
    job.setReducerClass(IndexerMapReduce.class);

    job.setOutputFormat(IndexerOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setMapOutputValueClass(NutchWritable.class);
    job.setOutputValueClass(NutchWritable.class);
}

From source file:org.archive.nutchwax.PageRankDb.java

License:Apache License

public void invert(Path pageRankDb, Path[] segments, boolean normalize, boolean filter, boolean force)
        throws IOException {

    Path lock = new Path(pageRankDb, LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);
    Path currentPageRankDb = new Path(pageRankDb, CURRENT_NAME);
    if (LOG.isInfoEnabled()) {
        LOG.info("PageRankDb: starting");
        LOG.info("PageRankDb: pageRankDb: " + pageRankDb);
        LOG.info("PageRankDb: URL normalize: " + normalize);
        LOG.info("PageRankDb: URL filter: " + filter);
    }//from   w ww  .  ja  v  a 2 s .  c  o m
    JobConf job = PageRankDb.createJob(getConf(), pageRankDb, normalize, filter);
    for (int i = 0; i < segments.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("PageRankDb: adding segment: " + segments[i]);
        }
        FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME));
    }
    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        throw e;
    }
    if (fs.exists(currentPageRankDb)) {
        if (LOG.isInfoEnabled()) {
            LOG.info("PageRankDb: merging with existing pageRankDb: " + pageRankDb);
        }
        // try to merge
        Path newPageRankDb = FileOutputFormat.getOutputPath(job);
        job = PageRankDbMerger.createMergeJob(getConf(), pageRankDb, normalize, filter);
        FileInputFormat.addInputPath(job, currentPageRankDb);
        FileInputFormat.addInputPath(job, newPageRankDb);
        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(newPageRankDb, true);
            throw e;
        }
        fs.delete(newPageRankDb, true);
    }
    PageRankDb.install(job, pageRankDb);
    if (LOG.isInfoEnabled()) {
        LOG.info("PageRankDb: done");
    }
}

From source file:org.archive.nutchwax.PageRankDbMerger.java

License:Apache License

public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception {
    JobConf job = createMergeJob(getConf(), output, normalize, filter);
    for (int i = 0; i < dbs.length; i++) {
        FileInputFormat.addInputPath(job, new Path(dbs[i], PageRankDb.CURRENT_NAME));
    }/*from w  ww .j  a  v  a 2  s .c  o  m*/
    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(getConf());
    fs.mkdirs(output);
    fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, PageRankDb.CURRENT_NAME));
}

From source file:org.cloudata.core.PerformanceTest.java

License:Apache License

private void runNIsMoreThanOne(final String cmd) throws IOException {
    checkTable();/*from w w w .j  a  v a2 s .c o  m*/

    // Run a mapreduce job.  Run as many maps as asked-for clients.
    // Before we start up the job, write out an input file with instruction
    // per client regards which row they are to start on.
    Path inputDir = writeInputFile(this.conf);
    this.conf.set(EvaluationMapTask.CMD_KEY, cmd);
    JobConf job = new JobConf(this.conf, this.getClass());
    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormat(TextInputFormat.class);
    job.setJobName("Cloudata Performance Evaluation");
    job.setMapperClass(EvaluationMapTask.class);
    job.setMaxMapAttempts(1);
    job.setMaxReduceAttempts(1);
    job.setNumMapTasks(this.N * 10); // Ten maps per client.
    job.setNumReduceTasks(1);
    job.setOutputFormat(TextOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(inputDir, "outputs"));
    JobClient.runJob(job);
}

From source file:org.cloudata.core.tablet.backup.RestoreBinaryJob.java

License:Apache License

/**
 * @param string/*from w ww . ja v  a  2  s .  c o m*/
 * @param string2
 * @param binary
 */
public void runRestore(String tableName, String[] columnNames, int numOfVersion, String inputPath)
        throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf partitionJob = new JobConf(BackupJob.class);

    FileSystem fs = FileSystem.get(partitionJob);

    if (!fs.exists(new Path(inputPath))) {
        throw new IOException("input path not exists:" + inputPath);
    }

    if (CTable.existsTable(nconf, tableName)) {
        throw new IOException("table already exists" + tableName);
    }

    TableSchema tableSchema = new TableSchema(tableName, "", columnNames);
    tableSchema.setNumOfVersion(numOfVersion);
    CTable.createTable(nconf, tableSchema);

    String columns = "";
    for (String eachColumn : columnNames) {
        columns += eachColumn.trim() + ",";
    }
    columns = columns.substring(0, columns.length() - 1);

    String jobName = tableName + " restore";
    String tempDir = jobName + "_" + System.currentTimeMillis();

    partitionJob.setJobName(tableName + " restore");

    partitionJob.setMapperClass(RestoreBinaryPartitionMap.class);
    FileInputFormat.addInputPath(partitionJob, new Path(inputPath));
    partitionJob.setInputFormat(RestoreSequenceFileAsBinaryInputFormat.class);
    partitionJob.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName);
    FileOutputFormat.setOutputPath(partitionJob, new Path(tempDir));

    //map only
    partitionJob.setNumReduceTasks(0);

    JobClient.runJob(partitionJob);

    //delete temp output dir
    fs.delete(new Path(tempDir), true);

    ////////////////////////////////////////////////////////////////
    JobConf jobConf = new JobConf(BackupJob.class);
    jobConf.setJobName(tableName + " restore");

    jobConf.setMapperClass(RestoreBinaryMap.class);
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(RestoreSequenceFileAsBinaryInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns);
    FileOutputFormat.setOutputPath(jobConf, new Path(tempDir));
    jobConf.setMaxMapAttempts(0);
    //map only
    jobConf.setNumReduceTasks(0);

    JobClient.runJob(jobConf);

    //delete temp output dir
    fs.delete(new Path(tempDir), true);
}

From source file:org.cloudata.core.tablet.backup.RestoreJob.java

License:Apache License

/**
 * @param string//w ww. jav  a  2s .c  o m
 * @param string2
 * @param binary
 */
public void runRestore(String tableName, String[] columnNames, int numOfVersion, String inputPath)
        throws IOException {
    CloudataConf nconf = new CloudataConf();

    JobConf partitionJob = new JobConf(RestoreJob.class);

    FileSystem fs = FileSystem.get(partitionJob);

    if (!fs.exists(new Path(inputPath))) {
        throw new IOException("input path not exists:" + inputPath);
    }

    if (CTable.existsTable(nconf, tableName)) {
        throw new IOException("table already exists" + tableName);
    }

    TableSchema tableSchema = new TableSchema(tableName, "", columnNames);
    tableSchema.setNumOfVersion(numOfVersion);
    CTable.createTable(nconf, tableSchema);

    String columns = "";
    for (String eachColumn : columnNames) {
        columns += eachColumn.trim() + ",";
    }
    columns = columns.substring(0, columns.length() - 1);

    String jobName = tableName + " restore";
    String tempDir = jobName + "_" + System.currentTimeMillis();

    partitionJob.setJobName(jobName + "_partition");

    partitionJob.setMapperClass(RestorePartitionMap.class);
    FileInputFormat.addInputPath(partitionJob, new Path(inputPath));
    partitionJob.setInputFormat(RestoreTextInputFormat.class);
    partitionJob.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName);
    FileOutputFormat.setOutputPath(partitionJob, new Path(tempDir));

    //map only
    partitionJob.setNumReduceTasks(0);

    JobClient.runJob(partitionJob);

    fs.delete(new Path(tempDir), true);
    ////////////////////////////////////////////////////////////

    JobConf jobConf = new JobConf(BackupJob.class);
    jobConf.setJobName(jobName);

    jobConf.setMapperClass(RestoreMap.class);
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(RestoreTextInputFormat.class);
    jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, tableName);
    jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns);
    FileOutputFormat.setOutputPath(jobConf, new Path(tempDir));
    jobConf.setMaxMapAttempts(0);

    //map only
    jobConf.setNumReduceTasks(0);

    JobClient.runJob(jobConf);
    //delete temp output dir
    fs.delete(new Path(tempDir), true);
}

From source file:org.cloudata.examples.first.HdfsToCloudataMapReduce.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 2) {
        System.out.println("Usage: java HdfsToCloudataMapReduce <input path> <table name>");
        System.exit(0);//from w w  w .ja  va2 s.  c  om
    }

    Path inputPath = new Path(args[0]);
    String tableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        TableSchema tableSchema = new TableSchema(tableName);
        tableSchema.addColumn("col1");

        CTable.createTable(nconf, tableSchema);
    }

    JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    // <MAP>
    FileInputFormat.addInputPath(jobConf, inputPath);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(HdfsToCloudataMappper.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);
    // </MAP>

    // <REDUCE>
    // Map Only
    FileOutputFormat.setOutputPath(jobConf, new Path("HdfsToCloudataMapReduce_" + System.currentTimeMillis()));
    jobConf.setNumReduceTasks(0);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(FileOutputFormat.getOutputPath(jobConf), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.upload.partitionjob.PartitionJob.java

License:Apache License

public boolean runJob(String inputPath, String tableName, int numOfTablets) throws IOException {
    JobConf jobConf = new JobConf(PartitionJob.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    FileSystem fs = FileSystem.get(jobConf);
    // ? //from   w  w  w  . j  a  va2 s  . c  o  m
    FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true);

    jobConf.setJobName("PartitionJob_" + tableName + "(" + new Date() + ")");
    jobConf.set("cloudata.numOfTablets", String.valueOf(numOfTablets));
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    String clientOpt = jobConf.get("mapred.child.java.opts");
    if (clientOpt == null) {
        clientOpt = "";
    }
    jobConf.set("mapred.child.java.opts", clientOpt + " -Duser.name=" + System.getProperty("user.name"));

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(PartitionMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/partitionJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setReducerClass(PartitionReducer.class);
    //Reduce  1 
    jobConf.setNumReduceTasks(1);
    //</Reduce>

    try {
        RunningJob job = JobClient.runJob(jobConf);
        return job.isSuccessful();
    } finally {
        FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.upload.partitionjob.UploadJob.java

License:Apache License

public void runJob(String inputPath, String tableName) throws IOException {
    JobConf jobConf = new JobConf(UploadJob.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")");

    //KeyRangePartitioner    
    //AbstractTabletInputFormat.OUTPUT_TABLE? ? 
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);

    CloudataConf conf = new CloudataConf();
    CTable ctable = CTable.openTable(conf, tableName);
    TabletInfo[] tabletInfos = ctable.listTabletInfos();

    //<Map>
    FileInputFormat.addInputPath(jobConf, new Path(inputPath));
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(UploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setMapSpeculativeExecution(false);
    jobConf.setMaxMapAttempts(0);/*from  ww  w .j a  v  a2s . c o  m*/
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    //</Map>

    //<Reduce>
    Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer");
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setReducerClass(UploadReducer.class);
    jobConf.setReduceSpeculativeExecution(false);
    jobConf.setMaxReduceAttempts(0);
    //Reduce  Tablet 
    jobConf.setNumReduceTasks(tabletInfos.length);
    //</Reduce>

    try {
        JobClient.runJob(jobConf);
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        FileUtil.delete(fs, tempOutputPath, true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.upload.SimpleUploaderMapReduce.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 3) {
        System.out.println("Usage: java SimpleUploaderMapReduce <input path> <table name> <# reduce>");
        System.exit(0);//from   www .ja  v a2 s  .c o  m
    }

    Path inputPath = new Path(args[0]);
    String tableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        TableSchema tableSchema = new TableSchema(tableName);
        tableSchema.addColumn("Col1");

        Row.Key[] rowKeys = new Row.Key[20];
        for (int i = 0; i < 10; i++) {
            rowKeys[i] = new Row.Key("-0" + i);
        }
        for (int i = 1; i < 10; i++) {
            rowKeys[9 + i] = new Row.Key("0" + i);
        }
        rowKeys[19] = Row.Key.MAX_KEY;

        CTable.createTable(nconf, tableSchema, rowKeys);
    }
    JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    // <MAP>
    FileInputFormat.addInputPath(jobConf, inputPath);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(SimpleUploaderMapper.class);
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);
    // </MAP>

    // <REDUCE>
    FileOutputFormat.setOutputPath(jobConf, new Path("SimpleUploaderMapReduce_" + System.currentTimeMillis()));
    jobConf.setReducerClass(SimpleUploaderReducer.class);
    jobConf.setNumReduceTasks(Integer.parseInt(args[2]));
    jobConf.setMaxReduceAttempts(0);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(FileOutputFormat.getOutputPath(jobConf), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}