Example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputValueClass.

Prototype

public void setMapOutputValueClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the value class for the map output data.

Usage

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamondAWS(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from  w ww .  j a  v  a2 s.  co  m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make local Hamond dir
    awshamondsidefunctions.MakeHamondDir.make();

    //copy DIAMOND, query, reference from S3 to master local
    awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName());

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(),
            "/mnt/Hamond/" + new Path(dataBase).getName(), userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase);
    conf.set(OUTPUT, outPut);
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);
    job.setReducerClass(AWSDiamondReducer.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:distributed.hadoop.MapReduceJobConfig.java

License:Open Source License

/**
 * Apply the settings encapsulated in this config and return a Job object
 * ready for execution./*from   ww  w. java2s  . co  m*/
 * 
 * @param jobName the name of the job
 * @param conf the Configuration object that will be wrapped in the Job
 * @param env environment variables
 * @return a configured Job object
 * @throws IOException if a problem occurs
 * @throws ClassNotFoundException if various classes are not found
 */
public Job configureForHadoop(String jobName, Configuration conf, Environment env)
        throws IOException, ClassNotFoundException {

    String jobTrackerPort = getJobTrackerPort();
    if (DistributedJobConfig.isEmpty(jobTrackerPort)) {
        jobTrackerPort = AbstractHadoopJobConfig.isHadoop2() ? AbstractHadoopJobConfig.DEFAULT_PORT_YARN
                : AbstractHadoopJobConfig.DEFAULT_PORT;
    }
    String jobTracker = getJobTrackerHost() + ":" + jobTrackerPort;
    if (DistributedJobConfig.isEmpty(jobTracker)) {
        System.err.println("No " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager " : "JobTracker ")
                + "set - running locally...");
    } else {
        jobTracker = environmentSubstitute(jobTracker, env);
        if (AbstractHadoopJobConfig.isHadoop2()) {
            conf.set(YARN_RESOURCE_MANAGER_ADDRESS, jobTracker);
            conf.set(YARN_RESOURCE_MANAGER_SCHEDULER_ADDRESS,
                    environmentSubstitute(getJobTrackerHost(), env) + ":8030");
        } else {
            conf.set(HADOOP_JOB_TRACKER_HOST, jobTracker);
        }
    }
    System.err.println("Using " + (AbstractHadoopJobConfig.isHadoop2() ? "resource manager: " : "jobtracker: ")
            + jobTracker);

    if (AbstractHadoopJobConfig.isHadoop2()) {
        // a few other properties needed to run against Yarn
        conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
        conf.set("mapreduce.framework.name", "yarn");
    }

    if (!DistributedJobConfig.isEmpty(getMapredMaxSplitSize())) {
        conf.set(AbstractHadoopJobConfig.isHadoop2() ? HADOOP2_MAPRED_MAX_SPLIT_SIZE
                : HADOOP_MAPRED_MAX_SPLIT_SIZE, getMapredMaxSplitSize());
    }

    // Do any user supplied properties here before creating the Job
    for (Map.Entry<String, String> e : m_additionalUserSuppliedProperties.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    m_hdfsConfig.configureForHadoop(conf, env);
    Job job = new Job(conf, jobName);

    String numMappers = getNumberOfMaps();
    if (!DistributedJobConfig.isEmpty(numMappers)) {
        numMappers = environmentSubstitute(numMappers, env);
        ((JobConf) job.getConfiguration()).setNumMapTasks(Integer.parseInt(numMappers));
    }

    // The number of map tasks that will be run simultaneously by a task tracker
    String maxConcurrentMapTasks = getTaskTrackerMapTasksMaximum();
    if (!DistributedJobConfig.isEmpty(maxConcurrentMapTasks)) {
        ((JobConf) job.getConfiguration()).set("mapred.tasktracker.map.tasks.maximum", maxConcurrentMapTasks);
    }

    String numReducers = getNumberOfReducers();
    if (!DistributedJobConfig.isEmpty(numReducers)) {
        numReducers = environmentSubstitute(numReducers, env);
        job.setNumReduceTasks(Integer.parseInt(numReducers));

        if (Integer.parseInt(numReducers) == 0) {
            System.err.println("Warning - no reducer class set. Configuring for a map only job");
        }
    } else {
        job.setNumReduceTasks(1);
    }
    String mapperClass = getMapperClass();
    if (DistributedJobConfig.isEmpty(mapperClass)) {
        throw new IOException("No mapper class specified!");
    }
    mapperClass = environmentSubstitute(mapperClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends Mapper> mc = (Class<? extends Mapper>) Class.forName(mapperClass);

    job.setMapperClass(mc);

    String reducerClass = getReducerClass();
    if (DistributedJobConfig.isEmpty(reducerClass) && Integer.parseInt(numReducers) > 0) {
        throw new IOException("No reducer class specified!");
    } else if (job.getNumReduceTasks() > 0) {
        reducerClass = environmentSubstitute(reducerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> rc = (Class<? extends Reducer>) Class.forName(reducerClass);

        job.setReducerClass(rc);
    }

    String combinerClass = getCombinerClass();
    if (!DistributedJobConfig.isEmpty(combinerClass)) {
        combinerClass = environmentSubstitute(combinerClass, env);

        @SuppressWarnings("unchecked")
        Class<? extends Reducer> cc = (Class<? extends Reducer>) Class.forName(combinerClass);

        job.setCombinerClass(cc);
    }

    String inputFormatClass = getInputFormatClass();
    if (DistributedJobConfig.isEmpty(inputFormatClass)) {
        throw new IOException("No input format class specified");
    }
    inputFormatClass = environmentSubstitute(inputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends InputFormat> ifc = (Class<? extends InputFormat>) Class.forName(inputFormatClass);

    job.setInputFormatClass(ifc);

    String outputFormatClass = getOutputFormatClass();
    if (DistributedJobConfig.isEmpty(outputFormatClass)) {
        throw new IOException("No output format class specified");
    }
    outputFormatClass = environmentSubstitute(outputFormatClass, env);

    @SuppressWarnings("unchecked")
    Class<? extends OutputFormat> ofc = (Class<? extends OutputFormat>) Class.forName(outputFormatClass);
    job.setOutputFormatClass(ofc);

    String mapOutputKeyClass = getMapOutputKeyClass();
    if (DistributedJobConfig.isEmpty(mapOutputKeyClass)) {
        throw new IOException("No map output key class defined");
    }
    mapOutputKeyClass = environmentSubstitute(mapOutputKeyClass, env);
    Class mokc = Class.forName(mapOutputKeyClass);
    job.setMapOutputKeyClass(mokc);

    String mapOutputValueClass = getMapOutputValueClass();
    if (DistributedJobConfig.isEmpty(mapOutputValueClass)) {
        throw new IOException("No map output value class defined");
    }
    mapOutputValueClass = environmentSubstitute(mapOutputValueClass, env);
    Class movc = Class.forName(mapOutputValueClass);
    job.setMapOutputValueClass(movc);

    String outputKeyClass = getOutputKeyClass();
    if (DistributedJobConfig.isEmpty(outputKeyClass)) {
        throw new IOException("No output key class defined");
    }
    outputKeyClass = environmentSubstitute(outputKeyClass, env);
    Class okc = Class.forName(outputKeyClass);
    job.setOutputKeyClass(okc);

    String outputValueClass = getOutputValueClass();
    if (DistributedJobConfig.isEmpty(outputValueClass)) {
        throw new IOException("No output value class defined");
    }
    outputValueClass = environmentSubstitute(outputValueClass, env);
    Class ovc = Class.forName(outputValueClass);
    job.setOutputValueClass(ovc);

    String inputPaths = getInputPaths();
    // don't complain if there aren't any as inputs such as HBASE
    // require other properties to be set
    if (!DistributedJobConfig.isEmpty(inputPaths)) {
        inputPaths = environmentSubstitute(inputPaths, env);
        FileInputFormat.setInputPaths(job, inputPaths);
    }

    String outputPath = getOutputPath();
    if (DistributedJobConfig.isEmpty(outputPath)) {
        throw new IOException("No output path specified");
    }
    outputPath = environmentSubstitute(outputPath, env);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}

From source file:dk.statsbiblioteket.hadoop.archeaderextractor.ARCHeaderExtractorMR.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration configuration = getConf();

    Job job = new Job(configuration, "ARC Header Extractor");
    job.setJarByClass(ARCHeaderExtractorMR.class);

    job.setMapperClass(ARCHeaderExtractorMapper.class);
    job.setCombinerClass(ARCHeaderExtractorReducer.class);
    job.setReducerClass(ARCHeaderExtractorReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    int n = args.length;
    if (n == 0 || n > 2) {
        System.err.println(/* w ww . j a  v a2  s. co m*/
                "Not enough arguments. input dir and output dir mandatory. Only " + n + " were supplied.");
        System.exit(0);
    }

    SequenceFileInputFormat.addInputPath(job, new Path(args[0]));
    SequenceFileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:edu.berkeley.chukwa_xtrace.XtrExtract.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());

    extractor.setMapperClass(MapClass.class);

    extractor.setReducerClass(Reduce.class);
    extractor.setJobName("x-trace reconstructor");
    extractor.setJarByClass(this.getClass());

    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(Text.class);

    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(TextArrayWritable.class);

    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();/*ww  w.jav a2  s. c  o m*/
    //    extractor.waitForCompletion(false);
    return 0;

}

From source file:edu.berkeley.chukwa_xtrace.XtrIndex.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    Job extractor = new Job(getConf());
    extractor.setMapperClass(MapClass.class);
    //no reduce, just identity

    extractor.setJobName("x-trace indexer");
    extractor.setJarByClass(this.getClass());

    extractor.setMapOutputKeyClass(BytesWritable.class);
    extractor.setMapOutputValueClass(TextArrayWritable.class);

    extractor.setOutputKeyClass(BytesWritable.class);
    extractor.setOutputValueClass(TextArrayWritable.class);

    extractor.setInputFormatClass(SequenceFileInputFormat.class);
    extractor.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(extractor, new Path(arg[0]));
    FileOutputFormat.setOutputPath(extractor, new Path(arg[1]));
    System.out.println("looks OK.  Submitting.");
    extractor.submit();/*from www.jav a  2  s .  c  om*/
    //    extractor.waitForCompletion(false);
    return 0;

}

From source file:edu.bigdata.training.serialization.UserHistory.java

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "postHistory");
    job.setJarByClass(UserHistory.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setNumReduceTasks(1);//w  w w . j av a2s .c  o m

    MultipleInputs.addInputPath(job, new Path("input/posts/user_info.txt"), TextInputFormat.class,
            UserCityMapper.class);
    MultipleInputs.addInputPath(job, new Path("input/posts/user_posts.txt"), TextInputFormat.class,
            UserPostsMapper.class);

    AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputValueSchema(job, UserPostSummary.getClassSchema());
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);

    Path outPath = new Path("output/user/posts");
    FileOutputFormat.setOutputPath(job, outPath);
    job.setReducerClass(UserPostHistory.class);
    //outPath.getFileSystem(job.getConfiguration()).delete(outPath, true);

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:edu.columbia.hs2807.Sentiment.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "sentiment");

    job.setJarByClass(Sentiment.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Combine.class);
    job.setReducerClass(Reduce.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(LongArrayWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];//w  w  w .  j a v a  2s . c  o m
    String inputPath = args[1];
    String outputPath = args[2];
    Path topicsFile = new Path(args[3]);
    Path vocabFile = new Path(args[4]);
    Path dateBinFile = new Path(args[5]);

    Configuration config = getConf();
    config.set("hbase.table.name", tableName);
    HBaseConfiguration.addHbaseResources(config);

    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
    job.setJobName("Bulk Loading HBase Table::" + tableName);
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapperClass(ThriftFilterMapper.class);

    Path output = new Path(outputPath);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapOutputValueClass(Put.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(dateBinFile.toUri());

    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapred.map.output.compression.codec",
            org.apache.hadoop.io.compress.SnappyCodec.class,
            org.apache.hadoop.io.compress.CompressionCodec.class);
    job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName());

    //RegionLocator regionLocator = conn.getRegionLocator(tableName);
    //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName));

    Connection con = ConnectionFactory.createConnection(config);
    TableName htableName = TableName.valueOf(tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName),
            con.getRegionLocator(htableName));

    job.waitForCompletion(true);
    if (job.isSuccessful()) {
        // Couldn't find a better way to do this. The LoadIncrementalHFiles
        // seems to want 777 permissions on the output directory.
        try {
            Runtime rt = Runtime.getRuntime();
            rt.exec("hadoop fs -chmod -R 777 " + output);
        } catch (Exception e) {
            e.printStackTrace();
        }
        /*
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
        HTable htable = new HTable(config, tableName);
        loader.doBulkLoad(new Path(outputPath), htable);
        */

    } else {
        throw new IOException("error with job");
    }

    return 0;

    // - 

    /*
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
            
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
    job.setMapOutputValueClass(Put.class);  
    job.setInputFormatClass(ThriftFileInputFormat.class);
            
    //HFileOutputFormat2.configureIncrementalLoad(job, htable);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));        
            
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
            
    job.setMapperClass(ThriftFilterMapper.class);
            
    boolean b = job.waitForCompletion(true);
    if (!b) {
    throw new IOException("error with job");
    }
            
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
    loader.doBulkLoad(new Path(outputPath), htable);
            
    return 0;        
    */
}

From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from   ww  w.j  a  v a  2  s.c om*/
    }

    // all directories are in HDFS
    tokenizedDocDir = args[0];
    dictDir = args[1];
    outputDir = args[2];
    numReducers = Integer.valueOf(args[3]);

    logger.info("PartialVectorsFromTokenizedDoc ");
    logger.info(" - tokenizedDocDir: " + tokenizedDocDir);
    logger.info(" - dictDir: " + dictDir);
    logger.info(" - outputDir: " + outputDir);
    logger.info(" - numReducers: " + numReducers);

    Path tokenizedDocPath = new Path(tokenizedDocDir);
    Path dictPath = new Path(dictDir);
    Path outputPath = new Path(outputDir);

    // get dimension
    Configuration conf = getConf();

    int dimension = 0;
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true,
            conf)) {
        dimension++;
    }
    logger.info("dimension of a vector: " + dimension);

    // submit job
    long t0 = System.currentTimeMillis();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir
            + ", dictionary-file: " + dictDir);
    job.setJarByClass(PartialVectorsFromTokenizedDoc.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, tokenizedDocPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    HadoopUtil.delete(conf, outputPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    job.waitForCompletion(true);

    long t1 = System.currentTimeMillis();
    logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 6) {
        printUsage();/*from  w w w  . java 2  s .  c  om*/
    }

    String inputPath = args[0];
    String outputPath = args[1];
    int maxIdsPerSplit = Integer.valueOf(args[2]);
    String dataAPIConfClassName = args[3];
    String analyzerClassName = args[4];
    int maxIdsPerReq = Integer.valueOf(args[5]);

    logger.info("DataCopyTokenizerJob ");
    logger.info(" - input: " + inputPath);
    logger.info(" - output: " + outputPath);
    logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit);
    logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName);
    logger.info(" - analyzerName: " + analyzerClassName);
    logger.info(" - maxIdsPerReq: " + maxIdsPerReq);

    // upload dictionary file to HDFS
    //      FileSystem fs = FileSystem.get(getConf());
    //      Path dictionaryPath = new Path(outputPath, Utilities.path2FileName(dictionaryFile));
    //      BufferedWriter writer = new BufferedWriter(
    //            new OutputStreamWriter(fs.create(dictionaryPath, true)));
    //      BufferedReader reader = new BufferedReader(new FileReader(dictionaryFile));
    //      String line = null;
    //      while ((line = reader.readLine()) != null) {
    //         writer.write(line + "\n");
    //      }
    //      writer.close();

    // 
    Job job = new Job(getConf(), "Copy and tokenize data from HTRC data storage parallely.");
    job.setJarByClass(DataCopyTokenizerJob.class);

    // set analyzer
    job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set distributed cache
    //      Path dictionaryPath = new Path(dictionaryFile);
    //      DistributedCache.setCacheFiles(new URI[] {dictionaryPath.toUri()}, job.getConfiguration());

    // set data api conf
    job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq);

    // no speculation
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);

    job.setInputFormatClass(IDInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);

    job.setMapperClass(DataCopyTokenizerMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    long start = System.nanoTime();
    job.waitForCompletion(true);
    logger.info("DataCopyTokenizerJob took " + (System.nanoTime() - start) / 1e9 + " seconds.");

    return 0;
}