Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:ivory.preprocess.GetTermCount.java

License:Apache License

public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//from w ww . ja v a2 s  .c o m
    JobConf conf = new JobConf(getConf(), GetTermCount.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt(Constants.NumMapTasks, 0);
    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        sLogger.info("index path doesn't existing: skipping!");
        return 0;
    }

    sLogger.info("PowerTool: GetTermCount");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);
    sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0));
    sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        sLogger.error("TermDfCf directory exist: skipping!");
        return 0;
    }

    conf.setJobName("GetTermCount:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyCombiner.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // write out number of postings
    int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter();
    env.writeCollectionTermCount(collectionTermCount);
    // NOTE: this value is not the same as number of postings, because
    // postings for non-English terms are discarded, or as result of df cut

    long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter();
    env.writeCollectionLength(collectionLength);
    return 0;
}

From source file:ivory.ptc.AnchorTextInvertedIndex.java

License:Apache License

@Override
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 1);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100);
    String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters");

    LOG.info("BuildAnchorTextInvertedIndex");
    LOG.info(" - input path: " + inPath);
    LOG.info(" - output path: " + outPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme"));
    LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters);

    String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER);
    for (String param : params) {
        DistributedCache.addCacheFile(new URI(param), conf);
    }/*from ww  w .j a  v a2  s  .co  m*/

    conf.setJobName("BuildAnchorTextInvertedIndex");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(AnchorTextTarget.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
}

From source file:ivory.ptc.driver.XMLFormatJudgments.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        printUsage();/*from   ww  w  .  ja v  a  2 s. c  om*/
        return -1;
    }
    JobConf conf = new JobConf(getConf(), XMLFormatJudgments.class);
    // Command line arguments
    String inPath = args[0];
    String outPath = args[1];
    String docnoMapping = args[2];
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = 1;
    int reduceTasks = 1;

    conf.setJobName("FormatPseudoJudgments");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    DistributedCache.addCacheFile(new URI(docnoMapping), conf);
    FileSystem.get(conf).delete(outputPath);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setMapOutputKeyClass(PseudoQuery.class);
    conf.setMapOutputValueClass(PseudoJudgments.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);
    return 0;
}

From source file:ivory.ptc.driver.XMLFormatQueries.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();//from  w w w  . j a  va  2 s .c  o  m
        return -1;
    }

    JobConf conf = new JobConf(getConf(), XMLFormatQueries.class);
    // Command line arguments
    String inPath = args[0];
    String outPath = args[1];
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = 1;
    int reduceTasks = 1;

    conf.setJobName("FormatPseudoQueries");
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    FileSystem.get(conf).delete(outputPath);
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setMapOutputKeyClass(PseudoQuery.class);
    conf.setMapOutputValueClass(PseudoJudgments.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);
    JobClient.runJob(conf);
    return 0;
}

From source file:ivory.ptc.SortedPseudoTestCollection.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), SortedPseudoTestCollection.class);
    FileSystem fs = FileSystem.get(conf);
    String inPath = conf.get("Ivory.InputPath");
    String outPath = conf.get("Ivory.OutputPath");
    Path inputPath = new Path(inPath);
    Path outputPath = new Path(outPath);
    int mapTasks = 1;
    int reduceTasks = 1;

    LOG.info("SortedPseudoTestCollection");
    LOG.info(" - Input path: " + conf.get("Ivory.InputPath"));
    LOG.info(" - Output path: " + conf.get("Ivory.OutputPath"));
    LOG.info(" - JudgmentExtractor: " + conf.get("Ivory.JudgmentExtractor"));
    LOG.info(" - JudgmentExtractorParameters: " + conf.get("Ivory.JudgmentExtractorParameters"));
    LOG.info(" - SamplingCriterion: " + conf.get("Ivory.SamplingCriterion"));
    LOG.info(" - SamplingCriterionParameters: " + conf.get("Ivory.SamplingCriterionParameters"));
    LOG.info(" - QueryScorer: " + conf.get("Ivory.QueryScorer"));

    conf.setJobName("SortedPTC");
    conf.setNumMapTasks(mapTasks);//from w w w .  j a  v a  2 s  .c  om
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setMapOutputKeyClass(PseudoQuery.class);
    conf.setMapOutputValueClass(PseudoJudgments.class);
    conf.setOutputKeyClass(PseudoQuery.class);
    conf.setOutputValueClass(PseudoJudgments.class);
    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    fs.delete(outputPath);
    JobClient.runJob(conf);
    return 0;
}

From source file:ivory.server.RunDistributedRetrievalServers.java

License:Apache License

/**
 * Runs this tool.//  w  w w. j ava  2s .  c  om
 */
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return -1;
    }

    String configFile = args[0];

    FileSystem fs = FileSystem.get(getConf());

    Document d = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(fs.open(new Path(configFile)));

    sLogger.info("Reading configuration to determine number of servers to launch:");
    List<String> sids = new ArrayList<String>();
    NodeList servers = d.getElementsByTagName("server");
    for (int i = 0; i < servers.getLength(); i++) {
        Node node = servers.item(i);

        // get server id
        String sid = XMLTools.getAttributeValue(node, "id", null);
        if (sid == null) {
            throw new Exception("Must specify a query id attribute for every server!");
        }

        sLogger.info(" - sid: " + sid);
        sids.add(sid);
    }

    int port = 7000;
    int numServers = sids.size();
    String configPath = args[1];

    if (fs.exists(new Path(configPath))) {
        fs.delete(new Path(configPath), true);
    }

    String fname = appendPath(configPath, "config-" + numServers + ".txt");
    sLogger.info("Writing configuration to: " + fname);
    StringBuffer sb = new StringBuffer();
    for (int n = 0; n < numServers; n++) {
        port++;
        sb.append(sids.get(n) + " " + port + "\n");
    }

    FSDataOutputStream out = fs.create(new Path(fname), true);
    out.writeBytes(sb.toString());
    out.close();

    JobConf conf = new JobConf(RetrievalServer.class);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(ServerMapper.class);

    FileInputFormat.setInputPaths(conf, new Path(fname));

    conf.set("Ivory.ConfigFile", configFile);
    conf.set("Ivory.ConfigPath", configPath);
    conf.setJobName("RetrievalServers");
    //conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    // conf.set("mapred.job.queue.name", "search");

    JobClient client = new JobClient(conf);
    client.submitJob(conf);

    sLogger.info("Waiting for servers to start up...");

    // poll HDFS for hostnames and ports
    boolean allStarted = true;
    do {
        allStarted = true;
        for (int n = 0; n < numServers; n++) {
            String f = appendPath(configPath, sids.get(n) + ".host");
            if (!fs.exists(new Path(f))) {
                allStarted = false;
            }
        }
        Thread.sleep(10000);
        sLogger.info(" ...");
    } while (!allStarted);

    // poll HDFS for ready signal that the index is ready
    boolean allReady = true;
    do {
        allReady = true;
        for (int n = 0; n < numServers; n++) {
            String f = appendPath(configPath, sids.get(n) + ".ready");
            if (!fs.exists(new Path(f))) {
                allReady = false;
            }
        }
        Thread.sleep(10000);
        sLogger.info(" ...");
    } while (!allReady);

    sLogger.info("All servers ready!");
    sLogger.info("Host information:");
    for (int n = 0; n < numServers; n++) {
        String f = appendPath(configPath, sids.get(n) + ".host");
        sLogger.info(" sid=" + sids.get(n) + ", " + FSProperty.readString(fs, f));
    }

    return 0;
}

From source file:ivory.smrf.retrieval.distributed.RunDistributedRetrievalServers.java

License:Apache License

/**
 * Runs this tool./*from  w w  w.j a  v a  2 s  .c o m*/
 */
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return -1;
    }

    String configFile = args[0];

    FileSystem fs = FileSystem.get(getConf());

    Document d = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(fs.open(new Path(configFile)));

    sLogger.info("Reading configuration to determine number of servers to launch:");
    List<String> sids = new ArrayList<String>();
    NodeList servers = d.getElementsByTagName("server");
    for (int i = 0; i < servers.getLength(); i++) {
        Node node = servers.item(i);

        // get server id
        String sid = XMLTools.getAttributeValue(node, "id", null);
        if (sid == null) {
            throw new Exception("Must specify a query id attribute for every server!");
        }

        sLogger.info(" - sid: " + sid);
        sids.add(sid);
    }

    int port = 7000;
    int numServers = sids.size();
    String configPath = args[1];

    if (fs.exists(new Path(configPath))) {
        fs.delete(new Path(configPath), true);
    }

    String fname = appendPath(configPath, "config-" + numServers + ".txt");
    sLogger.info("Writing configuration to: " + fname);
    StringBuffer sb = new StringBuffer();
    for (int n = 0; n < numServers; n++) {
        port++;
        sb.append(sids.get(n) + " " + port + "\n");
    }

    FSDataOutputStream out = fs.create(new Path(fname), true);
    out.writeBytes(sb.toString());
    out.close();

    JobConf conf = new JobConf(getConf(), RetrievalServer.class);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(0);

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(ServerMapper.class);

    FileInputFormat.setInputPaths(conf, new Path(fname));

    conf.set("Ivory.ConfigFile", configFile);
    conf.set("Ivory.ConfigPath", configPath);
    conf.setJobName("RetrievalServers");
    //conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    // conf.set("mapred.job.queue.name", "search");

    JobClient client = new JobClient(conf);
    client.submitJob(conf);

    sLogger.info("Waiting for servers to start up...");

    // poll HDFS for hostnames and ports
    boolean allStarted = true;
    do {
        allStarted = true;
        for (int n = 0; n < numServers; n++) {
            String f = appendPath(configPath, sids.get(n) + ".host");
            if (!fs.exists(new Path(f))) {
                allStarted = false;
            }
        }
        Thread.sleep(10000);
        sLogger.info(" ...");
    } while (!allStarted);

    // poll HDFS for ready signal that the index is ready
    boolean allReady = true;
    do {
        allReady = true;
        for (int n = 0; n < numServers; n++) {
            String f = appendPath(configPath, sids.get(n) + ".ready");
            if (!fs.exists(new Path(f))) {
                allReady = false;
            }
        }
        Thread.sleep(10000);
        sLogger.info(" ...");
    } while (!allReady);

    sLogger.info("All servers ready!");
    sLogger.info("Host information:");
    for (int n = 0; n < numServers; n++) {
        String f = appendPath(configPath, sids.get(n) + ".host");
        sLogger.info(" sid=" + sids.get(n) + ", " + FSProperty.readString(fs, f));
    }

    return 0;
}

From source file:job.uncombine.compressed.BigBuildInvertedIndex.java

License:Apache License

/**
 * Runs this tool.//from   ww w. j av  a2  s.  c  o m
 */
public int run(String[] args) throws Exception {

    //long GB = 1024 * 1024 * 1024;
    //long totalDataSize = 1 * GB;

    int reduceNumArray[] = { 9, 18 };
    int splitSizeMBArray[] = { 64, 128, 256 };
    int xmxArray[] = { 1000, 2000, 3000, 4000 };
    int xmsArray[] = { 0, 1 };
    int ismbArray[] = { 200, 400, 600, 800 };

    for (int splitIndex = 0; splitIndex < splitSizeMBArray.length; splitIndex++) {
        for (int reduceNumIndex = 0; reduceNumIndex < reduceNumArray.length; reduceNumIndex++) {
            for (int xmxIndex = 0; xmxIndex < xmxArray.length; xmxIndex++) {
                for (int xmsIndex = 0; xmsIndex < xmsArray.length; xmsIndex++) {
                    for (int ismbIndex = 0; ismbIndex < ismbArray.length; ismbIndex++) {

                        int reduceNum = reduceNumArray[reduceNumIndex];
                        int splitMB = splitSizeMBArray[splitIndex];
                        int xmx = xmxArray[xmxIndex];
                        int xms = xmsArray[xmsIndex] * xmx;
                        int ismb = ismbArray[ismbIndex];

                        JobConf conf = new JobConf(getConf(), BigBuildInvertedIndex.class);

                        conf.setLong("mapred.min.split.size", SplitTable.getMapred_min_split_size(splitMB));
                        conf.setLong("mapred.max.split.size", SplitTable.getMapred_max_split_size(splitMB));

                        //conf.setInt("my.sample.split.num", (int) (totalDataSize / (splitMB * 1024 * 1024)));

                        conf.setInt("mapred.reduce.tasks", reduceNum);
                        conf.setInt("io.sort.mb", ismb);

                        if (xms == 0)
                            conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m");
                        else
                            conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m -Xms" + xms + "m");

                        conf.setInt("child.monitor.metrics.seconds", 2);
                        conf.setInt("child.monitor.jvm.seconds", 2);
                        conf.setInt("child.monitor.jstat.seconds", 2);

                        conf.setJobName("BigBuildInvertedIndex " + splitMB + "MB "
                                + conf.get("mapred.child.java.opts") + " ismb=" + ismb + " RN=" + reduceNum);

                        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
                        if (otherArgs.length != 2) {
                            System.err.println("Usage: BigBuildInvertedIndex <in> <out>");
                            System.exit(2);
                        }

                        conf.setMapOutputKeyClass(Text.class);
                        conf.setMapOutputValueClass(PairOfInts.class);
                        conf.setOutputKeyClass(Text.class);
                        conf.setOutputValueClass(PairOfWritables.class);
                        SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);
                        conf.setOutputFormat(MapFileOutputFormat.class);

                        conf.setMapperClass(MyMapper.class);
                        // conf.setCombinerClass(IdentityReducer.class);
                        conf.setReducerClass(MyReducer.class);
                        FileInputFormat.setInputPaths(conf, new Path(otherArgs[0]));
                        FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));

                        FileSystem.get(conf).delete(new Path(otherArgs[1]), true);

                        try {
                            JobClient.runJob(conf);
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                        Thread.sleep(15000);

                    }
                }
            }
        }
    }
    return 0;
}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFt.java

License:Apache License

/**
 * Set the job configuration, classes and run the job.
 */// w ww. j  ava 2 s  . c  o  m
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    JobConf conf = HadoopUtil.generateJobConf(args);
    //      JobConf conf  = new JobConf(AggrPerFt.class);
    //      conf.setJobName("AggrPerFt");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /*
     * use compression
     */
    //      conf.set("mapred.output.compress", "true");
    //      conf.set("mapred.map.output.compress", "true");
    //      conf.set("mapred.map.output.compression.codec",
    //            "org.apache.hadoop.io.compress.SnappyCodec");
    //      conf.set("mapred.output.compression.codec",
    //            "org.apache.hadoop.io.compress.SnappyCodec");

    /* set the maximum number of task per node */
    int maptasks = 120;

    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.map.tasks", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);

    int reducetasks = 120;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);

    /*
     * heap size for the job
     */
    conf.set("mapred.child.java.opts", "-Xmx1500m");

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.job.reduce.memory.mb", "2048");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtUniquePositions.java

License:Apache License

/**
 * Set the job configuration, classes and run the job.
 *//*from  w w  w. j  a v  a 2  s.  co m*/
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    JobConf conf = HadoopUtil.generateJobConf(args);
    //      JobConf conf = new JobConf(AggrPerFtUniquePositions.class);
    conf.setJobName("AggrPerFtUniquePositions " + args[0] + " " + args[1]);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /*
     * use compression
     */
    conf.set("mapred.output.compress", "true");
    conf.set("mapred.map.output.compress", "true");
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");

    /* set the maximum number of task per node */
    int maptasks = 120;

    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.map.tasks", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);

    int reducetasks = 60;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);

    /*
     * heap size for the job
     */
    conf.set("mapred.child.java.opts", "-Xmx1500m");

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.job.reduce.memory.mb", "2048");

    JobClient.runJob(conf);

}