Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtWithParams.java

License:Apache License

/**
 * Set the job configuration, classes and run the job.
 *//* w ww.  j  a  v a 2s  . co m*/
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /* set the maximum number of task per node */
    int maptasks = 120;

    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.map.tasks", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);

    int reducetasks = 120;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);

    /*
     * heap size for the job
     */
    conf.set("mapred.child.java.opts", "-Xmx1500m");

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.job.reduce.memory.mb", "2048");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerWord.java

License:Apache License

/**
 * Set the job configuration, classes and run the job.
 *///from   www  .  j a va 2 s.  c o m
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /*
     * use compression
     */
    conf.set("mapred.output.compress", "true");
    conf.set("mapred.map.output.compress", "true");
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");

    /* set the maximum number of task per node */
    int maptasks = 120;

    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.map.tasks", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);

    int reducetasks = 120;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);

    /*
     * heap size for the job
     */
    conf.set("mapred.child.java.opts", "-Xmx1500m");

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.job.reduce.memory.mb", "2048");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.CleanContext.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);

    conf.setCombinerClass(IntSumReducer.class);
    conf.setReducerClass(IntSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /* number of milliseconds before killing a not responding task */
    conf.set("mapred.task.timeout", "600000");

    /* change to 128mb */
    conf.set("dfs.block.size", "134217728");

    /* set the maximum number of task per node */
    int maptasks = 100;

    /*//  w ww .ja v  a 2  s  .c  om
     * Number of map tasks to deploy on each machine. 0.5 to 2 *
     * (cores/node)
     */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);
    /*
     * The default number of map tasks per job. Typically set to a prime
     * several times greater than number of available hosts.
     */
    conf.set("mapred.map.tasks", "" + maptasks);

    int reducetasks = 120;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.job.map.memory.mb", "3000");
    conf.set("mapred.job.reduce.memory.mb", "3000");
    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.FeatureCount.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setReducerClass(IntSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    int maptasks = 120;

    /* set the maximum number of task per node */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.map.tasks", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);

    int reducetasks = 100;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);

    /*//from   w w  w.  j av  a 2s. c  om
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "2048");
    conf.set("mapred.job.reduce.memory.mb", "2048");

    conf.set("dfs.replication", "1");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(DoubleSumReducer.class);
    conf.setReducerClass(DoubleSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /* number of milliseconds before killing a not responding task */
    conf.set("mapred.task.timeout", "600000");

    /* change to 128mb */
    conf.set("dfs.block.size", "134217728");

    /* set the maximum number of task per node */
    int maptasks = 100;

    /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);
    /* The default number of map tasks per job. Typically set to a prime several
       times greater than number of available hosts. */
    conf.set("mapred.map.tasks", "" + maptasks);

    int reducetasks = 100;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);

    JobClient.runJob(conf);//from ww  w  . j  ava 2  s. co m

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(DoubleSumReducer.class);
    conf.setReducerClass(DoubleSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /* number of milliseconds before killing a not responding task */
    //conf.set("mapred.task.timeout", "600000");

    //conf.set("mapred.map.tasks.speculative.execution", "false");      

    /* change to 128mb */
    //conf.set("dfs.block.size", "134217728");

    /*/*from   w ww . j  a  v a  2 s. co  m*/
     * use compression
     */
    /*
    conf.set("mapred.output.compress", "true");
    conf.set("mapred.map.output.compress", "true");
    conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    */

    /* set the maximum number of task per node */
    int maptasks = 100;

    /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);
    /* The default number of map tasks per job. Typically set to a prime several
       times greater than number of available hosts. */
    conf.set("mapred.map.tasks", "" + maptasks);

    int reducetasks = 80;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "4000");
    conf.set("mapred.job.reduce.memory.mb", "4000");

    conf.set("dfs.replication", "1");

    /*
     * reduce I/O load
     */
    conf.set("mapred.child.java.opts", "-Xmx1400M");

    conf.set("io.sort.mb", "300");
    conf.set("io.sort.factor", "30");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1WithFeatures.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    /* set the new defined type to be used */
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    if (args.length > 3) {
        conf.setInt("threshold", Integer.parseInt(args[3]));
    }//from  w w w. j a va2 s.  c o  m
    /* number of milliseconds before killing a not responding task */
    conf.set("mapred.task.timeout", "600000");

    /* change to 128mb */
    conf.set("dfs.block.size", "134217728");

    /* set the maximum number of task per node */
    int maptasks = 200;

    /*
     * Number of map tasks to deploy on each machine. 0.5 to 2 *
     * (cores/node)
     */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);
    /*
     * The default number of map tasks per job. Typically set to a prime
     * several times greater than number of available hosts.
     */
    conf.set("mapred.map.tasks", "" + maptasks);

    int reducetasks = 20;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);

    /*
     * how much virtual memory the entire process tree of each map/reduce
     * task will use
     */
    conf.set("mapred.job.map.memory.mb", "4000");
    conf.set("mapred.job.reduce.memory.mb", "4000");

    conf.set("dfs.replication", "1");

    /*
     * reduce I/O load
     */
    conf.set("mapred.child.java.opts", "-Xmx1400M");

    conf.set("io.sort.mb", "300");
    conf.set("io.sort.factor", "30");

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java

License:Apache License

/**
 * The reducer step will sum all float values, i.e. the
 * weight for any (word1,word2) pair sharing a feature.
 *///from   ww w . j  a  va2s . c  o  m

public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(FloatWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(DoubleSumReducer.class);
    conf.setReducerClass(DoubleSumReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    /* number of milliseconds before killing a not responding task */
    conf.set("mapred.task.timeout", "600000");

    /* change to 128mb */
    conf.set("dfs.block.size", "134217728");

    /* set the maximum number of task per node */
    int maptasks = 100;

    /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */
    conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks);
    conf.set("mapred.tasktracker.map", "" + maptasks);
    /* The default number of map tasks per job. Typically set to a prime several
       times greater than number of available hosts. */
    conf.set("mapred.map.tasks", "" + maptasks);

    int reducetasks = 100;

    conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks);
    conf.set("mapred.tasktracker.reduce", "" + reducetasks);
    conf.set("mapred.reduce.tasks", "" + reducetasks);

    JobClient.runJob(conf);

}

From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.TotalWords.java

License:Apache License

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

    JobConf conf = HadoopUtil.generateJobConf(args);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(Map.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);//from w w w.  j  av a2 s  . com

}

From source file:junto.algorithm.parallel.AdsorptionHadoop.java

License:Apache License

public static void main(String[] args) throws Exception {
    Hashtable config = ConfigReader.read_config(args);

    String baseInputFilePat = Defaults.GetValueOrDie(config, "hdfs_input_pattern");
    String baseOutputFilePat = Defaults.GetValueOrDie(config, "hdfs_output_base");
    int numIterations = Integer.parseInt(Defaults.GetValueOrDie(config, "iters"));

    String currInputFilePat = baseInputFilePat;
    String currOutputFilePat = "";
    for (int iter = 1; iter <= numIterations; ++iter) {
        JobConf conf = new JobConf(AdsorptionHadoop.class);
        conf.setJobName("adsorption_hadoop");

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(Map.class);
        // conf.setCombinerClass(Reduce.class);
        conf.setReducerClass(Reduce.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        // hyperparameters
        conf.set("mu1", Defaults.GetValueOrDie(config, "mu1"));
        conf.set("mu2", Defaults.GetValueOrDie(config, "mu2"));
        conf.set("mu3", Defaults.GetValueOrDie(config, "mu3"));
        conf.set("keepTopKLabels", Defaults.GetValueOrDefault((String) config.get("keep_top_k_labels"),
                Integer.toString(Integer.MAX_VALUE)));

        if (iter > 1) {
            // output from last iteration is the input for current iteration
            currInputFilePat = currOutputFilePat + "/*";
        }/* w w  w .j  a va2  s. c  o m*/
        FileInputFormat.setInputPaths(conf, new Path(currInputFilePat));

        currOutputFilePat = baseOutputFilePat + "_" + iter;
        FileOutputFormat.setOutputPath(conf, new Path(currOutputFilePat));

        JobClient.runJob(conf);
    }
}