List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtWithParams.java
License:Apache License
/** * Set the job configuration, classes and run the job. *//* w ww. j a v a 2s . co m*/ @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerWord.java
License:Apache License
/** * Set the job configuration, classes and run the job. *///from www . j a va 2 s. c o m @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* * use compression */ conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.CleanContext.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /*// w ww .ja v a 2 s .c om * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.job.map.memory.mb", "3000"); conf.set("mapred.job.reduce.memory.mb", "3000"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.FeatureCount.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(IntSumReducer.class); conf.setReducerClass(IntSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); int maptasks = 120; /* set the maximum number of task per node */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /*//from w w w. j av a 2s. c om * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); conf.set("dfs.replication", "1"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); JobClient.runJob(conf);//from ww w . j ava 2 s. co m }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(DoubleWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ //conf.set("mapred.task.timeout", "600000"); //conf.set("mapred.map.tasks.speculative.execution", "false"); /* change to 128mb */ //conf.set("dfs.block.size", "134217728"); /*/*from w ww . j a v a 2 s. co m*/ * use compression */ /* conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); */ /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 80; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCounts1WithFeatures.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); /* set the new defined type to be used */ conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); if (args.length > 3) { conf.setInt("threshold", Integer.parseInt(args[3])); }//from w w w. j a va2 s. c o m /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 200; /* * Number of map tasks to deploy on each machine. 0.5 to 2 * * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* * The default number of map tasks per job. Typically set to a prime * several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 20; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "4000"); conf.set("mapred.job.reduce.memory.mb", "4000"); conf.set("dfs.replication", "1"); /* * reduce I/O load */ conf.set("mapred.child.java.opts", "-Xmx1400M"); conf.set("io.sort.mb", "300"); conf.set("io.sort.factor", "30"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.SimCountsLog.java
License:Apache License
/** * The reducer step will sum all float values, i.e. the * weight for any (word1,word2) pair sharing a feature. *///from ww w . j a va2s . c o m public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(DoubleSumReducer.class); conf.setReducerClass(DoubleSumReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* number of milliseconds before killing a not responding task */ conf.set("mapred.task.timeout", "600000"); /* change to 128mb */ conf.set("dfs.block.size", "134217728"); /* set the maximum number of task per node */ int maptasks = 100; /* Number of map tasks to deploy on each machine. 0.5 to 2 * (cores/node) */ conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); /* The default number of map tasks per job. Typically set to a prime several times greater than number of available hosts. */ conf.set("mapred.map.tasks", "" + maptasks); int reducetasks = 100; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.TotalWords.java
License:Apache License
@SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//from w w w. j av a2 s . com }
From source file:junto.algorithm.parallel.AdsorptionHadoop.java
License:Apache License
public static void main(String[] args) throws Exception { Hashtable config = ConfigReader.read_config(args); String baseInputFilePat = Defaults.GetValueOrDie(config, "hdfs_input_pattern"); String baseOutputFilePat = Defaults.GetValueOrDie(config, "hdfs_output_base"); int numIterations = Integer.parseInt(Defaults.GetValueOrDie(config, "iters")); String currInputFilePat = baseInputFilePat; String currOutputFilePat = ""; for (int iter = 1; iter <= numIterations; ++iter) { JobConf conf = new JobConf(AdsorptionHadoop.class); conf.setJobName("adsorption_hadoop"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); // conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); // hyperparameters conf.set("mu1", Defaults.GetValueOrDie(config, "mu1")); conf.set("mu2", Defaults.GetValueOrDie(config, "mu2")); conf.set("mu3", Defaults.GetValueOrDie(config, "mu3")); conf.set("keepTopKLabels", Defaults.GetValueOrDefault((String) config.get("keep_top_k_labels"), Integer.toString(Integer.MAX_VALUE))); if (iter > 1) { // output from last iteration is the input for current iteration currInputFilePat = currOutputFilePat + "/*"; }/* w w w .j a va2 s. c o m*/ FileInputFormat.setInputPaths(conf, new Path(currInputFilePat)); currOutputFilePat = baseOutputFilePat + "_" + iter; FileOutputFormat.setOutputPath(conf, new Path(currOutputFilePat)); JobClient.runJob(conf); } }