List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:ivory.preprocess.GetTermCount.java
License:Apache License
public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool//from w ww . ja v a2 s .c o m JobConf conf = new JobConf(getConf(), GetTermCount.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt(Constants.NumMapTasks, 0); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { sLogger.info("index path doesn't existing: skipping!"); return 0; } sLogger.info("PowerTool: GetTermCount"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0)); sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { sLogger.error("TermDfCf directory exist: skipping!"); return 0; } conf.setJobName("GetTermCount:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyCombiner.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter(); env.writeCollectionTermCount(collectionTermCount); // NOTE: this value is not the same as number of postings, because // postings for non-English terms are discarded, or as result of df cut long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter(); env.writeCollectionLength(collectionLength); return 0; }
From source file:ivory.ptc.AnchorTextInvertedIndex.java
License:Apache License
@Override public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class); FileSystem fs = FileSystem.get(conf); String inPath = conf.get("Ivory.InputPath"); String outPath = conf.get("Ivory.OutputPath"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = conf.getInt("Ivory.NumMapTasks", 1); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100); String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters"); LOG.info("BuildAnchorTextInvertedIndex"); LOG.info(" - input path: " + inPath); LOG.info(" - output path: " + outPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme")); LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters); String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER); for (String param : params) { DistributedCache.addCacheFile(new URI(param), conf); }/*from ww w .j a v a2 s .co m*/ conf.setJobName("BuildAnchorTextInvertedIndex"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(AnchorTextTarget.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); fs.delete(outputPath); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatJudgments.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { printUsage();/*from ww w . ja v a 2 s. c om*/ return -1; } JobConf conf = new JobConf(getConf(), XMLFormatJudgments.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; String docnoMapping = args[2]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoJudgments"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); DistributedCache.addCacheFile(new URI(docnoMapping), conf); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatQueries.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { printUsage();//from w w w . j a va 2 s .c o m return -1; } JobConf conf = new JobConf(getConf(), XMLFormatQueries.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoQueries"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.SortedPseudoTestCollection.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), SortedPseudoTestCollection.class); FileSystem fs = FileSystem.get(conf); String inPath = conf.get("Ivory.InputPath"); String outPath = conf.get("Ivory.OutputPath"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; LOG.info("SortedPseudoTestCollection"); LOG.info(" - Input path: " + conf.get("Ivory.InputPath")); LOG.info(" - Output path: " + conf.get("Ivory.OutputPath")); LOG.info(" - JudgmentExtractor: " + conf.get("Ivory.JudgmentExtractor")); LOG.info(" - JudgmentExtractorParameters: " + conf.get("Ivory.JudgmentExtractorParameters")); LOG.info(" - SamplingCriterion: " + conf.get("Ivory.SamplingCriterion")); LOG.info(" - SamplingCriterionParameters: " + conf.get("Ivory.SamplingCriterionParameters")); LOG.info(" - QueryScorer: " + conf.get("Ivory.QueryScorer")); conf.setJobName("SortedPTC"); conf.setNumMapTasks(mapTasks);//from w w w . j a v a 2 s .c om conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(PseudoQuery.class); conf.setOutputValueClass(PseudoJudgments.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); fs.delete(outputPath); JobClient.runJob(conf); return 0; }
From source file:ivory.server.RunDistributedRetrievalServers.java
License:Apache License
/** * Runs this tool.// w w w. j ava 2s . c om */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return -1; } String configFile = args[0]; FileSystem fs = FileSystem.get(getConf()); Document d = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(fs.open(new Path(configFile))); sLogger.info("Reading configuration to determine number of servers to launch:"); List<String> sids = new ArrayList<String>(); NodeList servers = d.getElementsByTagName("server"); for (int i = 0; i < servers.getLength(); i++) { Node node = servers.item(i); // get server id String sid = XMLTools.getAttributeValue(node, "id", null); if (sid == null) { throw new Exception("Must specify a query id attribute for every server!"); } sLogger.info(" - sid: " + sid); sids.add(sid); } int port = 7000; int numServers = sids.size(); String configPath = args[1]; if (fs.exists(new Path(configPath))) { fs.delete(new Path(configPath), true); } String fname = appendPath(configPath, "config-" + numServers + ".txt"); sLogger.info("Writing configuration to: " + fname); StringBuffer sb = new StringBuffer(); for (int n = 0; n < numServers; n++) { port++; sb.append(sids.get(n) + " " + port + "\n"); } FSDataOutputStream out = fs.create(new Path(fname), true); out.writeBytes(sb.toString()); out.close(); JobConf conf = new JobConf(RetrievalServer.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInputFormat(NLineInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(ServerMapper.class); FileInputFormat.setInputPaths(conf, new Path(fname)); conf.set("Ivory.ConfigFile", configFile); conf.set("Ivory.ConfigPath", configPath); conf.setJobName("RetrievalServers"); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("mapred.child.java.opts", "-Xmx2048m"); // conf.set("mapred.job.queue.name", "search"); JobClient client = new JobClient(conf); client.submitJob(conf); sLogger.info("Waiting for servers to start up..."); // poll HDFS for hostnames and ports boolean allStarted = true; do { allStarted = true; for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".host"); if (!fs.exists(new Path(f))) { allStarted = false; } } Thread.sleep(10000); sLogger.info(" ..."); } while (!allStarted); // poll HDFS for ready signal that the index is ready boolean allReady = true; do { allReady = true; for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".ready"); if (!fs.exists(new Path(f))) { allReady = false; } } Thread.sleep(10000); sLogger.info(" ..."); } while (!allReady); sLogger.info("All servers ready!"); sLogger.info("Host information:"); for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".host"); sLogger.info(" sid=" + sids.get(n) + ", " + FSProperty.readString(fs, f)); } return 0; }
From source file:ivory.smrf.retrieval.distributed.RunDistributedRetrievalServers.java
License:Apache License
/** * Runs this tool./*from w w w.j a v a 2 s .c o m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return -1; } String configFile = args[0]; FileSystem fs = FileSystem.get(getConf()); Document d = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(fs.open(new Path(configFile))); sLogger.info("Reading configuration to determine number of servers to launch:"); List<String> sids = new ArrayList<String>(); NodeList servers = d.getElementsByTagName("server"); for (int i = 0; i < servers.getLength(); i++) { Node node = servers.item(i); // get server id String sid = XMLTools.getAttributeValue(node, "id", null); if (sid == null) { throw new Exception("Must specify a query id attribute for every server!"); } sLogger.info(" - sid: " + sid); sids.add(sid); } int port = 7000; int numServers = sids.size(); String configPath = args[1]; if (fs.exists(new Path(configPath))) { fs.delete(new Path(configPath), true); } String fname = appendPath(configPath, "config-" + numServers + ".txt"); sLogger.info("Writing configuration to: " + fname); StringBuffer sb = new StringBuffer(); for (int n = 0; n < numServers; n++) { port++; sb.append(sids.get(n) + " " + port + "\n"); } FSDataOutputStream out = fs.create(new Path(fname), true); out.writeBytes(sb.toString()); out.close(); JobConf conf = new JobConf(getConf(), RetrievalServer.class); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInputFormat(NLineInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(ServerMapper.class); FileInputFormat.setInputPaths(conf, new Path(fname)); conf.set("Ivory.ConfigFile", configFile); conf.set("Ivory.ConfigPath", configPath); conf.setJobName("RetrievalServers"); //conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("mapred.child.java.opts", "-Xmx2048m"); // conf.set("mapred.job.queue.name", "search"); JobClient client = new JobClient(conf); client.submitJob(conf); sLogger.info("Waiting for servers to start up..."); // poll HDFS for hostnames and ports boolean allStarted = true; do { allStarted = true; for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".host"); if (!fs.exists(new Path(f))) { allStarted = false; } } Thread.sleep(10000); sLogger.info(" ..."); } while (!allStarted); // poll HDFS for ready signal that the index is ready boolean allReady = true; do { allReady = true; for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".ready"); if (!fs.exists(new Path(f))) { allReady = false; } } Thread.sleep(10000); sLogger.info(" ..."); } while (!allReady); sLogger.info("All servers ready!"); sLogger.info("Host information:"); for (int n = 0; n < numServers; n++) { String f = appendPath(configPath, sids.get(n) + ".host"); sLogger.info(" sid=" + sids.get(n) + ", " + FSProperty.readString(fs, f)); } return 0; }
From source file:job.uncombine.compressed.BigBuildInvertedIndex.java
License:Apache License
/** * Runs this tool.//from ww w. j av a2 s. c o m */ public int run(String[] args) throws Exception { //long GB = 1024 * 1024 * 1024; //long totalDataSize = 1 * GB; int reduceNumArray[] = { 9, 18 }; int splitSizeMBArray[] = { 64, 128, 256 }; int xmxArray[] = { 1000, 2000, 3000, 4000 }; int xmsArray[] = { 0, 1 }; int ismbArray[] = { 200, 400, 600, 800 }; for (int splitIndex = 0; splitIndex < splitSizeMBArray.length; splitIndex++) { for (int reduceNumIndex = 0; reduceNumIndex < reduceNumArray.length; reduceNumIndex++) { for (int xmxIndex = 0; xmxIndex < xmxArray.length; xmxIndex++) { for (int xmsIndex = 0; xmsIndex < xmsArray.length; xmsIndex++) { for (int ismbIndex = 0; ismbIndex < ismbArray.length; ismbIndex++) { int reduceNum = reduceNumArray[reduceNumIndex]; int splitMB = splitSizeMBArray[splitIndex]; int xmx = xmxArray[xmxIndex]; int xms = xmsArray[xmsIndex] * xmx; int ismb = ismbArray[ismbIndex]; JobConf conf = new JobConf(getConf(), BigBuildInvertedIndex.class); conf.setLong("mapred.min.split.size", SplitTable.getMapred_min_split_size(splitMB)); conf.setLong("mapred.max.split.size", SplitTable.getMapred_max_split_size(splitMB)); //conf.setInt("my.sample.split.num", (int) (totalDataSize / (splitMB * 1024 * 1024))); conf.setInt("mapred.reduce.tasks", reduceNum); conf.setInt("io.sort.mb", ismb); if (xms == 0) conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m"); else conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m -Xms" + xms + "m"); conf.setInt("child.monitor.metrics.seconds", 2); conf.setInt("child.monitor.jvm.seconds", 2); conf.setInt("child.monitor.jstat.seconds", 2); conf.setJobName("BigBuildInvertedIndex " + splitMB + "MB " + conf.get("mapred.child.java.opts") + " ismb=" + ismb + " RN=" + reduceNum); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: BigBuildInvertedIndex <in> <out>"); System.exit(2); } conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfInts.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfWritables.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapperClass(MyMapper.class); // conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(MyReducer.class); FileInputFormat.setInputPaths(conf, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1])); FileSystem.get(conf).delete(new Path(otherArgs[1]), true); try { JobClient.runJob(conf); } catch (IOException e) { e.printStackTrace(); } Thread.sleep(15000); } } } } } return 0; }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFt.java
License:Apache License
/** * Set the job configuration, classes and run the job. */// w ww. j ava 2 s . c o m @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); // JobConf conf = new JobConf(AggrPerFt.class); // conf.setJobName("AggrPerFt"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* * use compression */ // conf.set("mapred.output.compress", "true"); // conf.set("mapred.map.output.compress", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.SnappyCodec"); // conf.set("mapred.output.compression.codec", // "org.apache.hadoop.io.compress.SnappyCodec"); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 120; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }
From source file:jobimtext.thesaurus.distributional.hadoop.mapreduce.AggrPerFtUniquePositions.java
License:Apache License
/** * Set the job configuration, classes and run the job. *//*from w w w. j a v a 2 s. co m*/ @SuppressWarnings("deprecation") public static void main(String[] args) throws Exception { JobConf conf = HadoopUtil.generateJobConf(args); // JobConf conf = new JobConf(AggrPerFtUniquePositions.class); conf.setJobName("AggrPerFtUniquePositions " + args[0] + " " + args[1]); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); /* * use compression */ conf.set("mapred.output.compress", "true"); conf.set("mapred.map.output.compress", "true"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); /* set the maximum number of task per node */ int maptasks = 120; conf.set("mapred.tasktracker.map.tasks.maximum", "" + maptasks); conf.set("mapred.map.tasks", "" + maptasks); conf.set("mapred.tasktracker.map", "" + maptasks); int reducetasks = 60; conf.set("mapred.tasktracker.reduce.tasks.maximum", "" + reducetasks); conf.set("mapred.reduce.tasks", "" + reducetasks); conf.set("mapred.tasktracker.reduce", "" + reducetasks); /* * heap size for the job */ conf.set("mapred.child.java.opts", "-Xmx1500m"); /* * how much virtual memory the entire process tree of each map/reduce * task will use */ conf.set("mapred.job.map.memory.mb", "2048"); conf.set("mapred.job.reduce.memory.mb", "2048"); JobClient.runJob(conf); }