List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:nl.cwi.kba2013.apps.chunk_stream_DocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w ww.j a v a 2s . c o m String out = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; Log.info("TrainTest"); } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "chunk -stream"); job.setJarByClass(chunk_stream_DocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); return status; }
From source file:nl.cwi.kba2013.apps.FeatureExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w w w . j a va2 s . com String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String systemdescription_short = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-ds".equals(args[i])) { systemdescription_short = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; if (systemdescription_short == null) systemdescription_short = "a two -step classification approach"; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* * Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); * Attr.put("LengthAnchor", ""); Attr.put("Source", ""); * Attr.put("English", ""); Attr.put("MentionsTitle", ""); * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); * Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", * ""); Attr.put("partial", ""); Attr.put("s_form", ""); * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", * ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(FeatureExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Run_info.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); // Text line = new Text(); // LineReader reader = new LineReader(fs.open(new Path(out // + "/part-r-00000"))); // for (int i = 0; i < num_filter_results; i++) { // reader.readLine(line); // System.out.println(line.toString().split("\t\t")[1]); // } System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.FeatureExtractor_DocExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w w w .j av a 2s . c om String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String systemdescription_short = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-ds".equals(args[i])) { systemdescription_short = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; if (systemdescription_short == null) systemdescription_short = "a two -step classification approach"; LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); LOG.info(" - runtag: " + runtag); LOG.info(" - teamname: " + teamname); LOG.info(" - corpus_id: " + corpus_id); LOG.info(" - run description: " + systemdescription); Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* * Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); * Attr.put("LengthAnchor", ""); Attr.put("Source", ""); * Attr.put("English", ""); Attr.put("MentionsTitle", ""); * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); * Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", * ""); Attr.put("partial", ""); Attr.put("s_form", ""); * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", * ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(FeatureExtractor_DocExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); /* job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; */ //job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(TextOutputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(StreamItemWritable.class); //job.setMapOutputValueClass(Text.class); //job.setOutputKeyClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Run_info.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); // Text line = new Text(); // LineReader reader = new LineReader(fs.open(new Path(out // + "/part-r-00000"))); // for (int i = 0; i < num_filter_results; i++) { // reader.readLine(line); // System.out.println(line.toString().split("\t\t")[1]); // } System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.KBaDocExtractorFromCleansed.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w w w. j a v a2s . c o m String out = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } Configuration conf = getConf(); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Annotation Extraction"); job.setJarByClass(KBaDocExtractorFromCleansed.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StreamItemWritable.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); return status; }
From source file:nl.cwi.kba2013.apps.KbaExtractMissingFromRaw.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// ww w .java 2 s .c o m String out = null; String queryfile = null; String labelsFile = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Missing Extractor"); job.setJarByClass(KbaExtractMissingFromRaw.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); //let's see it crushing again //job.setInputFormatClass(ThriftFileInputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); //job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KbaNameVariantMatch.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w ww .j a va 2 s . co m String out = null; String queryfile = null; String labelsFile = null; String traintestFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-t".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KbaNameVariantMatch.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); //job.setInputFormatClass(ThriftFileInputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KbaNameVariantMatchFeatureExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w w w . ja v a 2s.c o m String out = null; String queryfile = null; String contextFile = null; String systemdescription = null; String systemdescription_short = null; String corpus_id = null; String runtag = null; String teamname = null; String annoFile = null; String gcldFile = null; String labelsFile = null; String pprFile = null; String myverFile = null; String wikiFile = null; String traintestFile = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { queryfile = args[++i]; } else if ("-r".equals(args[i])) { runtag = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-t".equals(args[i])) { teamname = args[++i]; } else if ("-d".equals(args[i])) { systemdescription = args[++i]; } else if ("-ds".equals(args[i])) { systemdescription_short = args[++i]; } else if ("-p".equals(args[i])) { pprFile = args[++i]; } else if ("-g".equals(args[i])) { gcldFile = args[++i]; } else if ("-s".equals(args[i])) { myverFile = args[++i]; } else if ("-c".equals(args[i])) { contextFile = args[++i]; } else if ("-w".equals(args[i])) { wikiFile = args[++i]; } else if ("-tt".equals(args[i])) { traintestFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || queryfile == null) return printUsage(); if (runtag == null) runtag = "toy_1"; if (teamname == null) teamname = "CompInsights"; if (corpus_id == null) corpus_id = "kba-stream-corpus-2012-cleansed-only"; if (systemdescription == null) systemdescription = "Description intentionally left blank."; if (systemdescription_short == null) systemdescription_short = "a two -step classification approach"; // LOG.info("Tool: " + this.getClass().getName()); // LOG.info(" - input path: " + in); // LOG.info(" - output path: " + out); // LOG.info(" - runtag: " + runtag); // LOG.info(" - teamname: " + teamname); // LOG.info(" - corpus_id: " + corpus_id); // LOG.info(" - run description: " + systemdescription); Run_info fr = new Run_info.Factory().create(teamname, runtag, systemdescription, systemdescription_short, corpus_id); Map<String, String> Attr = new LinkedHashMap<String, String>(); // Attr.put("trec-kba", ""); /* * Attr.put("LengthTitle", ""); Attr.put("LengthBody", ""); * Attr.put("LengthAnchor", ""); Attr.put("Source", ""); * Attr.put("English", ""); Attr.put("MentionsTitle", ""); * Attr.put("MentionsBody", ""); Attr.put("MentionsAnchor", ""); * Attr.put("FirstPos", ""); Attr.put("LastPos", ""); Attr.put("Spread", * ""); Attr.put("FirstPosNorm", ""); Attr.put("LastPosNorm", ""); * Attr.put("SpreadNorm", ""); // Attr.put("Related", ""); * Attr.put("Relatedtitle", ""); Attr.put("RelatedBody", ""); * Attr.put("RelatedAnchor", ""); Attr.put("ppr", ""); Attr.put("gcld", * ""); Attr.put("partial", ""); Attr.put("s_form", ""); * Attr.put("contxL", "0"); Attr.put("contxR", "0"); Attr.put("cos", * "0"); Attr.put("kl", "0"); Attr.put("jac", "0"); Attr.put("Class", * ""); */ Attr.put("gcld", "0"); Attr.put("jac", "0"); Attr.put("cos", "0"); Attr.put("kl", "0"); Attr.put("ppr", "0"); Attr.put("s_form", "0"); Attr.put("contxR", "0"); Attr.put("contxL", "0"); Attr.put("FirstPos", "0"); Attr.put("LastPos", "0"); Attr.put("LengthBody", "0"); Attr.put("FirstPosNorm", "0"); Attr.put("MentionsBody", "0"); Attr.put("RelatedBody", "0"); Attr.put("Spread", "0"); Attr.put("LastPosNorm", "0"); Attr.put("SpreadNorm", "0"); Attr.put("LengthAnchor", "0"); Attr.put("Source", "0"); Attr.put("LengthTitle", "0"); Attr.put("partial", "0"); Attr.put("MentionsAnchor", "0"); Attr.put("Relatedtitle", "0"); Attr.put("English", "0"); Attr.put("RelatedAnchor", "0"); Attr.put("MentionsTitle", "0"); Attr.put("Class", "0"); Configuration conf = getConf(); conf.set(QUERYFILEPATH_HDFS, new Path(queryfile).toUri().toString()); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); conf.set(PPR_HDFS, new Path(pprFile).toUri().toString()); conf.set(MYVER, new Path(myverFile).toUri().toString()); conf.set(GCLD_HDFS, new Path(gcldFile).toUri().toString()); conf.set(CONTEXT_HDFS, new Path(contextFile).toUri().toString()); conf.set(WIKI_HDFS, new Path(wikiFile).toUri().toString()); conf.set(TrainTest_HDFS, new Path(traintestFile).toUri().toString()); conf.set(RUNTAG, runtag); conf.set(TEAMNAME, teamname); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file loadTopicData(queryfile, fr, fs, run_info); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KbaNameVariantMatchFeatureExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(queryfile) + "#" + QUERYFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(pprFile) + "#" + PPR_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(gcldFile) + "#" + GCLD_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(contextFile) + "#" + CONTEXT_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(wikiFile) + "#" + WIKI_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(traintestFile) + "#" + TrainTest_HDFS), job.getConfiguration()); DistributedCache.addCacheFile(new URI(new Path(myverFile) + "#" + MYVER), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); /* job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; */ //job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(TextOutputFormat.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(StreamItemWritable.class); //job.setMapOutputValueClass(Text.class); //job.setOutputKeyClass(StreamItemWritable.class); // job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); SequenceFileOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time", ((double) cputime)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); fr.setAdditionalProperties("run_info", run_info); System.out.println("#" + new Run_info.Factory().toJSON(fr)); System.out.println("@RELATION" + " trec-kba" + " "); for (String key : Attr.keySet()) { if (key.equalsIgnoreCase("English")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1,2}"); } else if (key.equalsIgnoreCase("Class")) { System.out.println("@ATTRIBUTE " + key + " " + "{0,1}"); } else { System.out.println("@ATTRIBUTE " + key + " " + "NUMERIC"); } } System.out.println("\n@DATA"); // Text line = new Text(); // LineReader reader = new LineReader(fs.open(new Path(out // + "/part-r-00000"))); // for (int i = 0; i < num_filter_results; i++) { // reader.readLine(line); // System.out.println(line.toString().split("\t\t")[1]); // } System.out.println("#" + new Run_info.Factory().toPrettyJSON(fr).replaceAll("\\n", "\n#")); return status; }
From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w w w .j a v a 2 s . c o m*/ String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANameVariantMatchTHERank.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(50); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KBANER.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/* ww w. j av a 2s . c o m*/ String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANER.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.ReadGzip.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w ww . ja va 2 s. c o m String out = null; HashMap<String, Object> run_info = new HashMap<String, Object>(); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); // set time conf.setLong("mapred.task.timeout", 40 * 600000); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(ReadGzip.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // job.setCombinerClass(MyReducer.class); // job.setReducerClass(MyReducer.class); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; Counters c = job.getCounters(); long cputime = c.findCounter(org.apache.hadoop.mapred.Task.Counter.CPU_MILLISECONDS).getValue(); run_info.put("elapsed_time_secs", ((double) cputime / 1000d)); long num_filter_results = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS) .getValue(); run_info.put("num_filter_results", num_filter_results); long num_entity_doc_compares = c.findCounter(org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS) .getValue(); run_info.put("num_entity_doc_compares", num_entity_doc_compares); long hours = c.findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_INPUT_GROUPS).getValue(); run_info.put("num_stream_hours", hours); return status; }