List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams//from ww w.ja va2 s . com */ private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport, Window mode, int winsize) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); con.set(WINDOW_TYPE, mode.toString()); con.setInt(WINDOW_SIZE, winsize); if (mode.toString().equalsIgnoreCase("DOCUMENT")) { con.setInt("mapred.job.map.memory.mb", 3000); con.set("mapred.child.java.opts", "-Xmx2900M"); con.set("mapred.reduce.child.java.opts", "-Xmx8000M"); con.setInt("mapred.job.reduce.memory.mb", 8120); } else { con.setInt("mapred.job.map.memory.mb", 2000); con.set("mapred.child.java.opts", "-Xmx1900M"); con.set("mapred.reduce.child.java.opts", "-Xmx2900M"); con.setInt("mapred.job.reduce.memory.mb", 3000); } con.setBoolean("mapred.compress.map.output", true); con.setStrings("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setBoolean("mapred.compress.output", true); con.setStrings("mapred.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); con.setInt("mapred.task.timeout", 6000000); con.setInt("io.sort.factor", 50); con.setInt("mapreduce.map.tasks", 256); con.setInt("dfs.replication", 1); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(512); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation/* w w w . ja v a2s. c o m*/ */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(AssocReducer.MIN_VALUE, minValue); conf.setInt("mapred.job.map.memory.mb", 1280); conf.setInt("mapred.job.reduce.memory.mb", 2560); conf.set("mapred.reduce.child.java.opts", "-Xmx2G"); conf.setInt("mapred.task.timeout", 6000000); conf.set(AssocReducer.ASSOC_METRIC, "llr"); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr"); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.setReducerClass(AssocReducer.class); job.setNumReduceTasks(reduceTasks); // Defines additional single text based output 'text' for the job MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class); // Defines additional multi sequencefile based output 'sequence' for the // job MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:edu.berkeley.chukwa_xtrace.TestXtrAdaptor.java
License:Apache License
public void testXtrAdaptor() throws IOException, ChukwaAgent.AlreadyRunningException, InterruptedException { Configuration conf = new Configuration(); File baseDir = new File(System.getProperty("test.build.data", "/tmp")); conf.set("chukwaAgent.checkpoint.dir", baseDir.getCanonicalPath()); conf.setBoolean("chukwaAgent.checkpoint.enabled", false); conf.set("chukwaAgent.control.port", "0"); ChukwaAgent agent = new ChukwaAgent(conf); ChunkCatcherConnector chunks = new ChunkCatcherConnector(); chunks.start();/*from w ww .ja v a 2 s .co m*/ System.setProperty("xtrace.reporter", "edu.berkeley.xtrace.reporting.TcpReporter"); System.setProperty("xtrace.tcpdest", "localhost:7831"); assertEquals(0, agent.adaptorCount()); agent.processAddCommand("add edu.berkeley.chukwa_xtrace.XtrAdaptor XTrace TcpReportSource 0"); assertEquals(1, agent.adaptorCount()); XTraceContext.startTrace("test", "testtrace", "atag"); XTraceContext.logEvent("test", "label"); Chunk c = chunks.waitForAChunk(); String report = new String(c.getData()); assertTrue(report.contains("Agent: test")); assertTrue(report.contains("Tag: atag")); System.out.println(report); System.out.println("-- next chunk --- "); c = chunks.waitForAChunk(); report = new String(c.getData()); assertTrue(report.contains("Agent: test")); assertTrue(report.contains("Label: label")); System.out.println(report); System.out.println("OK"); agent.shutdown(); }
From source file:edu.indiana.d2i.htrc.io.DataAPISilvermapleConf.java
License:Apache License
@Override public void configurate(Configuration conf, int maxIdsPerReq) { // conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, 100); conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, maxIdsPerReq); conf.set(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); conf.set(HTRCConstants.DATA_API_CLIENTID, "drhtrc"); conf.set(HTRCConstants.DATA_API_CLIENTSECRETE, "d0ct0r.htrc"); conf.set(HTRCConstants.DATA_API_TOKENLOC, "https://silvermaple.pti.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); conf.setBoolean(HTRCConstants.DATA_API_SELFSIGNED, false); conf.set(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "silvermaple.pti.indiana.edu:25443"); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ////from www . ja v a 2 s.c om Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SparseVectorsToMemcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(idListDir)); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ////www.j a v a 2 s . c o m Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SVFromHDFS2Memcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(vecDir)); job.waitForCompletion(true); }
From source file:edu.isi.mavuno.app.distsim.ContextToContext.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String contextPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ContextPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusClass", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusPath", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorArgs", conf); int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.MinMatches", conf)); boolean harvestGlobalStats = Boolean .parseBoolean(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.GlobalStats", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: ContextToContext"); sLogger.info(" - Context path: " + contextPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Harvest global stats: " + harvestGlobalStats); // context to pattern conf.set("Mavuno.ContextToPattern.ContextPath", contextPath); conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath); conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass); conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass); conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches); conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats); conf.set("Mavuno.ContextToPattern.OutputPath", outputPath); new ContextToPattern(conf).run(); // pattern to context conf.set("Mavuno.PatternToContext.PatternPath", outputPath + "/pattern-stats"); conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath); conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass); conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass); conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches); conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats); conf.set("Mavuno.PatternToContext.OutputPath", outputPath); new PatternToContext(conf).run(); return 0;/* w w w. j a v a2 s .co m*/ }
From source file:edu.isi.mavuno.app.distsim.PatternToPattern.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String patternPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.PatternPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.CorpusClass", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.CorpusPath", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.ExtractorArgs", conf); int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.MinMatches", conf)); boolean harvestGlobalStats = Boolean .parseBoolean(MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.GlobalStats", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.PatternToPattern.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: PatternToPattern"); sLogger.info(" - Pattern path: " + patternPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Context class: " + extractorClass); sLogger.info(" - Context arguments: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Harvest global stats: " + harvestGlobalStats); // pattern to context conf.set("Mavuno.PatternToContext.PatternPath", patternPath); conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath); conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass); conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass); conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches); conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats); conf.set("Mavuno.PatternToContext.OutputPath", outputPath); new PatternToContext(conf).run(); // context to pattern conf.set("Mavuno.ContextToPattern.ContextPath", outputPath + "/context-stats"); conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath); conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass); conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass); conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches); conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats); conf.set("Mavuno.ContextToPattern.OutputPath", outputPath); new ContextToPattern(conf).run(); return 0;//from www. j av a2 s . c o m }
From source file:edu.isi.mavuno.app.ie.HarvestEspressoContexts.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.ExtractorArgs", conf); String scorerClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.ScorerClass", conf); String scorerArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.ScorerArgs", conf); int numPatterns = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.NumPatterns", conf)); int minMatches = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.MinMatches", conf)); String baseOutputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoContexts.OutputPath", conf); String numIterations = MavunoUtils.getOptionalParam("Mavuno.HarvestEspressoContexts.NumIterations", conf); int iterations = 1; if (numIterations != null) { iterations = Integer.parseInt(numIterations); }/* w w w . j av a2 s . co m*/ MavunoUtils.createDirectory(conf, baseOutputPath); sLogger.info("Tool name: HarvestEspressoContexts"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Scorer class: " + scorerClass); sLogger.info(" - Scorer args: " + scorerArgs); sLogger.info(" - Number of patterns: " + numPatterns); sLogger.info(" - Minimum matches: " + minMatches); sLogger.info(" - Iterations: " + iterations); sLogger.info(" - Output path: " + baseOutputPath); // initial sub output path MavunoUtils.createDirectory(conf, baseOutputPath + "/0"); MavunoUtils.createDirectory(conf, baseOutputPath + "/0/contexts-scored"); // examples -> sequence file conf.set("Mavuno.ExamplesToSequenceFile.InputPath", inputPath); conf.set("Mavuno.ExamplesToSequenceFile.OutputPath", baseOutputPath + "/0/contexts-scored/scored-contexts-raw"); new ExamplesToSequenceFile(conf).run(); // iterate procedure for (int i = 1; i <= iterations; i++) { // previous output path (input to current iteration) String prevOutputPath = baseOutputPath + "/" + (i - 1); // current output path String curOutputPath = baseOutputPath + "/" + i; MavunoUtils.createDirectory(conf, curOutputPath); // seeds -> patterns conf.set("Mavuno.ContextToPattern.ContextPath", prevOutputPath + "/contexts-scored/scored-contexts-raw"); conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath); conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass); conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass); conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches); conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", true); conf.set("Mavuno.ContextToPattern.OutputPath", curOutputPath + "/patterns"); new ContextToPattern(conf).run(); // score patterns conf.set("Mavuno.ComputePatternScores.InputPath", curOutputPath + "/patterns"); conf.set("Mavuno.ComputePatternScores.ContextScorerClass", null); conf.set("Mavuno.ComputePatternScores.PatternScorerClass", scorerClass); conf.set("Mavuno.ComputePatternScores.PatternScorerArgs", scorerArgs); conf.set("Mavuno.ComputePatternScores.OutputPath", curOutputPath + "/patterns-scored"); new ComputePatternScores(conf).run(); // only retain top-(k * i) patterns if (numPatterns >= 0) { conf.set("Mavuno.GetTopResults.InputPath", curOutputPath + "/patterns-scored/scored-patterns"); conf.set("Mavuno.GetTopResults.OutputPath", curOutputPath + "/patterns-scored-top"); conf.setInt("Mavuno.GetTopResults.NumResults", numPatterns * i); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", true); new GetTopResults(conf).run(); } // patterns -> contexts if (numPatterns >= 0) { conf.set("Mavuno.PatternToContext.PatternPath", curOutputPath + "/patterns-scored-top"); } else { conf.set("Mavuno.PatternToContext.PatternPath", curOutputPath + "/patterns-scored/scored-patterns-raw"); } conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath); conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass); conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass); conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches); conf.setBoolean("Mavuno.PatternToContext.GlobalStats", true); conf.set("Mavuno.PatternToContext.OutputPath", curOutputPath + "/contexts"); new PatternToContext(conf).run(); // score contexts conf.set("Mavuno.ComputeContextScores.InputPath", curOutputPath + "/contexts"); conf.set("Mavuno.ComputeContextScores.PatternScorerClass", null); conf.set("Mavuno.ComputeContextScores.ContextScorerClass", scorerClass); conf.set("Mavuno.ComputeContextScores.ContextScorerArgs", scorerArgs); conf.set("Mavuno.ComputeContextScores.OutputPath", curOutputPath + "/contexts-scored"); new ComputeContextScores(conf).run(); // delete previous output path MavunoUtils.removeDirectory(conf, prevOutputPath); } return 0; }
From source file:edu.isi.mavuno.app.ie.HarvestEspressoPatterns.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.ExtractorArgs", conf); String scorerClass = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.ScorerClass", conf); String scorerArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.ScorerArgs", conf); int numContexts = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.NumContexts", conf)); int minMatches = Integer .parseInt(MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.MinMatches", conf)); String baseOutputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestEspressoPatterns.OutputPath", conf); String numIterations = MavunoUtils.getOptionalParam("Mavuno.HarvestEspressoPatterns.NumIterations", conf); int iterations = 1; if (numIterations != null) { iterations = Integer.parseInt(numIterations); }/*from w w w . ja va2 s . c o m*/ MavunoUtils.createDirectory(conf, baseOutputPath); sLogger.info("Tool name: HarvestEspressoPatterns"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Scorer class: " + scorerClass); sLogger.info(" - Scorer args: " + scorerArgs); sLogger.info(" - Number of contexts: " + numContexts); sLogger.info(" - Minimum matches: " + minMatches); sLogger.info(" - Iterations: " + iterations); sLogger.info(" - Output path: " + baseOutputPath); // initial sub output path MavunoUtils.createDirectory(conf, baseOutputPath + "/0"); MavunoUtils.createDirectory(conf, baseOutputPath + "/0/patterns-scored"); // patterns -> sequence file conf.set("Mavuno.ExamplesToSequenceFile.InputPath", inputPath); conf.set("Mavuno.ExamplesToSequenceFile.OutputPath", baseOutputPath + "/0/patterns-scored/scored-patterns-raw"); new ExamplesToSequenceFile(conf).run(); // iterate procedure for (int i = 1; i <= iterations; i++) { // previous output path (input to current iteration) String prevOutputPath = baseOutputPath + "/" + (i - 1); // current output path String curOutputPath = baseOutputPath + "/" + i; MavunoUtils.createDirectory(conf, curOutputPath); // seeds -> contexts conf.set("Mavuno.PatternToContext.PatternPath", prevOutputPath + "/patterns-scored/scored-patterns-raw"); conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath); conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass); conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass); conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches); conf.setBoolean("Mavuno.PatternToContext.GlobalStats", true); conf.set("Mavuno.PatternToContext.OutputPath", curOutputPath + "/contexts"); new PatternToContext(conf).run(); // score contexts conf.set("Mavuno.ComputeContextScores.InputPath", curOutputPath + "/contexts"); conf.set("Mavuno.ComputeContextScores.PatternScorerClass", null); conf.set("Mavuno.ComputeContextScores.ContextScorerClass", scorerClass); conf.set("Mavuno.ComputeContextScores.ContextScorerArgs", scorerArgs); conf.set("Mavuno.ComputeContextScores.OutputPath", curOutputPath + "/contexts-scored"); new ComputeContextScores(conf).run(); // only retain top-(k * i) contexts if (numContexts >= 0) { conf.set("Mavuno.GetTopResults.InputPath", curOutputPath + "/contexts-scored/scored-contexts"); conf.set("Mavuno.GetTopResults.OutputPath", curOutputPath + "/contexts-scored-top"); conf.setInt("Mavuno.GetTopResults.NumResults", numContexts * i); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", true); new GetTopResults(conf).run(); } // contexts -> patterns if (numContexts >= 0) { conf.set("Mavuno.ContextToPattern.ContextPath", curOutputPath + "/contexts-scored-top"); } else { conf.set("Mavuno.ContextToPattern.ContextPath", curOutputPath + "/contexts-scored/scored-contexts-raw"); } conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath); conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass); conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass); conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches); conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", true); conf.set("Mavuno.ContextToPattern.OutputPath", curOutputPath + "/patterns"); new ContextToPattern(conf).run(); // score patterns conf.set("Mavuno.ComputePatternScores.InputPath", curOutputPath + "/patterns"); conf.set("Mavuno.ComputePatternScores.ContextScorerClass", null); conf.set("Mavuno.ComputePatternScores.PatternScorerClass", scorerClass); conf.set("Mavuno.ComputePatternScores.PatternScorerArgs", scorerArgs); conf.set("Mavuno.ComputePatternScores.OutputPath", curOutputPath + "/patterns-scored"); new ComputePatternScores(conf).run(); // delete previous output path MavunoUtils.removeDirectory(conf, prevOutputPath); } return 0; }