List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:com.datasalt.utils.mapred.counter.TestMapRedCounter.java
License:Apache License
public void testWithMinimumCountOtherThan1(boolean withCombiner) throws IOException, InterruptedException, ClassNotFoundException, CloneNotSupportedException { Configuration conf = BaseConfigurationFactory.getInstance().getConf(); Job job;/*from w ww.java 2 s.c o m*/ /* * Set minimum count */ conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "0", 2); conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "1", 2); conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "2", 2); if (withCombiner) { job = MapRedCounter.buildMapRedCounterJob("counter", SequenceFileOutputFormat.class, OUTPUT_COUNT, conf); } else { job = MapRedCounter.buildMapRedCounterJobWithoutCombiner("counter", SequenceFileOutputFormat.class, OUTPUT_COUNT, conf); } MapRedCounter.addInput(job, new Path(SINGLE_LINE_FILE), TextInputFormat.class, TestMapper.class); job.waitForCompletion(true); HashMap<String, Long> itemCount = itemCountAsMap(getFs(), OUTPUT_COUNT + "/" + MapRedCounter.Outputs.COUNTFILE + "/part-r-00000"); HashMap<String, LongPairWritable> itemGroupCount = itemGroupCountAsMap(getFs(), OUTPUT_COUNT + "/" + MapRedCounter.Outputs.COUNTDISTINCTFILE + "/part-r-00000"); assertCount(2, "2:c6d3:c", itemCount); assertCount(2, "2:c6d3:b", itemCount); assertCount(2, "2:c6d3:a", itemCount); assertCount(2, "1:c3d2:a", itemCount); assertCount(2, "1:c2d1:a", itemCount); assertGroupCount(6, 3, "2:c6d3", itemGroupCount); assertGroupCount(2, 1, "1:c3d2", itemGroupCount); assertGroupCount(2, 1, "1:c2d1", itemGroupCount); }
From source file:com.datasalt.utils.mapred.crossproduct.TestCrossProductMapRed.java
License:Apache License
public void test(boolean twoSteps) throws Exception { createFirstDataSet();//from w w w. j a v a 2s . c om createSecondDataSet(); Configuration conf = getConf(); FileSystem fS = FileSystem.get(conf); if (twoSteps) { /* * Here we are saying that 1-list elements is enough to collapse the JVM's heap - just for testing */ conf.setInt(CrossProductMapRed.SPLIT_DATASET_SIZE_CONF, 1); } CrossProductMapRed crossProduct = new CrossProductMapRed("Test", conf); crossProduct.setLeftInputPath(new Path(INPUT_1)); crossProduct.setLeftInputFormat(TextInputFormat.class); crossProduct.setLeftInputMapper(Map.class); crossProduct.setRightInputPath(new Path(INPUT_2)); crossProduct.setRightInputFormat(TextInputFormat.class); crossProduct.setRightInputMapper(Map.class); crossProduct.setOutputPath(new Path(OUTPUT)); crossProduct.setOutputFormat(SequenceFileOutputFormat.class); crossProduct.memoryAwareRun(); SequenceFile.Reader reader; CrossProductExtraKey groupKey = new CrossProductExtraKey(); CrossProductPair data = new CrossProductPair(); Text txt = new Text(); Text txt2 = new Text(); if (twoSteps) { reader = new SequenceFile.Reader(fS, new Path(OUTPUT, "EXTRA" + "/" + "part-r-00000"), conf); /* * Assert intermediate "big groups" output */ for (int i = 0; i < 9; i++) { reader.next(groupKey); reader.getCurrentValue(data); if (i < 3) { ser.deser(txt, data.getRight()); switch (i) { case 0: assertEquals(txt.toString(), "pere"); break; case 1: assertEquals(txt.toString(), "eric"); break; case 2: assertEquals(txt.toString(), "ivan"); break; } } else { ser.deser(txt, data.getLeft()); switch (i) { case 3: assertEquals(txt.toString(), "beer"); break; case 4: assertEquals(txt.toString(), "beer"); break; case 5: assertEquals(txt.toString(), "beer"); break; case 6: assertEquals(txt.toString(), "wine"); break; case 7: assertEquals(txt.toString(), "wine"); break; case 8: assertEquals(txt.toString(), "wine"); break; } } } reader.close(); } /* * Assert final output */ Counter count = Counter.createWithDistinctElements(); Path finalOutput = new Path(OUTPUT, "part-r-00000"); if (twoSteps) { finalOutput = new Path(crossProduct.getBigGroupsOutput(), "part-r-00000"); } reader = new SequenceFile.Reader(fS, finalOutput, conf); for (int i = 0; i < 6; i++) { reader.next(data); ser.deser(txt, data.getLeft()); ser.deser(txt2, data.getRight()); count.in(txt.toString()).count(txt2.toString()); } Count counts = count.getCounts(); List<String> beerResults = counts.get("beer").getDistinctListAsStringList(); List<String> wineResults = counts.get("wine").getDistinctListAsStringList(); for (List<String> list : new List[] { beerResults, wineResults }) { assertEquals(list.contains("pere"), true); assertEquals(list.contains("ivan"), true); assertEquals(list.contains("eric"), true); } HadoopUtils.deleteIfExists(fS, new Path(INPUT_1)); HadoopUtils.deleteIfExists(fS, new Path(INPUT_2)); HadoopUtils.deleteIfExists(fS, new Path(OUTPUT)); if (twoSteps) { HadoopUtils.deleteIfExists(fS, crossProduct.getBigGroupsOutput()); } }
From source file:com.datatorrent.contrib.frauddetect.FrauddetectApplicationTest.java
License:Open Source License
@Test public void testApplication() throws Exception { Application application = new Application(); Configuration conf = new Configuration(false); conf.set(Application.MONGO_HOST_PROPERTY, "localhost"); conf.set(Application.MONGO_HOST_PROPERTY, "localhost"); conf.setInt(Application.BIN_THRESHOLD_PROPERTY, 20); conf.setInt(Application.AVG_THRESHOLD_PROPERTY, 1200); conf.setInt(Application.CC_THRESHOLD_PROPERTY, 420); LocalMode lma = LocalMode.newInstance(); application.populateDAG(lma.getDAG(), conf); lma.getController().run(120000);//from w ww . j a va2 s . com }
From source file:com.datatorrent.demos.mroperator.WordCountMRApplicationTest.java
License:Open Source License
@Test public void testSomeMethod() throws Exception { LocalMode lma = LocalMode.newInstance(); Configuration conf = new Configuration(false); conf.set("dt.application.WordCountDemo.operator.Mapper.dirName", testMeta.testDir); conf.setInt("dt.application.WordCountDemo.operator.Mapper.partitionCount", 1); conf.set("dt.application.WordCountDemo.operator.Console.filePath", testMeta.testDir); conf.set("dt.application.WordCountDemo.operator.Console.outputFileName", "output.txt"); lma.prepareDAG(new NewWordCountApplication(), conf); LocalMode.Controller lc = lma.getController(); lc.setHeartbeatMonitoringEnabled(false); lc.run(5000);/* ww w . j a v a2s . c o m*/ lc.shutdown(); List<String> readLines = FileUtils.readLines(new File(testMeta.testDir + "/output.txt")); Map<String, Integer> readMap = Maps.newHashMap(); Iterator<String> itr = readLines.iterator(); while (itr.hasNext()) { String[] splits = itr.next().split("="); readMap.put(splits[0], Integer.valueOf(splits[1])); } Map<String, Integer> expectedMap = Maps.newHashMap(); expectedMap.put("1", 2); expectedMap.put("2", 2); expectedMap.put("3", 2); Assert.assertEquals("expected reduced data ", expectedMap, readMap); LOG.info("read lines {}", readLines); }
From source file:com.datatorrent.stram.client.StramClientUtils.java
License:Apache License
public static Configuration addDTSiteResources(Configuration conf) { addDTLocalResources(conf);/*from w w w . ja v a2 s . c o m*/ FileSystem fs = null; File targetGlobalFile; try { fs = newFileSystemInstance(conf); // after getting the dfsRootDirectory config parameter, redo the entire process with the global config // load global settings from DFS targetGlobalFile = new File(String.format("/tmp/dt-site-global-%s.xml", UserGroupInformation.getLoginUser().getShortUserName())); org.apache.hadoop.fs.Path hdfsGlobalPath = new org.apache.hadoop.fs.Path( StramClientUtils.getDTDFSConfigDir(fs, conf), StramClientUtils.DT_SITE_GLOBAL_XML_FILE); LOG.debug("Copying global dt-site.xml from {} to {}", hdfsGlobalPath, targetGlobalFile.getAbsolutePath()); fs.copyToLocalFile(hdfsGlobalPath, new org.apache.hadoop.fs.Path(targetGlobalFile.toURI())); addDTSiteResources(conf, targetGlobalFile); if (!isDevelopmentMode()) { // load node local config file addDTSiteResources(conf, new File(StramClientUtils.getConfigDir(), StramClientUtils.DT_SITE_XML_FILE)); } // load user config file addDTSiteResources(conf, new File(StramClientUtils.getUserDTDirectory(), StramClientUtils.DT_SITE_XML_FILE)); } catch (IOException ex) { // ignore LOG.debug("Caught exception when loading configuration: {}: moving on...", ex.getMessage()); } finally { // Cannot delete the file here because addDTSiteResource which eventually calls Configuration.reloadConfiguration // does not actually reload the configuration. The file is actually read later and it needs to exist. // //if (targetGlobalFile != null) { //targetGlobalFile.delete(); //} IOUtils.closeQuietly(fs); } //Validate loggers-level settings String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL); if (loggersLevel != null) { String targets[] = loggersLevel.split(","); Preconditions.checkArgument(targets.length > 0, "zero loggers level"); for (String target : targets) { String parts[] = target.split(":"); Preconditions.checkArgument(parts.length == 2, "incorrect " + target); Preconditions.checkArgument(ConfigValidator.validateLoggersLevel(parts[0], parts[1]), "incorrect " + target); } } convertDeprecatedProperties(conf); // // The ridiculous default RESOURCEMANAGER_CONNECT_MAX_WAIT_MS from hadoop is 15 minutes (!!!!), which actually translates to 20 minutes with the connect interval. // That means if there is anything wrong with YARN or if YARN is not running, the caller has to wait for up to 20 minutes until it gets an error. // We are overriding this to be 10 seconds maximum. // int rmConnectMaxWait = conf.getInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS); if (rmConnectMaxWait > RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE) { LOG.info("Overriding {} assigned value of {} to {} because the assigned value is too big.", YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, rmConnectMaxWait, RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE); conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE); int rmConnectRetryInterval = conf.getInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS); int defaultRetryInterval = Math.max(500, RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE / 5); if (rmConnectRetryInterval > defaultRetryInterval) { LOG.info("Overriding {} assigned value of {} to {} because the assigned value is too big.", YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, rmConnectRetryInterval, defaultRetryInterval); conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, defaultRetryInterval); } } LOG.info(" conf object in stramclient {}", conf); return conf; }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*from w w w . j a v a2 s . c o m*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(DictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format//from w w w . j a v a 2 s . c om */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(DictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Create a partial vector using a chunk of features from the input documents. The input documents has to be * in the {@link SequenceFile} format/*from w ww. j a va 2 s . com*/ * * @param input * input directory of the documents in {@link SequenceFile} format * @param baseConf * job configuration * @param maxNGramSize * maximum size of ngrams to generate * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param dimension * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVectors * output vectors should be named, retaining key (doc id) as a label * @param numReducers * the desired number of reducer tasks */ private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors); conf.setInt(MAX_NGRAMS, maxNGramSize); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath); job.setJarByClass(FixDictionaryVectorizer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java
License:Apache License
/** * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in * {@link SequenceFile} format// ww w.jav a 2 s . c o m */ private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(MIN_SUPPORT, minSupport); Job job = new Job(conf); job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input); job.setJarByClass(FixDictionaryVectorizer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(TermCountMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setCombinerClass(TermCountCombiner.class); job.setReducerClass(TermCountReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.ery.hadoop.mrddx.client.MRJOBClient.java
@Override public void run(Map<String, String> paramMap) throws Exception { // license//from ww w . j a v a2s . c o m // License.checkLicense(); // ?? // ?MR?job Configuration conf = new Configuration(); // ? for (String key : paramMap.keySet()) { String value = paramMap.get(key); if (null != value) {// ? value = value.replaceAll("\\\\n", "\n"); value = value.replaceAll("\\\\r", "\r"); conf.set(key, value); paramMap.put(key, value); } } String debug = paramMap.get(MRConfiguration.INTERNAL_JOB_LOG_DEBUG); if (null != debug) { String rownum = paramMap.get(MRConfiguration.INTERNAL_JOB_LOG_DEBUG_ROWNUM); conf.setInt(MRConfiguration.INTERNAL_JOB_LOG_DEBUG, Integer.parseInt(debug)); conf.setInt(MRConfiguration.INTERNAL_JOB_LOG_DEBUG_ROWNUM, Integer.parseInt(rownum)); } // ?? this.printParameter(paramMap); MRJOBService mrJobService = new MRJOBService(); // jobconfjob Job job = Job.getInstance(conf); job.setJarByClass(MRJOBService.class); mrJobService.run(paramMap, job); // if (mrJobService.isJobRun(conf)) { // } else { // JobConf jobConf = new JobConf(conf, MRJOBService.class); // mrJobService.run(paramMap, jobConf); // } }