Example usage for org.apache.hadoop.conf Configuration setInt

List of usage examples for org.apache.hadoop.conf Configuration setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:com.datasalt.utils.mapred.counter.TestMapRedCounter.java

License:Apache License

public void testWithMinimumCountOtherThan1(boolean withCombiner)
        throws IOException, InterruptedException, ClassNotFoundException, CloneNotSupportedException {
    Configuration conf = BaseConfigurationFactory.getInstance().getConf();
    Job job;/*from   w  ww.java 2 s.c o  m*/

    /*
     * Set minimum count
     */
    conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "0", 2);
    conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "1", 2);
    conf.setInt(MapRedCounter.MINIMUM_COUNT_FOR_GROUP_CONF_PREFIX + "2", 2);

    if (withCombiner) {
        job = MapRedCounter.buildMapRedCounterJob("counter", SequenceFileOutputFormat.class, OUTPUT_COUNT,
                conf);
    } else {
        job = MapRedCounter.buildMapRedCounterJobWithoutCombiner("counter", SequenceFileOutputFormat.class,
                OUTPUT_COUNT, conf);
    }

    MapRedCounter.addInput(job, new Path(SINGLE_LINE_FILE), TextInputFormat.class, TestMapper.class);

    job.waitForCompletion(true);

    HashMap<String, Long> itemCount = itemCountAsMap(getFs(),
            OUTPUT_COUNT + "/" + MapRedCounter.Outputs.COUNTFILE + "/part-r-00000");
    HashMap<String, LongPairWritable> itemGroupCount = itemGroupCountAsMap(getFs(),
            OUTPUT_COUNT + "/" + MapRedCounter.Outputs.COUNTDISTINCTFILE + "/part-r-00000");

    assertCount(2, "2:c6d3:c", itemCount);
    assertCount(2, "2:c6d3:b", itemCount);
    assertCount(2, "2:c6d3:a", itemCount);

    assertCount(2, "1:c3d2:a", itemCount);

    assertCount(2, "1:c2d1:a", itemCount);

    assertGroupCount(6, 3, "2:c6d3", itemGroupCount);
    assertGroupCount(2, 1, "1:c3d2", itemGroupCount);
    assertGroupCount(2, 1, "1:c2d1", itemGroupCount);
}

From source file:com.datasalt.utils.mapred.crossproduct.TestCrossProductMapRed.java

License:Apache License

public void test(boolean twoSteps) throws Exception {
    createFirstDataSet();//from  w w  w.  j  a v  a  2s . c om
    createSecondDataSet();

    Configuration conf = getConf();
    FileSystem fS = FileSystem.get(conf);

    if (twoSteps) {
        /*
         * Here we are saying that 1-list elements is enough to collapse the JVM's heap - just for testing 
         */
        conf.setInt(CrossProductMapRed.SPLIT_DATASET_SIZE_CONF, 1);
    }

    CrossProductMapRed crossProduct = new CrossProductMapRed("Test", conf);
    crossProduct.setLeftInputPath(new Path(INPUT_1));
    crossProduct.setLeftInputFormat(TextInputFormat.class);
    crossProduct.setLeftInputMapper(Map.class);

    crossProduct.setRightInputPath(new Path(INPUT_2));
    crossProduct.setRightInputFormat(TextInputFormat.class);
    crossProduct.setRightInputMapper(Map.class);

    crossProduct.setOutputPath(new Path(OUTPUT));
    crossProduct.setOutputFormat(SequenceFileOutputFormat.class);

    crossProduct.memoryAwareRun();
    SequenceFile.Reader reader;
    CrossProductExtraKey groupKey = new CrossProductExtraKey();
    CrossProductPair data = new CrossProductPair();
    Text txt = new Text();
    Text txt2 = new Text();
    if (twoSteps) {

        reader = new SequenceFile.Reader(fS, new Path(OUTPUT, "EXTRA" + "/" + "part-r-00000"), conf);

        /*
         * Assert intermediate "big groups" output
         */
        for (int i = 0; i < 9; i++) {
            reader.next(groupKey);
            reader.getCurrentValue(data);

            if (i < 3) {
                ser.deser(txt, data.getRight());
                switch (i) {
                case 0:
                    assertEquals(txt.toString(), "pere");
                    break;
                case 1:
                    assertEquals(txt.toString(), "eric");
                    break;
                case 2:
                    assertEquals(txt.toString(), "ivan");
                    break;
                }
            } else {
                ser.deser(txt, data.getLeft());
                switch (i) {
                case 3:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 4:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 5:
                    assertEquals(txt.toString(), "beer");
                    break;
                case 6:
                    assertEquals(txt.toString(), "wine");
                    break;
                case 7:
                    assertEquals(txt.toString(), "wine");
                    break;
                case 8:
                    assertEquals(txt.toString(), "wine");
                    break;
                }
            }
        }

        reader.close();
    }

    /*
     * Assert final output
     */

    Counter count = Counter.createWithDistinctElements();

    Path finalOutput = new Path(OUTPUT, "part-r-00000");
    if (twoSteps) {
        finalOutput = new Path(crossProduct.getBigGroupsOutput(), "part-r-00000");
    }
    reader = new SequenceFile.Reader(fS, finalOutput, conf);

    for (int i = 0; i < 6; i++) {
        reader.next(data);
        ser.deser(txt, data.getLeft());
        ser.deser(txt2, data.getRight());
        count.in(txt.toString()).count(txt2.toString());
    }

    Count counts = count.getCounts();
    List<String> beerResults = counts.get("beer").getDistinctListAsStringList();
    List<String> wineResults = counts.get("wine").getDistinctListAsStringList();
    for (List<String> list : new List[] { beerResults, wineResults }) {
        assertEquals(list.contains("pere"), true);
        assertEquals(list.contains("ivan"), true);
        assertEquals(list.contains("eric"), true);
    }

    HadoopUtils.deleteIfExists(fS, new Path(INPUT_1));
    HadoopUtils.deleteIfExists(fS, new Path(INPUT_2));
    HadoopUtils.deleteIfExists(fS, new Path(OUTPUT));
    if (twoSteps) {
        HadoopUtils.deleteIfExists(fS, crossProduct.getBigGroupsOutput());
    }
}

From source file:com.datatorrent.contrib.frauddetect.FrauddetectApplicationTest.java

License:Open Source License

@Test
public void testApplication() throws Exception {
    Application application = new Application();

    Configuration conf = new Configuration(false);
    conf.set(Application.MONGO_HOST_PROPERTY, "localhost");
    conf.set(Application.MONGO_HOST_PROPERTY, "localhost");
    conf.setInt(Application.BIN_THRESHOLD_PROPERTY, 20);
    conf.setInt(Application.AVG_THRESHOLD_PROPERTY, 1200);
    conf.setInt(Application.CC_THRESHOLD_PROPERTY, 420);
    LocalMode lma = LocalMode.newInstance();

    application.populateDAG(lma.getDAG(), conf);
    lma.getController().run(120000);//from  w  ww  . j  a va2 s .  com
}

From source file:com.datatorrent.demos.mroperator.WordCountMRApplicationTest.java

License:Open Source License

@Test
public void testSomeMethod() throws Exception {
    LocalMode lma = LocalMode.newInstance();
    Configuration conf = new Configuration(false);
    conf.set("dt.application.WordCountDemo.operator.Mapper.dirName", testMeta.testDir);
    conf.setInt("dt.application.WordCountDemo.operator.Mapper.partitionCount", 1);
    conf.set("dt.application.WordCountDemo.operator.Console.filePath", testMeta.testDir);
    conf.set("dt.application.WordCountDemo.operator.Console.outputFileName", "output.txt");
    lma.prepareDAG(new NewWordCountApplication(), conf);
    LocalMode.Controller lc = lma.getController();
    lc.setHeartbeatMonitoringEnabled(false);
    lc.run(5000);/*  ww  w . j  a  v a2s  . c  o m*/
    lc.shutdown();
    List<String> readLines = FileUtils.readLines(new File(testMeta.testDir + "/output.txt"));
    Map<String, Integer> readMap = Maps.newHashMap();
    Iterator<String> itr = readLines.iterator();
    while (itr.hasNext()) {
        String[] splits = itr.next().split("=");
        readMap.put(splits[0], Integer.valueOf(splits[1]));
    }
    Map<String, Integer> expectedMap = Maps.newHashMap();
    expectedMap.put("1", 2);
    expectedMap.put("2", 2);
    expectedMap.put("3", 2);
    Assert.assertEquals("expected reduced data ", expectedMap, readMap);
    LOG.info("read lines {}", readLines);
}

From source file:com.datatorrent.stram.client.StramClientUtils.java

License:Apache License

public static Configuration addDTSiteResources(Configuration conf) {
    addDTLocalResources(conf);/*from  w  w  w . ja  v a2  s  . c o  m*/
    FileSystem fs = null;
    File targetGlobalFile;
    try {
        fs = newFileSystemInstance(conf);
        // after getting the dfsRootDirectory config parameter, redo the entire process with the global config
        // load global settings from DFS
        targetGlobalFile = new File(String.format("/tmp/dt-site-global-%s.xml",
                UserGroupInformation.getLoginUser().getShortUserName()));
        org.apache.hadoop.fs.Path hdfsGlobalPath = new org.apache.hadoop.fs.Path(
                StramClientUtils.getDTDFSConfigDir(fs, conf), StramClientUtils.DT_SITE_GLOBAL_XML_FILE);
        LOG.debug("Copying global dt-site.xml from {} to {}", hdfsGlobalPath,
                targetGlobalFile.getAbsolutePath());
        fs.copyToLocalFile(hdfsGlobalPath, new org.apache.hadoop.fs.Path(targetGlobalFile.toURI()));
        addDTSiteResources(conf, targetGlobalFile);
        if (!isDevelopmentMode()) {
            // load node local config file
            addDTSiteResources(conf,
                    new File(StramClientUtils.getConfigDir(), StramClientUtils.DT_SITE_XML_FILE));
        }
        // load user config file
        addDTSiteResources(conf,
                new File(StramClientUtils.getUserDTDirectory(), StramClientUtils.DT_SITE_XML_FILE));
    } catch (IOException ex) {
        // ignore
        LOG.debug("Caught exception when loading configuration: {}: moving on...", ex.getMessage());
    } finally {
        // Cannot delete the file here because addDTSiteResource which eventually calls Configuration.reloadConfiguration
        // does not actually reload the configuration.  The file is actually read later and it needs to exist.
        //
        //if (targetGlobalFile != null) {
        //targetGlobalFile.delete();
        //}
        IOUtils.closeQuietly(fs);
    }

    //Validate loggers-level settings
    String loggersLevel = conf.get(DTLoggerFactory.DT_LOGGERS_LEVEL);
    if (loggersLevel != null) {
        String targets[] = loggersLevel.split(",");
        Preconditions.checkArgument(targets.length > 0, "zero loggers level");
        for (String target : targets) {
            String parts[] = target.split(":");
            Preconditions.checkArgument(parts.length == 2, "incorrect " + target);
            Preconditions.checkArgument(ConfigValidator.validateLoggersLevel(parts[0], parts[1]),
                    "incorrect " + target);
        }
    }
    convertDeprecatedProperties(conf);

    //
    // The ridiculous default RESOURCEMANAGER_CONNECT_MAX_WAIT_MS from hadoop is 15 minutes (!!!!), which actually translates to 20 minutes with the connect interval.
    // That means if there is anything wrong with YARN or if YARN is not running, the caller has to wait for up to 20 minutes until it gets an error.
    // We are overriding this to be 10 seconds maximum.
    //

    int rmConnectMaxWait = conf.getInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
            YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS);
    if (rmConnectMaxWait > RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE) {
        LOG.info("Overriding {} assigned value of {} to {} because the assigned value is too big.",
                YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS, rmConnectMaxWait,
                RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE);
        conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
                RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE);
        int rmConnectRetryInterval = conf.getInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
                YarnConfiguration.DEFAULT_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS);
        int defaultRetryInterval = Math.max(500, RESOURCEMANAGER_CONNECT_MAX_WAIT_MS_OVERRIDE / 5);
        if (rmConnectRetryInterval > defaultRetryInterval) {
            LOG.info("Overriding {} assigned value of {} to {} because the assigned value is too big.",
                    YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, rmConnectRetryInterval,
                    defaultRetryInterval);
            conf.setInt(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS, defaultRetryInterval);
        }
    }
    LOG.info(" conf object in stramclient {}", conf);
    return conf;
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format/*from w w w  .  j  a v  a2  s .  c  o  m*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.DictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format//from   w  w w .  j a  v a  2  s  .  c  om
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(DictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Create a partial vector using a chunk of features from the input documents. The input documents has to be
 * in the {@link SequenceFile} format/*from   w ww.  j  a va  2  s  . com*/
 * 
 * @param input
 *          input directory of the documents in {@link SequenceFile} format
 * @param baseConf
 *          job configuration
 * @param maxNGramSize
 *          maximum size of ngrams to generate
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param dimension
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVectors
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          the desired number of reducer tasks
 */
private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize,
        Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVectors);
    conf.setInt(MAX_NGRAMS, maxNGramSize);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.elex.dmp.vectorizer.FixDictionaryVectorizer.java

License:Apache License

/**
 * Count the frequencies of words in parallel using Map/Reduce. The input documents have to be in
 * {@link SequenceFile} format//  ww w.jav  a  2  s .  c  o m
 */
private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(MIN_SUPPORT, minSupport);

    Job job = new Job(conf);

    job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
    job.setJarByClass(FixDictionaryVectorizer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(TermCountMapper.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setCombinerClass(TermCountCombiner.class);
    job.setReducerClass(TermCountReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded)
        throw new IllegalStateException("Job failed!");
}

From source file:com.ery.hadoop.mrddx.client.MRJOBClient.java

@Override
public void run(Map<String, String> paramMap) throws Exception {
    // license//from ww  w .  j  a  v  a2s  .  c  o  m
    // License.checkLicense();

    // ??
    // ?MR?job
    Configuration conf = new Configuration();
    // ?
    for (String key : paramMap.keySet()) {
        String value = paramMap.get(key);
        if (null != value) {// ?
            value = value.replaceAll("\\\\n", "\n");
            value = value.replaceAll("\\\\r", "\r");
            conf.set(key, value);
            paramMap.put(key, value);
        }
    }

    String debug = paramMap.get(MRConfiguration.INTERNAL_JOB_LOG_DEBUG);
    if (null != debug) {
        String rownum = paramMap.get(MRConfiguration.INTERNAL_JOB_LOG_DEBUG_ROWNUM);
        conf.setInt(MRConfiguration.INTERNAL_JOB_LOG_DEBUG, Integer.parseInt(debug));
        conf.setInt(MRConfiguration.INTERNAL_JOB_LOG_DEBUG_ROWNUM, Integer.parseInt(rownum));
    }

    // ??
    this.printParameter(paramMap);
    MRJOBService mrJobService = new MRJOBService();

    // jobconfjob
    Job job = Job.getInstance(conf);
    job.setJarByClass(MRJOBService.class);
    mrJobService.run(paramMap, job);

    // if (mrJobService.isJobRun(conf)) {
    // } else {
    // JobConf jobConf = new JobConf(conf, MRJOBService.class);
    // mrJobService.run(paramMap, jobConf);
    // }
}