Example usage for org.apache.hadoop.conf Configuration setLong

List of usage examples for org.apache.hadoop.conf Configuration setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
 * be in the {@link SequenceFile} format
 * // ww w . j a va 2s.  co m
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param featureCount
 *          Number of unique features in the dataset
 * @param vectorCount
 *          Number of vectors in the dataset
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDFPercent
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf,
        int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setLong(FEATURE_COUNT, featureCount);
    conf.setLong(VECTOR_COUNT, vectorCount);
    conf.setInt(MIN_DF, minDf);
    conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath.toString());
    job.setJarByClass(TFIDFConverter.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFIDFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.overwriteOutput(output);

    job.waitForCompletion(true);
}

From source file:org.apache.mahout.vectorizer.HighDFWordsPruner.java

License:Apache License

private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
        long minDF, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf
    // values/*  w  w w  . j a v  a 2  s.  c o  m*/
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setLong(MAX_DF, maxDF);
    conf.setLong(MIN_DF, minDF);
    DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf);

    Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, Mapper.class, null, null,
            WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, conf);
    job.setJobName(
            ": Prune Vectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString());

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java

License:Apache License

/**
 * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to
 * be in the {@link SequenceFile} format
 * // www .  j  a v  a 2s  .c o m
 * @param input
 *          input directory of the vectors in {@link SequenceFile} format
 * @param featureCount
 *          Number of unique features in the dataset
 * @param vectorCount
 *          Number of vectors in the dataset
 * @param minDf
 *          The minimum document frequency. Default 1
 * @param maxDF
 *          The max percentage of vectors for the DF. Can be used to remove really high frequency features.
 *          Expressed as an integer between 0 and 100. Default 99
 * @param dictionaryFilePath
 *          location of the chunk of features and the id's
 * @param output
 *          output directory were the partial vectors have to be created
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVector
 *          output vectors should be named, retaining key (doc id) as a label
 */
private static void makePartialVectors(Path input, Configuration baseConf, Long featureCount, Long vectorCount,
        int minDf, long maxDF, Path dictionaryFilePath, Path output, boolean sequentialAccess,
        boolean namedVector) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setLong(FEATURE_COUNT, featureCount);
    conf.setLong(VECTOR_COUNT, vectorCount);
    conf.setInt(MIN_DF, minDf);
    conf.setLong(MAX_DF, maxDF);
    conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector);
    DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);

    Job job = new Job(conf);
    job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: "
            + dictionaryFilePath.toString());
    job.setJarByClass(TFIDFConverter.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    FileInputFormat.setInputPaths(job, input);

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFIDFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java

License:Apache License

public static void setSlotKeyId(Configuration conf, String prefix, long keyid) {
    conf.setLong(getConfigName(prefix, SLOT_KEY_ID), keyid);
}

From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java

License:Apache License

public static void setMemPoolSize(Configuration conf, String prefix, long size) {
    conf.setLong(getConfigName(prefix, MEM_POOL_SIZE), size);
}

From source file:org.apache.mrql.Config.java

License:Apache License

/** store the configuration parameters */
public static void write(Configuration conf) {
    conf.setBoolean("mrql.hadoop.mode", hadoop_mode);
    conf.setBoolean("mrql.local.mode", local_mode);
    conf.setBoolean("mrql.distributed.mode", distributed_mode);
    conf.setBoolean("mrql.map.reduce.mode", map_reduce_mode);
    conf.setBoolean("mrql.bsp.mode", bsp_mode);
    conf.setBoolean("mrql.spark.mode", spark_mode);
    conf.setBoolean("mrql.flink.mode", flink_mode);
    conf.setBoolean("mrql.interactive", interactive);
    conf.setBoolean("mrql.compile.functional.arguments", compile_functional_arguments);
    conf.setBoolean("mrql.trace", trace);
    conf.setInt("mrql.nodes", nodes);
    conf.setInt("mrql.mapjoin.size", mapjoin_size);
    conf.setInt("mrql.in.mapper.size", map_cache_size);
    conf.setInt("mrql.max.bag.size.print", max_bag_size_print);
    conf.setLong("mrql.max.materialized.bag", max_materialized_bag);
    conf.setInt("mrql.bsp.msg.size", bsp_msg_size);
    conf.setLong("mrql.range.split.size", range_split_size);
    conf.setInt("mrql.max.merged.streams", max_merged_streams);
    conf.set("mrql.tmp.directory", tmpDirectory);
    conf.setBoolean("mrql.use.combiner", use_combiner);
    conf.setBoolean("mrql.group.join.opt", groupJoinOpt);
    conf.setBoolean("mrql.self.join.opt", selfJoinOpt);
    conf.setBoolean("mrql.trace.execution", trace_execution);
    conf.setBoolean("mrql.trace.exp.execution", trace_exp_execution);
    conf.setBoolean("mrql.quiet.execution", quiet_execution);
    conf.setBoolean("mrql.testing", testing);
    conf.setBoolean("mrql.info", info);
    conf.setInt("mrql.stream.window", stream_window);
}

From source file:org.apache.mrql.MapReducePlan.java

License:Apache License

/** Set hadoop map min and max split size based on number of requested nodes */
public static void setupSplits(DataSet[] dsv, Configuration conf) throws IOException {
    int len = 0;/*from  ww w. ja  va2s  .co m*/
    for (DataSet ds : dsv)
        len += ds.source.size();
    long[] sizes = new long[len];
    int i = 0;
    for (DataSet ds : dsv)
        for (DataSource s : ds.source) {
            sizes[i] = s.size(conf);
            i++;
        }
    ;
    long total_size = 0;
    for (long size : sizes)
        total_size += size;
    long split_size = Math.max(total_size / Config.nodes, 1024L);
    int tasks = 0;
    do { // adjust split_size
        tasks = 0;
        for (long size : sizes)
            tasks += (int) Math.ceil(size / (double) split_size);
        if (tasks > Config.nodes)
            split_size = (long) Math.ceil((double) split_size * 1.01);
    } while (tasks > Config.nodes);
    conf.setLong("mapred.min.split.size", split_size);
    conf.setLong("mapred.max.split.size", split_size);
}

From source file:org.apache.nutch.crawl.TODOTestCrawlDbStates.java

License:Apache License

/**
 * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for
 * documents not modified/*from w w w. j av  a2 s  . c om*/
 * <p>
 * Problem: documents not modified for a longer time are fetched in every
 * cycle because of an error in the SYNC_DELTA calculation of
 * {@link AdaptiveFetchSchedule}. <br>
 * The next fetch time should always be in the future, never in the past.
 * </p>
 */
@Test
public void testAdaptiveFetchScheduleSyncDelta() {
    LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule");
    Context context = CrawlDBTestUtil.createContext();
    Configuration conf = context.getConfiguration();
    conf.setLong("db.fetch.interval.default", 172800); // 2 days
    conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day
    conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days
    conf.setLong("db.fetch.interval.max", 604800); // 7 days
    conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule");
    ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(context);
    crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3);
    try {
        if (!crawlUtil.run(100)) {
            fail("failed: sync_delta calculation with AdaptiveFetchSchedule");
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:org.apache.nutch.mapreduce.GenerateJob.java

License:Apache License

@Override
protected void setup(Map<String, Object> args) throws Exception {
    super.setup(args);

    Params params = new Params(args);
    Configuration conf = getConf();

    String crawlId = params.get(ARG_CRAWL, conf.get(PARAM_CRAWL_ID));
    String batchId = params.get(ARG_BATCH, NutchUtil.generateBatchId());
    boolean reGenerate = params.getBoolean(ARG_REGENERATE, false);
    long topN = params.getLong(ARG_TOPN, Long.MAX_VALUE);
    boolean filter = params.getBoolean(ARG_FILTER, true);
    boolean norm = params.getBoolean(ARG_NORMALIZE, true);
    long pseudoCurrTime = params.getLong(ARG_CURTIME, startTime);

    String nutchTmpDir = conf.get(PARAM_NUTCH_TMP_DIR, PATH_NUTCH_TMP_DIR);

    conf.set(PARAM_CRAWL_ID, crawlId);/*from w  w  w .j a va  2 s. c  o m*/
    conf.set(PARAM_BATCH_ID, batchId);
    conf.setLong(GENERATE_TIME_KEY, startTime); // seems not used, (or pseudoCurrTime used?)
    conf.setLong(PARAM_GENERATOR_CUR_TIME, pseudoCurrTime);
    conf.setBoolean(PARAM_GENERATE_REGENERATE, reGenerate);
    conf.setLong(PARAM_GENERATOR_TOP_N, topN);
    conf.setBoolean(PARAM_GENERATE_FILTER, filter);
    conf.setBoolean(PARAM_GENERATE_NORMALISE, norm);

    URLUtil.HostGroupMode hostGroupMode = conf.getEnum(PARAM_GENERATOR_COUNT_MODE,
            URLUtil.HostGroupMode.BY_HOST);
    conf.setEnum(PARTITION_MODE_KEY, hostGroupMode);

    LOG.info(Params.format("className", this.getClass().getSimpleName(), "crawlId", crawlId, "batchId", batchId,
            "filter", filter, "norm", norm, "pseudoCurrTime", DateTimeUtil.format(pseudoCurrTime), "topN", topN,
            "reGenerate", reGenerate, PARAM_GENERATOR_COUNT_MODE, hostGroupMode, PARTITION_MODE_KEY,
            hostGroupMode, "nutchTmpDir", nutchTmpDir));

    Files.write(Paths.get(PATH_LAST_BATCH_ID), (batchId + "\n").getBytes(), StandardOpenOption.CREATE,
            StandardOpenOption.WRITE);
}

From source file:org.apache.oozie.action.hadoop.LauncherHelper.java

License:Apache License

public static void setupYarnRestartHandling(Configuration launcherJobConf, Configuration actionConf,
        String launcherTag, long launcherTime) throws NoSuchAlgorithmException {
    launcherJobConf.setLong(LauncherMain.OOZIE_JOB_LAUNCH_TIME, launcherTime);
    // Tags are limited to 100 chars so we need to hash them to make sure (the actionId otherwise doesn't have a max length)
    String tag = getTag(launcherTag);
    // keeping the oozie.child.mapreduce.job.tags instead of mapreduce.job.tags to avoid killing launcher itself.
    // mapreduce.job.tags should only go to child job launch by launcher.
    actionConf.set(LauncherMain.CHILD_MAPREDUCE_JOB_TAGS, tag);
}