List of usage examples for org.apache.hadoop.conf Configuration setLong
public void setLong(String name, long value)
name
property to a long
. From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFConverter.java
License:Apache License
/** * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to * be in the {@link SequenceFile} format * // ww w . j a va 2s. co m * @param input * input directory of the vectors in {@link SequenceFile} format * @param featureCount * Number of unique features in the dataset * @param vectorCount * Number of vectors in the dataset * @param minDf * The minimum document frequency. Default 1 * @param maxDFPercent * The max percentage of vectors for the DF. Can be used to remove really high frequency features. * Expressed as an integer between 0 and 100. Default 99 * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ private static void makePartialVectors(Path input, Long featureCount, Long vectorCount, int minDf, int maxDFPercent, Path dictionaryFilePath, Path output, boolean sequentialAccess) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(FEATURE_COUNT, featureCount); conf.setLong(VECTOR_COUNT, vectorCount); conf.setInt(MIN_DF, minDf); conf.setInt(MAX_DF_PERCENTAGE, maxDFPercent); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = new Job(conf); job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); job.setJarByClass(TFIDFConverter.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFIDFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.overwriteOutput(output); job.waitForCompletion(true); }
From source file:org.apache.mahout.vectorizer.HighDFWordsPruner.java
License:Apache License
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF, long minDF, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values/* w w w . j a v a 2 s. c o m*/ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(MAX_DF, maxDF); conf.setLong(MIN_DF, minDF); DistributedCache.setCacheFiles(new URI[] { dictionaryFilePath.toUri() }, conf); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, Mapper.class, null, null, WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName( ": Prune Vectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.vectorizer.tfidf.TFIDFConverter.java
License:Apache License
/** * Create a partial tfidf vector using a chunk of features from the input vectors. The input vectors has to * be in the {@link SequenceFile} format * // www . j a v a 2s .c o m * @param input * input directory of the vectors in {@link SequenceFile} format * @param featureCount * Number of unique features in the dataset * @param vectorCount * Number of vectors in the dataset * @param minDf * The minimum document frequency. Default 1 * @param maxDF * The max percentage of vectors for the DF. Can be used to remove really high frequency features. * Expressed as an integer between 0 and 100. Default 99 * @param dictionaryFilePath * location of the chunk of features and the id's * @param output * output directory were the partial vectors have to be created * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVector * output vectors should be named, retaining key (doc id) as a label */ private static void makePartialVectors(Path input, Configuration baseConf, Long featureCount, Long vectorCount, int minDf, long maxDF, Path dictionaryFilePath, Path output, boolean sequentialAccess, boolean namedVector) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(FEATURE_COUNT, featureCount); conf.setLong(VECTOR_COUNT, vectorCount); conf.setInt(MIN_DF, minDf); conf.setLong(MAX_DF, maxDF); conf.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(PartialVectorMerger.NAMED_VECTOR, namedVector); DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf); Job job = new Job(conf); job.setJobName(": MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); job.setJarByClass(TFIDFConverter.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFIDFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java
License:Apache License
public static void setSlotKeyId(Configuration conf, String prefix, long keyid) { conf.setLong(getConfigName(prefix, SLOT_KEY_ID), keyid); }
From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java
License:Apache License
public static void setMemPoolSize(Configuration conf, String prefix, long size) { conf.setLong(getConfigName(prefix, MEM_POOL_SIZE), size); }
From source file:org.apache.mrql.Config.java
License:Apache License
/** store the configuration parameters */ public static void write(Configuration conf) { conf.setBoolean("mrql.hadoop.mode", hadoop_mode); conf.setBoolean("mrql.local.mode", local_mode); conf.setBoolean("mrql.distributed.mode", distributed_mode); conf.setBoolean("mrql.map.reduce.mode", map_reduce_mode); conf.setBoolean("mrql.bsp.mode", bsp_mode); conf.setBoolean("mrql.spark.mode", spark_mode); conf.setBoolean("mrql.flink.mode", flink_mode); conf.setBoolean("mrql.interactive", interactive); conf.setBoolean("mrql.compile.functional.arguments", compile_functional_arguments); conf.setBoolean("mrql.trace", trace); conf.setInt("mrql.nodes", nodes); conf.setInt("mrql.mapjoin.size", mapjoin_size); conf.setInt("mrql.in.mapper.size", map_cache_size); conf.setInt("mrql.max.bag.size.print", max_bag_size_print); conf.setLong("mrql.max.materialized.bag", max_materialized_bag); conf.setInt("mrql.bsp.msg.size", bsp_msg_size); conf.setLong("mrql.range.split.size", range_split_size); conf.setInt("mrql.max.merged.streams", max_merged_streams); conf.set("mrql.tmp.directory", tmpDirectory); conf.setBoolean("mrql.use.combiner", use_combiner); conf.setBoolean("mrql.group.join.opt", groupJoinOpt); conf.setBoolean("mrql.self.join.opt", selfJoinOpt); conf.setBoolean("mrql.trace.execution", trace_execution); conf.setBoolean("mrql.trace.exp.execution", trace_exp_execution); conf.setBoolean("mrql.quiet.execution", quiet_execution); conf.setBoolean("mrql.testing", testing); conf.setBoolean("mrql.info", info); conf.setInt("mrql.stream.window", stream_window); }
From source file:org.apache.mrql.MapReducePlan.java
License:Apache License
/** Set hadoop map min and max split size based on number of requested nodes */ public static void setupSplits(DataSet[] dsv, Configuration conf) throws IOException { int len = 0;/*from ww w. ja va2s .co m*/ for (DataSet ds : dsv) len += ds.source.size(); long[] sizes = new long[len]; int i = 0; for (DataSet ds : dsv) for (DataSource s : ds.source) { sizes[i] = s.size(conf); i++; } ; long total_size = 0; for (long size : sizes) total_size += size; long split_size = Math.max(total_size / Config.nodes, 1024L); int tasks = 0; do { // adjust split_size tasks = 0; for (long size : sizes) tasks += (int) Math.ceil(size / (double) split_size); if (tasks > Config.nodes) split_size = (long) Math.ceil((double) split_size * 1.01); } while (tasks > Config.nodes); conf.setLong("mapred.min.split.size", split_size); conf.setLong("mapred.max.split.size", split_size); }
From source file:org.apache.nutch.crawl.TODOTestCrawlDbStates.java
License:Apache License
/** * NUTCH-1564 AdaptiveFetchSchedule: sync_delta forces immediate re-fetch for * documents not modified/*from w w w. j av a2 s . c om*/ * <p> * Problem: documents not modified for a longer time are fetched in every * cycle because of an error in the SYNC_DELTA calculation of * {@link AdaptiveFetchSchedule}. <br> * The next fetch time should always be in the future, never in the past. * </p> */ @Test public void testAdaptiveFetchScheduleSyncDelta() { LOG.info("NUTCH-1564 test SYNC_DELTA calculation of AdaptiveFetchSchedule"); Context context = CrawlDBTestUtil.createContext(); Configuration conf = context.getConfiguration(); conf.setLong("db.fetch.interval.default", 172800); // 2 days conf.setLong("db.fetch.schedule.adaptive.min_interval", 86400); // 1 day conf.setLong("db.fetch.schedule.adaptive.max_interval", 604800); // 7 days conf.setLong("db.fetch.interval.max", 604800); // 7 days conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl.AdaptiveFetchSchedule"); ContinuousCrawlTestUtil crawlUtil = new CrawlTestFetchScheduleNotModifiedFetchTime(context); crawlUtil.setInterval(FetchSchedule.SECONDS_PER_DAY / 3); try { if (!crawlUtil.run(100)) { fail("failed: sync_delta calculation with AdaptiveFetchSchedule"); } } catch (IOException e) { e.printStackTrace(); } }
From source file:org.apache.nutch.mapreduce.GenerateJob.java
License:Apache License
@Override protected void setup(Map<String, Object> args) throws Exception { super.setup(args); Params params = new Params(args); Configuration conf = getConf(); String crawlId = params.get(ARG_CRAWL, conf.get(PARAM_CRAWL_ID)); String batchId = params.get(ARG_BATCH, NutchUtil.generateBatchId()); boolean reGenerate = params.getBoolean(ARG_REGENERATE, false); long topN = params.getLong(ARG_TOPN, Long.MAX_VALUE); boolean filter = params.getBoolean(ARG_FILTER, true); boolean norm = params.getBoolean(ARG_NORMALIZE, true); long pseudoCurrTime = params.getLong(ARG_CURTIME, startTime); String nutchTmpDir = conf.get(PARAM_NUTCH_TMP_DIR, PATH_NUTCH_TMP_DIR); conf.set(PARAM_CRAWL_ID, crawlId);/*from w w w .j a va 2 s. c o m*/ conf.set(PARAM_BATCH_ID, batchId); conf.setLong(GENERATE_TIME_KEY, startTime); // seems not used, (or pseudoCurrTime used?) conf.setLong(PARAM_GENERATOR_CUR_TIME, pseudoCurrTime); conf.setBoolean(PARAM_GENERATE_REGENERATE, reGenerate); conf.setLong(PARAM_GENERATOR_TOP_N, topN); conf.setBoolean(PARAM_GENERATE_FILTER, filter); conf.setBoolean(PARAM_GENERATE_NORMALISE, norm); URLUtil.HostGroupMode hostGroupMode = conf.getEnum(PARAM_GENERATOR_COUNT_MODE, URLUtil.HostGroupMode.BY_HOST); conf.setEnum(PARTITION_MODE_KEY, hostGroupMode); LOG.info(Params.format("className", this.getClass().getSimpleName(), "crawlId", crawlId, "batchId", batchId, "filter", filter, "norm", norm, "pseudoCurrTime", DateTimeUtil.format(pseudoCurrTime), "topN", topN, "reGenerate", reGenerate, PARAM_GENERATOR_COUNT_MODE, hostGroupMode, PARTITION_MODE_KEY, hostGroupMode, "nutchTmpDir", nutchTmpDir)); Files.write(Paths.get(PATH_LAST_BATCH_ID), (batchId + "\n").getBytes(), StandardOpenOption.CREATE, StandardOpenOption.WRITE); }
From source file:org.apache.oozie.action.hadoop.LauncherHelper.java
License:Apache License
public static void setupYarnRestartHandling(Configuration launcherJobConf, Configuration actionConf, String launcherTag, long launcherTime) throws NoSuchAlgorithmException { launcherJobConf.setLong(LauncherMain.OOZIE_JOB_LAUNCH_TIME, launcherTime); // Tags are limited to 100 chars so we need to hash them to make sure (the actionId otherwise doesn't have a max length) String tag = getTag(launcherTag); // keeping the oozie.child.mapreduce.job.tags instead of mapreduce.job.tags to avoid killing launcher itself. // mapreduce.job.tags should only go to child job launch by launcher. actionConf.set(LauncherMain.CHILD_MAPREDUCE_JOB_TAGS, tag); }