List of usage examples for org.apache.hadoop.conf Configuration getLong
public long getLong(String name, long defaultValue)
name
property as a long
. From source file:org.apache.mahout.clustering.lda.cvb.CachingCVB0Mapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { log.info("Retrieving configuration"); Configuration conf = context.getConfiguration(); float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); log.info("Initializing read model"); Path[] modelPaths = CVB0Driver.getModelPaths(conf); if (modelPaths != null && modelPaths.length > 0) { readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); } else {/* w w w . ja v a 2 s .co m*/ log.info("No model files found"); readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, numTrainThreads, modelWeight); } log.info("Initializing write model"); writeModel = modelWeight == 1 ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads) : readModel; log.info("Initializing model trainer"); modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms); modelTrainer.start(); }
From source file:org.apache.mahout.clustering.lda.cvb.CachingCVB0PerplexityMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { MemoryUtil.startMemoryLogger(5000);//from ww w .j a va 2s . c o m log.info("Retrieving configuration"); Configuration conf = context.getConfiguration(); float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); random = RandomUtils.getRandom(seed); numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); testFraction = conf.getFloat(CVB0Driver.TEST_SET_FRACTION, 0.1f); log.info("Initializing read model"); Path[] modelPaths = CVB0Driver.getModelPaths(conf); if (modelPaths != null && modelPaths.length > 0) { readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); } else { log.info("No model files found"); readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, numTrainThreads, modelWeight); } log.info("Initializing model trainer"); modelTrainer = new ModelTrainer(readModel, null, numTrainThreads, numTopics, numTerms); log.info("Initializing topic vector"); topicVector = new DenseVector(new double[numTopics]); }
From source file:org.apache.mahout.clustering.lda.cvb.CVBConfig.java
License:Apache License
public CVBConfig read(Configuration conf) { setNumTopics(conf.getInt(NUM_TOPICS_PARAM, 0)); setNumTerms(conf.getInt(NUM_TERMS_PARAM, 0)); setAlpha(conf.getFloat(DOC_TOPIC_SMOOTHING_PARAM, 0)); setEta(conf.getFloat(TERM_TOPIC_SMOOTHING_PARAM, 0)); setRandomSeed(conf.getLong(RANDOM_SEED_PARAM, 0)); setTestFraction(conf.getFloat(TEST_SET_FRACTION_PARAM, 0)); setNumTrainThreads(conf.getInt(NUM_TRAIN_THREADS_PARAM, 0)); setNumUpdateThreads(conf.getInt(NUM_UPDATE_THREADS_PARAM, 0)); setMaxItersPerDoc(conf.getInt(MAX_ITERATIONS_PER_DOC_PARAM, 0)); setModelWeight(conf.getFloat(MODEL_WEIGHT_PARAM, 0)); setUseOnlyLabeledDocs(conf.getBoolean(ONLY_LABELED_DOCS_PARAM, false)); setMinRelPreplexityDiff(conf.getFloat(MIN_RELATIVE_PERPLEXITY_DIFF_PARAM, -1)); setMaxInferenceItersPerDoc(conf.getInt(MAX_INFERENCE_ITERATIONS_PER_DOC_PARAM, 0)); check();//from w w w. ja v a 2 s . c o m return this; }
From source file:org.apache.mahout.fpm.hadoop.util.SplitByNumberOfMappersTextInputFormat.java
License:Apache License
/** * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned. * //w w w . j a v a 2 s. c o m * @param conf * hadoop configuration object * @param fileName * name of file to count * @return the number of lines in the file * @throws IOException */ private static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException { long nrLines = conf.getLong(Config.NUMBER_OF_LINES_KEY, -1); if (nrLines != -1) { return nrLines; } FileSystem fs = fileName.getFileSystem(conf); LineReader lr = null; try { FSDataInputStream in = fs.open(fileName); lr = new LineReader(in, conf); Text text = new Text(); nrLines = 0; while (lr.readLine(text) > 0) { nrLines++; } return nrLines; } catch (IOException e) { e.printStackTrace(); } return 0; }
From source file:org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1); this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR); this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); if (log.isInfoEnabled()) { log.info("NGram Total is {}", ngramTotal); log.info("Min LLR value is {}", minLLRValue); log.info("Emit Unitgrams is {}", emitUnigrams); }//from w ww . j a v a2s . co m if (ngramTotal == -1) { throw new IllegalStateException("No NGRAM_TOTAL available in job config"); } }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); try {//from w ww . j a v a 2s .co m Configuration conf = context.getConfiguration(); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); // key is feature, value is the document frequency while (reader.next(key, value)) { dictionary.put(key.get(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.vectorizer.collocations.llr.LLRReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1); this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR); this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); log.info("NGram Total: {}, Min LLR value: {}, Emit Unigrams: {}", ngramTotal, minLLRValue, emitUnigrams); if (ngramTotal == -1) { throw new IllegalStateException("No NGRAM_TOTAL available in job config"); }/* w ww .j a va2 s .co m*/ }
From source file:org.apache.mahout.vectorizer.pruner.WordsPrunerReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); //Path[] localFiles = HadoopUtil.getCachedFiles(conf); maxDf = conf.getLong(HighDFWordsPruner.MAX_DF, Long.MAX_VALUE); minDf = conf.getLong(HighDFWordsPruner.MIN_DF, -1); Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf); // key is feature, value is the document frequency for (Pair<IntWritable, LongWritable> record : new SequenceFileIterable<IntWritable, LongWritable>( dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); }//from ww w .j a v a2s . c om }
From source file:org.apache.mahout.vectorizer.tfidf.TFIDFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles); // key is feature, value is the document frequency for (Pair<IntWritable, LongWritable> record : new SequenceFileIterable<IntWritable, LongWritable>( dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); }//from w w w. j a v a 2 s .c o m }
From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java
License:Apache License
public static long getSlotKeyId(Configuration conf, String prefix) { return conf.getLong(getConfigName(prefix, SLOT_KEY_ID), 0L); }