Example usage for org.apache.hadoop.conf Configuration getLong

List of usage examples for org.apache.hadoop.conf Configuration getLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getLong.

Prototype

public long getLong(String name, long defaultValue) 

Source Link

Document

Get the value of the name property as a long.

Usage

From source file:org.apache.mahout.clustering.lda.cvb.CachingCVB0Mapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    log.info("Retrieving configuration");
    Configuration conf = context.getConfiguration();
    float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
    float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
    long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
    numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
    int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
    int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
    int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
    maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
    float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);

    log.info("Initializing read model");
    Path[] modelPaths = CVB0Driver.getModelPaths(conf);
    if (modelPaths != null && modelPaths.length > 0) {
        readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
    } else {/*  w w  w  . ja v  a  2  s  .co  m*/
        log.info("No model files found");
        readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
                numTrainThreads, modelWeight);
    }

    log.info("Initializing write model");
    writeModel = modelWeight == 1 ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
            : readModel;

    log.info("Initializing model trainer");
    modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
    modelTrainer.start();
}

From source file:org.apache.mahout.clustering.lda.cvb.CachingCVB0PerplexityMapper.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    MemoryUtil.startMemoryLogger(5000);//from   ww  w  .j  a  va  2s . c o  m

    log.info("Retrieving configuration");
    Configuration conf = context.getConfiguration();
    float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
    float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
    long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
    random = RandomUtils.getRandom(seed);
    numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
    int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
    int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
    int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
    maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
    float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
    testFraction = conf.getFloat(CVB0Driver.TEST_SET_FRACTION, 0.1f);

    log.info("Initializing read model");
    Path[] modelPaths = CVB0Driver.getModelPaths(conf);
    if (modelPaths != null && modelPaths.length > 0) {
        readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
    } else {
        log.info("No model files found");
        readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
                numTrainThreads, modelWeight);
    }

    log.info("Initializing model trainer");
    modelTrainer = new ModelTrainer(readModel, null, numTrainThreads, numTopics, numTerms);

    log.info("Initializing topic vector");
    topicVector = new DenseVector(new double[numTopics]);
}

From source file:org.apache.mahout.clustering.lda.cvb.CVBConfig.java

License:Apache License

public CVBConfig read(Configuration conf) {
    setNumTopics(conf.getInt(NUM_TOPICS_PARAM, 0));
    setNumTerms(conf.getInt(NUM_TERMS_PARAM, 0));
    setAlpha(conf.getFloat(DOC_TOPIC_SMOOTHING_PARAM, 0));
    setEta(conf.getFloat(TERM_TOPIC_SMOOTHING_PARAM, 0));
    setRandomSeed(conf.getLong(RANDOM_SEED_PARAM, 0));
    setTestFraction(conf.getFloat(TEST_SET_FRACTION_PARAM, 0));
    setNumTrainThreads(conf.getInt(NUM_TRAIN_THREADS_PARAM, 0));
    setNumUpdateThreads(conf.getInt(NUM_UPDATE_THREADS_PARAM, 0));
    setMaxItersPerDoc(conf.getInt(MAX_ITERATIONS_PER_DOC_PARAM, 0));
    setModelWeight(conf.getFloat(MODEL_WEIGHT_PARAM, 0));
    setUseOnlyLabeledDocs(conf.getBoolean(ONLY_LABELED_DOCS_PARAM, false));
    setMinRelPreplexityDiff(conf.getFloat(MIN_RELATIVE_PERPLEXITY_DIFF_PARAM, -1));
    setMaxInferenceItersPerDoc(conf.getInt(MAX_INFERENCE_ITERATIONS_PER_DOC_PARAM, 0));
    check();//from   w  w w. ja  v  a  2 s  .  c o m
    return this;
}

From source file:org.apache.mahout.fpm.hadoop.util.SplitByNumberOfMappersTextInputFormat.java

License:Apache License

/**
 * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned.
 * //w w  w . j a v  a 2  s.  c  o  m
 * @param conf
 *          hadoop configuration object
 * @param fileName
 *          name of file to count
 * @return the number of lines in the file
 * @throws IOException
 */
private static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException {
    long nrLines = conf.getLong(Config.NUMBER_OF_LINES_KEY, -1);
    if (nrLines != -1) {
        return nrLines;
    }

    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text text = new Text();
        nrLines = 0;
        while (lr.readLine(text) > 0) {
            nrLines++;
        }
        return nrLines;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return 0;
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1);
    this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR);

    this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);

    if (log.isInfoEnabled()) {
        log.info("NGram Total is {}", ngramTotal);
        log.info("Min LLR value is {}", minLLRValue);
        log.info("Emit Unitgrams is {}", emitUnigrams);
    }//from  w  ww . j  a v  a2s  . co m

    if (ngramTotal == -1) {
        throw new IllegalStateException("No NGRAM_TOTAL available in job config");
    }
}

From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    try {//from w ww  .  j a  v  a 2s .co  m
        Configuration conf = context.getConfiguration();
        URI[] localFiles = DistributedCache.getCacheFiles(conf);
        if (localFiles == null || localFiles.length < 1) {
            throw new IllegalArgumentException("missing paths from the DistributedCache");
        }

        vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
        featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
        minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
        maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99);
        sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);

        Path dictionaryFile = new Path(localFiles[0].getPath());
        FileSystem fs = dictionaryFile.getFileSystem(conf);
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf);
        IntWritable key = new IntWritable();
        LongWritable value = new LongWritable();

        // key is feature, value is the document frequency
        while (reader.next(key, value)) {
            dictionary.put(key.get(), value.get());
        }
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.apache.mahout.vectorizer.collocations.llr.LLRReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1);
    this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR);

    this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS);

    log.info("NGram Total: {}, Min LLR value: {}, Emit Unigrams: {}", ngramTotal, minLLRValue, emitUnigrams);

    if (ngramTotal == -1) {
        throw new IllegalStateException("No NGRAM_TOTAL available in job config");
    }/*  w  ww  .j a va2 s  .co m*/
}

From source file:org.apache.mahout.vectorizer.pruner.WordsPrunerReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();
    //Path[] localFiles = HadoopUtil.getCachedFiles(conf);

    maxDf = conf.getLong(HighDFWordsPruner.MAX_DF, Long.MAX_VALUE);
    minDf = conf.getLong(HighDFWordsPruner.MIN_DF, -1);

    Path dictionaryFile = HadoopUtil.getSingleCachedFile(conf);

    // key is feature, value is the document frequency
    for (Pair<IntWritable, LongWritable> record : new SequenceFileIterable<IntWritable, LongWritable>(
            dictionaryFile, true, conf)) {
        dictionary.put(record.getFirst().get(), record.getSecond().get());
    }//from ww w .j a  v  a2s  . c om
}

From source file:org.apache.mahout.vectorizer.tfidf.TFIDFPartialVectorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    super.setup(context);
    Configuration conf = context.getConfiguration();

    vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
    featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
    minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
    maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1);
    sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
    namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);

    URI[] localFiles = DistributedCache.getCacheFiles(conf);
    Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles);
    // key is feature, value is the document frequency
    for (Pair<IntWritable, LongWritable> record : new SequenceFileIterable<IntWritable, LongWritable>(
            dictionaryFile, true, conf)) {
        dictionary.put(record.getFirst().get(), record.getSecond().get());
    }//from  w  w  w.  j  a v  a  2 s .c o  m
}

From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java

License:Apache License

public static long getSlotKeyId(Configuration conf, String prefix) {
    return conf.getLong(getConfigName(prefix, SLOT_KEY_ID), 0L);
}