List of usage examples for org.apache.hadoop.conf Configuration getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:org.apache.mahout.utils.nlp.collocations.llr.LLRReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1); this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR); this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); if (log.isInfoEnabled()) { log.info("NGram Total is {}", ngramTotal); log.info("Min LLR value is {}", minLLRValue); log.info("Emit Unitgrams is {}", emitUnigrams); }/*from w w w . java 2s .c o m*/ if (ngramTotal == -1) { throw new IllegalStateException("No NGRAM_TOTAL available in job config"); } }
From source file:org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); normPower = conf.getFloat(PartialVectorMerger.NORMALIZATION_POWER, PartialVectorMerger.NO_NORMALIZING); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); }
From source file:org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try {/* www . j a va 2 s .c o m*/ dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); Writable key = new Text(); IntWritable value = new IntWritable(); // key is word value is id while (reader.next(key, value)) { dictionary.put(key.toString(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.utils.vectors.tfidf.TFIDFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); try {/* ww w . ja v a2s .c o m*/ Configuration conf = context.getConfiguration(); URI[] localFiles = DistributedCache.getCacheFiles(conf); if (localFiles == null || localFiles.length < 1) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDfPercent = conf.getInt(TFIDFConverter.MAX_DF_PERCENTAGE, 99); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); Path dictionaryFile = new Path(localFiles[0].getPath()); FileSystem fs = dictionaryFile.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dictionaryFile, conf); IntWritable key = new IntWritable(); LongWritable value = new LongWritable(); // key is feature, value is the document frequency while (reader.next(key, value)) { dictionary.put(key.get(), value.get()); } } catch (IOException e) { throw new IllegalStateException(e); } }
From source file:org.apache.mahout.vectorizer.collocations.llr.LLRReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.ngramTotal = conf.getLong(NGRAM_TOTAL, -1); this.minLLRValue = conf.getFloat(MIN_LLR, DEFAULT_MIN_LLR); this.emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); log.info("NGram Total: {}, Min LLR value: {}, Emit Unigrams: {}", ngramTotal, minLLRValue, emitUnigrams); if (ngramTotal == -1) { throw new IllegalStateException("No NGRAM_TOTAL available in job config"); }//from w ww. j av a 2s. co m }
From source file:org.apache.mahout.vectorizer.common.PartialVectorMergeReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); normPower = conf.getFloat(PartialVectorMerger.NORMALIZATION_POWER, PartialVectorMerger.NO_NORMALIZING); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); logNormalize = conf.getBoolean(PartialVectorMerger.LOG_NORMALIZE, false); }
From source file:org.apache.mahout.vectorizer.EncodingMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); sequentialVectors = conf.getBoolean(USE_SEQUENTIAL, false); namedVectors = conf.getBoolean(USE_NAMED_VECTORS, false); String analyzerName = conf.get(ANALYZER_NAME, StandardAnalyzer.class.getName()); Analyzer analyzer;// w w w. jav a 2s .c o m try { analyzer = AnalyzerUtils.createAnalyzer(analyzerName); } catch (ClassNotFoundException e) { //TODO: hmmm, don't like this approach throw new IOException("Unable to create Analyzer for name: " + analyzerName, e); } String encoderName = conf.get(ENCODER_FIELD_NAME, "text"); cardinality = conf.getInt(CARDINALITY, 5000); String encClass = conf.get(ENCODER_CLASS); encoder = ClassUtils.instantiateAs(encClass, FeatureVectorEncoder.class, new Class[] { String.class }, new Object[] { encoderName }); if (encoder instanceof LuceneTextValueEncoder) { ((LuceneTextValueEncoder) encoder).setAnalyzer(analyzer); } }
From source file:org.apache.mahout.vectorizer.pruner.PrunedPartialVectorMergeReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); normPower = conf.getFloat(PartialVectorMerger.NORMALIZATION_POWER, PartialVectorMerger.NO_NORMALIZING); logNormalize = conf.getBoolean(PartialVectorMerger.LOG_NORMALIZE, false); }
From source file:org.apache.mahout.vectorizer.term.TFPartialVectorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); dimension = conf.getInt(PartialVectorMerger.DIMENSION, Integer.MAX_VALUE); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); maxNGramSize = conf.getInt(DictionaryVectorizer.MAX_NGRAMS, maxNGramSize); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(DictionaryVectorizer.DICTIONARY_FILE, localFiles);/* w w w . j a va 2 s. c o m*/ // key is word value is id for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().toString(), record.getSecond().get()); } }