List of usage examples for org.apache.hadoop.conf Configuration getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansReducer.java
License:Apache License
public static Iterable<Vector> getBestCentroids(List<Centroid> centroids, Configuration conf) { if (log.isInfoEnabled()) { log.info("Number of Centroids: {}", centroids.size()); }/* w w w. j a v a2 s .c om*/ int numClusters = conf.getInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, 1); int maxNumIterations = conf.getInt(StreamingKMeansDriver.MAX_NUM_ITERATIONS, 10); float trimFraction = conf.getFloat(StreamingKMeansDriver.TRIM_FRACTION, 0.9f); boolean kMeansPlusPlusInit = !conf.getBoolean(StreamingKMeansDriver.RANDOM_INIT, false); boolean correctWeights = !conf.getBoolean(StreamingKMeansDriver.IGNORE_WEIGHTS, false); float testProbability = conf.getFloat(StreamingKMeansDriver.TEST_PROBABILITY, 0.1f); int numRuns = conf.getInt(StreamingKMeansDriver.NUM_BALLKMEANS_RUNS, 3); BallKMeans ballKMeansCluster = new BallKMeans(StreamingKMeansUtilsMR.searcherFromConfiguration(conf), numClusters, maxNumIterations, trimFraction, kMeansPlusPlusInit, correctWeights, testProbability, numRuns); return ballKMeansCluster.cluster(centroids); }
From source file:org.apache.mahout.df.mapred.Builder.java
License:Apache License
protected static boolean isOobEstimate(Configuration conf) { return conf.getBoolean("mahout.rf.oob", false); }
From source file:org.apache.mahout.df.mapred.partial.PartialBuilder.java
License:Apache License
/** * Indicates if we should run the second step of the builder.<br> * This parameter is only meant for debuging, so we keep it protected. * //from w ww . j av a 2 s .com * @param conf * @return */ protected static boolean isStep2(Configuration conf) { return conf.getBoolean("debug.mahout.rf.partial.step2", true); }
From source file:org.apache.mahout.ga.watchmaker.cd.hadoop.DatasetSplit.java
License:Apache License
static boolean isTraining(Configuration conf) { if (conf.get(TRAINING) == null) { throw new IllegalArgumentException("TRAINING job parameter not found"); }//from w ww. ja v a 2 s. c o m return conf.getBoolean(TRAINING, true); }
From source file:org.apache.mahout.knn.experimental.StreamingKMeansDriver.java
License:Apache License
/** * Iterate over the input vectors to produce clusters and, if requested, use the results of the final iteration to * cluster the input vectors.//from w ww . j a va 2 s.c o m * * @param input * the directory pathname for input points * @param output * the directory pathname for output points */ public static void run(Configuration conf, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { log.info("Starting StreamingKMeans clustering for vectors in {}; results are output to {}", input.toString(), output.toString()); // Prepare Job for submission. Job job = new Job(conf, "StreamingKMeans"); // Input and output file format. job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // Mapper output Key and Value classes. // We don't really need to output anything as a key, since there will only be 1 reducer. job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(CentroidWritable.class); // Reducer output Key and Value classes. job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(CentroidWritable.class); // Mapper and Reducer classes. job.setMapperClass(StreamingKMeansMapper.class); job.setReducerClass(StreamingKMeansReducer.class); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set input and output paths for the job. FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Prevent StreamingKMeans class from logging debug output by default. // TODO(dfilimon): Remove this completely and configure using log files. if (!conf.getBoolean("logDebug", false)) { ((LoggerContext) LoggerFactory.getILoggerFactory()).getLogger(StreamingKMeans.class) .setLevel(Level.INFO); } long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { throw new InterruptedException("StreamingKMeans interrupted"); } long end = System.currentTimeMillis(); if (conf.getBoolean("summarize", true)) { EvaluateClustering.summarize(conf, output, log); } log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); }
From source file:org.apache.mahout.text.SequenceFilesFromMailArchivesMapper.java
License:Apache License
@Override public void setup(Context context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); // absorb all of the options into the MailOptions object this.options = new MailOptions(); options.setPrefix(configuration.get(KEY_PREFIX_OPTION[1], "")); if (!configuration.get(CHUNK_SIZE_OPTION[0], "").equals("")) { options.setChunkSize(configuration.getInt(CHUNK_SIZE_OPTION[0], 64)); }/*from w w w. j a v a 2s. c om*/ if (!configuration.get(CHARSET_OPTION[0], "").equals("")) { Charset charset = Charset.forName(configuration.get(CHARSET_OPTION[0], "UTF-8")); options.setCharset(charset); } else { Charset charset = Charset.forName("UTF-8"); options.setCharset(charset); } List<Pattern> patterns = Lists.newArrayListWithCapacity(5); // patternOrder is used downstream so that we can know what order the // text is in instead // of encoding it in the string, which // would require more processing later to remove it pre feature // selection. Map<String, Integer> patternOrder = Maps.newHashMap(); int order = 0; if (!configuration.get(FROM_OPTION[1], "").equals("")) { patterns.add(MailProcessor.FROM_PREFIX); patternOrder.put(MailOptions.FROM, order++); } if (!configuration.get(TO_OPTION[1], "").equals("")) { patterns.add(MailProcessor.TO_PREFIX); patternOrder.put(MailOptions.TO, order++); } if (!configuration.get(REFERENCES_OPTION[1], "").equals("")) { patterns.add(MailProcessor.REFS_PREFIX); patternOrder.put(MailOptions.REFS, order++); } if (!configuration.get(SUBJECT_OPTION[1], "").equals("")) { patterns.add(MailProcessor.SUBJECT_PREFIX); patternOrder.put(MailOptions.SUBJECT, order += 1); } options.setStripQuotedText(configuration.getBoolean(STRIP_QUOTED_OPTION[1], false)); options.setPatternsToMatch(patterns.toArray(new Pattern[patterns.size()])); options.setPatternOrder(patternOrder); options.setIncludeBody(configuration.getBoolean(BODY_OPTION[1], false)); options.setSeparator("\n"); if (!configuration.get(SEPARATOR_OPTION[1], "").equals("")) { options.setSeparator(configuration.get(SEPARATOR_OPTION[1], "")); } if (!configuration.get(BODY_SEPARATOR_OPTION[1], "").equals("")) { options.setBodySeparator(configuration.get(BODY_SEPARATOR_OPTION[1], "")); } if (!configuration.get(QUOTED_REGEX_OPTION[1], "").equals("")) { options.setQuotedTextPattern(Pattern.compile(configuration.get(QUOTED_REGEX_OPTION[1], ""))); } }
From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); if (inputCategories == null) { Set<String> newCategories = Sets.newHashSet(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories)); Set<String> inputCategoriesSet = setStringifier.fromString(categoriesStr); inputCategories = Lists.newArrayList(inputCategoriesSet); inputCategoryPatterns = Lists.newArrayListWithCapacity(inputCategories.size()); for (String inputCategory : inputCategories) { inputCategoryPatterns.add(Pattern.compile(".*\\b" + inputCategory + "\\b.*")); }//w w w.ja va 2 s .co m } exactMatchOnly = conf.getBoolean("exact.match.only", false); if (analyzer == null) { String analyzerStr = conf.get("analyzer.class", WikipediaAnalyzer.class.getName()); analyzer = ClassUtils.instantiateAs(analyzerStr, Analyzer.class); } log.info("Configure: Input Categories size: {} Exact Match: {} Analyzer: {}", inputCategories.size(), exactMatchOnly, analyzer.getClass().getName()); }
From source file:org.apache.mahout.text.wikipedia.WikipediaMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); Set<String> newCategories = Sets.newHashSet(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = conf.get("wikipedia.categories"); inputCategories = setStringifier.fromString(categoriesStr); exactMatchOnly = conf.getBoolean("exact.match.only", false); all = conf.getBoolean("all.files", false); removeLabels = conf.getBoolean("remove.labels", false); log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}", inputCategories.size(), all, exactMatchOnly, removeLabels); }
From source file:org.apache.mahout.text.WikipediaMapper.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); try {//from ww w . jav a 2s. co m if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = setStringifier.toString(newCategories); categoriesStr = conf.get("wikipedia.categories", categoriesStr); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = conf.getBoolean("exact.match.only", false); all = conf.getBoolean("all.files", true); } catch (IOException ex) { throw new IllegalStateException(ex); } log.info("Configure: Input Categories size: {} All: {} Exact Match: {}", new Object[] { inputCategories.size(), all, exactMatchOnly }); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); this.minSupport = conf.getInt(MIN_SUPPORT, DEFAULT_MIN_SUPPORT); boolean emitUnigrams = conf.getBoolean(CollocDriver.EMIT_UNIGRAMS, CollocDriver.DEFAULT_EMIT_UNIGRAMS); log.info("Min support is {}", minSupport); log.info("Emit Unitgrams is {}", emitUnigrams); }