List of usage examples for org.apache.hadoop.conf Configuration setFloat
public void setFloat(String name, float value)
name
property to a float
. From source file:org.apache.hcatalog.mapreduce.TestHCatInputFormat.java
License:Apache License
private boolean runJob(float badRecordThreshold) throws Exception { Configuration conf = new Configuration(); conf.setFloat(HCatConstants.HCAT_INPUT_BAD_RECORD_THRESHOLD_KEY, badRecordThreshold); Job job = new Job(conf); job.setJarByClass(this.getClass()); job.setMapperClass(MyMapper.class); job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); HCatInputFormat.setInput(job, "default", "test_bad_records"); job.setMapOutputKeyClass(HCatRecord.class); job.setMapOutputValueClass(HCatRecord.class); job.setNumReduceTasks(0);//w ww.ja v a2s . co m Path path = new Path(TEST_DATA_DIR, "test_bad_record_handling_output"); if (path.getFileSystem(conf).exists(path)) { path.getFileSystem(conf).delete(path, true); } TextOutputFormat.setOutputPath(job, path); return job.waitForCompletion(true); }
From source file:org.apache.kylin.storage.hbase.steps.HFileOutputFormat3.java
License:Apache License
static <V extends Cell> RecordWriter<ImmutableBytesWritable, V> createRecordWriter( final TaskAttemptContext context, final OutputCommitter committer) throws IOException, InterruptedException { // Get the path of the temporary output file final Path outputdir = ((FileOutputCommitter) committer).getWorkPath(); final Configuration conf = context.getConfiguration(); LOG.debug("Task output path: " + outputdir); final FileSystem fs = outputdir.getFileSystem(conf); // These configs. are from hbase-*.xml final long maxsize = conf.getLong(HConstants.HREGION_MAX_FILESIZE, HConstants.DEFAULT_MAX_FILE_SIZE); // Invented config. Add to hbase-*.xml if other than default compression. final String defaultCompressionStr = conf.get("hfile.compression", Compression.Algorithm.NONE.getName()); final Algorithm defaultCompression = AbstractHFileWriter.compressionByName(defaultCompressionStr); final boolean compactionExclude = conf.getBoolean("hbase.mapreduce.hfileoutputformat.compaction.exclude", false);//from ww w . j ava 2 s . c om // create a map from column family to the compression algorithm final Map<byte[], Algorithm> compressionMap = createFamilyCompressionMap(conf); final Map<byte[], BloomType> bloomTypeMap = createFamilyBloomTypeMap(conf); final Map<byte[], Integer> blockSizeMap = createFamilyBlockSizeMap(conf); String dataBlockEncodingStr = conf.get(DATABLOCK_ENCODING_OVERRIDE_CONF_KEY); final Map<byte[], DataBlockEncoding> datablockEncodingMap = createFamilyDataBlockEncodingMap(conf); final DataBlockEncoding overriddenEncoding; if (dataBlockEncodingStr != null) { overriddenEncoding = DataBlockEncoding.valueOf(dataBlockEncodingStr); } else { overriddenEncoding = null; } return new RecordWriter<ImmutableBytesWritable, V>() { // Map of families to writers and how much has been output on the writer. private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>( Bytes.BYTES_COMPARATOR); private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY; private final byte[] now = Bytes.toBytes(System.currentTimeMillis()); private boolean rollRequested = false; @Override public void write(ImmutableBytesWritable row, V cell) throws IOException { KeyValue kv = KeyValueUtil.ensureKeyValue(cell); if (row == null && kv == null) { rollWriters(); return; } byte[] rowKey = CellUtil.cloneRow(kv); long length = kv.getLength(); byte[] family = CellUtil.cloneFamily(kv); WriterLength wl = this.writers.get(family); if (wl == null) { fs.mkdirs(new Path(outputdir, Bytes.toString(family))); } if (wl != null && wl.written + length >= maxsize) { this.rollRequested = true; } if (rollRequested && Bytes.compareTo(this.previousRow, rowKey) != 0) { rollWriters(); } if (wl == null || wl.writer == null) { wl = getNewWriter(family, conf); } kv.updateLatestStamp(this.now); wl.writer.append(kv); wl.written += length; this.previousRow = rowKey; } private void rollWriters() throws IOException { for (WriterLength wl : this.writers.values()) { if (wl.writer != null) { LOG.info("Writer=" + wl.writer.getPath() + ((wl.written == 0) ? "" : ", wrote=" + wl.written)); close(wl.writer); } wl.writer = null; wl.written = 0; } this.rollRequested = false; } @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "BX_UNBOXING_IMMEDIATELY_REBOXED", justification = "Not important") private WriterLength getNewWriter(byte[] family, Configuration conf) throws IOException { WriterLength wl = new WriterLength(); Path familydir = new Path(outputdir, Bytes.toString(family)); Algorithm compression = compressionMap.get(family); compression = compression == null ? defaultCompression : compression; BloomType bloomType = bloomTypeMap.get(family); bloomType = bloomType == null ? BloomType.NONE : bloomType; Integer blockSize = blockSizeMap.get(family); blockSize = blockSize == null ? HConstants.DEFAULT_BLOCKSIZE : blockSize; DataBlockEncoding encoding = overriddenEncoding; encoding = encoding == null ? datablockEncodingMap.get(family) : encoding; encoding = encoding == null ? DataBlockEncoding.NONE : encoding; Configuration tempConf = new Configuration(conf); tempConf.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0.0f); HFileContextBuilder contextBuilder = new HFileContextBuilder().withCompression(compression) .withChecksumType(HStore.getChecksumType(conf)) .withBytesPerCheckSum(HStore.getBytesPerChecksum(conf)).withBlockSize(blockSize); contextBuilder.withDataBlockEncoding(encoding); HFileContext hFileContext = contextBuilder.build(); wl.writer = new StoreFile.WriterBuilder(conf, new CacheConfig(tempConf), fs) .withOutputDir(familydir).withBloomType(bloomType).withComparator(KeyValue.COMPARATOR) .withFileContext(hFileContext).build(); this.writers.put(family, wl); return wl; } private void close(final StoreFile.Writer w) throws IOException { if (w != null) { w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes.toBytes(System.currentTimeMillis())); w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes.toBytes(context.getTaskAttemptID().toString())); w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes.toBytes(true)); w.appendFileInfo(StoreFile.EXCLUDE_FROM_MINOR_COMPACTION_KEY, Bytes.toBytes(compactionExclude)); w.appendTrackedTimestampsToMetadata(); w.close(); } } @Override public void close(TaskAttemptContext c) throws IOException, InterruptedException { for (WriterLength wl : this.writers.values()) { close(wl.writer); } } }; }
From source file:org.apache.mahout.classifier.naivebayes.trainer.NaiveBayesTrainer.java
License:Apache License
public static void trainNaiveBayes(Path input, Configuration conf, Iterable<String> inputLabels, Path output, int numReducers, float alphaI, boolean trainComplementary) throws IOException, InterruptedException, ClassNotFoundException { conf.setFloat(ALPHA_I, alphaI); Path labelMapPath = createLabelMapFile(inputLabels, conf, new Path(output, LABEL_MAP)); Path classVectorPath = new Path(output, CLASS_VECTORS); runNaiveBayesByLabelSummer(input, conf, labelMapPath, classVectorPath, numReducers); Path weightFilePath = new Path(output, SUM_VECTORS); runNaiveBayesWeightSummer(classVectorPath, conf, labelMapPath, weightFilePath, numReducers); Path thetaFilePath = new Path(output, THETA_SUM); if (trainComplementary) { runNaiveBayesThetaComplementarySummer(classVectorPath, conf, weightFilePath, thetaFilePath, numReducers);//from w ww . jav a2 s .c o m } else { runNaiveBayesThetaSummer(classVectorPath, conf, weightFilePath, thetaFilePath, numReducers); } }
From source file:org.apache.mahout.classifier.svm.algorithm.parallelalgorithms.ParallelMultiClassifierTrainJob.java
License:Apache License
/** * Sets the parameters related to this mapper. * // w ww.j a v a2 s .c o m * <p> * <ol> * <li></li> * </ol> * * @param conf * @param lambda * @param k * @param modelFile * model files store path * @param hdfsServer * hdfs server address */ public static void setReducerParameters(Configuration conf, double lambda, int k, String modelFile, String hdfsServer) { // set the columns to be updated conf.setFloat(SVMParameters.HADOOP_LAMBDA, (float) lambda); conf.setInt(SVMParameters.HADOOP_K, k); conf.set(SVMParameters.HADOOP_MODLE_PATH, modelFile); conf.set(SVMParameters.HDFS_SERVER, hdfsServer); }
From source file:org.apache.mahout.clustering.classify.ClusterClassificationDriver.java
License:Apache License
private static void classifyClusterMR(Configuration conf, Path input, Path clustersIn, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException, InterruptedException, ClassNotFoundException { conf.setFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, clusterClassificationThreshold.floatValue()); conf.setBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, emitMostLikely); conf.set(ClusterClassificationConfigKeys.CLUSTERS_IN, clustersIn.toUri().toString()); Job job = new Job(conf, "Cluster Classification Driver running over input: " + input); job.setJarByClass(ClusterClassificationDriver.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(ClusterClassificationMapper.class); job.setNumReduceTasks(0);// w ww .ja v a 2 s . c om job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedPropertyVectorWritable.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); if (!job.waitForCompletion(true)) { throw new InterruptedException("Cluster Classification Driver Job failed processing " + input); } }
From source file:org.apache.mahout.clustering.lda.cvb.CVBConfig.java
License:Apache License
public void write(Configuration conf) { conf.setInt(NUM_TOPICS_PARAM, numTopics); conf.setInt(NUM_TERMS_PARAM, numTerms); conf.setFloat(DOC_TOPIC_SMOOTHING_PARAM, alpha); conf.setFloat(TERM_TOPIC_SMOOTHING_PARAM, eta); conf.setLong(RANDOM_SEED_PARAM, randomSeed); conf.setFloat(TEST_SET_FRACTION_PARAM, testFraction); conf.setInt(NUM_TRAIN_THREADS_PARAM, numTrainThreads); conf.setInt(NUM_UPDATE_THREADS_PARAM, numUpdateThreads); conf.setInt(MAX_ITERATIONS_PER_DOC_PARAM, maxItersPerDoc); conf.setFloat(MODEL_WEIGHT_PARAM, modelWeight); conf.setBoolean(ONLY_LABELED_DOCS_PARAM, useOnlyLabeledDocs); conf.setFloat(MIN_RELATIVE_PERPLEXITY_DIFF_PARAM, minRelPreplexityDiff); conf.setInt(MAX_INFERENCE_ITERATIONS_PER_DOC_PARAM, maxInferenceItersPerDoc); }
From source file:org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansDriver.java
License:Apache License
/** * Checks the parameters for a StreamingKMeans job and prepares a Configuration with them. * * @param conf the Configuration to populate * @param numClusters k, the number of clusters at the end * @param estimatedNumMapClusters O(k log n), the number of clusters requested from each mapper * @param estimatedDistanceCutoff an estimate of the minimum distance that separates two clusters (can be smaller and * will be increased dynamically) * @param maxNumIterations the maximum number of iterations of BallKMeans * @param trimFraction the fraction of the points to be considered in updating a ball k-means * @param randomInit whether to initialize the ball k-means seeds randomly * @param ignoreWeights whether to ignore the invalid final ball k-means weights * @param testProbability the percentage of vectors assigned to the test set for selecting the best final centers * @param numBallKMeansRuns the number of BallKMeans runs in the reducer that determine the centroids to return * (clusters are computed for the training set and the error is computed on the test set) * @param measureClass string, name of the distance measure class; theory works for Euclidean-like distances * @param searcherClass string, name of the searcher that will be used for nearest neighbor search * @param searchSize the number of closest neighbors to look at for selecting the closest one in approximate nearest * neighbor searches//from w ww.ja va 2 s. com * @param numProjections the number of projected vectors to use for faster searching (only useful for ProjectionSearch * or FastProjectionSearch); @see org.apache.mahout.math.neighborhood.ProjectionSearch */ public static void configureOptionsForWorkers(Configuration conf, int numClusters, /* StreamingKMeans */ int estimatedNumMapClusters, float estimatedDistanceCutoff, /* BallKMeans */ int maxNumIterations, float trimFraction, boolean randomInit, boolean ignoreWeights, float testProbability, int numBallKMeansRuns, /* Searcher */ String measureClass, String searcherClass, int searchSize, int numProjections, String method, boolean reduceStreamingKMeans) throws ClassNotFoundException { // Checking preconditions for the parameters. Preconditions.checkArgument(numClusters > 0, "Invalid number of clusters requested: " + numClusters + ". Must be: numClusters > 0!"); // StreamingKMeans Preconditions.checkArgument(estimatedNumMapClusters > numClusters, "Invalid number of estimated map " + "clusters; There must be more than the final number of clusters (k log n vs k)"); Preconditions.checkArgument( estimatedDistanceCutoff == INVALID_DISTANCE_CUTOFF || estimatedDistanceCutoff > 0, "estimatedDistanceCutoff must be equal to -1 or must be greater then 0!"); // BallKMeans Preconditions.checkArgument(maxNumIterations > 0, "Must have at least one BallKMeans iteration"); Preconditions.checkArgument(trimFraction > 0, "trimFraction must be positive"); Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "test probability is not in the " + "interval [0, 1)"); Preconditions.checkArgument(numBallKMeansRuns > 0, "numBallKMeans cannot be negative"); // Searcher if (!searcherClass.contains("Brute")) { // These tests only make sense when a relevant searcher is being used. Preconditions.checkArgument(searchSize > 0, "Invalid searchSize. Must be positive."); if (searcherClass.contains("Projection")) { Preconditions.checkArgument(numProjections > 0, "Invalid numProjections. Must be positive"); } } // Setting the parameters in the Configuration. conf.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, numClusters); /* StreamingKMeans */ conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters); if (estimatedDistanceCutoff != INVALID_DISTANCE_CUTOFF) { conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff); } /* BallKMeans */ conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations); conf.setFloat(TRIM_FRACTION, trimFraction); conf.setBoolean(RANDOM_INIT, randomInit); conf.setBoolean(IGNORE_WEIGHTS, ignoreWeights); conf.setFloat(TEST_PROBABILITY, testProbability); conf.setInt(NUM_BALLKMEANS_RUNS, numBallKMeansRuns); /* Searcher */ // Checks if the measureClass is available, throws exception otherwise. Class.forName(measureClass); conf.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, measureClass); // Checks if the searcherClass is available, throws exception otherwise. Class.forName(searcherClass); conf.set(SEARCHER_CLASS_OPTION, searcherClass); conf.setInt(SEARCH_SIZE_OPTION, searchSize); conf.setInt(NUM_PROJECTIONS_OPTION, numProjections); conf.set(DefaultOptionCreator.METHOD_OPTION, method); conf.setBoolean(REDUCE_STREAMING_KMEANS, reduceStreamingKMeans); log.info( "Parameters are: [k] numClusters {}; " + "[SKM] estimatedNumMapClusters {}; estimatedDistanceCutoff {} " + "[BKM] maxNumIterations {}; trimFraction {}; randomInit {}; ignoreWeights {}; " + "testProbability {}; numBallKMeansRuns {}; " + "[S] measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; " + "method {}; reduceStreamingKMeans {}", numClusters, estimatedNumMapClusters, estimatedDistanceCutoff, maxNumIterations, trimFraction, randomInit, ignoreWeights, testProbability, numBallKMeansRuns, measureClass, searcherClass, searchSize, numProjections, method, reduceStreamingKMeans); }
From source file:org.apache.mahout.clustering.streaming.mapreduce.StreamingKMeansTestMR.java
License:Apache License
private void configure(Configuration configuration) { configuration.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, distanceMeasureClassName); configuration.setInt(StreamingKMeansDriver.SEARCH_SIZE_OPTION, SEARCH_SIZE); configuration.setInt(StreamingKMeansDriver.NUM_PROJECTIONS_OPTION, NUM_PROJECTIONS); configuration.set(StreamingKMeansDriver.SEARCHER_CLASS_OPTION, searcherClassName); configuration.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, 1 << NUM_DIMENSIONS); configuration.setInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, (1 << NUM_DIMENSIONS) * (int) Math.log(NUM_DATA_POINTS)); configuration.setFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF, (float) DISTANCE_CUTOFF); configuration.setInt(StreamingKMeansDriver.MAX_NUM_ITERATIONS, MAX_NUM_ITERATIONS); // Collapse the Centroids in the reducer. configuration.setBoolean(StreamingKMeansDriver.REDUCE_STREAMING_KMEANS, true); }
From source file:org.apache.mahout.knn.experimental.StreamingKMeansDriver.java
License:Apache License
public static void configureOptionsForWorkers(Configuration conf, int numClusters, int estimatedNumMapClusters, float estimatedDistanceCutoff, String measureClass, String searcherClass, int searchSize, int numProjections, int maxNumIterations) { conf.setInt(DefaultOptionCreator.NUM_CLUSTERS_OPTION, numClusters); conf.setInt(ESTIMATED_NUM_MAP_CLUSTERS, estimatedNumMapClusters); conf.setFloat(ESTIMATED_DISTANCE_CUTOFF, estimatedDistanceCutoff); try {//from w w w .j av a 2s . com Class.forName(measureClass); } catch (ClassNotFoundException e) { log.error("Measure class not found " + measureClass, e); } conf.set(DefaultOptionCreator.DISTANCE_MEASURE_OPTION, measureClass); try { Class.forName(searcherClass); } catch (ClassNotFoundException e) { log.error("Searcher class not found " + measureClass, e); } conf.set(SEARCHER_CLASS_OPTION, searcherClass); conf.setInt(SEARCH_SIZE_OPTION, searchSize); conf.setInt(NUM_PROJECTIONS_OPTION, numProjections); conf.setInt(MAX_NUM_ITERATIONS, maxNumIterations); log.info( "Parameters are: numClusters {}; estimatedNumMapClusters {}; estimatedDistanceCutoff" + " {}; measureClass {}; searcherClass {}; searcherSize {}; numProjections {}; " + "maxNumIterations {}", numClusters, estimatedNumMapClusters, estimatedDistanceCutoff, measureClass, searcherClass, searchSize, numProjections, maxNumIterations); }
From source file:org.apache.mahout.regression.penalizedlinear.LinearCrossValidation.java
License:Apache License
private void runPenalizedLinear() throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); conf.setInt(PenalizedLinearKeySet.NUM_CV, parameter.numOfCV); conf.setFloat(PenalizedLinearKeySet.ALPHA, parameter.alpha); conf.set(PenalizedLinearKeySet.LAMBDA, parameter.lambda); conf.setBoolean(PenalizedLinearKeySet.INTERCEPT, parameter.intercept); Job job = new Job(conf, "Penalized Linear Regression Driver running over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(PenalizedLinearMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setReducerClass(PenalizedLinearReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setCombinerClass(PenalizedLinearReducer.class); job.setNumReduceTasks(1);/*from w w w . j a v a 2s . com*/ job.setJarByClass(LinearRegularizePath.class); FileInputFormat.addInputPath(job, new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT)); FileOutputFormat.setOutputPath(job, new Path(output, "output")); if (!job.waitForCompletion(true)) { throw new InterruptedException("Penalized Linear Regression Job failed processing " + input); } solver = new PenalizedLinearSolver(); solver.setAlpha(parameter.alpha); solver.setIntercept(parameter.intercept); solver.setLambdaString(parameter.lambda); solver.initSolver(new Path(output, "output"), getConf()); solver.crossValidate(); printInfo(parameter, solver); }