List of usage examples for org.apache.hadoop.conf Configuration setFloat
public void setFloat(String name, float value)
name
property to a float
. From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java
License:Apache License
private void runPenalizedLinear() throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = getConf(); conf.setInt(PenalizedLinearKeySet.NUM_CV, parameter.numOfCV); conf.setFloat(PenalizedLinearKeySet.ALPHA, parameter.alpha); conf.set(PenalizedLinearKeySet.LAMBDA, parameter.lambda); conf.setBoolean(PenalizedLinearKeySet.INTERCEPT, parameter.intercept); Job job = new Job(conf, "Penalized Linear Regression Driver running over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(PenalizedLinearMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setReducerClass(PenalizedLinearReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setCombinerClass(PenalizedLinearReducer.class); job.setNumReduceTasks(1);/*ww w.j a va 2 s . c o m*/ job.setJarByClass(LinearRegularizePath.class); FileInputFormat.addInputPath(job, new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT)); FileOutputFormat.setOutputPath(job, new Path(output, "output")); if (!job.waitForCompletion(true)) { throw new InterruptedException("Penalized Linear Regression Job failed processing " + input); } solver = new PenalizedLinearSolver(); solver.setAlpha(parameter.alpha); solver.setIntercept(parameter.intercept); solver.setLambdaString(parameter.lambda); solver.initSolver(new Path(output, "output"), getConf()); solver.regularizePath(solver.getLambda()); printInfo(parameter, solver); }
From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java
License:Apache License
private void buildRegressionModelMR(PenalizedLinearParameter parameter, Path input, Path output) throws IOException, InterruptedException, ClassNotFoundException { Job job = prepareJob(input, output, SequenceFileInputFormat.class, PenalizedLinearMapper.class, Text.class, VectorWritable.class, PenalizedLinearReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); job.setJobName("Penalized Linear Regression Driver running over input: " + input); job.setNumReduceTasks(1);//from ww w . ja v a 2 s.c om job.setJarByClass(PenalizedLinearDriver.class); Configuration conf = job.getConfiguration(); conf.setInt(PenalizedLinearKeySet.NUM_CV, parameter.getNumOfCV()); conf.setFloat(PenalizedLinearKeySet.ALPHA, parameter.getAlpha()); conf.set(PenalizedLinearKeySet.LAMBDA, parameter.getLambda()); conf.setBoolean(PenalizedLinearKeySet.INTERCEPT, parameter.isIntercept()); if (!job.waitForCompletion(true)) { throw new InterruptedException("Penalized Linear Regression Job failed processing " + input); } }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation/* www .jav a 2 s . co m*/ */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(LLRReducer.class); job.setNumReduceTasks(reduceTasks); conf.setFloat(LLRReducer.MIN_LLR, minLLRValue); job.waitForCompletion(true); }
From source file:org.apache.mahout.utils.SplitInputJob.java
License:Apache License
/** * Run job to downsample, randomly permute and split data into test and * training sets. This job takes a SequenceFile as input and outputs two * SequenceFiles test-r-00000 and training-r-00000 which contain the test and * training sets respectively// w ww .j a va 2 s . c om * * @param initialConf * @param inputPath * path to input data SequenceFile * @param outputPath * path for output data SequenceFiles * @param keepPct * percentage of key value pairs in input to keep. The rest are * discarded * @param randomSelectionPercent * percentage of key value pairs to allocate to test set. Remainder * are allocated to training set */ @SuppressWarnings("rawtypes") public static void run(Configuration initialConf, Path inputPath, Path outputPath, int keepPct, float randomSelectionPercent) throws IOException, ClassNotFoundException, InterruptedException { int downsamplingFactor = (int) (100.0 / keepPct); initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor); initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent); // Determine class of keys and values FileSystem fs = FileSystem.get(initialConf); SequenceFileDirIterator<? extends WritableComparable, Writable> iterator = new SequenceFileDirIterator<WritableComparable, Writable>( inputPath, PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf()); Class<? extends WritableComparable> keyClass; Class<? extends Writable> valueClass; if (iterator.hasNext()) { Pair<? extends WritableComparable, Writable> pair = iterator.next(); keyClass = pair.getFirst().getClass(); valueClass = pair.getSecond().getClass(); } else { throw new IllegalStateException("Couldn't determine class of the input values"); } Job job = new Job(new Configuration(initialConf)); MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass); MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass); job.setJarByClass(SplitInputJob.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(1); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(SplitInputMapper.class); job.setReducerClass(SplitInputReducer.class); job.setSortComparatorClass(SplitInputComparator.class); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); job.submit(); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.utils.vectors.common.PartialVectorMerger.java
License:Apache License
/** * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the complete Document * {@link org.apache.mahout.math.RandomAccessSparseVector} * //from ww w .j a va2 s .c o m * @param partialVectorPaths * input directory of the vectors in {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the partial vectors have to be created * @param normPower * The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING} * @param numReducers * The number of reducers to spawn * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void mergePartialVectors(List<Path> partialVectorPaths, Path output, float normPower, int dimension, boolean sequentialAccess, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { if (normPower != NO_NORMALIZING && normPower < 0) { throw new IllegalArgumentException("normPower must either be -1 or >= 0"); } Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess); conf.setInt(DIMENSION, dimension); conf.setFloat(NORMALIZATION_POWER, normPower); Job job = new Job(conf); job.setJobName("PartialVectorMerger::MergePartialVectors"); job.setJarByClass(PartialVectorMerger.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths)); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(PartialVectorMergeReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.overwriteOutput(output); job.waitForCompletion(true); }
From source file:org.apache.mahout.vectorizer.common.PartialVectorMerger.java
License:Apache License
/** * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the complete Document * {@link org.apache.mahout.math.RandomAccessSparseVector} * // w w w .j av a2 s . c o m * @param partialVectorPaths * input directory of the vectors in {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the partial vectors have to be created * @param baseConf * job configuration * @param normPower * The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING} * @param dimension cardinality of the vectors * @param sequentialAccess * output vectors should be optimized for sequential access * @param namedVector * output vectors should be named, retaining key (doc id) as a label * @param numReducers * The number of reducers to spawn */ public static void mergePartialVectors(Iterable<Path> partialVectorPaths, Path output, Configuration baseConf, float normPower, boolean logNormalize, int dimension, boolean sequentialAccess, boolean namedVector, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Preconditions.checkArgument(normPower == NO_NORMALIZING || normPower >= 0, "If specified normPower must be nonnegative", normPower); Preconditions.checkArgument( normPower == NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower)) || !logNormalize, "normPower must be > 1 and not infinite if log normalization is chosen", normPower); Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess); conf.setBoolean(NAMED_VECTOR, namedVector); conf.setInt(DIMENSION, dimension); conf.setFloat(NORMALIZATION_POWER, normPower); conf.setBoolean(LOG_NORMALIZE, logNormalize); Job job = new Job(conf); job.setJobName("PartialVectorMerger::MergePartialVectors"); job.setJarByClass(PartialVectorMerger.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths)); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(PartialVectorMergeReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mahout.vectorizer.HighDFWordsPruner.java
License:Apache License
public static void mergePartialVectors(Iterable<Path> partialVectorPaths, Path output, Configuration baseConf, float normPower, boolean logNormalize, int numReducers) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setFloat(PartialVectorMerger.NORMALIZATION_POWER, normPower); conf.setBoolean(PartialVectorMerger.LOG_NORMALIZE, logNormalize); Job job = new Job(conf); job.setJobName("PrunerPartialVectorMerger::MergePartialVectors"); job.setJarByClass(PartialVectorMerger.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths)); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(PrunedPartialVectorMergeReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers);/*from w ww . j a va2 s . c o m*/ HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.orc.tools.TestFileDump.java
License:Apache License
@Test public void testDictionaryThreshold() throws Exception { TypeDescription schema = getMyRecordType(); Configuration conf = new Configuration(); conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION"); conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).fileSystem(fs).setSchema(schema).stripeSize(100000) .compress(CompressionKind.ZLIB).rowIndexStride(1000).bufferSize(10000)); VectorizedRowBatch batch = schema.createRowBatch(1000); Random r1 = new Random(1); String[] words = new String[] { "It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it", "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", "of", "incredulity,", "it", "was", "the", "season", "of", "Light,", "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the", "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,", "we", "had", "everything", "before", "us,", "we", "had", "nothing", "before", "us,", "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way" }; int nextInt = 0; for (int i = 0; i < 21000; ++i) { // Write out the same string twice, this guarantees the fraction of rows with // distinct strings is 0.5 if (i % 2 == 0) { nextInt = r1.nextInt(words.length); // Append the value of i to the word, this guarantees when an index or word is repeated // the actual string is unique. words[nextInt] += "-" + i; }//from w w w. j av a2 s .co m appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]); if (batch.size == batch.getMaxSize()) { writer.addRowBatch(batch); batch.reset(); } } if (batch.size != 0) { writer.addRowBatch(batch); } writer.close(); PrintStream origOut = System.out; String outputFilename = "orc-file-dump-dictionary-threshold.out"; FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); // replace stdout and run command System.setOut(new PrintStream(myOut)); FileDump.main(new String[] { testFilePath.toString(), "--rowindex=1,2,3" }); System.out.flush(); System.setOut(origOut); checkOutput(outputFilename, workDir + File.separator + outputFilename); }
From source file:org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords.java
License:Apache License
@Test public void testCanTolerateBadRecords() throws Exception { Configuration conf = new Configuration(); conf.setFloat(UnmaterializableRecordCounter.BAD_RECORD_THRESHOLD_CONF_KEY, 0.1f); List<StructWithUnionV2> expected = new ArrayList<StructWithUnionV2>(); readFile(writeFileWithCorruptRecords(4, expected), conf, "testCanTolerateBadRecords"); assertEquals(200, ReadMapper.records.size()); assertEqualsExcepted(expected, ReadMapper.records); }
From source file:org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords.java
License:Apache License
@Test public void testThrowsWhenTooManyBadRecords() throws Exception { Configuration conf = new Configuration(); conf.setFloat(UnmaterializableRecordCounter.BAD_RECORD_THRESHOLD_CONF_KEY, 0.1f); ArrayList<StructWithUnionV2> expected = new ArrayList<StructWithUnionV2>(); try {// www .jav a 2 s .c om readFile(writeFileWithCorruptRecords(300, expected), conf, "testThrowsWhenTooManyBadRecords"); fail("This should throw"); } catch (RuntimeException e) { // still should have actually read all the valid records assertEquals(100, ReadMapper.records.size()); assertEqualsExcepted(expected.subList(0, 100), ReadMapper.records); } }