Example usage for org.apache.hadoop.conf Configuration setFloat

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setFloat.

Prototype

public void setFloat(String name, float value)

Source Link

Document

Set the value of the name property to a float.

Usage

From source file:org.apache.mahout.regression.penalizedlinear.LinearRegularizePath.java

License:Apache License

private void runPenalizedLinear() throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = getConf();
    conf.setInt(PenalizedLinearKeySet.NUM_CV, parameter.numOfCV);
    conf.setFloat(PenalizedLinearKeySet.ALPHA, parameter.alpha);
    conf.set(PenalizedLinearKeySet.LAMBDA, parameter.lambda);
    conf.setBoolean(PenalizedLinearKeySet.INTERCEPT, parameter.intercept);

    Job job = new Job(conf, "Penalized Linear Regression Driver running over input: " + input);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(PenalizedLinearMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(VectorWritable.class);
    job.setReducerClass(PenalizedLinearReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);
    job.setCombinerClass(PenalizedLinearReducer.class);
    job.setNumReduceTasks(1);/*ww w.j  a va  2  s .  c o m*/
    job.setJarByClass(LinearRegularizePath.class);

    FileInputFormat.addInputPath(job, new Path(output, DIRECTORY_CONTAINING_CONVERTED_INPUT));
    FileOutputFormat.setOutputPath(job, new Path(output, "output"));
    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Penalized Linear Regression Job failed processing " + input);
    }
    solver = new PenalizedLinearSolver();
    solver.setAlpha(parameter.alpha);
    solver.setIntercept(parameter.intercept);
    solver.setLambdaString(parameter.lambda);
    solver.initSolver(new Path(output, "output"), getConf());
    solver.regularizePath(solver.getLambda());
    printInfo(parameter, solver);
}

From source file:org.apache.mahout.regression.penalizedlinear.PenalizedLinearDriver.java

License:Apache License

private void buildRegressionModelMR(PenalizedLinearParameter parameter, Path input, Path output)
        throws IOException, InterruptedException, ClassNotFoundException {

    Job job = prepareJob(input, output, SequenceFileInputFormat.class, PenalizedLinearMapper.class, Text.class,
            VectorWritable.class, PenalizedLinearReducer.class, Text.class, VectorWritable.class,
            SequenceFileOutputFormat.class);
    job.setJobName("Penalized Linear Regression Driver running over input: " + input);
    job.setNumReduceTasks(1);//from  ww  w  .  ja v  a 2  s.c om
    job.setJarByClass(PenalizedLinearDriver.class);

    Configuration conf = job.getConfiguration();
    conf.setInt(PenalizedLinearKeySet.NUM_CV, parameter.getNumOfCV());
    conf.setFloat(PenalizedLinearKeySet.ALPHA, parameter.getAlpha());
    conf.set(PenalizedLinearKeySet.LAMBDA, parameter.getLambda());
    conf.setBoolean(PenalizedLinearKeySet.INTERCEPT, parameter.isIntercept());

    if (!job.waitForCompletion(true)) {
        throw new InterruptedException("Penalized Linear Regression Job failed processing " + input);
    }
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocDriver.java

License:Apache License

/**
 * pass2: perform the LLR calculation/*  www  .jav  a 2 s  . co  m*/
 */
private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal,
        boolean emitUnigrams, float minLLRValue, int reduceTasks)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal);
    conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams);

    Job job = new Job(conf);
    job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output);
    job.setJarByClass(CollocDriver.class);

    job.setMapOutputKeyClass(Gram.class);
    job.setMapOutputValueClass(Gram.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY));
    Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setReducerClass(LLRReducer.class);
    job.setNumReduceTasks(reduceTasks);

    conf.setFloat(LLRReducer.MIN_LLR, minLLRValue);
    job.waitForCompletion(true);
}

From source file:org.apache.mahout.utils.SplitInputJob.java

License:Apache License

/**
 * Run job to downsample, randomly permute and split data into test and
 * training sets. This job takes a SequenceFile as input and outputs two
 * SequenceFiles test-r-00000 and training-r-00000 which contain the test and
 * training sets respectively// w ww  .j  a va 2  s . c  om
 *
 * @param initialConf
 * @param inputPath
 *          path to input data SequenceFile
 * @param outputPath
 *          path for output data SequenceFiles
 * @param keepPct
 *          percentage of key value pairs in input to keep. The rest are
 *          discarded
 * @param randomSelectionPercent
 *          percentage of key value pairs to allocate to test set. Remainder
 *          are allocated to training set
 */
@SuppressWarnings("rawtypes")
public static void run(Configuration initialConf, Path inputPath, Path outputPath, int keepPct,
        float randomSelectionPercent) throws IOException, ClassNotFoundException, InterruptedException {

    int downsamplingFactor = (int) (100.0 / keepPct);
    initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor);
    initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent);

    // Determine class of keys and values
    FileSystem fs = FileSystem.get(initialConf);

    SequenceFileDirIterator<? extends WritableComparable, Writable> iterator = new SequenceFileDirIterator<WritableComparable, Writable>(
            inputPath, PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf());
    Class<? extends WritableComparable> keyClass;
    Class<? extends Writable> valueClass;
    if (iterator.hasNext()) {
        Pair<? extends WritableComparable, Writable> pair = iterator.next();
        keyClass = pair.getFirst().getClass();
        valueClass = pair.getSecond().getClass();
    } else {
        throw new IllegalStateException("Couldn't determine class of the input values");
    }

    Job job = new Job(new Configuration(initialConf));

    MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
    MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
    job.setJarByClass(SplitInputJob.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setNumReduceTasks(1);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(SplitInputMapper.class);
    job.setReducerClass(SplitInputReducer.class);
    job.setSortComparatorClass(SplitInputComparator.class);
    job.setOutputKeyClass(keyClass);
    job.setOutputValueClass(valueClass);
    job.submit();
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.utils.vectors.common.PartialVectorMerger.java

License:Apache License

/**
 * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the complete Document
 * {@link org.apache.mahout.math.RandomAccessSparseVector}
 * //from   ww w  .j a va2 s .c o m
 * @param partialVectorPaths
 *          input directory of the vectors in {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *          output directory were the partial vectors have to be created
 * @param normPower
 *          The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
 * @param numReducers 
 *          The number of reducers to spawn
 * @throws IOException
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
public static void mergePartialVectors(List<Path> partialVectorPaths, Path output, float normPower,
        int dimension, boolean sequentialAccess, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {
    if (normPower != NO_NORMALIZING && normPower < 0) {
        throw new IllegalArgumentException("normPower must either be -1 or >= 0");
    }

    Configuration conf = new Configuration();
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setInt(DIMENSION, dimension);
    conf.setFloat(NORMALIZATION_POWER, normPower);

    Job job = new Job(conf);
    job.setJobName("PartialVectorMerger::MergePartialVectors");
    job.setJarByClass(PartialVectorMerger.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths));

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(PartialVectorMergeReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.overwriteOutput(output);

    job.waitForCompletion(true);
}

From source file:org.apache.mahout.vectorizer.common.PartialVectorMerger.java

License:Apache License

/**
 * Merge all the partial {@link org.apache.mahout.math.RandomAccessSparseVector}s into the complete Document
 * {@link org.apache.mahout.math.RandomAccessSparseVector}
 * // w w  w .j  av a2  s .  c  o m
 * @param partialVectorPaths
 *          input directory of the vectors in {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *          output directory were the partial vectors have to be created
 * @param baseConf
 *          job configuration
 * @param normPower
 *          The normalization value. Must be greater than or equal to 0 or equal to {@link #NO_NORMALIZING}
 * @param dimension cardinality of the vectors
 * @param sequentialAccess
 *          output vectors should be optimized for sequential access
 * @param namedVector
 *          output vectors should be named, retaining key (doc id) as a label
 * @param numReducers 
 *          The number of reducers to spawn
 */
public static void mergePartialVectors(Iterable<Path> partialVectorPaths, Path output, Configuration baseConf,
        float normPower, boolean logNormalize, int dimension, boolean sequentialAccess, boolean namedVector,
        int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
    Preconditions.checkArgument(normPower == NO_NORMALIZING || normPower >= 0,
            "If specified normPower must be nonnegative", normPower);
    Preconditions.checkArgument(
            normPower == NO_NORMALIZING || (normPower > 1 && !Double.isInfinite(normPower)) || !logNormalize,
            "normPower must be > 1 and not infinite if log normalization is chosen", normPower);

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setBoolean(SEQUENTIAL_ACCESS, sequentialAccess);
    conf.setBoolean(NAMED_VECTOR, namedVector);
    conf.setInt(DIMENSION, dimension);
    conf.setFloat(NORMALIZATION_POWER, normPower);
    conf.setBoolean(LOG_NORMALIZE, logNormalize);

    Job job = new Job(conf);
    job.setJobName("PartialVectorMerger::MergePartialVectors");
    job.setJarByClass(PartialVectorMerger.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths));

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(PartialVectorMergeReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mahout.vectorizer.HighDFWordsPruner.java

License:Apache License

public static void mergePartialVectors(Iterable<Path> partialVectorPaths, Path output, Configuration baseConf,
        float normPower, boolean logNormalize, int numReducers)
        throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setFloat(PartialVectorMerger.NORMALIZATION_POWER, normPower);
    conf.setBoolean(PartialVectorMerger.LOG_NORMALIZE, logNormalize);

    Job job = new Job(conf);
    job.setJobName("PrunerPartialVectorMerger::MergePartialVectors");
    job.setJarByClass(PartialVectorMerger.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(partialVectorPaths));

    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(PrunedPartialVectorMergeReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);/*from   w ww .  j a va2  s . c o m*/

    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.orc.tools.TestFileDump.java

License:Apache License

@Test
public void testDictionaryThreshold() throws Exception {
    TypeDescription schema = getMyRecordType();
    Configuration conf = new Configuration();
    conf.set(OrcConf.ENCODING_STRATEGY.getAttribute(), "COMPRESSION");
    conf.setFloat(OrcConf.DICTIONARY_KEY_SIZE_THRESHOLD.getAttribute(), 0.49f);
    Writer writer = OrcFile.createWriter(testFilePath,
            OrcFile.writerOptions(conf).fileSystem(fs).setSchema(schema).stripeSize(100000)
                    .compress(CompressionKind.ZLIB).rowIndexStride(1000).bufferSize(10000));
    VectorizedRowBatch batch = schema.createRowBatch(1000);
    Random r1 = new Random(1);
    String[] words = new String[] { "It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst",
            "of", "times,", "it", "was", "the", "age", "of", "wisdom,", "it", "was", "the", "age", "of",
            "foolishness,", "it", "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", "of",
            "incredulity,", "it", "was", "the", "season", "of", "Light,", "it", "was", "the", "season", "of",
            "Darkness,", "it", "was", "the", "spring", "of", "hope,", "it", "was", "the", "winter", "of",
            "despair,", "we", "had", "everything", "before", "us,", "we", "had", "nothing", "before", "us,",
            "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct",
            "the", "other", "way" };
    int nextInt = 0;
    for (int i = 0; i < 21000; ++i) {
        // Write out the same string twice, this guarantees the fraction of rows with
        // distinct strings is 0.5
        if (i % 2 == 0) {
            nextInt = r1.nextInt(words.length);
            // Append the value of i to the word, this guarantees when an index or word is repeated
            // the actual string is unique.
            words[nextInt] += "-" + i;
        }//from  w w  w. j av  a2  s .co  m
        appendMyRecord(batch, r1.nextInt(), r1.nextLong(), words[nextInt]);
        if (batch.size == batch.getMaxSize()) {
            writer.addRowBatch(batch);
            batch.reset();
        }
    }
    if (batch.size != 0) {
        writer.addRowBatch(batch);
    }
    writer.close();
    PrintStream origOut = System.out;
    String outputFilename = "orc-file-dump-dictionary-threshold.out";
    FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);

    // replace stdout and run command
    System.setOut(new PrintStream(myOut));
    FileDump.main(new String[] { testFilePath.toString(), "--rowindex=1,2,3" });
    System.out.flush();
    System.setOut(origOut);

    checkOutput(outputFilename, workDir + File.separator + outputFilename);
}

From source file:org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords.java

License:Apache License

@Test
public void testCanTolerateBadRecords() throws Exception {
    Configuration conf = new Configuration();
    conf.setFloat(UnmaterializableRecordCounter.BAD_RECORD_THRESHOLD_CONF_KEY, 0.1f);

    List<StructWithUnionV2> expected = new ArrayList<StructWithUnionV2>();

    readFile(writeFileWithCorruptRecords(4, expected), conf, "testCanTolerateBadRecords");
    assertEquals(200, ReadMapper.records.size());
    assertEqualsExcepted(expected, ReadMapper.records);
}

From source file:org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords.java

License:Apache License

@Test
public void testThrowsWhenTooManyBadRecords() throws Exception {
    Configuration conf = new Configuration();
    conf.setFloat(UnmaterializableRecordCounter.BAD_RECORD_THRESHOLD_CONF_KEY, 0.1f);

    ArrayList<StructWithUnionV2> expected = new ArrayList<StructWithUnionV2>();

    try {//  www .jav  a  2 s .c  om
        readFile(writeFileWithCorruptRecords(300, expected), conf, "testThrowsWhenTooManyBadRecords");
        fail("This should throw");
    } catch (RuntimeException e) {
        // still should have actually read all the valid records
        assertEquals(100, ReadMapper.records.size());
        assertEqualsExcepted(expected.subList(0, 100), ReadMapper.records);
    }
}