public float getFloat(String name, float defaultValue) 

Get the value of the name property as a float.


From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java

License:Open Source License

 * Create a partitioner for a particular job
 * @param in//from w w w. j  a  v a  2 s.c om
 * @param out
 * @param job
 * @param partitionerName
 * @return
 * @throws IOException
public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
        throws IOException {
    try {
        Partitioner partitioner = null;
        Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
        if (partitionerClass == null) {
            // Try to parse the name as a class name
            try {
                partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Unknown index type '" + partitionerName + "'");

        if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) {
            boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
            job.setBoolean("replicate", replicate);
        partitioner = partitionerClass.newInstance();

        long t1 = System.currentTimeMillis();
        final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
        // Determine number of partitions
        long inSize = 0;
        for (Path in : ins) {
            inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
        long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
        FileSystem outFS = out.getFileSystem(job);
        long outBlockSize = outFS.getDefaultBlockSize(out);
        int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
        LOG.info("Partitioning the space into " + numPartitions + " partitions");

        final Vector<Point> sample = new Vector<Point>();
        float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
        long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

        LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
        ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
            public void collect(Point p) {
        OperationsParams params2 = new OperationsParams(job);
        params2.setFloat("ratio", sample_ratio);
        params2.setLong("size", sample_size);
        params2.setClass("outshape", Point.class, Shape.class);
        Sampler.sample(ins, resultCollector, params2);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time for sampling in millis: " + (t2 - t1));
        LOG.info("Finished reading a sample of " + sample.size() + " records");

        partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), numPartitions);

        return partitioner;
    } catch (InstantiationException e) {
        return null;
    } catch (IllegalAccessException e) {
        return null;

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

 * Calculates number of partitions required to index the given file.
 * @param conf The current configuration which can contain user-defined parameters
 * @param inFileSize The size of the input file in bytes
 * @param outFs The output file system where the index will be written
 * @param outFile The path of the output file which is used to get the output block size.
 * @param blockSize If set, this will override the default output block size.
 * @return The number of blocks needed to write the index file
 *///from  w w w .j a  v  a2s  . c o m
public static int calculateNumberOfPartitions(Configuration conf, long inFileSize, FileSystem outFs,
        Path outFile, long blockSize) {
    final float IndexingOverhead = conf.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f);
    long indexedFileSize = (long) (inFileSize * (1 + IndexingOverhead));
    if (blockSize == 0)
        blockSize = outFs.getDefaultBlockSize(outFile);
    return (int) Math.ceil((float) indexedFileSize / blockSize);

From source file:external.nutch.scoring.initial.InitialStaticScoringFilter.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    initialScore = conf.getFloat("initial.static.score", 1.00f);

From source file:gov.jgi.meta.exec.BlastCommand.java

License:Open Source License

 * new blast command based on values stored in the configuration.
 * <p/>//from ww  w  . j  a  v  a2 s. c om
 * Looks for the following config values: blast.commandline,
 * blast.commandpath, and blast.tmpdir, blast.cleanup
 * @param config is the hadoop configuration with overriding values
 *               for commandline options and paths
 * @throws IOException if executable can not be found
public BlastCommand(Configuration config) throws IOException {
    String c;

    log.info("initializing new blast command");

    if ((c = config.get("blast.commandline")) != null) {
        commandLine = c;
    if ((c = config.get("blast.commandpath")) != null) {
        commandPath = c;
    if ((c = config.get("formatdb.commandline")) != null) {
        formatdbCommandLine = c;
    if ((c = config.get("formatdb.commandpath")) != null) {
        formatdbCommandPath = c;

    if ((c = config.get("blast.tmpdir")) != null) {
        tmpDir = c;

    docleanup = config.getBoolean("blast.cleanup", true);

    effectiveSize = config.getLong("blast.effectivedatabasesize", 0);
    useScaledEValue = config.getBoolean("blast.usescaledevalue", false);
    useEffectiveSize = config.getBoolean("blast.useeffectivesize", false);
    useEValue = config.getFloat("blast.useevalue", 10F);

     * do sanity check to make sure all paths exist

     * if all is good, create a working space inside tmpDir

    tmpDirFile = MetaUtils.createTempDir("blast_", tmpDir);

    log.info("done initializing: tmp dir = " + tmpDirFile);

From source file:idgs.ConfVar.java

License:Open Source License

public static Float getFloatVar(Configuration conf, ConfVar variable) {
    require(variable.valClass == Float.class);
    return conf.getFloat(variable.varname, variable.defaultFloatVal);

From source file:io.hops.common.IDsMonitor.java

License:Apache License

public void setConfiguration(Configuration conf) {
    setConfiguration(/*from  ww  w .  j  a v  a  2s  .co m*/

From source file:ir.co.bayan.simorq.zal.extractor.nutch.OPICScoringFilter.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    scorePower = conf.getFloat("indexer.score.power", 0.5f);
    internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
    externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
    countFiltered = conf.getBoolean("db.score.count.filtered", false);

From source file:it.crs4.seal.recab.RecabTableReducer.java

License:Open Source License

public void setup(Configuration conf) {
    smoothing = conf.getFloat(CONF_SMOOTHING, CONF_SMOOTHING_DEFAULT);
    if (smoothing < 0.0)
        throw new IllegalArgumentException(CONF_SMOOTHING + " can't be less than 0");

    maxQscore = conf.getInt(CONF_MAX_QSCORE, CONF_MAX_QSCORE_DEFAULT);
    if (maxQscore <= 0)
        throw new IllegalArgumentException(CONF_MAX_QSCORE + " must be greater than 0");

From source file:ivory.core.index.BuildLPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCount = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    // These are the default values for the LP algorithm.
    float mapMemoryThreshold = conf.getFloat(Constants.IndexingMapMemoryThreshold, 0.9f);
    float reduceMemoryThreshold = conf.getFloat(Constants.IndexingReduceMemoryThreshold, 0.9f);
    int maxHeap = conf.getInt(Constants.MaxHeap, 2048);
    int maxNDocsBeforeFlush = conf.getInt(Constants.MaxNDocsBeforeFlush, 50000);

    LOG.info("PowerTool: " + BuildLPInvertedIndexDocSorted.class.getSimpleName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCount));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));
    LOG.info(String.format(" - %s: %s", Constants.IndexingMapMemoryThreshold, mapMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.IndexingReduceMemoryThreshold, reduceMemoryThreshold));
    LOG.info(String.format(" - %s: %s", Constants.MaxHeap, maxHeap));
    LOG.info(String.format(" - %s: %s", Constants.MaxNDocsBeforeFlush, maxNDocsBeforeFlush));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }// w w  w.jav  a  2  s  .co  m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);

    conf.setInt("mapred.min.split.size", minSplitSize);
    //conf.set("mapred.child.java.opts", "-Xmx" + maxHeap + "m");
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(conf, BuildLPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);


    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);




    long startTime = System.currentTimeMillis();
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");


    return 0;

From source file:mlbench.bayes.BayesUtils.java

License:Apache License

public static NaiveBayesModel readModelFromDir(Path base, Configuration conf) {

    float alphaI = conf.getFloat(ThetaMapper.ALPHA_I, 1.0f);

    // read feature sums and label sums
    Vector scoresPerLabel = null;
    Vector scoresPerFeature = null;
    for (Pair<Text, VectorWritable> record : new SequenceFileDirIterable<Text, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.WEIGHTS), PathType.LIST, PathFilters.partFilter(), conf)) {
        String key = record.getFirst().toString();
        VectorWritable value = record.getSecond();
        if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_FEATURE)) {
            scoresPerFeature = value.get();
        } else if (key.equals(TrainNaiveBayesJob.WEIGHTS_PER_LABEL)) {
            scoresPerLabel = value.get();
        }/*w w w.  ja  va2 s  .c  o m*/

    // Preconditions.checkNotNull(scoresPerFeature);
    // Preconditions.checkNotNull(scoresPerLabel);

    Matrix scoresPerLabelAndFeature = new SparseMatrix(scoresPerLabel.size(), scoresPerFeature.size());
    for (Pair<IntWritable, VectorWritable> entry : new SequenceFileDirIterable<IntWritable, VectorWritable>(
            new Path(base, TrainNaiveBayesJob.SUMMED_OBSERVATIONS), PathType.LIST, PathFilters.partFilter(),
            conf)) {
        scoresPerLabelAndFeature.assignRow(entry.getFirst().get(), entry.getSecond().get());

    Vector perlabelThetaNormalizer = scoresPerLabel.like();
     * for (Pair<Text,VectorWritable> entry : new
     * SequenceFileDirIterable<Text,VectorWritable>( new Path(base,
     * TrainNaiveBayesJob.THETAS), PathType.LIST, PathFilters.partFilter(),
     * conf)) { if (entry.getFirst().toString().equals(TrainNaiveBayesJob.
     * LABEL_THETA_NORMALIZER)) { perlabelThetaNormalizer =
     * entry.getSecond().get(); } }
     * Preconditions.checkNotNull(perlabelThetaNormalizer);
    return new NaiveBayesModel(scoresPerLabelAndFeature, scoresPerFeature, scoresPerLabel,
            perlabelThetaNormalizer, alphaI, false);