Example usage for org.apache.hadoop.conf Configuration setInt

List of usage examples for org.apache.hadoop.conf Configuration setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from w w  w .  j a va 2s  .c o m*/
    }

    // all directories are in HDFS
    tokenizedDocDir = args[0];
    dictDir = args[1];
    outputDir = args[2];
    numReducers = Integer.valueOf(args[3]);

    logger.info("PartialVectorsFromTokenizedDoc ");
    logger.info(" - tokenizedDocDir: " + tokenizedDocDir);
    logger.info(" - dictDir: " + dictDir);
    logger.info(" - outputDir: " + outputDir);
    logger.info(" - numReducers: " + numReducers);

    Path tokenizedDocPath = new Path(tokenizedDocDir);
    Path dictPath = new Path(dictDir);
    Path outputPath = new Path(outputDir);

    // get dimension
    Configuration conf = getConf();

    int dimension = 0;
    for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true,
            conf)) {
        dimension++;
    }
    logger.info("dimension of a vector: " + dimension);

    // submit job
    long t0 = System.currentTimeMillis();
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.setInt(PartialVectorMerger.DIMENSION, dimension);
    DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf);

    Job job = new Job(conf);
    job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir
            + ", dictionary-file: " + dictDir);
    job.setJarByClass(PartialVectorsFromTokenizedDoc.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(StringTuple.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(VectorWritable.class);

    FileInputFormat.setInputPaths(job, tokenizedDocPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    HadoopUtil.delete(conf, outputPath);

    job.setMapperClass(Mapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setReducerClass(TFPartialVectorReducer.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(numReducers);

    job.waitForCompletion(true);

    long t1 = System.currentTimeMillis();
    logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds.");

    return 0;
}

From source file:edu.indiana.d2i.htrc.io.DataAPIDefaultConf.java

License:Apache License

public void configurate(Configuration conf, int maxIdsPerReq) {
    conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, maxIdsPerReq);
}

From source file:edu.indiana.d2i.htrc.io.DataAPISilvermapleConf.java

License:Apache License

@Override
public void configurate(Configuration conf, int maxIdsPerReq) {
    //      conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, 100);
    conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, maxIdsPerReq);

    conf.set(HTRCConstants.DATA_API_URL_DELIMITOR, "|");
    conf.set(HTRCConstants.DATA_API_CLIENTID, "drhtrc");
    conf.set(HTRCConstants.DATA_API_CLIENTSECRETE, "d0ct0r.htrc");
    conf.set(HTRCConstants.DATA_API_TOKENLOC,
            "https://silvermaple.pti.indiana.edu:25443/oauth2/token?grant_type=client_credentials");
    conf.setBoolean(HTRCConstants.DATA_API_SELFSIGNED, false);
    conf.set(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "silvermaple.pti.indiana.edu:25443");
}

From source file:edu.indiana.d2i.htrc.io.mem.MemCachedUtil.java

License:Apache License

public static void configHelper(Configuration conf, String memhostsPath) throws IOException {
    List<String> hosts = new ArrayList<String>();
    FileSystem fs = FileSystem.get(conf);
    DataInputStream fsinput = new DataInputStream(fs.open(new Path(memhostsPath)));
    BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput));
    String line = null;/*ww  w .j  a  va2s .  c o  m*/
    while ((line = reader.readLine()) != null) {
        hosts.add(line);
    }
    reader.close();
    String[] hostsArray = hosts.toArray(new String[hosts.size()]);

    conf.setInt(HTRCConstants.MEMCACHED_CLIENT_NUM, 1);
    //      conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, Integer.MAX_VALUE);
    conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, 60 * 60 * 60); // seconds
    conf.setStrings(HTRCConstants.MEMCACHED_HOSTS, hostsArray);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException {
    // set dictionary
    conf.set(HTRCConstants.DICTIONARY_PATH, dictDir);

    // set analyzer
    conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName);

    // set data api conf
    conf.setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit);
    Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq);

    // set memcached conf
    MemCachedUtil.configHelper(conf, memHostsPath);
}

From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java

License:Apache License

public static void kmeansConfigHelper(Configuration conf, int k) {
    conf.setInt(MemKMeansConfig.CLUSTER_NUM, k);
    conf.set(MemKMeansConfig.KEY_NS, CLUSTER_NAMESPACE);
}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansAdapterTest.java

License:Apache License

@Test
public static void testCluster() {
    int dimension = 500;

    // construct data samplers centered on the corners of a unit cube
    Matrix mean = new DenseMatrix(8, dimension);
    List<MultiNormal> rowSamplers = Lists.newArrayList();
    for (int i = 0; i < 8; i++) {
        //         mean.viewRow(i).assign(
        //               new double[] { 0.25 * (i & 4), 0.5 * (i & 2), i & 1 });

        double[] random = new double[dimension];
        for (int j = 0; j < random.length; j++) {
            random[j] = Math.random();
        }//from w w w . j ava 2 s  .c  o  m
        mean.viewRow(i).assign(random);
        rowSamplers.add(new MultiNormal(0.01, mean.viewRow(i)));
    }

    // sample a bunch of data points
    Matrix data = new DenseMatrix(10000, dimension);
    for (MatrixSlice row : data) {
        row.vector().assign(rowSamplers.get(row.index() % 8).sample());
    }

    // cluster the data
    long t0 = System.currentTimeMillis();

    double cutoff = StreamingKMeansAdapter.estimateCutoff(data, 100);
    Configuration conf = new Configuration();
    conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, 1000);
    conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, (float) cutoff);
    conf.setClass(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class,
            DistanceMeasure.class);
    conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dimension);
    StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter(conf);
    // for (MatrixSlice row : Iterables.skip(data, 1)) {
    // skmeans.cluster(row.vector());
    // }
    for (MatrixSlice row : data) {
        skmeans.cluster(row.vector());
    }

    // validate
    Searcher r = skmeans.getCentroids();

    // StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter();
    // Searcher r = skmeans.cluster(data, 1000, centroidFactory);

    long t1 = System.currentTimeMillis();

    assertEquals("Total weight not preserved", totalWeight(data), totalWeight(r), 1e-9);

    // and verify that each corner of the cube has a centroid very nearby
    for (MatrixSlice row : mean) {
        WeightedVector v = r.search(row.vector(), 1).get(0);
        assertTrue(v.getWeight() < 0.05);
    }
    System.out.printf("%.2f for clustering\n%.1f us per row\n", (t1 - t0) / 1000.0,
            (t1 - t0) / 1000.0 / data.rowSize() * 1e6);

    System.out.println("Done??");
}

From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java

License:Apache License

private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException {
    // get samples to calculate scale factor
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER);
    int index = 0 + (int) (Math.random() * (status.length));
    SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf);

    int count = 0;
    Text key = new Text();
    VectorWritable value = new VectorWritable();
    List<MatrixSlice> slices = new ArrayList<MatrixSlice>();
    while (seqReader.next(key, value) && count < samplesNum) {
        MatrixSlice slice = new MatrixSlice(value.get().clone(), count);
        slices.add(slice);/* ww w. j a  v a  2 s .c om*/
        count++;
    }

    // set cutoff
    float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum);
    conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff);
    logger.info("Scale factor (cutoff) is: " + cutoff);

    // set vector dimension
    int dim = value.get().size();
    conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim);
    logger.info("Dimemsion of a vector is: " + dim);

    // set maximum #cluster
    conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster);

    // set distance measurement
    conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName());
}

From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java

License:Open Source License

public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir)
        throws Exception {
    /* input parameters */
    LOG.info(sequenceFileFullPath);//from   w w w. java2  s  .c o  m
    Job job = new Job(conf, "Pairwise-calc-" + sequenceFile);

    /* create the base dir for this job. Delete and recreates if it exists */
    Path hdMainDir = new Path(distDir + "/" + sequenceFile);
    FileSystem fs = FileSystem.get(conf);
    fs.delete(hdMainDir, true);
    Path hdInputDir = new Path(hdMainDir, "data");
    if (!fs.mkdirs(hdInputDir)) {
        throw new IOException("Mkdirs failed to create " + hdInputDir.toString());
    }

    int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs);
    int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize);
    int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2;
    LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :"
            + blockSize);

    // Retrieving the configuration form the job to set the properties
    // Setting properties to the original conf does not work (possible
    // Hadoop bug)
    Configuration jobConf = job.getConfiguration();

    // Input dir in HDFS. Create this in newly created job base dir
    Path inputDir = new Path(hdMainDir, "input");
    if (!fs.mkdirs(inputDir)) {
        throw new IOException("Mkdirs failed to create " + inputDir.toString());
    }

    Long dataPartitionStartTime = System.nanoTime();
    partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir);

    distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions);

    long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000;
    LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime);

    // Output dir in HDFS
    Path hdOutDir = new Path(hdMainDir, "out");

    jobConf.setInt(Constants.BLOCK_SIZE, blockSize);
    jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions);
    jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences);
    jobConf.set(Constants.DIST_FUNC, distFunc);

    job.setJarByClass(PairWiseDistance.class);
    job.setMapperClass(SWGMap.class);
    job.setReducerClass(SWGReduce.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(SWGWritable.class);
    FileInputFormat.setInputPaths(job, hdInputDir);
    FileOutputFormat.setOutputPath(job, hdOutDir);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setNumReduceTasks(noOfDivisions);

    long startTime = System.currentTimeMillis();
    int exitStatus = job.waitForCompletion(true) ? 0 : 1;
    double executionTime = (System.currentTimeMillis() - startTime) / 1000.0;
    LOG.info("Job Finished in " + executionTime + " seconds");
    LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t"
            + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir);

    return exitStatus;
}

From source file:edu.isi.mavuno.app.distsim.ContextToContext.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String contextPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ContextPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusClass", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusPath", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorArgs", conf);
    int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.MinMatches", conf));
    boolean harvestGlobalStats = Boolean
            .parseBoolean(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.GlobalStats", conf));
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.OutputPath", conf);

    MavunoUtils.createDirectory(conf, outputPath);

    sLogger.info("Tool name: ContextToContext");
    sLogger.info(" - Context path: " + contextPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor arguments: " + extractorArgs);
    sLogger.info(" - Min matches: " + minMatches);
    sLogger.info(" - Harvest global stats: " + harvestGlobalStats);

    // context to pattern
    conf.set("Mavuno.ContextToPattern.ContextPath", contextPath);
    conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath);
    conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass);
    conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass);
    conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs);
    conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches);
    conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats);
    conf.set("Mavuno.ContextToPattern.OutputPath", outputPath);
    new ContextToPattern(conf).run();

    // pattern to context
    conf.set("Mavuno.PatternToContext.PatternPath", outputPath + "/pattern-stats");
    conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath);
    conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass);
    conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass);
    conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs);
    conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches);
    conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats);
    conf.set("Mavuno.PatternToContext.OutputPath", outputPath);
    new PatternToContext(conf).run();

    return 0;/*from w w  w  .ja  v a2  s .c o  m*/
}