List of usage examples for org.apache.hadoop.conf Configuration setInt
public void setInt(String name, int value)
name
property to an int
. From source file:edu.indiana.d2i.htrc.exp.PartialVectorsFromTokenizedDoc.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/*from w w w . j a va 2s .c o m*/ } // all directories are in HDFS tokenizedDocDir = args[0]; dictDir = args[1]; outputDir = args[2]; numReducers = Integer.valueOf(args[3]); logger.info("PartialVectorsFromTokenizedDoc "); logger.info(" - tokenizedDocDir: " + tokenizedDocDir); logger.info(" - dictDir: " + dictDir); logger.info(" - outputDir: " + outputDir); logger.info(" - numReducers: " + numReducers); Path tokenizedDocPath = new Path(tokenizedDocDir); Path dictPath = new Path(dictDir); Path outputPath = new Path(outputDir); // get dimension Configuration conf = getConf(); int dimension = 0; for (Pair<Writable, IntWritable> record : new SequenceFileIterable<Writable, IntWritable>(dictPath, true, conf)) { dimension++; } logger.info("dimension of a vector: " + dimension); // submit job long t0 = System.currentTimeMillis(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt(PartialVectorMerger.DIMENSION, dimension); DistributedCache.setCacheFiles(new URI[] { dictPath.toUri() }, conf); Job job = new Job(conf); job.setJobName("PartialVectorsFromTokenizedDoc::MakePartialVectors: input-folder: " + tokenizedDocDir + ", dictionary-file: " + dictDir); job.setJarByClass(PartialVectorsFromTokenizedDoc.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StringTuple.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); FileInputFormat.setInputPaths(job, tokenizedDocPath); FileOutputFormat.setOutputPath(job, outputPath); HadoopUtil.delete(conf, outputPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setReducerClass(TFPartialVectorReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReducers); job.waitForCompletion(true); long t1 = System.currentTimeMillis(); logger.info("PartialVectorsFromTokenizedDoc takes " + (double) (t1 - t0) / 1000 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.DataAPIDefaultConf.java
License:Apache License
public void configurate(Configuration conf, int maxIdsPerReq) { conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, maxIdsPerReq); }
From source file:edu.indiana.d2i.htrc.io.DataAPISilvermapleConf.java
License:Apache License
@Override public void configurate(Configuration conf, int maxIdsPerReq) { // conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, 100); conf.setInt(HTRCConstants.MAX_ID_RETRIEVED, maxIdsPerReq); conf.set(HTRCConstants.DATA_API_URL_DELIMITOR, "|"); conf.set(HTRCConstants.DATA_API_CLIENTID, "drhtrc"); conf.set(HTRCConstants.DATA_API_CLIENTSECRETE, "d0ct0r.htrc"); conf.set(HTRCConstants.DATA_API_TOKENLOC, "https://silvermaple.pti.indiana.edu:25443/oauth2/token?grant_type=client_credentials"); conf.setBoolean(HTRCConstants.DATA_API_SELFSIGNED, false); conf.set(HTRCConstants.HOSTS_SEPARATEDBY_COMMA, "silvermaple.pti.indiana.edu:25443"); }
From source file:edu.indiana.d2i.htrc.io.mem.MemCachedUtil.java
License:Apache License
public static void configHelper(Configuration conf, String memhostsPath) throws IOException { List<String> hosts = new ArrayList<String>(); FileSystem fs = FileSystem.get(conf); DataInputStream fsinput = new DataInputStream(fs.open(new Path(memhostsPath))); BufferedReader reader = new BufferedReader(new InputStreamReader(fsinput)); String line = null;/*ww w .j a va2s . c o m*/ while ((line = reader.readLine()) != null) { hosts.add(line); } reader.close(); String[] hostsArray = hosts.toArray(new String[hosts.size()]); conf.setInt(HTRCConstants.MEMCACHED_CLIENT_NUM, 1); // conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, Integer.MAX_VALUE); conf.setInt(HTRCConstants.MEMCACHED_MAX_EXPIRE, 60 * 60 * 60); // seconds conf.setStrings(HTRCConstants.MEMCACHED_HOSTS, hostsArray); }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void setupConfiguration(Configuration conf) throws ClassNotFoundException, IOException { // set dictionary conf.set(HTRCConstants.DICTIONARY_PATH, dictDir); // set analyzer conf.set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set data api conf conf.setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(conf, dataAPIConfClassName, maxIdsPerReq); // set memcached conf MemCachedUtil.configHelper(conf, memHostsPath); }
From source file:edu.indiana.d2i.htrc.kmeans.MemKMeansUtil.java
License:Apache License
public static void kmeansConfigHelper(Configuration conf, int k) { conf.setInt(MemKMeansConfig.CLUSTER_NUM, k); conf.set(MemKMeansConfig.KEY_NS, CLUSTER_NAMESPACE); }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansAdapterTest.java
License:Apache License
@Test public static void testCluster() { int dimension = 500; // construct data samplers centered on the corners of a unit cube Matrix mean = new DenseMatrix(8, dimension); List<MultiNormal> rowSamplers = Lists.newArrayList(); for (int i = 0; i < 8; i++) { // mean.viewRow(i).assign( // new double[] { 0.25 * (i & 4), 0.5 * (i & 2), i & 1 }); double[] random = new double[dimension]; for (int j = 0; j < random.length; j++) { random[j] = Math.random(); }//from w w w . j ava 2 s .c o m mean.viewRow(i).assign(random); rowSamplers.add(new MultiNormal(0.01, mean.viewRow(i))); } // sample a bunch of data points Matrix data = new DenseMatrix(10000, dimension); for (MatrixSlice row : data) { row.vector().assign(rowSamplers.get(row.index() % 8).sample()); } // cluster the data long t0 = System.currentTimeMillis(); double cutoff = StreamingKMeansAdapter.estimateCutoff(data, 100); Configuration conf = new Configuration(); conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, 1000); conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, (float) cutoff); conf.setClass(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class, DistanceMeasure.class); conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dimension); StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter(conf); // for (MatrixSlice row : Iterables.skip(data, 1)) { // skmeans.cluster(row.vector()); // } for (MatrixSlice row : data) { skmeans.cluster(row.vector()); } // validate Searcher r = skmeans.getCentroids(); // StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter(); // Searcher r = skmeans.cluster(data, 1000, centroidFactory); long t1 = System.currentTimeMillis(); assertEquals("Total weight not preserved", totalWeight(data), totalWeight(r), 1e-9); // and verify that each corner of the cube has a centroid very nearby for (MatrixSlice row : mean) { WeightedVector v = r.search(row.vector(), 1).get(0); assertTrue(v.getWeight() < 0.05); } System.out.printf("%.2f for clustering\n%.1f us per row\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / data.rowSize() * 1e6); System.out.println("Done??"); }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java
License:Apache License
private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException { // get samples to calculate scale factor FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); int index = 0 + (int) (Math.random() * (status.length)); SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf); int count = 0; Text key = new Text(); VectorWritable value = new VectorWritable(); List<MatrixSlice> slices = new ArrayList<MatrixSlice>(); while (seqReader.next(key, value) && count < samplesNum) { MatrixSlice slice = new MatrixSlice(value.get().clone(), count); slices.add(slice);/* ww w. j a v a 2 s .c om*/ count++; } // set cutoff float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum); conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff); logger.info("Scale factor (cutoff) is: " + cutoff); // set vector dimension int dim = value.get().size(); conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim); logger.info("Dimemsion of a vector is: " + dim); // set maximum #cluster conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster); // set distance measurement conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName()); }
From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java
License:Open Source License
public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir) throws Exception { /* input parameters */ LOG.info(sequenceFileFullPath);//from w w w. java2 s .c o m Job job = new Job(conf, "Pairwise-calc-" + sequenceFile); /* create the base dir for this job. Delete and recreates if it exists */ Path hdMainDir = new Path(distDir + "/" + sequenceFile); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdInputDir = new Path(hdMainDir, "data"); if (!fs.mkdirs(hdInputDir)) { throw new IOException("Mkdirs failed to create " + hdInputDir.toString()); } int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs); int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize); int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2; LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :" + blockSize); // Retrieving the configuration form the job to set the properties // Setting properties to the original conf does not work (possible // Hadoop bug) Configuration jobConf = job.getConfiguration(); // Input dir in HDFS. Create this in newly created job base dir Path inputDir = new Path(hdMainDir, "input"); if (!fs.mkdirs(inputDir)) { throw new IOException("Mkdirs failed to create " + inputDir.toString()); } Long dataPartitionStartTime = System.nanoTime(); partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir); distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions); long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000; LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime); // Output dir in HDFS Path hdOutDir = new Path(hdMainDir, "out"); jobConf.setInt(Constants.BLOCK_SIZE, blockSize); jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions); jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences); jobConf.set(Constants.DIST_FUNC, distFunc); job.setJarByClass(PairWiseDistance.class); job.setMapperClass(SWGMap.class); job.setReducerClass(SWGReduce.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SWGWritable.class); FileInputFormat.setInputPaths(job, hdInputDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(noOfDivisions); long startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; double executionTime = (System.currentTimeMillis() - startTime) / 1000.0; LOG.info("Job Finished in " + executionTime + " seconds"); LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir); return exitStatus; }
From source file:edu.isi.mavuno.app.distsim.ContextToContext.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String contextPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ContextPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusClass", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.CorpusPath", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.ExtractorArgs", conf); int minMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.MinMatches", conf)); boolean harvestGlobalStats = Boolean .parseBoolean(MavunoUtils.getRequiredParam("Mavuno.ContextToContext.GlobalStats", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ContextToContext.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: ContextToContext"); sLogger.info(" - Context path: " + contextPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Harvest global stats: " + harvestGlobalStats); // context to pattern conf.set("Mavuno.ContextToPattern.ContextPath", contextPath); conf.set("Mavuno.ContextToPattern.CorpusPath", corpusPath); conf.set("Mavuno.ContextToPattern.CorpusClass", corpusClass); conf.set("Mavuno.ContextToPattern.ExtractorClass", extractorClass); conf.set("Mavuno.ContextToPattern.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.ContextToPattern.MinMatches", minMatches); conf.setBoolean("Mavuno.ContextToPattern.GlobalStats", harvestGlobalStats); conf.set("Mavuno.ContextToPattern.OutputPath", outputPath); new ContextToPattern(conf).run(); // pattern to context conf.set("Mavuno.PatternToContext.PatternPath", outputPath + "/pattern-stats"); conf.set("Mavuno.PatternToContext.CorpusPath", corpusPath); conf.set("Mavuno.PatternToContext.CorpusClass", corpusClass); conf.set("Mavuno.PatternToContext.ExtractorClass", extractorClass); conf.set("Mavuno.PatternToContext.ExtractorArgs", extractorArgs); conf.setInt("Mavuno.PatternToContext.MinMatches", minMatches); conf.setBoolean("Mavuno.PatternToContext.GlobalStats", harvestGlobalStats); conf.set("Mavuno.PatternToContext.OutputPath", outputPath); new PatternToContext(conf).run(); return 0;/*from w w w .ja v a2 s .c o m*/ }