List of usage examples for org.apache.hadoop.conf Configuration setFloat
public void setFloat(String name, float value)
name
property to a float
. From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, inMemCStr); conf.setInt(MATRIXINMEMORYROWS, inMemCRows); conf.setInt(MATRIXINMEMORYCOLS, inMemCCols); conf.setFloat(ALPHA1, alpha1); conf.setFloat(ALPHA2, alpha2);// w ww . j a v a 2s. co m FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj"); conf.set(MAPDIRMATRIX, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CompositeDMJ.class); job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.nmf.SampleColsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, int cols, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); conf.setInt(COLS, cols);/*from www . ja va2 s.c om*/ FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplecol"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleColsJob.class); job.setJobName(SampleColsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.SampleRowsJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, float sampleRate) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setFloat(SAMPLERATE, sampleRate); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "samplerows"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(SampleRowsJob.class); job.setJobName(SampleRowsJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setNumReduceTasks(0);/*from w ww . j a v a 2s . c om*/ job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.zjy.mongo.util.MongoConfigUtil.java
License:Apache License
public static Configuration buildConfiguration(final Map<String, Object> data) { Configuration newConf = new Configuration(); for (Entry<String, Object> entry : data.entrySet()) { String key = entry.getKey(); Object val = entry.getValue(); if (val instanceof String) { newConf.set(key, (String) val); } else if (val instanceof Boolean) { newConf.setBoolean(key, (Boolean) val); } else if (val instanceof Integer) { newConf.setInt(key, (Integer) val); } else if (val instanceof Float) { newConf.setFloat(key, (Float) val); } else if (val instanceof DBObject) { setDBObject(newConf, key, (DBObject) val); } else {/*w ww .j ava2 s. c om*/ throw new RuntimeException("can't convert " + val.getClass() + " into any type for Configuration"); } } return newConf; }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.collocations.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation//from w w w . java 2 s .c o m */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(AssocReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(AssocReducer.MIN_VALUE, minValue); conf.setInt("mapred.job.map.memory.mb", 1280); conf.setInt("mapred.job.reduce.memory.mb", 2560); conf.set("mapred.reduce.child.java.opts", "-Xmx2G"); conf.setInt("mapred.task.timeout", 6000000); conf.set(AssocReducer.ASSOC_METRIC, "llr"); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output + " pruning: " + minValue); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY + "_llr"); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.setReducerClass(AssocReducer.class); job.setNumReduceTasks(reduceTasks); // Defines additional single text based output 'text' for the job MultipleOutputs.addNamedOutput(job, "contingency", TextOutputFormat.class, Text.class, Text.class); // Defines additional multi sequencefile based output 'sequence' for the // job MultipleOutputs.addNamedOutput(job, "llr", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "pmi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "chi", TextOutputFormat.class, Text.class, DoubleWritable.class); MultipleOutputs.addNamedOutput(job, "dice", TextOutputFormat.class, Text.class, DoubleWritable.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:edu.cuhk.hccl.hadoop.HadoopApp.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args == null || args.length < 4) { System.out.println("Please specify parameters: input, output, domain, num-reducers!"); System.exit(-1);//from w w w. jav a 2 s . c om } String input = args[0]; String output = args[1]; String domain = args[2]; int numReducers = Integer.parseInt(args[3]); float similarity = Float.parseFloat(args[4]); int range = Integer.parseInt(args[5]); Job job = new Job(new Configuration(), this.getClass().getSimpleName()); // Must below the line of job creation Configuration conf = job.getConfiguration(); // Reuse the JVM conf.setInt("mapred.job.reuse.jvm.num.tasks", -1); conf.setFloat("SIM_THRESHOLD", similarity); conf.setInt("SEARCH_RANGE", range); if (domain.equalsIgnoreCase("restaurant")) { conf.setStrings("ASPECTS", Constant.RESTAURANT_ASPECTS); job.setMapperClass(YelpMapper.class); job.setInputFormatClass(TextInputFormat.class); // args[4] is the business file to select matching business_ids to restaurant String busiFile = args[6]; DistributedCache.addCacheFile(new URI(busiFile), conf); } else if (domain.equalsIgnoreCase("hotel")) { conf.setStrings("ASPECTS", Constant.TRIPADVISOR_ASPECTS); job.setMapperClass(TripAdvisorMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); } else { System.out.println("Wrong domain type!"); System.exit(-1); } job.setJarByClass(HadoopApp.class); job.setReducerClass(ReviewReducer.class); job.setNumReduceTasks(numReducers); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(UserItemPair.class); job.setOutputValueClass(NounPhrase.class); // Delete output if exists Path outputDir = new Path(output); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(outputDir)) hdfs.delete(outputDir, true); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.waitForCompletion(true); return 0; }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansAdapterTest.java
License:Apache License
@Test public static void testCluster() { int dimension = 500; // construct data samplers centered on the corners of a unit cube Matrix mean = new DenseMatrix(8, dimension); List<MultiNormal> rowSamplers = Lists.newArrayList(); for (int i = 0; i < 8; i++) { // mean.viewRow(i).assign( // new double[] { 0.25 * (i & 4), 0.5 * (i & 2), i & 1 }); double[] random = new double[dimension]; for (int j = 0; j < random.length; j++) { random[j] = Math.random(); }/*from w ww . j a v a2 s .co m*/ mean.viewRow(i).assign(random); rowSamplers.add(new MultiNormal(0.01, mean.viewRow(i))); } // sample a bunch of data points Matrix data = new DenseMatrix(10000, dimension); for (MatrixSlice row : data) { row.vector().assign(rowSamplers.get(row.index() % 8).sample()); } // cluster the data long t0 = System.currentTimeMillis(); double cutoff = StreamingKMeansAdapter.estimateCutoff(data, 100); Configuration conf = new Configuration(); conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, 1000); conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, (float) cutoff); conf.setClass(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class, DistanceMeasure.class); conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dimension); StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter(conf); // for (MatrixSlice row : Iterables.skip(data, 1)) { // skmeans.cluster(row.vector()); // } for (MatrixSlice row : data) { skmeans.cluster(row.vector()); } // validate Searcher r = skmeans.getCentroids(); // StreamingKMeansAdapter skmeans = new StreamingKMeansAdapter(); // Searcher r = skmeans.cluster(data, 1000, centroidFactory); long t1 = System.currentTimeMillis(); assertEquals("Total weight not preserved", totalWeight(data), totalWeight(r), 1e-9); // and verify that each corner of the cube has a centroid very nearby for (MatrixSlice row : mean) { WeightedVector v = r.search(row.vector(), 1).get(0); assertTrue(v.getWeight() < 0.05); } System.out.printf("%.2f for clustering\n%.1f us per row\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / data.rowSize() * 1e6); System.out.println("Done??"); }
From source file:edu.indiana.d2i.htrc.skmeans.StreamingKMeansDriver.java
License:Apache License
private void StreamingKMeansConfigHelper(Configuration conf, String input, int maxCluster) throws IOException { // get samples to calculate scale factor FileSystem fs = FileSystem.get(conf); FileStatus[] status = fs.listStatus(new Path(input), Utilities.HIDDEN_FILE_FILTER); int index = 0 + (int) (Math.random() * (status.length)); SequenceFile.Reader seqReader = new SequenceFile.Reader(fs, status[index].getPath(), conf); int count = 0; Text key = new Text(); VectorWritable value = new VectorWritable(); List<MatrixSlice> slices = new ArrayList<MatrixSlice>(); while (seqReader.next(key, value) && count < samplesNum) { MatrixSlice slice = new MatrixSlice(value.get().clone(), count); slices.add(slice);/*from ww w. ja v a 2 s . c o m*/ count++; } // set cutoff float cutoff = (float) StreamingKmeans.estimateCutoff(slices, samplesNum); conf.setFloat(StreamingKMeansConfigKeys.CUTOFF, cutoff); logger.info("Scale factor (cutoff) is: " + cutoff); // set vector dimension int dim = value.get().size(); conf.setInt(StreamingKMeansConfigKeys.VECTOR_DIMENSION, dim); logger.info("Dimemsion of a vector is: " + dim); // set maximum #cluster conf.setInt(StreamingKMeansConfigKeys.MAXCLUSTER, maxCluster); // set distance measurement conf.set(StreamingKMeansConfigKeys.DIST_MEASUREMENT, EuclideanDistanceMeasure.class.getName()); }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation/* www. j a va 2 s.co m*/ */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(LLRReducer.MIN_LLR, minLLRValue); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(LLRReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:gaffer.accumulo.splitpoints.EstimateSplitPointsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 5) { System.err.println("Usage: " + this.getClass().getName() + " <mapred_output_directory> <proportion_to_sample> <number_of_tablet_servers> <resulting_split_file> <input_path1>..."); return 1; }/* w w w . j a va 2 s .c om*/ // Parse arguments Path outputPath = new Path(args[0]); float proportionToSample = Float.parseFloat(args[1]); int numberTabletServers = Integer.parseInt(args[2]); Path resultingSplitsFile = new Path(args[3]); Path[] inputPaths = new Path[args.length - 4]; for (int i = 0; i < inputPaths.length; i++) { inputPaths[i] = new Path(args[i + 4]); } // Conf and job Configuration conf = getConf(); conf.setFloat("proportion_to_sample", proportionToSample); String jobName = "Estimate split points: input = "; for (int i = 0; i < inputPaths.length; i++) { jobName += inputPaths[i] + ", "; } jobName += "output = " + outputPath; Job job = Job.getInstance(conf, jobName); job.setJarByClass(getClass()); // Input job.setInputFormatClass(SequenceFileInputFormat.class); for (int i = 0; i < inputPaths.length; i++) { SequenceFileInputFormat.addInputPath(job, inputPaths[i]); } // Mapper job.setMapperClass(EstimateSplitPointsMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); // Reducer job.setReducerClass(EstimateSplitPointsReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); job.setNumReduceTasks(1); // Output job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } // Number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. @SuppressWarnings("deprecation") Counter counter = job.getCounters() .findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS); long recordsOutput = counter.getValue(); System.out.println("Number of records output = " + recordsOutput); // Work out when to output a split point. The number of split points // needed is the number of tablet servers minus 1 (because you don't // have to output the start of the first tablet or the end of the // last tablet). long outputEveryNthRecord = recordsOutput / (numberTabletServers - 1); // Read through resulting file, pick out the split points and write to // file. FileSystem fs = FileSystem.get(conf); Path resultsFile = new Path(outputPath, "part-r-00000"); @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); PrintStream splitsWriter = new PrintStream(new BufferedOutputStream(fs.create(resultingSplitsFile, true))); Key key = new Key(); Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { numberSplitPointsOutput++; splitsWriter.println(new String(Base64.encodeBase64(key.getRow().getBytes()))); System.out.println("Written split point: " + key.getRow()); } } reader.close(); splitsWriter.close(); System.out.println("Number of split points output = " + numberSplitPointsOutput); return 0; }