List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:edu.isi.mavuno.app.mine.HarvestParaphraseCandidates.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorClass", conf);/*from w w w .j ava 2 s .c om*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.ExtractorArgs", conf); String numResults = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.NumResults", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestParaphraseCandidates.OutputPath", conf); MavunoUtils.createDirectory(conf, outputPath); sLogger.info("Tool name: HarvestParaphraseCandidates"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestParaphraseCandidates"); // harvest all (context, pattern) triples conf.set("Mavuno.HarvestContextPatternPairs.CorpusPath", corpusPath); conf.set("Mavuno.HarvestContextPatternPairs.CorpusClass", corpusClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorClass", extractorClass); conf.set("Mavuno.HarvestContextPatternPairs.ExtractorArgs", extractorArgs); conf.set("Mavuno.HarvestContextPatternPairs.MinMatches", minMatches); conf.set("Mavuno.HarvestContextPatternPairs.OutputPath", outputPath + "/triples"); new HarvestContextPatternPairs(conf).run(); FileInputFormat.addInputPath(job, new Path(outputPath + "/triples")); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/patterns-all")); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); job.setMapOutputValueClass(TextLongPairWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); // combine scores // conf.set("Mavuno.CombineScores.InputPath", outputPath + "/patterns-all"); // conf.set("Mavuno.CombineScores.OutputPath", outputPath + "/patterns"); // new CombineScores(conf).run(); // // only retain the top paraphrases conf.set("Mavuno.GetTopResults.InputPath", outputPath + "/patterns-all"); conf.set("Mavuno.GetTopResults.OutputPath", outputPath + "/top-k"); conf.set("Mavuno.GetTopResults.NumResults", numResults); conf.setBoolean("Mavuno.GetTopResults.SequenceFileOutputFormat", false); new GetTopResults(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/patterns-all"); return 0; }
From source file:edu.iu.daal_sgd.SGDDaalLauncher.java
License:Apache License
/** * Launches SGD workers.//from w w w . ja va2 s . co m */ @Override public int run(String[] args) throws Exception { /* Put shared libraries into the distributed cache */ Configuration conf = this.getConf(); Initialize init = new Initialize(conf, args); /* Put shared libraries into the distributed cache */ init.loadDistributedLibs(); init.loadDistributedLibsExp(); // load args init.loadSysArgs(); //load app args conf.setInt(HarpDAALConstants.FILE_DIM, Integer.parseInt(args[init.getSysArgNum()])); conf.setInt(HarpDAALConstants.FEATURE_DIM, Integer.parseInt(args[init.getSysArgNum() + 1])); conf.setDouble(Constants.LAMBDA, Double.parseDouble(args[init.getSysArgNum() + 2])); conf.setDouble(Constants.EPSILON, Double.parseDouble(args[init.getSysArgNum() + 3])); conf.setBoolean(Constants.ENABLE_TUNING, Boolean.parseBoolean(args[init.getSysArgNum() + 4])); conf.set(HarpDAALConstants.TEST_FILE_PATH, args[init.getSysArgNum() + 5]); // launch job System.out.println("Starting Job"); long perJobSubmitTime = System.currentTimeMillis(); System.out.println( "Start Job#" + " " + new SimpleDateFormat("HH:mm:ss.SSS").format(Calendar.getInstance().getTime())); Job mfsgdJob = init.createJob("mfsgdJob", SGDDaalLauncher.class, SGDDaalCollectiveMapper.class); // finish job boolean jobSuccess = mfsgdJob.waitForCompletion(true); System.out.println( "End Job#" + " " + new SimpleDateFormat("HH:mm:ss.SSS").format(Calendar.getInstance().getTime())); System.out.println( "| Job#" + " Finished in " + (System.currentTimeMillis() - perJobSubmitTime) + " miliseconds |"); if (!jobSuccess) { mfsgdJob.killJob(); System.out.println("mfsgdJob failed"); } return 0; }
From source file:edu.iu.daal_subgraph.SCDaalLauncher.java
License:Apache License
private Job configureSCJob(String graphDir, String template, String outDir, int numMapTasks, boolean useLocalMultiThread, int numThreads, int numCores, String affinity, String omp_opt, int tpc, int mem, double memjavaratio, int send_array_limit, int nbr_split_len, boolean rotation_pipeline, int numIteration) throws IOException { Configuration configuration = getConf(); Job job = Job.getInstance(configuration, "subgraph counting"); Configuration jobConfig = job.getConfiguration(); Path jobOutDir = new Path(outDir); FileSystem fs = FileSystem.get(configuration); if (fs.exists(jobOutDir)) { fs.delete(jobOutDir, true);/*from ww w . j a v a2 s . c om*/ } FileInputFormat.setInputPaths(job, graphDir); FileOutputFormat.setOutputPath(job, jobOutDir); //job.setInputFormatClass(KeyValueTextInputFormat.class); //use harp multifile input format to have a better control on num of map tasks job.setInputFormatClass(MultiFileInputFormat.class); job.setJarByClass(SCDaalLauncher.class); job.setMapperClass(SCDaalCollectiveMapper.class); JobConf jobConf = (JobConf) job.getConfiguration(); jobConf.set("mapreduce.framework.name", "map-collective"); // mapreduce.map.collective.memory.mb // 125000 jobConf.setInt("mapreduce.map.collective.memory.mb", mem); // mapreduce.map.collective.java.opts // -Xmx120000m -Xms120000m // int xmx = (mem - 5000) > (mem * 0.9) // ? (mem - 5000) : (int) Math.ceil(mem * 0.5); // int xmx = (int) Math.ceil((mem - 5000)*0.2); int xmx = (int) Math.ceil((mem - 5000) * memjavaratio); int xmn = (int) Math.ceil(0.25 * xmx); jobConf.set("mapreduce.map.collective.java.opts", "-Xmx" + xmx + "m -Xms" + xmx + "m" + " -Xmn" + xmn + "m"); jobConf.setNumMapTasks(numMapTasks); jobConf.setInt("mapreduce.job.max.split.locations", 10000); jobConf.setInt("mapreduce.task.timeout", 60000000); job.setNumReduceTasks(0); jobConfig.setInt(SCConstants.NUM_MAPPERS, numMapTasks); jobConfig.set(SCConstants.TEMPLATE_PATH, template); jobConfig.set(SCConstants.OUTPUT_PATH, outDir); jobConfig.setBoolean(SCConstants.USE_LOCAL_MULTITHREAD, useLocalMultiThread); jobConfig.setInt(SCConstants.NUM_THREADS_PER_NODE, numThreads); jobConfig.setInt(SCConstants.THREAD_NUM, numThreads); jobConfig.setInt(SCConstants.CORE_NUM, numCores); jobConfig.set(SCConstants.THD_AFFINITY, affinity); jobConfig.set(SCConstants.OMPSCHEDULE, omp_opt); jobConfig.setInt(SCConstants.TPC, tpc); jobConfig.setInt(SCConstants.SENDLIMIT, send_array_limit); jobConfig.setInt(SCConstants.NBRTASKLEN, nbr_split_len); jobConfig.setBoolean(SCConstants.ROTATION_PIPELINE, rotation_pipeline); jobConfig.setInt(SCConstants.NUM_ITERATION, numIteration); return job; }
From source file:edu.iu.kmeans.sgxsimu.KMeansLauncher.java
License:Apache License
/** * Launches all the tasks in order.//w w w . j av a 2 s.c o m */ @Override public int run(String[] args) throws Exception { /* Put shared libraries into the distributed cache */ Configuration conf = this.getConf(); Initialize init = new Initialize(conf, args); // load args init.loadSysArgs(); init.loadDistributedLibs(); //load app args conf.setInt(HarpDAALConstants.FILE_DIM, Integer.parseInt(args[init.getSysArgNum()])); conf.setInt(HarpDAALConstants.FEATURE_DIM, Integer.parseInt(args[init.getSysArgNum() + 1])); conf.setInt(HarpDAALConstants.NUM_CENTROIDS, Integer.parseInt(args[init.getSysArgNum() + 2])); conf.setInt(Constants.ENCLAVE_TOTAL, Integer.parseInt(args[init.getSysArgNum() + 3])); conf.setInt(Constants.ENCLAVE_PER_THD, Integer.parseInt(args[init.getSysArgNum() + 4])); conf.setInt(Constants.ENCLAVE_TASK, Integer.parseInt(args[init.getSysArgNum() + 5])); conf.setBoolean(Constants.ENABLE_SIMU, Boolean.parseBoolean(args[init.getSysArgNum() + 6])); // config job System.out.println("Starting Job"); long perJobSubmitTime = System.currentTimeMillis(); System.out.println( "Start Job#" + " " + new SimpleDateFormat("HH:mm:ss.SSS").format(Calendar.getInstance().getTime())); Job kmeansJob = init.createJob("kmeansJob", KMeansLauncher.class, KMeansCollectiveMapper.class); // initialize centroids data JobConf thisjobConf = (JobConf) kmeansJob.getConfiguration(); FileSystem fs = FileSystem.get(conf); int nFeatures = Integer.parseInt(args[init.getSysArgNum() + 1]); int numCentroids = Integer.parseInt(args[init.getSysArgNum() + 2]); Path workPath = init.getWorkPath(); Path cenDir = new Path(workPath, "centroids"); fs.mkdirs(cenDir); if (fs.exists(cenDir)) { fs.delete(cenDir, true); } Path initCenDir = new Path(cenDir, "init_centroids"); DataGenerator.generateDenseDataSingle(numCentroids, nFeatures, 1000, 0, " ", initCenDir, fs); thisjobConf.set(HarpDAALConstants.CEN_DIR, cenDir.toString()); thisjobConf.set(HarpDAALConstants.CENTROID_FILE_NAME, "init_centroids"); //generate Data if required boolean generateData = Boolean.parseBoolean(args[init.getSysArgNum() + 7]); if (generateData) { Path inputPath = init.getInputPath(); int total_points = Integer.parseInt(args[init.getSysArgNum() + 8]); int total_files = Integer.parseInt(args[init.getSysArgNum() + 9]); String tmpDirPathName = args[init.getSysArgNum() + 10]; DataGenerator.generateDenseDataMulti(total_points, nFeatures, total_files, 2, 1, ",", inputPath, tmpDirPathName, fs); } // finish job boolean jobSuccess = kmeansJob.waitForCompletion(true); System.out.println( "End Job#" + " " + new SimpleDateFormat("HH:mm:ss.SSS").format(Calendar.getInstance().getTime())); System.out.println( "| Job#" + " Finished in " + (System.currentTimeMillis() - perJobSubmitTime) + " miliseconds |"); if (!jobSuccess) { kmeansJob.killJob(); System.out.println("kmeansJob failed"); } return 0; }
From source file:edu.iu.lda.LDALauncher.java
License:Apache License
private Job configureLDAJob(Path docDir, int numTopics, double alpha, double beta, int numIterations, int minBound, int maxBound, int numMapTasks, int numThreadsPerWorker, double scheduleRatio, int mem, boolean printModel, Path modelDir, Path outputDir, Configuration configuration, int jobID) throws IOException, URISyntaxException { configuration.setInt(Constants.NUM_TOPICS, numTopics); configuration.setDouble(Constants.ALPHA, alpha); configuration.setDouble(Constants.BETA, beta); configuration.setInt(Constants.NUM_ITERATIONS, numIterations); configuration.setInt(Constants.MIN_BOUND, minBound); configuration.setInt(Constants.MAX_BOUND, maxBound); configuration.setInt(Constants.NUM_THREADS, numThreadsPerWorker); configuration.setDouble(Constants.SCHEDULE_RATIO, scheduleRatio); System.out.println("Model Dir Path: " + modelDir.toString()); configuration.set(Constants.MODEL_DIR, modelDir.toString()); configuration.setBoolean(Constants.PRINT_MODEL, printModel); Job job = Job.getInstance(configuration, "lda_job_" + jobID); JobConf jobConf = (JobConf) job.getConfiguration(); jobConf.set("mapreduce.framework.name", "map-collective"); // mapreduce.map.collective.memory.mb // 125000//from w w w. j a v a 2 s.c om jobConf.setInt("mapreduce.map.collective.memory.mb", mem); // mapreduce.map.collective.java.opts // -Xmx120000m -Xms120000m int xmx = (mem - 5000) > (mem * 0.9) ? (mem - 5000) : (int) Math.ceil(mem * 0.9); int xmn = (int) Math.ceil(0.25 * xmx); jobConf.set("mapreduce.map.collective.java.opts", "-Xmx" + xmx + "m -Xms" + xmx + "m" + " -Xmn" + xmn + "m"); jobConf.setNumMapTasks(numMapTasks); jobConf.setInt("mapreduce.job.max.split.locations", 10000); FileInputFormat.setInputPaths(job, docDir); FileOutputFormat.setOutputPath(job, outputDir); job.setInputFormatClass(MultiFileInputFormat.class); job.setJarByClass(LDALauncher.class); job.setMapperClass(LDAMPCollectiveMapper.class); job.setNumReduceTasks(0); return job; }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass1: generate collocations, ngrams//from www. ja v a 2s . c om */ @SuppressWarnings("deprecation") private static long generateCollocations(Path input, Path output, Configuration baseConf, boolean emitUnigrams, int maxNGramSize, int reduceTasks, int minSupport) throws IOException, ClassNotFoundException, InterruptedException { Configuration con = new Configuration(baseConf); con.setBoolean(EMIT_UNIGRAMS, emitUnigrams); con.setInt(CollocMapper.MAX_SHINGLE_SIZE, maxNGramSize); con.setInt(CollocReducer.MIN_SUPPORT, minSupport); Job job = new Job(con); job.setJobName(CollocDriver.class.getSimpleName() + ".generateCollocations:" + input); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(GramKey.class); job.setMapOutputValueClass(Gram.class); job.setPartitionerClass(GramKeyPartitioner.class); job.setGroupingComparatorClass(GramKeyGroupComparator.class); job.setOutputKeyClass(Gram.class); job.setOutputValueClass(Gram.class); job.setCombinerClass(CollocCombiner.class); FileInputFormat.setInputPaths(job, input); Path outputPath = new Path(output, SUBGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(CollocMapper.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(CollocReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } return job.getCounters().findCounter(CollocMapper.Count.NGRAM_TOTAL).getValue(); }
From source file:edu.rosehulman.CollocDriver.java
License:Apache License
/** * pass2: perform the LLR calculation// w ww .ja va 2s. c om */ private static void computeNGramsPruneByLLR(Path output, Configuration baseConf, long nGramTotal, boolean emitUnigrams, float minLLRValue, int reduceTasks) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); conf.setLong(LLRReducer.NGRAM_TOTAL, nGramTotal); conf.setBoolean(EMIT_UNIGRAMS, emitUnigrams); conf.setFloat(LLRReducer.MIN_LLR, minLLRValue); Job job = new Job(conf); job.setJobName(CollocDriver.class.getSimpleName() + ".computeNGrams: " + output); job.setJarByClass(CollocDriver.class); job.setMapOutputKeyClass(Gram.class); job.setMapOutputValueClass(Gram.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.setInputPaths(job, new Path(output, SUBGRAM_OUTPUT_DIRECTORY)); Path outPath = new Path(output, NGRAM_OUTPUT_DIRECTORY); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(Mapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setReducerClass(LLRReducer.class); job.setNumReduceTasks(reduceTasks); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:edu.umd.cloud9.webgraph.driver.ClueWebDriver.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 6) { printUsage();//from ww w .jav a2s . c om return -1; } Configuration conf = getConf(); String inputArg = DriverUtil.argValue(args, DriverUtil.CL_INPUT); final String inputBase = inputArg.endsWith("/") ? inputArg : inputArg + "/"; String outputArg = DriverUtil.argValue(args, DriverUtil.CL_OUTPUT); final String outputBase = outputArg.endsWith("/") ? outputArg : outputArg + "/"; final String docnoMapping = DriverUtil.argValue(args, DriverUtil.CL_DOCNO_MAPPING); final int fromSegment = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_BEGIN_SEGMENT)); final int toSegment = Integer.parseInt(DriverUtil.argValue(args, DriverUtil.CL_END_SEGMENT)); final boolean includeInternalLinks = DriverUtil.argExists(args, DriverUtil.CL_INCLUDE_INTERNAL_LINKS); final boolean computeAnchorWeights = DriverUtil.argExists(args, DriverUtil.CL_COMPUTE_WEIGHTS); final String normalizer = DriverUtil.argValue(args, DriverUtil.CL_NORMALIZER); conf.setInt("Cloud9.Mappers", 2000); conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS); conf.set("Cloud9.DocnoMappingFile", docnoMapping); conf.setBoolean("Cloud9.IncludeInternalLinks", includeInternalLinks); conf.set("Cloud9.AnchorTextNormalizer", normalizer); // Extract link information for each segment separately for (int i = fromSegment; i <= toSegment; i++) { String inputPath = inputBase + "en." + (i == 10 ? "10" : ("0" + i)); String outputPath = outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en." + (i == 10 ? "10" : ("0" + i)); conf.set("Cloud9.InputPath", inputPath); conf.set("Cloud9.OutputPath", outputPath); int r = new ClueExtractLinks(conf).run(); if (r != 0) { return -1; } } // Construct the reverse web graph (i.e., collect incoming link // information) String inputPath = ""; for (int i = fromSegment; i < toSegment; i++) { inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.0" + i + "/,"; } if (toSegment == 10) { inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.10/"; } else { inputPath += outputBase + DriverUtil.OUTPUT_EXTRACT_LINKS + "/en.0" + toSegment + "/"; } String outputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/"; conf.set("Cloud9.InputPath", inputPath); conf.set("Cloud9.OutputPath", outputPath); conf.setInt("Cloud9.Mappers", 1); conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1)); int r = new BuildReverseWebGraph(conf).run(); if (r != 0) { return -1; } // Construct the web graph inputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/"; outputPath = outputBase + DriverUtil.OUTPUT_WEBGRAPH + "/"; conf.set("Cloud9.InputPath", inputPath); conf.set("Cloud9.OutputPath", outputPath); conf.setInt("Cloud9.Mappers", 1); conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1)); r = new BuildWebGraph(conf).run(); if (r != 0) { return -1; } if (computeAnchorWeights) { // Propagating domain names in order to compute anchor weights inputPath = outputBase + DriverUtil.OUTPUT_WEBGRAPH + "/"; outputPath = outputBase + DriverUtil.OUTPUT_HOST_NAMES + "/"; conf.set("Cloud9.InputPath", inputPath); conf.set("Cloud9.OutputPath", outputPath); conf.setInt("Cloud9.Mappers", 1); conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1)); r = new CollectHostnames(conf).run(); if (r != 0) { return -1; } // Compute the weights inputPath = outputBase + DriverUtil.OUTPUT_REVERSE_WEBGRAPH + "/," + outputBase + DriverUtil.OUTPUT_HOST_NAMES + "/"; outputPath = outputBase + DriverUtil.OUTPUT_WEGIHTED_REVERSE_WEBGRAPH + "/"; conf.set("Cloud9.InputPath", inputPath); conf.set("Cloud9.OutputPath", outputPath); conf.setInt("Cloud9.Mappers", 1); conf.setInt("Cloud9.Reducers", DriverUtil.DEFAULT_REDUCERS * (toSegment - fromSegment + 1)); r = new ComputeWeight(conf).run(); if (r != 0) { return -1; } } return 0; }
From source file:edu.umn.cs.spatialHadoop.indexing.Indexer.java
License:Open Source License
/** * Create a partitioner for a particular job * @param ins//ww w .j a v a 2 s .co m * @param out * @param job * @param partitionerName * @return * @throws IOException */ public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName) throws IOException { try { Partitioner partitioner; Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase()); if (partitionerClass == null) { // Try to parse the name as a class name try { partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class); } catch (ClassNotFoundException e) { throw new RuntimeException("Unknown index type '" + partitionerName + "'"); } } if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) { boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase()); job.setBoolean("replicate", replicate); } partitioner = partitionerClass.newInstance(); long t1 = System.currentTimeMillis(); final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr"); // Determine number of partitions long inSize = 0; for (Path in : ins) { inSize += FileUtil.getPathSize(in.getFileSystem(job), in); } long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f))); FileSystem outFS = out.getFileSystem(job); long outBlockSize = outFS.getDefaultBlockSize(out); final List<Point> sample = new ArrayList<Point>(); float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f); long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024); LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%"); ResultCollector<Point> resultCollector = new ResultCollector<Point>() { @Override public void collect(Point p) { sample.add(p.clone()); } }; OperationsParams params2 = new OperationsParams(job); params2.setFloat("ratio", sample_ratio); params2.setLong("size", sample_size); if (job.get("shape") != null) params2.set("shape", job.get("shape")); if (job.get("local") != null) params2.set("local", job.get("local")); params2.setClass("outshape", Point.class, Shape.class); Sampler.sample(ins, resultCollector, params2); long t2 = System.currentTimeMillis(); System.out.println("Total time for sampling in millis: " + (t2 - t1)); LOG.info("Finished reading a sample of " + sample.size() + " records"); int partitionCapacity = (int) Math.max(1, Math.floor((double) sample.size() * outBlockSize / estimatedOutSize)); int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize)); LOG.info("Partitioning the space into " + numPartitions + " partitions with capacity of " + partitionCapacity); partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), partitionCapacity); return partitioner; } catch (InstantiationException e) { e.printStackTrace(); return null; } catch (IllegalAccessException e) { e.printStackTrace(); return null; } }
From source file:edu.umn.cs.spatialHadoop.nasa.HDFToText.java
License:Open Source License
/** * Performs an HDF to text operation as a MapReduce job and returns total * number of points generated./*w w w . ja va2s .c o m*/ * @param inPath * @param outPath * @param datasetName * @param skipFillValue * @return * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static long HDFToTextMapReduce(Path inPath, Path outPath, String datasetName, boolean skipFillValue, OperationsParams params) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(params, "HDFToText"); Configuration conf = job.getConfiguration(); job.setJarByClass(HDFToText.class); job.setJobName("HDFToText"); // Set Map function details job.setMapperClass(HDFToTextMap.class); job.setNumReduceTasks(0); // Set input information job.setInputFormatClass(SpatialInputFormat3.class); SpatialInputFormat3.setInputPaths(job, inPath); if (conf.get("shape") == null) conf.setClass("shape", NASAPoint.class, Shape.class); conf.set("dataset", datasetName); conf.setBoolean("skipfillvalue", skipFillValue); // Set output information job.setOutputFormatClass(TextOutputFormat3.class); TextOutputFormat3.setOutputPath(job, outPath); // Run the job boolean verbose = conf.getBoolean("verbose", false); job.waitForCompletion(verbose); Counters counters = job.getCounters(); Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS); final long resultCount = outputRecordCounter.getValue(); return resultCount; }