List of usage examples for org.apache.hadoop.conf Configuration setStrings
public void setStrings(String name, String... values)
name
property as as comma delimited values. From source file:root.benchmark.LyrlBenchmarkJob.java
License:Apache License
/** * {@inheritDoc}//from w w w . j a v a2s . c o m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(vectorDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path vectorDirectoryPath = new Path(vectorDirectory); if (!inputFS.exists(vectorDirectoryPath)) { throw new Exception("Vector directory not found."); } Path wordDictDirectoryPath = new Path(wordDictDirectory); if (!inputFS.exists(wordDictDirectoryPath)) { throw new Exception("Word Dictionary directory not found."); } Path workingDirectoryPath = new Path(workingDirectory); if (workingFS.exists(workingDirectoryPath)) { throw new Exception("Working Directory already exists."); } if (!workingFS.mkdirs(workingDirectoryPath)) { throw new Exception("Failed to create Working Directory."); } String[] vectorizationArgs = { "-i", vectorDirectory, "-wd", wordDictDirectory, "-w", workingDirectory, "-l", numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure", "-smd", diagScale }; System.out.println(); ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs); long starttime, stoptime, deltatime; starttime = System.currentTimeMillis(); String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory, "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda }; ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs); stoptime = System.currentTimeMillis(); deltatime = stoptime - starttime; conf.setStrings(CONF_PREFIX + "Dataset", "Lyrl2004"); conf.setStrings(CONF_PREFIX + "iterations", numIterations); conf.setStrings(CONF_PREFIX + "levels", numLevels); conf.setStrings(CONF_PREFIX + "lambda", lambda); conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale); writeTimestamp(conf, workingDirectory + timeStamp, deltatime); String[] logJobArgs = { "-s", vectorDirectory, "-c", workingDirectory + apOutputDirectory, "-t", workingDirectory + timeStamp, "-f", workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", }; ToolRunner.run(conf, new LogJob(), logJobArgs); // workingFS = FileSystem.get(workingURI, conf); // workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot)); // // String [] hiveArgs={ // "-sqldb" , url_mysql, // "-sqlusr" , usr_mysql, // "-sqlpsw" , psw_mysql, // "-dataroot" , workingDirectory+dir_hiveDataRoot, // "-ap-out" , workingDirectory+apOutputDirectory, // "-word-dict", workingDirectory+wordDictionaryDirectory, // "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData, // "-tf-vec" , workingDirectory+dir_dataVectors+dir_dataTFVectorFile, // "-nwords" , topNWords // }; // ToolRunner.run(conf,new OutputJob(), hiveArgs); return 0; }
From source file:root.benchmark.ReutersBenchmarkJob.java
License:Apache License
/** * {@inheritDoc}//from w w w . j a va2 s . co m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path inputDirectoryPath = new Path(inputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } Path workingDirectoryPath = new Path(workingDirectory); if (workingFS.exists(workingDirectoryPath)) { throw new Exception("Working Directory already exists."); } if (!workingFS.mkdirs(workingDirectoryPath)) { throw new Exception("Failed to create Working Directory."); } String[] vectorizationArgs = { "-i", inputDirectory, "-o", workingDirectory + vectorizationOutputDirectory, "-x", exclusionThreshold, "-mdf", minimumDocumentFrequency, }; System.out.println(); ToolRunner.run(conf, new ReutersVectorizationJob(), vectorizationArgs); long starttime, stoptime, deltatime; starttime = System.currentTimeMillis(); String[] canopyArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors", "-o", workingDirectory + canopyOutputDirectory, "-dm", distanceMetric, "-t1", threshold1, "-t2", threshold2 }; ToolRunner.run(conf, new CanopyJob(), canopyArgs); String[] kmeansArgs = { "-i", workingDirectory + vectorizationOutputDirectory + "/vectorFiles/tf-vectors", "-o", workingDirectory + kmeansOutputDirectory, "-c", workingDirectory + canopyOutputDirectory + "/clusters-0-final", "-dm", distanceMetric, "-cd", convergenceDelta, "-mIter", numIterations, "-k", "3" }; ToolRunner.run(conf, new KMeansJob(), kmeansArgs); Path input = new Path(workingDirectory + kmeansOutputDirectory); Path output = new Path(workingDirectory + outputDirectory); boolean sequential = false; ClusterOutputPostProcessorDriver.run(input, output, sequential); stoptime = System.currentTimeMillis(); deltatime = stoptime - starttime; conf.setStrings(CONF_PREFIX + "Dataset", "Image"); conf.setStrings(CONF_PREFIX + "Iterations", numIterations); conf.setStrings(CONF_PREFIX + "Levels", convergenceDelta); writeTimestamp(conf, workingDirectory + timeStamp, deltatime); return 0; }
From source file:root.input.images.ImagesJob.java
License:Apache License
/** * {@inheritDoc}/* w w w . j av a 2 s . c o m*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path vectorDirectoryPath = new Path(inputDirectory); if (!inputFS.exists(vectorDirectoryPath)) { throw new Exception("Points directory not found."); } Path workingDirectoryPath = new Path(workingDirectory); if (workingFS.exists(workingDirectoryPath)) { throw new Exception("Working Directory already exists."); } if (!workingFS.mkdirs(workingDirectoryPath)) { throw new Exception("Failed to create Working Directory."); } String[] vectorizationArgs = { "-i", inputDirectory, "-w", workingDirectory, "-smd", diagScale, "-l", numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure" }; System.out.println(); ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs); long starttime, stoptime, deltatime; starttime = System.currentTimeMillis(); String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory, "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda, "-n", N }; ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs); stoptime = System.currentTimeMillis(); deltatime = stoptime - starttime; conf.setStrings(CONF_PREFIX + "Dataset", "Image"); conf.setStrings(CONF_PREFIX + "Iterations", numIterations); conf.setStrings(CONF_PREFIX + "Levels", numLevels); conf.setStrings(CONF_PREFIX + "Lambda", lambda); conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale); writeTimestamp(conf, workingDirectory + timeStamp, deltatime); String[] logJobArgs = { "-s", inputDirectory, "-c", workingDirectory + apOutputDirectory, "-t", workingDirectory + timeStamp, "-f", workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", }; ToolRunner.run(conf, new LogJob(), logJobArgs); // workingFS = FileSystem.get(workingURI, conf); // workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot)); // // String [] hiveArgs={ // "-sqldb" , url_mysql, // "-sqlusr" , usr_mysql, // "-sqlpsw" , psw_mysql, // "-dataroot" , workingDirectory+dir_hiveDataRoot, // "-ap-out" , workingDirectory+apOutputDirectory, // "-word-dict", workingDirectory+wordDictionaryDirectory, // "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData, // "-tf-vec" , workingDirectory+dir_dataVectors+dir_dataTFVectorFile, // "-nwords" , topNWords // }; // ToolRunner.run(conf,new OutputJob(), hiveArgs); return 0; }
From source file:root.input.lyrl2004.LyrlJob.java
License:Apache License
/** * {@inheritDoc}// w w w . j a va2 s. co m */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(vectorDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path vectorDirectoryPath = new Path(vectorDirectory); if (!inputFS.exists(vectorDirectoryPath)) { throw new Exception("Vector directory not found."); } Path wordDictDirectoryPath = new Path(wordDictDirectory); if (!inputFS.exists(wordDictDirectoryPath)) { throw new Exception("Word Dictionary directory not found."); } Path workingDirectoryPath = new Path(workingDirectory); if (workingFS.exists(workingDirectoryPath)) { throw new Exception("Working Directory already exists."); } if (!workingFS.mkdirs(workingDirectoryPath)) { throw new Exception("Failed to create Working Directory."); } String[] vectorizationArgs = { "-i", vectorDirectory, "-wd", wordDictDirectory, "-w", workingDirectory, "-l", numLevels, "-dm", "org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure", "-smd", diagScale }; System.out.println(); ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs); long starttime, stoptime, deltatime; starttime = System.currentTimeMillis(); String[] hapArgs = { "-i", workingDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory, "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda, "-n", N }; ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs); stoptime = System.currentTimeMillis(); deltatime = stoptime - starttime; conf.setStrings(CONF_PREFIX + "Dataset", "Lyrl2004"); conf.setStrings(CONF_PREFIX + "iterations", numIterations); conf.setStrings(CONF_PREFIX + "levels", numLevels); conf.setStrings(CONF_PREFIX + "lambda", lambda); conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale); writeTimestamp(conf, workingDirectory + timeStamp, deltatime); String[] logJobArgs = { "-s", vectorDirectory, "-c", workingDirectory + apOutputDirectory, "-t", workingDirectory + timeStamp, "-f", workingDirectory + dir_dataVectors + dir_dataMetaData + "/vectorName2docId", }; ToolRunner.run(conf, new LogJob(), logJobArgs); // workingFS = FileSystem.get(workingURI, conf); // workingFS.mkdirs(new Path(workingDirectory+dir_hiveDataRoot)); // // String [] hiveArgs={ // "-sqldb" , url_mysql, // "-sqlusr" , usr_mysql, // "-sqlpsw" , psw_mysql, // "-dataroot" , workingDirectory+dir_hiveDataRoot, // "-ap-out" , workingDirectory+apOutputDirectory, // "-word-dict", workingDirectory+wordDictionaryDirectory, // "-metadata" , workingDirectory+dir_dataVectors+dir_dataMetaData, // "-tf-vec" , workingDirectory+dir_dataVectors+dir_dataTFVectorFile, // "-nwords" , topNWords // }; // ToolRunner.run(conf,new OutputJob(), hiveArgs); return 0; }
From source file:root.input.reuters21578.ReutersJob.java
License:Apache License
/** * {@inheritDoc}/* w w w . j a v a 2 s. co m*/ */ @Override public int run(String[] args) throws Exception { constructParameterList(); if (parseArguments(args) == null) { return -1; } initializeConfigurationParameters(); printJobHeader(); Configuration conf = getConf(); URI workingURI = new URI(conf.get("fs.default.name")); URI inputURI = new URI(inputDirectory); FileSystem workingFS = FileSystem.get(workingURI, conf); FileSystem inputFS = FileSystem.get(inputURI, conf); Path inputDirectoryPath = new Path(inputDirectory); if (!inputFS.exists(inputDirectoryPath)) { throw new Exception("Input directory not found."); } Path workingDirectoryPath = new Path(workingDirectory); if (workingFS.exists(workingDirectoryPath)) { throw new Exception("Working Directory already exists."); } if (!workingFS.mkdirs(workingDirectoryPath)) { throw new Exception("Failed to create Working Directory."); } // workingFS.mkdirs( new Path( workingDirectory + vectorizationOutputDirectory ) ); // workingFS.mkdirs( new Path( workingDirectory + vectorizationOutputDirectory + apInputDirectory )); // // String[] similarityMatrixConverterArgs = { // "-i", inputDirectory + "/Ssym.csv", // "-o", workingDirectory + vectorizationOutputDirectory + apInputDirectory, // "-l", numLevels // }; // ToolRunner.run( conf, new SimilarityMatrixConverter(), similarityMatrixConverterArgs ); String[] vectorizationArgs = { "-i", inputDirectory, "-o", workingDirectory + vectorizationOutputDirectory, "-x", exclusionThreshold, "-l", numLevels, "-mdf", minimumDocumentFrequency, "-dm", distanceMetric, "-tf_tfidf", tf_tfidf, "-smd", diagScale }; System.out.println(); ToolRunner.run(conf, new VectorizationJob(), vectorizationArgs); long starttime, stoptime, deltatime; starttime = System.currentTimeMillis(); String[] hapArgs = { "-i", workingDirectory + vectorizationOutputDirectory + apInputDirectory, "-o", workingDirectory + apOutputDirectory, "-l", numLevels, "-w", workingDirectory, "-iter", numIterations, "-lambda", lambda, "-n", N }; ToolRunner.run(conf, new HierarchicalAffinityPropagationJob(), hapArgs); stoptime = System.currentTimeMillis(); deltatime = stoptime - starttime; conf.setStrings(CONF_PREFIX + "Dataset", "Reuters"); conf.setStrings(CONF_PREFIX + "iterations", numIterations); conf.setStrings(CONF_PREFIX + "levels", numLevels); conf.setStrings(CONF_PREFIX + "lambda", lambda); conf.setStrings(CONF_PREFIX + "SMatSeed", diagScale); writeTimestamp(conf, workingDirectory + timeStamp, deltatime); String[] logJobArgs = { "-s", inputDirectory, "-c", workingDirectory + apOutputDirectory, "-t", workingDirectory + timeStamp, "-f", workingDirectory + vectorizationOutputDirectory + dir_dataMetaData + "/fileName2docId" }; ToolRunner.run(conf, new LogJob(), logJobArgs); // // // Move files into position for HiveJob // workingFS = FileSystem.get(workingURI, conf); // workingFS.mkdirs(new Path(workingDirectory + dir_hiveDataRoot)); // Path currWordDict = new Path(workingDirectory // + vectorizationOutputDirectory // + "/vectorFiles/dictionary.file-0"); // Path nextWordDict = new Path(workingDirectory // + vectorizationOutputDirectory + dir_dataWordDict // + "/dictionary.file-0"); // workingFS.mkdirs(new Path(workingDirectory // + vectorizationOutputDirectory + dir_dataWordDict)); // workingFS.rename(currWordDict, nextWordDict); // // String[] hiveArgs = { // "--sqldb", url_mysql, // "--sqlusr", usr_mysql, // "--sqlpsw", psw_mysql, // "--dataroot", workingDirectory + dir_hiveDataRoot, // "--ap-out", workingDirectory + apOutputDirectory, // "--word-dict", workingDirectory + vectorizationOutputDirectory // + dir_dataWordDict, // "--metadata", workingDirectory + vectorizationOutputDirectory // + dir_dataMetaData, // "--tf-vec", workingDirectory + vectorizationOutputDirectory // + dir_dataTFVectors + tf_tfidf + "-vectors", // "--nwords", topNWords // }; // ToolRunner.run(conf, new OutputJob(), hiveArgs); return 0; }
From source file:simsql.runtime.JoinOp.java
License:Apache License
public void setConfigurations(Configuration conf, RuntimeParameter params) { // first, send out the type of join conf.setStrings("simsql.joinType", new String[] { joinType.toString().toLowerCase() }); // set the self-join value conf.setBoolean("simsql.isSelfJoin", isSelfJoin); // see if we have a Cartesian product conf.setBoolean("simsql.joinCartesian", isCartesian); // see if we have a pure, map-only merge join conf.setBoolean("simsql.isMergeJoin", mergeJoin); // if we are able to avoid a sort of the left or of the right, then we need some extra configs that will allow the merge if (mergeJoin || sortOnlyRight) { conf.setInt("simsql.sortedFileTypeCode", getDB().getTypeCode(getDB().getTableName(leftFile))); conf.set("simsql.sortedFileName", leftFile); conf.setInt("simsql.sortedFileNumAtts", getDB().getNumAtts(getDB().getTableName(leftFile))); } else if (sortOnlyLeft) { conf.setInt("simsql.sortedFileTypeCode", getDB().getTypeCode(getDB().getTableName(rightFile))); conf.set("simsql.sortedFileName", rightFile); conf.setInt("simsql.sortedFileNumAtts", getDB().getNumAtts(getDB().getTableName(rightFile))); }// ww w.j a va2s. c o m // find out which relation is the largest. long leftSize = getPathsActualSize(getValue("leftInput.inFiles").getStringList().toArray(new String[0])); long rightSize = getPathsActualSize(getValue("rightInput.inFiles").getStringList().toArray(new String[0])); long smallerSize = 0; long largerSize = 0; int smallerTypeCode = -1; int largerTypeCode = -1; if (leftSize < rightSize) { smallerSize = leftSize; largerSize = rightSize; smallerTypeCode = leftTypeCode; largerTypeCode = rightTypeCode; } else { smallerSize = rightSize; largerSize = leftSize; smallerTypeCode = rightTypeCode; largerTypeCode = leftTypeCode; } // and pass the typecode and size of those relations. conf.setInt("simsql.smallerRelation.typeCode", smallerTypeCode); conf.setInt("simsql.largerRelation.typeCode", largerTypeCode); conf.setLong("simsql.smallerRelation.size", smallerSize); conf.setLong("simsql.largerRelation.size", largerSize); }
From source file:simsql.runtime.MRLoader.java
License:Apache License
public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) { // make a directory for the relation Configuration conf = new Configuration(); FileSystem dfs = null;/*from ww w .j ava2 s . c o m*/ try { dfs = FileSystem.get(conf); } catch (Exception e) { throw new RuntimeException("Cannot access HDFS!", e); } try { // if it exists, destroy it. Path path = new Path(outputPath); if (dfs.exists(path)) { dfs.delete(path, true); } } catch (Exception e) { throw new RuntimeException("Could not create the file to bulk load to!", e); } // find a file name String tempPath = null; if (inputPath.startsWith("hdfs:")) { tempPath = inputPath.replace("hdfs:", ""); } else { tempPath = "/tempDataFile_" + r.getName(); try { dfs.delete(new Path(tempPath), true); } catch (Exception e) { // ignore this. } // upload the text file try { dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath)); dfs.deleteOnExit(new Path(tempPath)); } catch (Exception e) { throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e); } } // set up the new job's parameters. conf.setBoolean("mapred.compress.map.output", true); conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); conf.set("io.serializations", "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt("simsql.loader.numAtts", r.getAttributes().size()); conf.setInt("simsql.loader.typeCode", (int) typeCode); conf.setInt("simsql.loader.sortAtt", sortAtt); String[] myStrings = new String[r.getAttributes().size()]; int j = 0; for (simsql.compiler.Attribute a : r.getAttributes()) { myStrings[j++] = a.getPhysicalRealization().getClass().getName(); } conf.setStrings("simsql.loader.types", myStrings); // create a job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create bulk loading job!", e); } // set the split size (number of mappers) long fSize = 0; if (inputPath.startsWith("hdfs")) { fSize = RelOp.getPathsTotalSize(new String[] { tempPath }); } else { fSize = new File(inputPath).length(); } FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks); FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks); // and the number of reducers job.setNumReduceTasks(numTasks); // the mapper/reducer/jar job.setMapperClass(MRLoaderMapper.class); job.setReducerClass(MRLoaderReducer.class); job.setJarByClass(MRLoader.class); // I/O settings. job.setOutputFormatClass(RecordOutputFormat.class); job.setMapOutputKeyClass(RecordKey.class); job.setMapOutputValueClass(RecordWrapper.class); job.setOutputKeyClass(Nothing.class); job.setOutputValueClass(Record.class); try { FileInputFormat.setInputPaths(job, new Path(tempPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); } catch (Exception e) { throw new RuntimeException("Could not set job inputs/outputs", e); } job.setGroupingComparatorClass(RecordKeyGroupingComparator.class); job.setPartitionerClass(RecordPartitioner.class); job.setSortComparatorClass(RecordKeySortComparator.class); job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath); // run it Counters counters; try { job.waitForCompletion(true); counters = job.getCounters(); } catch (Exception e) { throw new RuntimeException("Could not set up bulk loader job!", e); } // now, delete all the empty part files try { // get a filesystem FileSystem ddfs = FileSystem.get(conf); Path outPath = new Path(outputPath); if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) { FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... ddfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } // get the counter for the output of the mapper. Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); return bytesCounter.getValue(); }
From source file:simsql.runtime.RelOp.java
License:Apache License
public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose);//www .j av a 2 s. com /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); }
From source file:simsql.runtime.VGWrapperOp.java
License:Apache License
public void setConfigurations(Configuration conf, RuntimeParameter params) { ExampleRuntimeParameter p = (ExampleRuntimeParameter) params; /**//from w ww. j a v a2s . c om conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=10,force=n,thread=y,verbose=n,file=%s"); **/ // set the number of iterations conf.setInt("simsql.numIterations", p.getNumIterations()); // set the file name of the VG function. conf.setStrings("simsql.functionFile", new String[] { "/simsql/functions/" + getVGFunctions()[0] + ".so" }); // set the buffer size for data exchange -- 2GB is the maximum because it is a long. int bSize = 0; if (((p.getMemoryPerCPUInMB() / 2) * 1024L * 1024L) > (long) Integer.MAX_VALUE) { bSize = Integer.MAX_VALUE; } else { bSize = (p.getMemoryPerCPUInMB() / 2) * 1024 * 1024; } conf.setInt("simsql.dataBufferSize", bSize); // set the cross product relations. if (crossFiles.size() > 0) { conf.setStrings("simsql.crossFiles", crossFiles.toArray(new String[0])); conf.setStrings("simsql.crossTypeCodes", crossTypeCodes.toArray(new String[0])); conf.setStrings("simsql.crossAttCounts", crossAttCounts.toArray(new String[0])); } // set the sorted input relations. if (sortedInnerFiles.size() > 0) { conf.setStrings("simsql.sortedFiles", sortedInnerFiles.toArray(new String[0])); conf.setStrings("simsql.sortedTypeCodes", sortedTypeCodes.toArray(new String[0])); conf.setStrings("simsql.sortedAttCounts", sortedAttCounts.toArray(new String[0])); } conf.setBoolean("simsql.runVGWrapperReducer", runVGWrapperReducer); }
From source file:sourcefiles.BuildPersonalizedPageRankRecords.java
License:Apache License
/** * Runs this tool.//www . ja v a 2s . c o m */ @Override @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("sources").create(SOURCES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); String sources = cmdline.getOptionValue(SOURCES); LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); LOG.info(" - sources: " + sources); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setStrings(NODE_SRC_FIELD, sources); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPersonalizedPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PageRankNodeEnhanced.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(PageRankNodeEnhanced.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }