List of usage examples for org.apache.hadoop.conf Configuration set
public void set(String name, String value)
value
of the name
property. From source file:cascading.tap.hadoop.Hfs.java
License:Open Source License
/** * Based on the configuration, handles and sets {@link CombineFileInputFormat} as the input * format.//from w ww .j a va 2s .co m */ private void handleCombineFileInputFormat(Configuration conf) { // if combining files, override the configuration to use CombineFileInputFormat if (!getUseCombinedInput(conf)) return; // get the prescribed individual input format from the underlying scheme so it can be used by CombinedInputFormat String individualInputFormat = conf.get("mapred.input.format.class"); if (individualInputFormat == null) throw new TapException("input format is missing from the underlying scheme"); if (individualInputFormat.equals(CombinedInputFormat.class.getName()) && conf.get(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT) == null) throw new TapException( "the input format class is already the combined input format but the underlying input format is missing"); // if safe mode is on (default) throw an exception if the InputFormat is not a FileInputFormat, otherwise log a // warning and don't use the CombineFileInputFormat boolean safeMode = getCombinedInputSafeMode(conf); if (!FileInputFormat.class.isAssignableFrom(conf.getClass("mapred.input.format.class", null))) { if (safeMode) throw new TapException( "input format must be of type org.apache.hadoop.mapred.FileInputFormat, got: " + individualInputFormat); else LOG.warn( "not combining input splits with CombineFileInputFormat, {} is not of type org.apache.hadoop.mapred.FileInputFormat.", individualInputFormat); } else { // set the underlying individual input format conf.set(CombineFileRecordReaderWrapper.INDIVIDUAL_INPUT_FORMAT, individualInputFormat); // override the input format class conf.setClass("mapred.input.format.class", CombinedInputFormat.class, InputFormat.class); } }
From source file:cascading.tap.hadoop.io.StreamedFileSystem.java
License:Open Source License
public static void setMD5SumFor(Configuration conf, String path, String md5Hex) { if (md5Hex == null || md5Hex.length() == 0) return;/*from w w w. j a va 2 s .c om*/ conf.set(path + ".md5", md5Hex); }
From source file:cascading.tap.hadoop.util.Hadoop18TapUtil.java
License:Open Source License
/** * should only be called if not in a Flow * * @param conf/*from w w w . jav a 2 s . c om*/ * @throws IOException */ public static void setupJob(Configuration conf) throws IOException { Path outputPath = FileOutputFormat.getOutputPath(asJobConfInstance(conf)); if (outputPath == null) return; if (getFSSafe(conf, outputPath) == null) return; String taskID = conf.get("mapred.task.id", conf.get("mapreduce.task.id")); if (taskID == null) // need to stuff a fake id { String mapper = conf.getBoolean("mapred.task.is.map", conf.getBoolean("mapreduce.task.is.map", true)) ? "m" : "r"; String value = String.format("attempt_%012d_0000_%s_000000_0", (int) Math.rint(System.currentTimeMillis()), mapper); conf.set("mapred.task.id", value); conf.set("mapreduce.task.id", value); } makeTempPath(conf); if (writeDirectlyToWorkingPath(conf, outputPath)) { LOG.info("writing directly to output path: {}", outputPath); setWorkOutputPath(conf, outputPath); return; } // "mapred.work.output.dir" Path taskOutputPath = getTaskOutputPath(conf); setWorkOutputPath(conf, taskOutputPath); }
From source file:cascading.tap.hadoop.util.Hadoop18TapUtil.java
License:Open Source License
static void setWorkOutputPath(Configuration conf, Path outputDir) { outputDir = new Path(asJobConfInstance(conf).getWorkingDirectory(), outputDir); conf.set("mapred.work.output.dir", outputDir.toString()); }
From source file:cascading.tuple.hadoop.HadoopSerializationPlatformTest.java
License:Open Source License
@Test public void testInputOutputSerialization() throws IOException { long time = System.currentTimeMillis(); Configuration jobConf = new Configuration(); jobConf.set("io.serializations", TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class jobConf.set("cascading.serialization.tokens", "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing TupleSerialization tupleSerialization = new TupleSerialization(jobConf); File file = new File(getOutputPath("serialization")); file.mkdirs();/*from w w w. jav a2 s . c o m*/ file = new File(file, "/test.bytes"); TupleOutputStream output = new HadoopTupleOutputStream(new FileOutputStream(file, false), tupleSerialization.getElementWriter()); for (int i = 0; i < 501; i++) // 501 is arbitrary { String aString = "string number " + i; double random = Math.random(); output.writeTuple(new Tuple(i, aString, random, new TestText(aString), new Tuple("inner tuple", new BytesWritable("some string".getBytes())), new BytesWritable(Integer.toString(i).getBytes("UTF-8")), new BooleanWritable(false))); } output.close(); assertEquals("wrong size", 89967L, file.length()); // just makes sure the file size doesnt change from expected TupleInputStream input = new HadoopTupleInputStream(new FileInputStream(file), tupleSerialization.getElementReader()); int k = -1; for (int i = 0; i < 501; i++) { Tuple tuple = input.readTuple(); int value = tuple.getInteger(0); assertTrue("wrong diff", value - k == 1); assertTrue("wrong type", tuple.getObject(3) instanceof TestText); assertTrue("wrong type", tuple.getObject(4) instanceof Tuple); assertTrue("wrong type", tuple.getObject(5) instanceof BytesWritable); byte[] bytes = ((BytesWritable) tuple.getObject(5)).getBytes(); String string = new String(bytes, 0, bytes.length > 1 ? bytes.length - 1 : bytes.length, "UTF-8"); assertEquals("wrong value", Integer.parseInt(string), i); assertTrue("wrong type", tuple.getObject(6) instanceof BooleanWritable); k = value; } input.close(); System.out.println("time = " + (System.currentTimeMillis() - time)); }
From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java
License:Open Source License
private void performListTest(int size, int threshold, CompressionCodec codec, int spills) { Configuration jobConf = new Configuration(); jobConf.set("io.serializations", TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class jobConf.set("cascading.serialization.tokens", "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing HadoopSpillableTupleList list = new HadoopSpillableTupleList(threshold, codec, jobConf); for (int i = 0; i < size; i++) { String aString = "string number " + i; double random = Math.random(); list.add(new Tuple(i, aString, random, new Text(aString), new TestText(aString), new Tuple("inner tuple", new BytesWritable(aString.getBytes())))); }/*from w w w . j ava2s . co m*/ assertEquals("not equal: list.size();", size, list.size()); assertEquals("not equal: list.getNumFiles()", spills, list.spillCount()); int i = -1; int count = 0; for (Tuple tuple : list) { int value = tuple.getInteger(0); assertTrue("wrong diff", value - i == 1); assertEquals("wrong value", "string number " + count, tuple.getObject(3).toString()); assertEquals("wrong value", "string number " + count, tuple.getObject(4).toString()); assertTrue("wrong type", tuple.getObject(5) instanceof Tuple); BytesWritable bytesWritable = (BytesWritable) ((Tuple) tuple.getObject(5)).getObject(1); byte[] bytes = bytesWritable.getBytes(); String actual = new String(bytes, 0, bytesWritable.getLength()); assertEquals("wrong value", "string number " + count, actual); i = value; count++; } assertEquals("not equal: list.size();", size, count); Iterator<Tuple> iterator = list.iterator(); assertEquals("not equal: iterator.next().get(1)", "string number 0", iterator.next().getObject(1)); assertEquals("not equal: iterator.next().get(1)", "string number 1", iterator.next().getObject(1)); }
From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java
License:Open Source License
@Test public void testSpillMapCompressed() { long time = System.currentTimeMillis(); Configuration jobConf = new Configuration(); jobConf.set(SpillableProps.SPILL_CODECS, "org.apache.hadoop.io.compress.GzipCodec"); performMapTest(5, 5, 100, 20, jobConf); performMapTest(5, 50, 100, 20, jobConf); performMapTest(50, 5, 200, 20, jobConf); performMapTest(500, 50, 7000, 20, jobConf); System.out.println("time = " + (System.currentTimeMillis() - time)); }
From source file:cascading.tuple.hadoop.SpillableTupleHadoopTest.java
License:Open Source License
private void performMapTest(int numKeys, int listSize, int mapThreshold, int listThreshold, Configuration jobConf) { jobConf.set("io.serializations", TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class jobConf.set("cascading.serialization.tokens", "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing HadoopFlowProcess flowProcess = new HadoopFlowProcess(jobConf); HadoopSpillableTupleMap map = new HadoopSpillableTupleMap(SpillableProps.defaultMapInitialCapacity, SpillableProps.defaultMapLoadFactor, mapThreshold, listThreshold, flowProcess); Set<Integer> keySet = new HashSet<Integer>(); Random gen = new Random(1); for (int i = 0; i < listSize * numKeys; i++) { String aString = "string number " + i; double random = Math.random(); double keys = numKeys / 3.0; int key = (int) (gen.nextDouble() * keys + gen.nextDouble() * keys + gen.nextDouble() * keys); Tuple tuple = new Tuple(i, aString, random, new Text(aString), new TestText(aString), new Tuple("inner tuple", new BytesWritable(aString.getBytes()))); map.get(new Tuple(key)).add(tuple); keySet.add(key);/*w w w. j a v a 2 s. co m*/ } // the list test above verifies the contents are being serialized, the Map is just a container of lists. assertEquals("not equal: map.size();", keySet.size(), map.size()); }
From source file:cgl.hadoop.apps.runner.DataAnalysis.java
License:Open Source License
/** * Launch the MapReduce computation./* ww w .j a v a 2 s . c o m*/ * This method first, remove any previous working directories and create a new one * Then the data (file names) is copied to this new directory and launch the * MapReduce (map-only though) computation. * @param numMapTasks - Number of map tasks. * @param numReduceTasks - Number of reduce tasks =0. * @param programDir - The directory where the Cap3 program is. * @param execName - Name of the executable. * @param dataDir - Directory where the data is located. * @param outputDir - Output directory to place the output. * @param cmdArgs - These are the command line arguments to the Cap3 program. * @throws Exception - Throws any exception occurs in this program. */ void launch(int numReduceTasks, String programDir, String execName, String workingDir, String databaseArchive, String databaseName, String dataDir, String outputDir, String cmdArgs) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, execName); // First get the file system handler, delete any previous files, add the // files and write the data to it, then pass its name as a parameter to // job Path hdMainDir = new Path(outputDir); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdOutDir = new Path(hdMainDir, "out"); // Starting the data analysis. Configuration jc = job.getConfiguration(); jc.set(WORKING_DIR, workingDir); jc.set(EXECUTABLE, execName); jc.set(PROGRAM_DIR, programDir); // this the name of the executable archive jc.set(DB_ARCHIVE, databaseArchive); jc.set(DB_NAME, databaseName); jc.set(PARAMETERS, cmdArgs); jc.set(OUTPUT_DIR, outputDir); // using distributed cache // flush it //DistributedCache.releaseCache(new URI(programDir), jc); //DistributedCache.releaseCache(new URI(databaseArchive), jc); //DistributedCache.purgeCache(jc); // reput the data into cache long startTime = System.currentTimeMillis(); //DistributedCache.addCacheArchive(new URI(databaseArchive), jc); DistributedCache.addCacheArchive(new URI(programDir), jc); System.out.println( "Add Distributed Cache in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); FileInputFormat.setInputPaths(job, dataDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setJarByClass(DataAnalysis.class); job.setMapperClass(RunnerMap.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(DataFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReduceTasks); startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //clean the cache System.exit(exitStatus); }
From source file:chapter7.src.InputDriver.java
License:Apache License
public static void runJob(Path input, Path output, String vectorClassName, Configuration config) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = config; conf.set("vector.implementation.class.name", vectorClassName); Job job = new Job(conf, "Input Driver running over input: " + input); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(InputMapper.class); job.setNumReduceTasks(0);// www . ja v a 2s .c om job.setJarByClass(InputDriver.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }