List of usage examples for org.apache.hadoop.mapred Reporter NULL
Reporter NULL
To view the source code for org.apache.hadoop.mapred Reporter NULL.
Click Source Link
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void runJob(JobConf job) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);//from ww w.j a v a2 s. c om int numTrees = Builder.getNbTrees(job); // total number of trees firstOutput = new PartialOutputCollector(numTrees); Reporter reporter = Reporter.NULL; firstIds = new int[splits.length]; sizes = new int[splits.length]; // to compute firstIds, process the splits in file order int firstId = 0; long slowest = 0; // duration of slowest map for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.next(key, value)) { mapper.map(key, value, firstOutput, reporter); firstId++; sizes[hp]++; } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition/*from ww w .ja va 2 s . c o m*/ * * @throws IOException * */ void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); Builder.sortSplits(splits); int numTrees = Builder.getNbTrees(job); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < splits.length; p++) { total += Step2Mapper.nbConcerned(splits.length, numTrees, p); } secondOutput = new PartialOutputCollector(total); Reporter reporter = Reporter.NULL; long slowest = 0; // duration of slowest map for (int partition = 0; partition < splits.length; partition++) { InputSplit split = splits[partition]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(job); int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.next(key, value)) { mapper.map(key, value, secondOutput, reporter); } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); JobConf job = new JobConf(); job.setNumMapTasks(numMaps);// www.java 2 s . co m FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Step0OutputCollector collector = new Step0OutputCollector(numMaps); Reporter reporter = Reporter.NULL; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); } mapper.map(key, value, collector, reporter); size++; } mapper.close(); // validate the mapper's output assertEquals(p, collector.keys[p]); assertEquals(firstKey.longValue(), collector.values[p].getFirstId()); assertEquals(size, collector.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < numInstances; index++) { source[index][labelId] = index;//from w ww .jav a 2s.c om } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); JobConf job = new JobConf(); job.setNumMapTasks(numMaps); FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Reporter reporter = Reporter.NULL; int[] keys = new int[numMaps]; Step0Output[] values = new Step0Output[numMaps]; int[] expectedIds = new int[numMaps]; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).label; } size++; } keys[p] = p; values[p] = new Step0Output(firstKey, size); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.mahout.df.mapred.partial.Step1MapperTest.java
License:Apache License
public void testMapper() throws Exception { Long seed = null;//from www . java2 s .c o m Random rng = RandomUtils.getRandom(); // prepare the data String descriptor = Utils.randomDescriptor(rng, nbAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, nbInstances); String[] sData = Utils.double2String(source); Dataset dataset = DataLoader.generateDataset(descriptor, sData); String[][] splits = Utils.splitData(sData, nbMappers); MockTreeBuilder treeBuilder = new MockTreeBuilder(); LongWritable key = new LongWritable(); Text value = new Text(); int treeIndex = 0; for (int partition = 0; partition < nbMappers; partition++) { String[] split = splits[partition]; treeBuilder.setExpected(DataLoader.loadData(dataset, split)); // expected number of trees that this mapper will build int mapNbTrees = Step1Mapper.nbTrees(nbMappers, nbTrees, partition); PartialOutputCollector output = new PartialOutputCollector(mapNbTrees); MockStep1Mapper mapper = new MockStep1Mapper(treeBuilder, dataset, seed, partition, nbMappers, nbTrees); // make sure the mapper computed firstTreeId correctly assertEquals(treeIndex, mapper.getFirstTreeId()); for (int index = 0; index < split.length; index++) { key.set(index); value.set(split[index]); mapper.map(key, value, output, Reporter.NULL); } mapper.close(); // make sure the mapper built all its trees assertEquals(mapNbTrees, output.nbOutputs()); // check the returned keys for (TreeID k : output.getKeys()) { assertEquals(partition, k.partition()); assertEquals(treeIndex, k.treeId()); treeIndex++; } } }
From source file:org.apache.mahout.df.mapred.partial.Step2MapperTest.java
License:Apache License
public void testMapper() throws Exception { Random rng = RandomUtils.getRandom(); // prepare the data String descriptor = Utils.randomDescriptor(rng, nbAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, nbInstances); String[] sData = Utils.double2String(source); Dataset dataset = DataLoader.generateDataset(descriptor, sData); String[][] splits = Utils.splitData(sData, nbMappers); // prepare first step output TreeID[] keys = new TreeID[nbTrees]; Node[] trees = new Node[nbTrees]; int[] sizes = new int[nbMappers]; int treeIndex = 0; for (int partition = 0; partition < nbMappers; partition++) { int nbMapTrees = Step1Mapper.nbTrees(nbMappers, nbTrees, partition); for (int tree = 0; tree < nbMapTrees; tree++, treeIndex++) { keys[treeIndex] = new TreeID(partition, treeIndex); // put the partition in the leaf's label // this way we can track the outputs trees[treeIndex] = new Leaf(partition); }/* w w w .j a v a2 s. c om*/ sizes[partition] = splits[partition].length; } // store the first step outputs in a file FileSystem fs = FileSystem.getLocal(new Configuration()); Path forestPath = new Path("testdata/Step2MapperTest.forest"); InterResults.store(fs, forestPath, keys, trees, sizes); LongWritable key = new LongWritable(); Text value = new Text(); for (int partition = 0; partition < nbMappers; partition++) { String[] split = splits[partition]; // number of trees that will be handled by the mapper int nbConcerned = Step2Mapper.nbConcerned(nbMappers, nbTrees, partition); PartialOutputCollector output = new PartialOutputCollector(nbConcerned); // load the current mapper's (key, tree) pairs TreeID[] curKeys = new TreeID[nbConcerned]; Node[] curTrees = new Node[nbConcerned]; InterResults.load(fs, forestPath, nbMappers, nbTrees, partition, curKeys, curTrees); // simulate the job MockStep2Mapper mapper = new MockStep2Mapper(partition, dataset, curKeys, curTrees, split.length); for (int index = 0; index < split.length; index++) { key.set(index); value.set(split[index]); mapper.map(key, value, output, Reporter.NULL); } mapper.close(); // make sure the mapper did not return its own trees assertEquals(nbConcerned, output.nbOutputs()); // check the returned results int current = 0; for (int index = 0; index < nbTrees; index++) { if (keys[index].partition() == partition) { // should not be part of the results continue; } TreeID k = output.getKeys()[current]; // the tree should receive the partition's index assertEquals(partition, k.partition()); // make sure all the trees of the other partitions are handled in the // correct order assertEquals(index, k.treeId()); int[] predictions = output.getValues()[current].getPredictions(); // all the instances of the partition should be classified assertEquals(split.length, predictions.length); assertEquals("at least one instance of the partition was not classified", -1, ArrayUtils.indexOf(predictions, -1)); // the tree must not belong to the mapper's partition int treePartition = predictions[0]; assertFalse("Step2Mapper returned a tree from its own partition", partition == treePartition); current++; } } }
From source file:org.apache.orc.bench.ColumnProjectionBenchmark.java
License:Apache License
@Benchmark public void parquet(ExtraCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); if ("taxi".equals(dataset)) { conf.set("columns", "vendor_id,pickup_time"); conf.set("columns.types", "int,timestamp"); } else if ("sales".equals(dataset)) { conf.set("columns", "sales_id,customer_id"); conf.set("columns.types", "bigint,bigint"); } else if ("github".equals(dataset)) { conf.set("columns", "actor,created_at"); conf.set("columns.types", "struct<avatar_url:string,gravatar_id:string," + "id:int,login:string,url:string>,timestamp"); } else {/*from w w w . jav a 2 s . co m*/ throw new IllegalArgumentException("Unknown data set " + dataset); } Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset(); ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.records += 1; } recordReader.close(); counters.bytesRead += statistics.getBytesRead(); counters.reads += statistics.getReadOps(); counters.invocations += 1; }
From source file:org.apache.orc.bench.convert.parquet.ParquetReader.java
License:Apache License
public ParquetReader(Path path, TypeDescription schema, Configuration conf) throws IOException { FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); JobConf jobConf = new JobConf(conf); reader = new MapredParquetInputFormat().getRecordReader(split, jobConf, Reporter.NULL); value = reader.createValue();// w w w .j a v a 2s . c om converters = new Converter[schema.getChildren().size()]; List<TypeDescription> children = schema.getChildren(); for (int c = 0; c < converters.length; ++c) { converters[c] = createConverter(children.get(c)); } }
From source file:org.apache.orc.bench.convert.parquet.ParquetWriter.java
License:Apache License
public ParquetWriter(Path path, TypeDescription schema, Configuration conf, CompressionKind compression) throws IOException { JobConf jobConf = new JobConf(conf); Properties tableProperties = Utilities.convertSchemaToHiveConfig(schema); this.schema = schema; jobConf.set(ParquetOutputFormat.COMPRESSION, getCodec(compression).name()); writer = new MapredParquetOutputFormat().getHiveRecordWriter(jobConf, path, ParquetHiveRecord.class, compression != CompressionKind.NONE, tableProperties, Reporter.NULL); record = new ParquetHiveRecord(null, OrcBenchmarkUtilities.createObjectInspector(schema)); }
From source file:org.apache.orc.bench.FullReadBenchmark.java
License:Apache License
@Benchmark public void parquet(ExtraCounters counters) throws Exception { JobConf conf = new JobConf(); conf.set("fs.track.impl", TrackingLocalFileSystem.class.getName()); conf.set("fs.defaultFS", "track:///"); Path path = Utilities.getVariant(root, dataset, "parquet", compression); FileSystem.Statistics statistics = FileSystem.getStatistics("track:///", TrackingLocalFileSystem.class); statistics.reset();/* w w w . ja v a2 s .c om*/ ParquetInputFormat<ArrayWritable> inputFormat = new ParquetInputFormat<>(DataWritableReadSupport.class); NullWritable nada = NullWritable.get(); FileSplit split = new FileSplit(path, 0, Long.MAX_VALUE, new String[] {}); org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> recordReader = new ParquetRecordReaderWrapper( inputFormat, split, conf, Reporter.NULL); ArrayWritable value = recordReader.createValue(); while (recordReader.next(nada, value)) { counters.records += 1; } recordReader.close(); counters.bytesRead += statistics.getBytesRead(); counters.reads += statistics.getReadOps(); counters.invocations += 1; }