List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths
public static void setInputPaths(JobConf conf, Path... inputPaths)
From source file:org.apache.avro.mapred.DelegatingInputFormat.java
License:Apache License
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException { JobConf confCopy = new JobConf(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, Class<? extends AvroMapper>> mapperMap = AvroMultipleInputs.getMapperTypeMap(conf); Map<Path, Schema> schemaMap = AvroMultipleInputs.getInputSchemaMap(conf); Map<Schema, List<Path>> schemaPaths = new HashMap<Schema, List<Path>>(); // First, build a map of Schemas to Paths for (Entry<Path, Schema> entry : schemaMap.entrySet()) { if (!schemaPaths.containsKey(entry.getValue())) { schemaPaths.put(entry.getValue(), new LinkedList<Path>()); System.out.println(entry.getValue()); System.out.println(entry.getKey()); }//from www . j ava2 s . c o m schemaPaths.get(entry.getValue()).add(entry.getKey()); } for (Entry<Schema, List<Path>> schemaEntry : schemaPaths.entrySet()) { Schema schema = schemaEntry.getKey(); System.out.println(schema); InputFormat format = (InputFormat) ReflectionUtils.newInstance(AvroInputFormat.class, conf); List<Path> paths = schemaEntry.getValue(); Map<Class<? extends AvroMapper>, List<Path>> mapperPaths = new HashMap<Class<? extends AvroMapper>, List<Path>>(); // Now, for each set of paths that have a common Schema, build // a map of Mappers to the paths they're used for for (Path path : paths) { Class<? extends AvroMapper> mapperClass = mapperMap.get(path); if (!mapperPaths.containsKey(mapperClass)) { mapperPaths.put(mapperClass, new LinkedList<Path>()); } mapperPaths.get(mapperClass).add(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. for (Entry<Class<? extends AvroMapper>, List<Path>> mapEntry : mapperPaths.entrySet()) { paths = mapEntry.getValue(); Class<? extends AvroMapper> mapperClass = mapEntry.getKey(); if (mapperClass == null) { mapperClass = (Class<? extends AvroMapper>) conf.getMapperClass(); } FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. InputSplit[] pathSplits = format.getSplits(confCopy, numSplits); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass, schema)); } } } return splits.toArray(new InputSplit[splits.size()]); }
From source file:org.apache.avro.mapred.TestAvroInputFormat.java
License:Apache License
@SuppressWarnings("rawtypes") @Test/*from w w w . j a v a 2 s . co m*/ public void testIgnoreFilesWithoutExtension() throws Exception { fs.mkdirs(inputDir); Path avroFile = new Path(inputDir, "somefile.avro"); Path textFile = new Path(inputDir, "someotherfile.txt"); fs.create(avroFile).close(); fs.create(textFile).close(); FileInputFormat.setInputPaths(conf, inputDir); AvroInputFormat inputFormat = new AvroInputFormat(); FileStatus[] statuses = inputFormat.listStatus(conf); Assert.assertEquals(1, statuses.length); Assert.assertEquals("somefile.avro", statuses[0].getPath().getName()); conf.setBoolean(AvroInputFormat.IGNORE_FILES_WITHOUT_EXTENSION_KEY, false); statuses = inputFormat.listStatus(conf); Assert.assertEquals(2, statuses.length); Set<String> names = new HashSet<String>(); names.add(statuses[0].getPath().getName()); names.add(statuses[1].getPath().getName()); Assert.assertTrue(names.contains("somefile.avro")); Assert.assertTrue(names.contains("someotherfile.txt")); }
From source file:org.apache.avro.mapred.TestAvroMultipleOutputs.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); // private static final String UTF8 = "UTF-8"; String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile();/*from ww w . j a v a2 s. c om*/ job.setJobName("AvroMultipleOutputs"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); WordCountUtil.setMeta(job); JobClient.runJob(job); WordCountUtil.validateCountsFile(); }
From source file:org.apache.avro.mapred.TestAvroMultipleOutputs.java
License:Apache License
@SuppressWarnings("deprecation") public void testJob_noreducer() throws Exception { JobConf job = new JobConf(); job.setNumReduceTasks(0);//from w ww . j av a 2s .co m // private static final String UTF8 = "UTF-8"; String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesFile(); job.setJobName("AvroMultipleOutputs_noreducer"); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema()); AvroJob.setMapperClass(job, MapImpl.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING)); JobClient.runJob(job); }
From source file:org.apache.avro.mapred.TestAvroTextSort.java
License:Apache License
@Test /**/*from w w w . j a v a2 s . com*/ * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat * and AvroTextOutputFormat to produce a sorted "bytes" Avro file. */ public void testSort() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesBytesFile(); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job, new Path(dir + "/in")); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); WordCountUtil.validateSortedFile(); }
From source file:org.apache.avro.mapred.TestGenericJob.java
License:Apache License
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, dir + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0);//from www . ja v a 2 s. c om FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); }
From source file:org.apache.avro.mapred.TestReflectJob.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "target/testReflectJob"; Path inputPath = new Path(dir + "/in"); Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); inputPath.getFileSystem(job).delete(inputPath); writeLinesFile(new File(dir + "/in")); job.setJobName("reflect"); AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class)); AvroJob.setMapOutputSchema(job, new Pair(new Text(""), new Count(0L)).getSchema()); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class)); AvroJob.setMapperClass(job, MapImpl.class); //AvroJob.setCombinerClass(job, ReduceImpl.class); AvroJob.setReducerClass(job, ReduceImpl.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); // use reflection JobClient.runJob(job);//from w ww. j av a 2 s. c o m validateCountsFile(new File(new File(dir, "out"), "part-00000.avro")); }
From source file:org.apache.avro.mapred.TestSequenceFileReader.java
License:Apache License
@Test public void testSequenceFileInputFormat() throws Exception { JobConf job = new JobConf(); Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out"); output.getFileSystem(job).delete(output); // configure input for Avro from sequence file AvroJob.setInputSequenceFile(job);/* ww w . j a v a 2s.co m*/ FileInputFormat.setInputPaths(job, FILE.toURI().toString()); AvroJob.setInputSchema(job, SCHEMA); // mapper is default, identity // reducer is default, identity // configure output for avro AvroJob.setOutputSchema(job, SCHEMA); FileOutputFormat.setOutputPath(job, output); JobClient.runJob(job); checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"), new SpecificDatumReader<Pair<Long, CharSequence>>())); }
From source file:org.apache.avro.mapred.TestSequenceFileReader.java
License:Apache License
@Test public void testNonAvroMapper() throws Exception { JobConf job = new JobConf(); Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out"); output.getFileSystem(job).delete(output); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, FILE.toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroMapper.class); // reducer is default, identity // configure output for avro FileOutputFormat.setOutputPath(job, output); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job);/* ww w. j a v a2 s. c o m*/ checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"), new SpecificDatumReader<Pair<Long, CharSequence>>())); }
From source file:org.apache.avro.mapred.TestSequenceFileReader.java
License:Apache License
@Test public void testNonAvroMapOnly() throws Exception { JobConf job = new JobConf(); Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out"); output.getFileSystem(job).delete(output); // configure input for non-Avro sequence file job.setInputFormat(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, FILE.toURI().toString()); // use a hadoop mapper that emits Avro output job.setMapperClass(NonAvroOnlyMapper.class); // configure output for avro job.setNumReduceTasks(0); // map-only FileOutputFormat.setOutputPath(job, output); AvroJob.setOutputSchema(job, SCHEMA); JobClient.runJob(job);/*w w w.j a v a2s . com*/ checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"), new SpecificDatumReader<Pair<Long, CharSequence>>())); }