Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:org.apache.avro.mapred.DelegatingInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {

    JobConf confCopy = new JobConf(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();

    Map<Path, Class<? extends AvroMapper>> mapperMap = AvroMultipleInputs.getMapperTypeMap(conf);
    Map<Path, Schema> schemaMap = AvroMultipleInputs.getInputSchemaMap(conf);
    Map<Schema, List<Path>> schemaPaths = new HashMap<Schema, List<Path>>();

    // First, build a map of Schemas to Paths
    for (Entry<Path, Schema> entry : schemaMap.entrySet()) {
        if (!schemaPaths.containsKey(entry.getValue())) {
            schemaPaths.put(entry.getValue(), new LinkedList<Path>());
            System.out.println(entry.getValue());
            System.out.println(entry.getKey());
        }//from  www .  j  ava2  s  .  c o  m

        schemaPaths.get(entry.getValue()).add(entry.getKey());
    }

    for (Entry<Schema, List<Path>> schemaEntry : schemaPaths.entrySet()) {
        Schema schema = schemaEntry.getKey();
        System.out.println(schema);
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(AvroInputFormat.class, conf);
        List<Path> paths = schemaEntry.getValue();

        Map<Class<? extends AvroMapper>, List<Path>> mapperPaths = new HashMap<Class<? extends AvroMapper>, List<Path>>();

        // Now, for each set of paths that have a common Schema, build
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends AvroMapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }

            mapperPaths.get(mapperClass).add(path);
        }

        // Now each set of paths that has a common InputFormat and Mapper can
        // be added to the same job, and split together.
        for (Entry<Class<? extends AvroMapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends AvroMapper> mapperClass = mapEntry.getKey();

            if (mapperClass == null) {
                mapperClass = (Class<? extends AvroMapper>) conf.getMapperClass();
            }

            FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass, schema));
            }
        }
    }

    return splits.toArray(new InputSplit[splits.size()]);
}

From source file:org.apache.avro.mapred.TestAvroInputFormat.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test/*from  w w w . j  a  v a 2  s  . co  m*/
public void testIgnoreFilesWithoutExtension() throws Exception {
    fs.mkdirs(inputDir);
    Path avroFile = new Path(inputDir, "somefile.avro");
    Path textFile = new Path(inputDir, "someotherfile.txt");
    fs.create(avroFile).close();
    fs.create(textFile).close();

    FileInputFormat.setInputPaths(conf, inputDir);

    AvroInputFormat inputFormat = new AvroInputFormat();
    FileStatus[] statuses = inputFormat.listStatus(conf);
    Assert.assertEquals(1, statuses.length);
    Assert.assertEquals("somefile.avro", statuses[0].getPath().getName());

    conf.setBoolean(AvroInputFormat.IGNORE_FILES_WITHOUT_EXTENSION_KEY, false);
    statuses = inputFormat.listStatus(conf);
    Assert.assertEquals(2, statuses.length);
    Set<String> names = new HashSet<String>();
    names.add(statuses[0].getPath().getName());
    names.add(statuses[1].getPath().getName());
    Assert.assertTrue(names.contains("somefile.avro"));
    Assert.assertTrue(names.contains("someotherfile.txt"));
}

From source file:org.apache.avro.mapred.TestAvroMultipleOutputs.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();

    //    private static final String UTF8 = "UTF-8";
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();/*from ww w  . j  a v a2  s.  c om*/

    job.setJobName("AvroMultipleOutputs");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, false);
    AvroMultipleOutputs.addNamedOutput(job, "myavro", AvroOutputFormat.class,
            new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
    AvroMultipleOutputs.addNamedOutput(job, "myavro1", AvroOutputFormat.class,
            Schema.create(Schema.Type.STRING));
    AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class,
            Schema.create(Schema.Type.STRING));
    WordCountUtil.setMeta(job);

    JobClient.runJob(job);

    WordCountUtil.validateCountsFile();
}

From source file:org.apache.avro.mapred.TestAvroMultipleOutputs.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob_noreducer() throws Exception {
    JobConf job = new JobConf();
    job.setNumReduceTasks(0);//from w  ww  .  j av  a 2s  .co m
    //    private static final String UTF8 = "UTF-8";
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();

    job.setJobName("AvroMultipleOutputs_noreducer");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, false);
    AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class,
            Schema.create(Schema.Type.STRING));
    JobClient.runJob(job);
}

From source file:org.apache.avro.mapred.TestAvroTextSort.java

License:Apache License

@Test
/**/*from w  w w  .  j a  v  a2  s  .  com*/
 * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat
 * and AvroTextOutputFormat to produce a sorted "bytes" Avro file.
 */
public void testSort() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesBytesFile();

    job.setInputFormat(AvroAsTextInputFormat.class);
    job.setOutputFormat(AvroTextOutputFormat.class);
    job.setOutputKeyClass(Text.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);

    JobClient.runJob(job);

    WordCountUtil.validateSortedFile();
}

From source file:org.apache.avro.mapred.TestGenericJob.java

License:Apache License

@Test
public void testJob() throws Exception {
    JobConf job = new JobConf();
    Path outputPath = new Path(dir + "/out");
    outputPath.getFileSystem(job).delete(outputPath);

    job.setInputFormat(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dir + "/in");

    job.setMapperClass(AvroTestConverter.class);
    job.setNumReduceTasks(0);//from  www .  ja v a 2  s.  c om

    FileOutputFormat.setOutputPath(job, outputPath);
    System.out.println(createSchema());
    AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema()));
    job.setOutputFormat(AvroOutputFormat.class);

    JobClient.runJob(job);
}

From source file:org.apache.avro.mapred.TestReflectJob.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "target/testReflectJob";
    Path inputPath = new Path(dir + "/in");
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    inputPath.getFileSystem(job).delete(inputPath);

    writeLinesFile(new File(dir + "/in"));

    job.setJobName("reflect");

    AvroJob.setInputSchema(job, ReflectData.get().getSchema(Text.class));
    AvroJob.setMapOutputSchema(job, new Pair(new Text(""), new Count(0L)).getSchema());
    AvroJob.setOutputSchema(job, ReflectData.get().getSchema(WordCount.class));

    AvroJob.setMapperClass(job, MapImpl.class);
    //AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    AvroJob.setReflect(job); // use reflection

    JobClient.runJob(job);//from   w  ww. j av a  2  s. c  o  m

    validateCountsFile(new File(new File(dir, "out"), "part-00000.avro"));
}

From source file:org.apache.avro.mapred.TestSequenceFileReader.java

License:Apache License

@Test
public void testSequenceFileInputFormat() throws Exception {
    JobConf job = new JobConf();
    Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out");

    output.getFileSystem(job).delete(output);

    // configure input for Avro from sequence file
    AvroJob.setInputSequenceFile(job);/* ww w  . j a  v  a 2s.co  m*/
    FileInputFormat.setInputPaths(job, FILE.toURI().toString());
    AvroJob.setInputSchema(job, SCHEMA);

    // mapper is default, identity
    // reducer is default, identity

    // configure output for avro
    AvroJob.setOutputSchema(job, SCHEMA);
    FileOutputFormat.setOutputPath(job, output);

    JobClient.runJob(job);

    checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"),
            new SpecificDatumReader<Pair<Long, CharSequence>>()));
}

From source file:org.apache.avro.mapred.TestSequenceFileReader.java

License:Apache License

@Test
public void testNonAvroMapper() throws Exception {
    JobConf job = new JobConf();
    Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out");

    output.getFileSystem(job).delete(output);

    // configure input for non-Avro sequence file
    job.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, FILE.toURI().toString());

    // use a hadoop mapper that emits Avro output
    job.setMapperClass(NonAvroMapper.class);

    // reducer is default, identity

    // configure output for avro
    FileOutputFormat.setOutputPath(job, output);
    AvroJob.setOutputSchema(job, SCHEMA);

    JobClient.runJob(job);/* ww w.  j a  v  a2  s.  c o m*/

    checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"),
            new SpecificDatumReader<Pair<Long, CharSequence>>()));
}

From source file:org.apache.avro.mapred.TestSequenceFileReader.java

License:Apache License

@Test
public void testNonAvroMapOnly() throws Exception {
    JobConf job = new JobConf();
    Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out");

    output.getFileSystem(job).delete(output);

    // configure input for non-Avro sequence file
    job.setInputFormat(SequenceFileInputFormat.class);
    FileInputFormat.setInputPaths(job, FILE.toURI().toString());

    // use a hadoop mapper that emits Avro output
    job.setMapperClass(NonAvroOnlyMapper.class);

    // configure output for avro
    job.setNumReduceTasks(0); // map-only
    FileOutputFormat.setOutputPath(job, output);
    AvroJob.setOutputSchema(job, SCHEMA);

    JobClient.runJob(job);/*w w w.j a v a2s  .  com*/

    checkFile(new DataFileReader<Pair<Long, CharSequence>>(new File(output.toString() + "/part-00000.avro"),
            new SpecificDatumReader<Pair<Long, CharSequence>>()));
}