Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths) 

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:org.apache.avro.mapred.TestSequenceFileReader.java

License:Apache License

@Test
public void testNonAvroReducer() throws Exception {
    JobConf job = new JobConf();
    Path output = new Path(System.getProperty("test.dir", ".") + "/seq-out");

    output.getFileSystem(job).delete(output);

    // configure input for Avro from sequence file
    AvroJob.setInputSequenceFile(job);/* ww w . j av  a  2s  . c  o  m*/
    AvroJob.setInputSchema(job, SCHEMA);
    FileInputFormat.setInputPaths(job, FILE.toURI().toString());

    // mapper is default, identity

    // use a hadoop reducer that consumes Avro input
    AvroJob.setMapOutputSchema(job, SCHEMA);
    job.setReducerClass(NonAvroReducer.class);

    // configure output for non-Avro SequenceFile
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, output);

    // output key/value classes are default, LongWritable/Text

    JobClient.runJob(job);

    checkFile(new SequenceFileReader<Long, CharSequence>(new File(output.toString() + "/part-00000")));
}

From source file:org.apache.avro.mapred.TestWeather.java

License:Apache License

/** Uses default mapper with no reduces for a map-only identity job. */
@Test/*from ww  w.  j a  v a2 s . c  o  m*/
@SuppressWarnings("deprecation")
public void testMapOnly() throws Exception {
    JobConf job = new JobConf();
    String inDir = System.getProperty("share.dir", "../../../share") + "/test/data";
    Path input = new Path(inDir + "/weather.avro");
    Path output = new Path(System.getProperty("test.dir", "target/test") + "/weather-ident");

    output.getFileSystem(job).delete(output);

    job.setJobName("identity map weather");

    AvroJob.setInputSchema(job, Weather.SCHEMA$);
    AvroJob.setOutputSchema(job, Weather.SCHEMA$);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);
    FileOutputFormat.setCompressOutput(job, true);

    job.setNumReduceTasks(0); // map-only

    JobClient.runJob(job);

    // check output is correct
    DatumReader<Weather> reader = new SpecificDatumReader<Weather>();
    DataFileReader<Weather> check = new DataFileReader<Weather>(new File(inDir + "/weather.avro"), reader);
    DataFileReader<Weather> sorted = new DataFileReader<Weather>(
            new File(output.toString() + "/part-00000.avro"), reader);

    for (Weather w : sorted)
        assertEquals(check.next(), w);

    check.close();
    sorted.close();
}

From source file:org.apache.avro.mapred.TestWeather.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testSort() throws Exception {
    JobConf job = new JobConf();
    String inDir = System.getProperty("share.dir", "../../../share") + "/test/data";
    Path input = new Path(inDir + "/weather.avro");
    Path output = new Path(System.getProperty("test.dir", "target/test") + "/weather-sort");

    output.getFileSystem(job).delete(output);

    job.setJobName("sort weather");

    AvroJob.setInputSchema(job, Weather.SCHEMA$);
    AvroJob.setMapOutputSchema(job, Pair.getPairSchema(Weather.SCHEMA$, Schema.create(Type.NULL)));
    AvroJob.setOutputSchema(job, Weather.SCHEMA$);

    AvroJob.setMapperClass(job, SortMapper.class);
    AvroJob.setReducerClass(job, SortReducer.class);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);
    FileOutputFormat.setCompressOutput(job, true);
    AvroJob.setOutputCodec(job, SNAPPY_CODEC);

    JobClient.runJob(job);//from   w w  w  . j ava  2  s. c om

    // check output is correct
    DatumReader<Weather> reader = new SpecificDatumReader<Weather>();
    DataFileReader<Weather> check = new DataFileReader<Weather>(new File(inDir + "/weather-sorted.avro"),
            reader);
    DataFileReader<Weather> sorted = new DataFileReader<Weather>(
            new File(output.toString() + "/part-00000.avro"), reader);

    for (Weather w : sorted)
        assertEquals(check.next(), w);

    check.close();
    sorted.close();

    // check that AvroMapper and AvroReducer get close() and configure() called
    assertEquals(1, mapCloseCalls.get());
    assertEquals(1, reducerCloseCalls.get());
    assertEquals(1, mapConfigureCalls.get());
    assertEquals(1, reducerConfigureCalls.get());

}

From source file:org.apache.avro.mapred.TestWordCount.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();//from  w w  w . j a  va2s. c  o  m

    job.setJobName("wordcount");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);

    WordCountUtil.setMeta(job);

    JobClient.runJob(job);

    WordCountUtil.validateCountsFile();
}

From source file:org.apache.avro.mapred.TestWordCountGeneric.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");
    JobConf job = new JobConf();
    try {//from   ww w.  ja v a  2 s . c  om
        WordCountUtil.writeLinesFile();

        job.setJobName("wordcount");

        AvroJob.setInputGeneric(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputGeneric(job, WordCount.SCHEMA$);

        job.setMapperClass(MapImpl.class);
        job.setCombinerClass(ReduceImpl.class);
        job.setReducerClass(ReduceImpl.class);

        FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);

        JobClient.runJob(job);

        WordCountUtil.validateCountsFile();
    } finally {
        outputPath.getFileSystem(job).delete(outputPath);
    }
}

From source file:org.apache.avro.mapred.TestWordCountSpecific.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    try {//  w  ww  . ja v a 2  s.c  o m
        WordCountUtil.writeLinesFile();

        job.setJobName("wordcount");

        AvroJob.setInputSpecific(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputSpecific(job, WordCount.SCHEMA$);

        job.setMapperClass(MapImpl.class);
        job.setCombinerClass(ReduceImpl.class);
        job.setReducerClass(ReduceImpl.class);

        FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
        FileOutputFormat.setOutputPath(job, outputPath);
        FileOutputFormat.setCompressOutput(job, true);

        JobClient.runJob(job);

        WordCountUtil.validateCountsFile();
    } finally {
        outputPath.getFileSystem(job).delete(outputPath);
    }

}

From source file:org.apache.cassandra.bulkloader.CassandraBulkLoader.java

License:Apache License

public static void runJob(String[] args) {
    JobConf conf = new JobConf(CassandraBulkLoader.class);

    if (args.length >= 4) {
        conf.setNumReduceTasks(new Integer(args[3]));
    }/*from   ww  w  .j a  va2s . c o  m*/

    try {
        // We store the cassandra storage-conf.xml on the HDFS cluster
        DistributedCache.addCacheFile(new URI("/cassandra/storage-conf.xml#storage-conf.xml"), conf);
    } catch (URISyntaxException e) {
        throw new RuntimeException(e);
    }
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setJobName("CassandraBulkLoader_v2");
    conf.setMapperClass(Map.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[1]));
    FileOutputFormat.setOutputPath(conf, new Path(args[2]));
    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java

License:Apache License

@VisibleForTesting
static Stream<String> getLocations(final List<WindowedDataSegment> segments,
        final org.apache.hadoop.mapred.InputFormat fio, final JobConf conf) {
    return segments.stream().sequential().flatMap((final WindowedDataSegment segment) -> {
        FileInputFormat.setInputPaths(conf, new Path(JobHelper.getURIFromSegment(segment.getSegment())));
        try {//from  www . ja  v  a2s . c om
            return Arrays.stream(fio.getSplits(conf, 1))
                    .flatMap((final org.apache.hadoop.mapred.InputSplit split) -> {
                        try {
                            return Arrays.stream(split.getLocations());
                        } catch (final Exception e) {
                            logger.error(e, "Exception getting locations");
                            return Stream.empty();
                        }
                    });
        } catch (final Exception e) {
            logger.error(e, "Exception getting splits");
            return Stream.empty();
        }
    });
}

From source file:org.apache.hawq.pxf.plugins.hive.HiveDataFragmenter.java

License:Apache License

private void fetchMetaData(HiveTablePartition tablePartition) throws Exception {
    InputFormat<?, ?> fformat = makeInputFormat(tablePartition.storageDesc.getInputFormat(), jobConf);
    FileInputFormat.setInputPaths(jobConf, new Path(tablePartition.storageDesc.getLocation()));

    InputSplit[] splits = null;/* w w  w .  j a  va 2  s  .  co  m*/
    try {
        splits = fformat.getSplits(jobConf, 1);
    } catch (org.apache.hadoop.mapred.InvalidInputException e) {
        LOG.debug("getSplits failed on " + e.getMessage());
        return;
    }

    for (InputSplit split : splits) {
        FileSplit fsp = (FileSplit) split;
        String[] hosts = fsp.getLocations();
        String filepath = fsp.getPath().toUri().getPath();

        byte[] locationInfo = HdfsUtilities.prepareFragmentMetadata(fsp);
        Fragment fragment = new Fragment(filepath, hosts, locationInfo, makeUserData(tablePartition));
        fragments.add(fragment);
    }
}

From source file:org.apache.hcatalog.hcatmix.load.HadoopLoadGenerator.java

License:Apache License

/**
 * Prepare input directory/jobConf and launch the hadoop job, for load testing
 *
 * @param confFileName The properties file for the task, should be available in the classpath
 * @param conf//from ww  w.j av a 2 s  . c o m
 * @return
 * @throws IOException
 * @throws MetaException
 * @throws TException
 */
public SortedMap<Long, ReduceResult> runLoadTest(String confFileName, Configuration conf)
        throws Exception, MetaException, TException {
    JobConf jobConf;
    if (conf != null) {
        jobConf = new JobConf(conf);
    } else {
        jobConf = new JobConf(new Configuration());
    }
    InputStream confFileIS;
    try {
        confFileIS = HCatMixUtils.getInputStream(confFileName);
    } catch (Exception e) {
        LOG.error("Couldn't load configuration file " + confFileName);
        throw e;
    }
    Properties props = new Properties();
    try {
        props.load(confFileIS);
    } catch (IOException e) {
        LOG.error("Couldn't load properties file: " + confFileName, e);
        throw e;
    }

    LOG.info("Loading configuration file: " + confFileName);
    addToJobConf(jobConf, props, Conf.MAP_RUN_TIME_MINUTES);
    addToJobConf(jobConf, props, Conf.STAT_COLLECTION_INTERVAL_MINUTE);
    addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_COUNT);
    addToJobConf(jobConf, props, Conf.THREAD_INCREMENT_INTERVAL_MINUTES);
    addToJobConf(jobConf, props, Conf.THREAD_COMPLETION_BUFFER_MINUTES);

    int numMappers = Integer
            .parseInt(props.getProperty(Conf.NUM_MAPPERS.propName, "" + Conf.NUM_MAPPERS.defaultValue));
    Path inputDir = new Path(props.getProperty(Conf.INPUT_DIR.propName, Conf.INPUT_DIR.defaultValueStr));
    Path outputDir = new Path(props.getProperty(Conf.OUTPUT_DIR.propName, Conf.OUTPUT_DIR.defaultValueStr));

    jobConf.setJobName(JOB_NAME);
    jobConf.setNumMapTasks(numMappers);
    jobConf.setMapperClass(HCatMapper.class);
    jobConf.setJarByClass(HCatMapper.class);
    jobConf.setReducerClass(HCatReducer.class);
    jobConf.setMapOutputKeyClass(LongWritable.class);
    jobConf.setMapOutputValueClass(IntervalResult.class);
    jobConf.setOutputKeyClass(LongWritable.class);
    jobConf.setOutputValueClass(ReduceResult.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    jobConf.set(Conf.TASK_CLASS_NAMES.getJobConfKey(),
            props.getProperty(Conf.TASK_CLASS_NAMES.propName, Conf.TASK_CLASS_NAMES.defaultValueStr));

    fs = FileSystem.get(jobConf);
    Path jarRoot = new Path("/tmp/hcatmix_jar_" + new Random().nextInt());
    HadoopUtils.uploadClasspathAndAddToJobConf(jobConf, jarRoot);
    fs.deleteOnExit(jarRoot);

    FileInputFormat.setInputPaths(jobConf, createInputFiles(inputDir, numMappers));
    if (fs.exists(outputDir)) {
        fs.delete(outputDir, true);
    }
    FileOutputFormat.setOutputPath(jobConf, outputDir);

    // Set up delegation token required for hiveMetaStoreClient in map task
    HiveConf hiveConf = new HiveConf(HadoopLoadGenerator.class);
    HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf);
    String tokenStr = hiveClient.getDelegationToken(UserGroupInformation.getCurrentUser().getUserName(),
            "mapred");
    Token<? extends AbstractDelegationTokenIdentifier> token = new Token<DelegationTokenIdentifier>();
    token.decodeFromUrlString(tokenStr);
    token.setService(new Text(METASTORE_TOKEN_SIGNATURE));
    jobConf.getCredentials().addToken(new Text(METASTORE_TOKEN_KEY), token);

    // Submit the job, once the job is complete see output
    LOG.info("Submitted hadoop job");
    RunningJob j = JobClient.runJob(jobConf);
    LOG.info("JobID is: " + j.getJobName());
    if (!j.isSuccessful()) {
        throw new IOException("Job failed");
    }
    return readResult(outputDir, jobConf);
}