Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:cascading.tap.hadoop.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob//from  w  w  w. j  a  va2 s  .c  om
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        configs.add(getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local");
    }

    FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", Util.serializeBase64(configs));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        toJob.set("mapred.job.tracker", "local");
}

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;//from  w  w w . j a  va2 s.c  om

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}

From source file:clusteringblocks.ClusteringBlocks.java

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    JobConf job = new JobConf(conf, ClusteringBlocks.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("ClusteringBlocks");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormat(KeyValueTextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    //   job.set("key.value.separator.in.input.line", "");

    JobClient.runJob(job);//  w ww .  j ava2  s .  co  m

    return 0;
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*ww  w  .  j ava 2 s.c  om*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:co.cask.hydrator.plugin.CopybookTest.java

License:Apache License

@BeforeClass
public static void setupTest() throws Exception {
    setupBatchArtifacts(BATCH_APP_ARTIFACT_ID, ETLBatchApplication.class);
    // add artifact for batch sources and sinks
    addPluginArtifact(NamespaceId.DEFAULT.artifact("copybookreader-plugins", "1.0.0"), BATCH_APP_ARTIFACT_ID,
            CopybookSource.class);
    FileInputFormat.setInputPaths(new JobConf(), new Path("src/test/resources"));
}

From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java

License:Apache License

@SuppressWarnings("deprecation")
public void testJob() throws Exception {
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);
    WordCountUtil.writeLinesFile();/*from  w ww .j  a  v  a  2s .c o  m*/

    job.setJobName("wordcount");

    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());

    AvroJob.setMapperClass(job, MapImpl.class);
    AvroJob.setCombinerClass(job, ReduceImpl.class);
    AvroJob.setReducerClass(job, ReduceImpl.class);

    job.setInputFormat(CombineAvroInputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(dir + "/in"));
    FileOutputFormat.setOutputPath(job, outputPath);
    FileOutputFormat.setCompressOutput(job, true);

    WordCountUtil.setMeta(job);

    JobClient.runJob(job);

    WordCountUtil.validateCountsFile();
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java

License:Apache License

/**
 * The driver for the MapReduce job.//from   w w  w .  ja  v  a  2s.  c  om
 *
 * @param conf           configuration
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws java.io.IOException         if something went wrong
 * @throws java.net.URISyntaxException if a URI wasn't correctly formed
 */
public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString)
        throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

    JobConf job = new JobConf(conf);

    job.setJarByClass(CombineSequenceFileJob.class);
    job.setJobName("seqfilecombiner");

    job.setNumReduceTasks(0);

    job.setMapperClass(IdentityMapper.class);

    job.setInputFormat(CombineSequenceFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, inputDirAsString);
    FileOutputFormat.setOutputPath(job, new Path(outputDirAsString));

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    RunningJob jobResult = JobClient.runJob(job);

    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    return jobResult.isSuccessful();
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *///  w w  w .  j a  v a  2  s .  c  o  m
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.benchmark.mapred.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf conf;//from  w w w .  j  a  v  a2  s  .c  o  m
    int depth = 5;
    int width = 9;
    int height = 10;
    Class<? extends Pentomino> pentClass;
    if (args.length == 0) {
        System.out.println("pentomino <output>");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    conf = new JobConf(getConf());
    width = conf.getInt("pent.width", width);
    height = conf.getInt("pent.height", height);
    depth = conf.getInt("pent.depth", depth);
    pentClass = conf.getClass("pent.class", OneSidedPentomino.class, Pentomino.class);

    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, output);
        conf.setJarByClass(PentMap.class);

        conf.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        createInputDirectory(fileSys, input, pent, depth);

        // the keys are the prefix strings
        conf.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PentMap.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setNumMapTasks(2000);
        conf.setNumReduceTasks(1);

        JobClient.runJob(conf);
    } finally {
        fileSys.delete(input, true);
    }
    return 0;
}

From source file:com.benchmark.mapred.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("Grep <inDir> <outDir> <numreduces> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*from   w ww  . j a v  a  2 s.c  om*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        Date startIteration = new Date();
        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);
        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[3]);
        if (args.length == 5)
            grepJob.set("mapred.mapper.regex.group", args[4]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);
        FileOutputFormat.setOutputPath(grepJob, new Path(args[1]));
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);
        grepJob.setNumReduceTasks(Integer.parseInt(args[2]));

        JobClient.runJob(grepJob);

        Date endIteration = new Date();
        System.out.println("The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000
                + " seconds.");
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}