Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:com.github.gaoyangthu.demo.mapred.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf conf;// w  w w. j  a  va  2  s  . c om
    int depth = 5;
    int width = 9;
    int height = 10;
    Class<? extends Pentomino> pentClass;
    if (args.length == 0) {
        System.out.println("Usage: pentomino <output> [-depth #] [-height #] [-width #]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    conf = new JobConf(getConf());

    // Pick up the parameters, should the user set these
    width = conf.getInt("pent.width", width);
    height = conf.getInt("pent.height", height);
    depth = conf.getInt("pent.depth", depth);
    pentClass = conf.getClass("pent.class", OneSidedPentomino.class, Pentomino.class);

    for (int i = 0; i < args.length; i++) {
        if (args[i].equalsIgnoreCase("-depth")) {
            depth = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-height")) {
            height = Integer.parseInt(args[++i].trim());
        } else if (args[i].equalsIgnoreCase("-width")) {
            width = Integer.parseInt(args[++i].trim());
        }
    }

    // Set parameters for MR tasks to pick up either which way the user sets
    // them or not
    conf.setInt("pent.width", width);
    conf.setInt("pent.height", height);
    conf.setInt("pent.depth", depth);

    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, output);
        conf.setJarByClass(PentMap.class);

        conf.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        createInputDirectory(fileSys, input, pent, depth);

        // the keys are the prefix strings
        conf.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PentMap.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setNumMapTasks(2000);
        conf.setNumReduceTasks(1);

        JobClient.runJob(conf);
    } finally {
        fileSys.delete(input, true);
    }
    return 0;
}

From source file:com.github.gaoyangthu.demo.mapred.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }//from   ww  w. j  av a2s. c  o m

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4)
            grepJob.set("mapred.mapper.regex.group", args[3]);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.github.gaoyangthu.demo.mapred.PiEstimator.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi/*  w ww  .j a  v  a 2  s.c  o m*/
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException {
    //setup job conf
    jobConf.setJobName(PiEstimator.class.getSimpleName());

    jobConf.setInputFormat(SequenceFileInputFormat.class);

    jobConf.setOutputKeyClass(BooleanWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setMapperClass(PiMapper.class);
    jobConf.setNumMapTasks(numMaps);

    jobConf.setReducerClass(PiReducer.class);
    jobConf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobConf.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(TMP_DIR, "in");
    final Path outDir = new Path(TMP_DIR, "out");
    FileInputFormat.setInputPaths(jobConf, inDir);
    FileOutputFormat.setOutputPath(jobConf, outDir);

    final FileSystem fs = FileSystem.get(jobConf);
    if (fs.exists(TMP_DIR)) {
        throw new IOException(
                "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    try {
        //generate an input file for each map task
        for (int i = 0; i < numMaps; ++i) {
            final Path file = new Path(inDir, "part" + i);
            final LongWritable offset = new LongWritable(i * numPoints);
            final LongWritable size = new LongWritable(numPoints);
            final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class,
                    LongWritable.class, CompressionType.NONE);
            try {
                writer.append(offset, size);
            } finally {
                writer.close();
            }
            System.out.println("Wrote input for Map #" + i);
        }

        //start a map/reduce job
        System.out.println("Starting Job");
        final long startTime = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Job Finished in " + duration + " seconds");

        //read outputs
        Path inFile = new Path(outDir, "reduce-out");
        LongWritable numInside = new LongWritable();
        LongWritable numOutside = new LongWritable();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
        try {
            reader.next(numInside, numOutside);
        } finally {
            reader.close();
        }

        //compute estimated value
        return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get()))
                .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints));
    } finally {
        fs.delete(TMP_DIR, true);
    }
}

From source file:com.hadoop.examples.geolocation.GeoLocationJob.java

License:Apache License

/**
 * @param args/*w  ww  . ja v a2s  .  c  o m*/
 */
public static void main(String[] args) throws Exception {
    JobConf conf = new JobConf(GeoLocationJob.class);
    conf.setJobName("geolocationgroup");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(GeoLocationMapper.class);
    conf.setReducerClass(GeoLocationReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

}

From source file:com.hadoopilluminated.examples.dancing.DistributedPentomino.java

License:Apache License

public int run(String[] args) throws Exception {
    JobConf conf;/* w  w  w .j  a v a 2s. co m*/
    int depth = 5;
    int width = 9;
    int height = 10;
    Class<? extends Pentomino> pentClass;
    if (args.length == 0) {
        System.out.println("pentomino <output>");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    conf = new JobConf(getConf());
    width = conf.getInt("pent.width", width);
    height = conf.getInt("pent.height", height);
    depth = conf.getInt("pent.depth", depth);
    pentClass = conf.getClass("pent.class", OneSidedPentonimo.class, Pentomino.class);

    Path output = new Path(args[0]);
    Path input = new Path(output + "_input");
    FileSystem fileSys = FileSystem.get(conf);
    try {
        FileInputFormat.setInputPaths(conf, input);
        FileOutputFormat.setOutputPath(conf, output);
        conf.setJarByClass(PentMap.class);

        conf.setJobName("dancingElephant");
        Pentomino pent = ReflectionUtils.newInstance(pentClass, conf);
        pent.initialize(width, height);
        createInputDirectory(fileSys, input, pent, depth);

        // the keys are the prefix strings
        conf.setOutputKeyClass(Text.class);
        // the values are puzzle solutions
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(PentMap.class);
        conf.setReducerClass(IdentityReducer.class);

        conf.setNumMapTasks(2000);
        conf.setNumReduceTasks(1);

        JobClient.runJob(conf);
    } finally {
        fileSys.delete(input, true);
    }
    return 0;
}

From source file:com.hadoopilluminated.examples.Grep.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }/*from   w ww  .j  a  v a 2 s. co  m*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4) {
            grepJob.set("mapred.mapper.regex.group", args[3]);
        }

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        JobConf sortJob = new JobConf(getConf(), Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setOutputKeyComparatorClass // sort by decreasing freq
        (LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:com.hdfs.concat.crush.Crush.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (!createJobConfAndParseArgs(args)) {
        return 0;
    }//from ww  w  .  ja v  a  2s  .  c o  m

    setFileSystem(FileSystem.get(job));

    FileStatus status = fs.getFileStatus(srcDir);

    if (null == status || !status.isDir()) {
        throw new IllegalArgumentException("No such directory: " + srcDir);
    }

    if (Mode.STAND_ALONE == mode) {
        standAlone();
    } else {
        writeDirs();

        MultipleInputs.addInputPath(job, bucketFiles, SequenceFileInputFormat.class, IdentityMapper.class);
        MultipleInputs.addInputPath(job, counters, CountersInputFormat.class, CountersMapper.class);

        job.setPartitionerClass(CrushPartitioner.class);

        job.setReducerClass(CrushReducer.class);

        job.setOutputKeyComparatorClass(Text.Comparator.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setOutputFormat(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(job, bucketFiles);
        FileOutputFormat.setOutputPath(job, outDir);

        job.set("crush.partition.map", partitionMap.toString());

        if (0 != nBuckets) {
            print(Verbosity.INFO, "\n\nInvoking map reduce\n\n");

            RunningJob completed = JobClient.runJob(job);

            jobCounters = completed.getCounters();
        }

        long eligible = jobCounters.getCounter(MapperCounter.FILES_ELIGIBLE);
        long crushed = jobCounters.getCounter(ReducerCounter.FILES_CRUSHED);

        /*
         * There's no way this cannot hold true if Hadoop is working correctly.
         */
        if (eligible != crushed) {
            throw new AssertionError(format("Files eligible (%d) != files crushed (%d)", eligible, crushed));
        }

        if (Mode.CLONE == mode) {
            cloneOutput();
        } else {
            moveOutput();
        }
    }

    print(Verbosity.INFO, "\n\nDeleting temporary directory");

    fs.delete(tmpDir, true);

    /*
     * If we have printed anything to the console at all, then add a line wrap to bring the cursor back to the beginning.
     */
    print(Verbosity.INFO, "\n\n");

    return 0;
}

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList
            .get(idx);/* w  ww. j av a  2s  .  c o  m*/

    try {
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.hp.hplc.mr.driver.WordCount.java

License:Apache License

/**
 * The main driver for word count map/reduce program.
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker.// w w  w . j  a  v a2s  .  c o m
 */
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf(), WordCount.class);
    conf.setJobName("wordcount");

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MapClass.class);
    conf.setCombinerClass(Reduce.class);
    conf.setReducerClass(Reduce.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                System.out.println("# of reduces: " + conf.getNumReduceTasks());
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(conf, other_args.get(0));
    FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

    JobClient.runJob(conf);

    return 0;
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java

License:Open Source License

@Override
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen,
        long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-DPMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;/*from ww  w .  j  a v  a 2s .  co m*/
    job = new JobConf(DataPartitionerRemoteMR.class);
    if (_pfid >= 0) //use in parfor
        job.setJobName(jobname + _pfid);
    else //use for partition instruction
        job.setJobName("Partition-MR");

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    try {
        //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size)
        in.exportData(); //written to disk iff dirty

        Path path = new Path(in.getFileName());

        /////
        //configure the MR job
        MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew,
                _keepIndexes);

        //set mappers, reducers, combiners
        job.setMapperClass(DataPartitionerRemoteMapper.class);
        job.setReducerClass(DataPartitionerRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            //binary cell intermediates for reduced IO 
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(PairWritableBlock.class);

            //check Alignment
            if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0)
                    || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) {
                throw new DMLRuntimeException(
                        "Data partitioning format " + _format + " requires aligned blocks.");
            }
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path and output path 
        FileInputFormat.setInputPaths(job, path);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        //FileOutputFormat.setOutputPath(job, pathNew);
        job.setOutputFormat(NullOutputFormat.class);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = -1;
        switch (_format) {
        case ROW_WISE:
            reducerGroups = rlen;
            break;
        case COLUMN_WISE:
            reducerGroups = clen;
            break;
        case ROW_BLOCK_WISE:
            reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE:
            reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1);
            break;
        case ROW_BLOCK_WISE_N:
            reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1);
            break;
        case COLUMN_BLOCK_WISE_N:
            reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1);
            break;
        default:
            //do nothing
        }
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        /*if( ParForProgramBlock.USE_FLEX_SCHEDULER_CONF )
        {
           job.setInt("flex.map.min", 0);
           job.setInt("flex.map.max", _numMappers);
           job.setInt("flex.reduce.min", 0);
           job.setInt("flex.reduce.max", _numMappers);
        }*/

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set up map/reduce memory configurations (if in AM context)
        DMLConfig config = ConfigurationManager.getConfig();
        DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   
        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS && _pfid >= 0) {
        long t1 = System.nanoTime(); //only for parfor 
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}