Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf) 

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:cascading.tap.hadoop.ZipInputFormat.java

License:Open Source License

protected Path[] listPathsInternal(JobConf jobConf) throws IOException {
    Path[] dirs = FileInputFormat.getInputPaths(jobConf);

    if (dirs.length == 0)
        throw new IOException("no input paths specified in job");

    for (Path dir : dirs) {
        FileSystem fs = dir.getFileSystem(jobConf);

        if (!fs.isFile(dir))
            throw new IOException("does not support directories: " + dir);
    }//from w ww .  j  ava  2 s  .  c o m

    return dirs;
}

From source file:cascading.tap.Hfs.java

License:Open Source License

@Override
public void sourceInit(JobConf conf) throws IOException {
    Path qualifiedPath = getQualifiedPath(conf);

    for (Path exitingPath : FileInputFormat.getInputPaths(conf)) {
        if (exitingPath.equals(qualifiedPath))
            throw new TapException("may not add duplicate paths, found: " + exitingPath);
    }/* w w w  .  j  a  v  a2s . c  o m*/

    FileInputFormat.addInputPath(conf, qualifiedPath);

    super.sourceInit(conf);

    makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: ");

    TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.// ww  w .  j  a v a2 s  .  c om
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*  w w w  .ja  va2 s.  co  m*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.benchmark.mapred.Sort.java

License:Apache License

/**
 * The main driver for sort program./* w  w  w. ja  v a2 s .com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java

License:Apache License

/**
 * Copied HiveInputFormat/*  w  w w  .  j  a  v  a 2  s .  c o m*/
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);

    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    JobConf newjob = new JobConf(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    List<Path> currentDirs = new ArrayList<Path>();
    Class<? extends InputFormat> currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;

    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        TableDesc table = part.getTableDesc();
        TableScanOperator tableScan = null;

        List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString());

        // Make filter pushdown information available to getSplits.
        if ((aliases != null) && (aliases.size() == 1)) {
            Operator op = mrwork_.getAliasToWork().get(aliases.get(0));
            if ((op != null) && (op instanceof TableScanOperator)) {
                tableScan = (TableScanOperator) op;
                // push down projections.
                ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(),
                        tableScan.getNeededColumns());
                // push down filters
                pushFilters(newjob, tableScan);
            }
        }

        if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass)
                && table.equals(currentTable) && tableScan == currentTableScan) {
            currentDirs.add(dir);
            continue;
        }

        if (!currentDirs.isEmpty()) {
            LOG.info("Generating splits");
            addSplitsForGroup(currentDirs, currentTableScan, newjob,
                    getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                    currentDirs.size() * (numSplits / dirs.length), currentTable, result);
        }

        currentDirs.clear();
        currentDirs.add(dir);
        currentTableScan = tableScan;
        currentTable = table;
        currentInputFormatClass = inputFormatClass;
    }

    if (dirs.length != 0) {
        LOG.info("Generating splits");
        addSplitsForGroup(currentDirs, currentTableScan, newjob,
                getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                currentDirs.size() * (numSplits / dirs.length), currentTable, result);
    }

    LOG.info("number of splits " + result.size());
    return result.toArray(new HiveInputSplitShim[result.size()]);
}

From source file:com.cloudera.science.avro.streaming.AvroAsJSONInputFormat.java

License:Open Source License

private void loadSchemas(JobConf job) throws IOException {
    this.schemas = Lists.newArrayList();
    SchemaLoader loader = new SchemaLoader(job);
    String schemaLiteral = job.get(SCHEMA_LITERAL);
    if (schemaLiteral != null) {
        schemas.add(loader.loadLiteral(schemaLiteral));
        return;//www .j av a2s .c o  m
    } else {
        String[] schemaUrls = job.getStrings(SCHEMA_URL);
        String[] typeNames = job.getStrings(SCHEMA_TYPE_NAME);
        if (schemaUrls != null) {
            for (String schemaUrl : schemaUrls) {
                schemas.add(loader.loadFromUrl(schemaUrl));
            }
        } else if (typeNames != null) {
            for (String typeName : typeNames) {
                schemas.add(loader.loadFromTypeName(typeName));
            }
        } else {
            throw new IllegalArgumentException("No schema information provided");
        }

        if (schemas.size() > 1) {
            // Need to track input paths
            Path[] inputs = FileInputFormat.getInputPaths(job);
            if (inputs.length != schemas.size()) {
                throw new IllegalArgumentException(String.format(
                        "Number of input paths (%d) does not match number of schemas specified (%d)",
                        inputs.length, schemas.size()));
            }
            this.inputPaths = new String[inputs.length];
            for (int i = 0; i < inputs.length; i++) {
                inputPaths[i] = inputs[i].toString();
            }
        }
    }
}

From source file:com.dappervision.hbase.mapred.TypedBytesTableInputFormat.java

License:Apache License

/**
 * Builds a TableRecordReader. If no TableRecordReader was provided, uses
 * the default./*from w  w w  .  ja  va2s  . c  o m*/
 *
 * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit,
 *      JobConf, Reporter)
 */

public void configure(JobConf job) {
    Path[] tableNames = FileInputFormat.getInputPaths(job);
    String colArg = job.get(COLUMN_LIST);
    String[] colNames = colArg.split(" ");
    byte[][] m_cols = new byte[colNames.length][];
    for (int i = 0; i < m_cols.length; i++) {
        m_cols[i] = Base64.decodeBase64(Bytes.toBytes(colNames[i]));
    }
    setInputColumns(m_cols);
    if (job.get(ROW_FILTER_REGEX) != null) {
        LOG.info("Row Regex Filter[" + job.get(ROW_FILTER_REGEX) + "]");
        setRowFilter(new RowFilter(CompareFilter.CompareOp.EQUAL,
                new RegexStringComparator(job.get(ROW_FILTER_REGEX))));
    }
    if (job.get(START_ROW) != null) {
        LOG.info("Start Row[" + job.get(START_ROW) + "]");
        try {
            setStartRow(Base64.decodeBase64(job.get(START_ROW).getBytes("US-ASCII")));
        } catch (UnsupportedEncodingException e) {
            LOG.error("Start Row[" + job.get(START_ROW) + "] - Error");
        }
    }
    if (job.get(STOP_ROW) != null) {
        LOG.info("Stop Row[" + job.get(STOP_ROW) + "]");
        try {
            setStopRow(Base64.decodeBase64(job.get(STOP_ROW).getBytes("US-ASCII")));
        } catch (UnsupportedEncodingException e) {
            LOG.error("Stop Row[" + job.get(STOP_ROW) + "] - Error");
        }
    }
    try {
        setHTable(new HTable(HBaseConfiguration.create(job), tableNames[0].getName()));
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
    }
    if (job.get(VALUE_FORMAT) != null && job.get(VALUE_FORMAT).equalsIgnoreCase("singlevalue")) {
        LOG.info("Value Format[" + job.get(VALUE_FORMAT) + "]");
        super.setTableRecordReader(new TypedBytesTableRecordReaderSingleValue());
    } else {
        LOG.info("Value Format[familiescolumns]");
        super.setTableRecordReader(new TypedBytesTableRecordReader());
    }
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    setColumns(job);//from   ww  w.  ja  v  a 2 s . com

    // hive depends on FileSplits, so wrap in HBaseSplit
    Path[] tablePaths = FileInputFormat.getInputPaths(job);

    InputSplit[] results = delegate.getSplits(job, numSplits);
    for (int i = 0; i < results.length; i++) {
        results[i] = new HBaseSplit(results[i], tablePaths[0]);
    }

    return results;
}

From source file:com.linkedin.mapred.AvroUtils.java

License:Open Source License

/**
 * Obtain the avro input schema from data
 * @param conf//from   w  ww  . j a v  a 2s  .  com
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(conf);
    if (paths == null) {
        throw new IllegalStateException("input paths do not exist in jobConf!");
    }
    Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
    if (inputSchema == null) {
        throw new IllegalStateException("Input does not have schema info and/or input is missing.");
    }
    return inputSchema;
}