Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf)

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:cascading.tap.hadoop.ZipInputFormat.java

License:Open Source License

protected Path[] listPathsInternal(JobConf jobConf) throws IOException {
    Path[] dirs = FileInputFormat.getInputPaths(jobConf);

    if (dirs.length == 0)
        throw new IOException("no input paths specified in job");

    for (Path dir : dirs) {
        FileSystem fs = dir.getFileSystem(jobConf);

        if (!fs.isFile(dir))
            throw new IOException("does not support directories: " + dir);
    }//from w ww .  j  ava  2 s  .  c o m

    return dirs;
}

From source file:cascading.tap.Hfs.java

License:Open Source License

@Override
public void sourceInit(JobConf conf) throws IOException {
    Path qualifiedPath = getQualifiedPath(conf);

    for (Path exitingPath : FileInputFormat.getInputPaths(conf)) {
        if (exitingPath.equals(qualifiedPath))
            throw new TapException("may not add duplicate paths, found: " + exitingPath);
    }/* w w w  .  j  a  v  a2s . c  o m*/

    FileInputFormat.addInputPath(conf, qualifiedPath);

    super.sourceInit(conf);

    makeLocal(conf, qualifiedPath, "forcing job to local mode, via source: ");

    TupleSerialization.setSerializations(conf); // allows Hfs to be used independent of Flow
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job.// ww  w .  j  a v a2 s  .  c om
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for the sort MapReduce job.
 *
 * @param jobConf           sort configuration
 * @param numMapTasks       number of map tasks
 * @param numReduceTasks    number of reduce tasks
 * @param sampler           sampler, if required
 * @param codecClass        the compression codec for compressing final outputs
 * @param mapCodecClass     the compression codec for compressing intermediary map outputs
 * @param createLzopIndexes whether or not a MR job should be launched to create LZOP indexes
 *                          for the job output files
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws IOException        if something went wrong
 * @throws URISyntaxException if a URI wasn't correctly formed
 *//*  w w w  .ja  va2 s.  co  m*/
public boolean runJob(final JobConf jobConf, final Integer numMapTasks, final Integer numReduceTasks,
        final InputSampler.Sampler<K, V> sampler, final Class<? extends CompressionCodec> codecClass,
        final Class<? extends CompressionCodec> mapCodecClass, final boolean createLzopIndexes,
        final String inputDirAsString, final String outputDirAsString) throws IOException, URISyntaxException {

    jobConf.setJarByClass(Sort.class);
    jobConf.setJobName("sorter");

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();

    if (numMapTasks != null) {
        jobConf.setNumMapTasks(numMapTasks);
    }
    if (numReduceTasks != null) {
        jobConf.setNumReduceTasks(numReduceTasks);
    } else {
        int numReduces = (int) (cluster.getMaxReduceTasks() * 0.9);
        String sortReduces = jobConf.get("test.sort.reduces_per_host");
        if (sortReduces != null) {
            numReduces = cluster.getTaskTrackers() * Integer.parseInt(sortReduces);
        }

        // Set user-supplied (possibly default) job configs
        jobConf.setNumReduceTasks(numReduces);
    }

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(SortReduce.class);

    jobConf.setInputFormat(SortInputFormat.class);

    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    if (mapCodecClass != null) {
        jobConf.setMapOutputCompressorClass(mapCodecClass);
    }

    if (codecClass != null) {
        jobConf.setBoolean("mapred.output.compress", true);
        jobConf.setClass("mapred.output.compression.codec", codecClass, CompressionCodec.class);
    }

    FileInputFormat.setInputPaths(jobConf, inputDirAsString);
    FileOutputFormat.setOutputPath(jobConf, new Path(outputDirAsString));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];

        FileSystem fileSystem = FileSystem.get(jobConf);

        if (fileSystem.exists(inputDir) && fileSystem.isFile(inputDir)) {
            inputDir = inputDir.getParent();
        }
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + jobConf.getNumReduceTasks() + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    if (jobResult.isSuccessful()) {
        if (createLzopIndexes && codecClass != null && LzopCodec.class.equals(codecClass)) {
            new LzoIndexer(jobConf).index(new Path(outputDirAsString));
        }
        return true;
    }
    return false;
}

From source file:com.benchmark.mapred.Sort.java

License:Apache License

/**
 * The main driver for sort program./* w  w  w. ja  v a2 s .com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java

License:Apache License

/**
 * Copied HiveInputFormat/*  w  w w  .  j  a  v  a 2  s .  c o m*/
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);

    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    JobConf newjob = new JobConf(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    List<Path> currentDirs = new ArrayList<Path>();
    Class<? extends InputFormat> currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;

    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        TableDesc table = part.getTableDesc();
        TableScanOperator tableScan = null;

        List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString());

        // Make filter pushdown information available to getSplits.
        if ((aliases != null) && (aliases.size() == 1)) {
            Operator op = mrwork_.getAliasToWork().get(aliases.get(0));
            if ((op != null) && (op instanceof TableScanOperator)) {
                tableScan = (TableScanOperator) op;
                // push down projections.
                ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(),
                        tableScan.getNeededColumns());
                // push down filters
                pushFilters(newjob, tableScan);
            }
        }

        if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass)
                && table.equals(currentTable) && tableScan == currentTableScan) {
            currentDirs.add(dir);
            continue;
        }

        if (!currentDirs.isEmpty()) {
            LOG.info("Generating splits");
            addSplitsForGroup(currentDirs, currentTableScan, newjob,
                    getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                    currentDirs.size() * (numSplits / dirs.length), currentTable, result);
        }

        currentDirs.clear();
        currentDirs.add(dir);
        currentTableScan = tableScan;
        currentTable = table;
        currentInputFormatClass = inputFormatClass;
    }

    if (dirs.length != 0) {
        LOG.info("Generating splits");
        addSplitsForGroup(currentDirs, currentTableScan, newjob,
                getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                currentDirs.size() * (numSplits / dirs.length), currentTable, result);
    }

    LOG.info("number of splits " + result.size());
    return result.toArray(new HiveInputSplitShim[result.size()]);
}

From source file:com.cloudera.science.avro.streaming.AvroAsJSONInputFormat.java

License:Open Source License

private void loadSchemas(JobConf job) throws IOException {
    this.schemas = Lists.newArrayList();
    SchemaLoader loader = new SchemaLoader(job);
    String schemaLiteral = job.get(SCHEMA_LITERAL);
    if (schemaLiteral != null) {
        schemas.add(loader.loadLiteral(schemaLiteral));
        return;//www .j av a2s .c o  m
    } else {
        String[] schemaUrls = job.getStrings(SCHEMA_URL);
        String[] typeNames = job.getStrings(SCHEMA_TYPE_NAME);
        if (schemaUrls != null) {
            for (String schemaUrl : schemaUrls) {
                schemas.add(loader.loadFromUrl(schemaUrl));
            }
        } else if (typeNames != null) {
            for (String typeName : typeNames) {
                schemas.add(loader.loadFromTypeName(typeName));
            }
        } else {
            throw new IllegalArgumentException("No schema information provided");
        }

        if (schemas.size() > 1) {
            // Need to track input paths
            Path[] inputs = FileInputFormat.getInputPaths(job);
            if (inputs.length != schemas.size()) {
                throw new IllegalArgumentException(String.format(
                        "Number of input paths (%d) does not match number of schemas specified (%d)",
                        inputs.length, schemas.size()));
            }
            this.inputPaths = new String[inputs.length];
            for (int i = 0; i < inputs.length; i++) {
                inputPaths[i] = inputs[i].toString();
            }
        }
    }
}

From source file:com.dappervision.hbase.mapred.TypedBytesTableInputFormat.java

License:Apache License

/**
 * Builds a TableRecordReader. If no TableRecordReader was provided, uses
 * the default./*from w  w w  .  ja  va2s  . c  o m*/
 *
 * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit,
 *      JobConf, Reporter)
 */

public void configure(JobConf job) {
    Path[] tableNames = FileInputFormat.getInputPaths(job);
    String colArg = job.get(COLUMN_LIST);
    String[] colNames = colArg.split(" ");
    byte[][] m_cols = new byte[colNames.length][];
    for (int i = 0; i < m_cols.length; i++) {
        m_cols[i] = Base64.decodeBase64(Bytes.toBytes(colNames[i]));
    }
    setInputColumns(m_cols);
    if (job.get(ROW_FILTER_REGEX) != null) {
        LOG.info("Row Regex Filter[" + job.get(ROW_FILTER_REGEX) + "]");
        setRowFilter(new RowFilter(CompareFilter.CompareOp.EQUAL,
                new RegexStringComparator(job.get(ROW_FILTER_REGEX))));
    }
    if (job.get(START_ROW) != null) {
        LOG.info("Start Row[" + job.get(START_ROW) + "]");
        try {
            setStartRow(Base64.decodeBase64(job.get(START_ROW).getBytes("US-ASCII")));
        } catch (UnsupportedEncodingException e) {
            LOG.error("Start Row[" + job.get(START_ROW) + "] - Error");
        }
    }
    if (job.get(STOP_ROW) != null) {
        LOG.info("Stop Row[" + job.get(STOP_ROW) + "]");
        try {
            setStopRow(Base64.decodeBase64(job.get(STOP_ROW).getBytes("US-ASCII")));
        } catch (UnsupportedEncodingException e) {
            LOG.error("Stop Row[" + job.get(STOP_ROW) + "] - Error");
        }
    }
    try {
        setHTable(new HTable(HBaseConfiguration.create(job), tableNames[0].getName()));
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
    }
    if (job.get(VALUE_FORMAT) != null && job.get(VALUE_FORMAT).equalsIgnoreCase("singlevalue")) {
        LOG.info("Value Format[" + job.get(VALUE_FORMAT) + "]");
        super.setTableRecordReader(new TypedBytesTableRecordReaderSingleValue());
    } else {
        LOG.info("Value Format[familiescolumns]");
        super.setTableRecordReader(new TypedBytesTableRecordReader());
    }
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    setColumns(job);//from   ww  w.  ja  v  a 2 s . com

    // hive depends on FileSplits, so wrap in HBaseSplit
    Path[] tablePaths = FileInputFormat.getInputPaths(job);

    InputSplit[] results = delegate.getSplits(job, numSplits);
    for (int i = 0; i < results.length; i++) {
        results[i] = new HBaseSplit(results[i], tablePaths[0]);
    }

    return results;
}

From source file:com.linkedin.mapred.AvroUtils.java

License:Open Source License

/**
 * Obtain the avro input schema from data
 * @param conf//from   w  ww  . j a v  a 2s  .  com
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(conf);
    if (paths == null) {
        throw new IllegalStateException("input paths do not exist in jobConf!");
    }
    Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
    if (inputSchema == null) {
        throw new IllegalStateException("Input does not have schema info and/or input is missing.");
    }
    return inputSchema;
}