Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf)

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

private void setSAMHeaderFrom(JobConf job) throws IOException {
    if (wrappedOutputFormat.getSAMHeader() != null)
        return;/* w  w w  .  j  av  a2 s  .c  om*/

    // XXX: We're not told where to take the SAM header from so we just merge
    // them all. There should probably be a better way of doing this.

    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();

    // The "best" sort order among the headers: unsorted if they're sorted
    // differently, otherwise their common sort order.
    SAMFileHeader.SortOrder sortOrder = null;

    // XXX: it seems that FileInputFormat.getInputPaths(job) will point to
    // the directories of the input tables in the query. I'm not sure if this
    // is always the case.
    for (final Path table : FileInputFormat.getInputPaths(job)) {
        final FileSystem fs = table.getFileSystem(job);
        for (final FileStatus stat : fs.listStatus(table)) {
            if (!stat.isFile())
                throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files");

            final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath()));
            final SAMFileHeader h = r.getFileHeader();
            r.close();
            headers.add(h);

            if (sortOrder == null) {
                sortOrder = h.getSortOrder();
                continue;
            }
            if (sortOrder == SAMFileHeader.SortOrder.unsorted)
                continue;
            if (sortOrder != h.getSortOrder())
                sortOrder = SAMFileHeader.SortOrder.unsorted;
        }
    }

    wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader());
}

From source file:boa.datagen.SeqSort.java

License:Apache License

/**
 * The main driver for sort program./*from w w w.ja  va  2  s  . com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
@Override
public int run(String[] args) throws Exception {
    System.out.println(inPath);

    JobConf jobConf = new JobConf(getConf(), SeqSort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(SequenceFileInputFormat.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BytesWritable.class);

    SequenceFileOutputFormat.setCompressOutput(jobConf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK);

    // Make sure there are exactly 2 parameters left.
    FileInputFormat.setInputPaths(jobConf, inPath);
    FileOutputFormat.setOutputPath(jobConf, new Path(outPath));

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:cascading.flow.hadoop.MapReduceFlow.java

License:Open Source License

protected Map<String, Tap> createSources(JobConf jobConf) {
    Path[] paths = FileInputFormat.getInputPaths(jobConf);

    if (paths.length == 0) {
        try {/*from   ww  w  .j  av a2 s  .c  o m*/
            paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(new Job(jobConf));
        } catch (IOException exception) {
            throw new CascadingException(exception);
        }
    }

    Map<String, Tap> taps = new HashMap<String, Tap>();

    for (Path path : paths)
        taps.put(path.toString(), new Hfs(new NullScheme(), path.toString()));

    return taps;
}

From source file:cascading.flow.MapReduceFlow.java

License:Open Source License

private Map<String, Tap> createSources(JobConf jobConf) {
    Path[] paths = FileInputFormat.getInputPaths(jobConf);

    Map<String, Tap> taps = new HashMap<String, Tap>();

    for (Path path : paths)
        taps.put(path.toString(), new Hfs(new NullScheme(), path.toString()));

    return taps;/*from ww w.jav  a  2  s  .c om*/
}

From source file:cascading.hbase.helper.TableInputFormat.java

License:Apache License

public void validateInput(JobConf job) throws IOException {
    // expecting exactly one path
    Path[] tableNames = FileInputFormat.getInputPaths(job);
    if (tableNames == null || tableNames.length > 1) {
        throw new IOException("expecting one table name");
    }//from   w w  w  . j a  v a  2  s .co  m

    // connected to table?
    if (getHTable() == null) {
        throw new IOException("could not connect to table '" + tableNames[0].getName() + "'");
    }

    // expecting at least one column
    String colArg = job.get(COLUMN_LIST);
    if (colArg == null || colArg.length() == 0) {
        throw new IOException("expecting at least one column");
    }
}

From source file:cascading.scheme.hadoop.TextLine.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess,
        Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(asJobConfInstance(conf))))
        throw new IllegalStateException("cannot read zip files: "
                + Arrays.toString(FileInputFormat.getInputPaths(asJobConfInstance(conf))));

    conf.setBoolean("mapred.mapper.new-api", false);
    conf.setClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class);
}

From source file:cascading.scheme.TextLine.java

License:Open Source License

@Override
public void sourceInit(Tap tap, JobConf conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
        conf.setInputFormat(ZipInputFormat.class);
    else//from   w w  w  .ja  va  2  s  .  com
        conf.setInputFormat(TextInputFormat.class);
}

From source file:cascading.tap.hadoop.Hfs.java

License:Open Source License

protected static void verifyNoDuplicates(Configuration conf) {
    Path[] inputPaths = FileInputFormat.getInputPaths(HadoopUtil.asJobConfInstance(conf));
    Set<Path> paths = new HashSet<Path>((int) (inputPaths.length / .75f));

    for (Path inputPath : inputPaths) {
        if (!paths.add(inputPath))
            throw new TapException("may not add duplicate paths, found: " + inputPath);
    }//from ww  w  . j  a  v  a 2s  . co  m
}

From source file:cascading.tap.hadoop.io.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob//from  www.  j a v a 2s.c  o m
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        if (fromJob.get("mapred.input.format.class") == null)
            throw new CascadingException(
                    "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit");

        configs.add(HadoopUtil.getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = HadoopUtil.isLocal(fromJob);
    }

    if (!allPaths.isEmpty()) // it's possible there aren't any
        FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        HadoopUtil.setLocal(toJob);
}

From source file:cascading.tap.hadoop.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob// w ww .ja  va 2  s.  c o m
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        configs.add(getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local");
    }

    FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", Util.serializeBase64(configs));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        toJob.set("mapred.job.tracker", "local");
}