Example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobConf conf) 

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:HiveKeyIgnoringBAMOutputFormat.java

License:Open Source License

private void setSAMHeaderFrom(JobConf job) throws IOException {
    if (wrappedOutputFormat.getSAMHeader() != null)
        return;/* w  w w  .  j  av  a2 s  .c  om*/

    // XXX: We're not told where to take the SAM header from so we just merge
    // them all. There should probably be a better way of doing this.

    final List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>();

    // The "best" sort order among the headers: unsorted if they're sorted
    // differently, otherwise their common sort order.
    SAMFileHeader.SortOrder sortOrder = null;

    // XXX: it seems that FileInputFormat.getInputPaths(job) will point to
    // the directories of the input tables in the query. I'm not sure if this
    // is always the case.
    for (final Path table : FileInputFormat.getInputPaths(job)) {
        final FileSystem fs = table.getFileSystem(job);
        for (final FileStatus stat : fs.listStatus(table)) {
            if (!stat.isFile())
                throw new IOException("Unexpected directory '" + stat.getPath() + "', expected only files");

            final SAMFileReader r = new SAMFileReader(fs.open(stat.getPath()));
            final SAMFileHeader h = r.getFileHeader();
            r.close();
            headers.add(h);

            if (sortOrder == null) {
                sortOrder = h.getSortOrder();
                continue;
            }
            if (sortOrder == SAMFileHeader.SortOrder.unsorted)
                continue;
            if (sortOrder != h.getSortOrder())
                sortOrder = SAMFileHeader.SortOrder.unsorted;
        }
    }

    wrappedOutputFormat.setSAMHeader(new SamFileHeaderMerger(sortOrder, headers, true).getMergedHeader());
}

From source file:boa.datagen.SeqSort.java

License:Apache License

/**
 * The main driver for sort program./*from w w w.ja  va  2  s  . com*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the 
 *                     job tracker.
 */
@Override
public int run(String[] args) throws Exception {
    System.out.println(inPath);

    JobConf jobConf = new JobConf(getConf(), SeqSort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(SequenceFileInputFormat.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(BytesWritable.class);

    SequenceFileOutputFormat.setCompressOutput(jobConf, true);
    SequenceFileOutputFormat.setOutputCompressorClass(jobConf, SnappyCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(jobConf, CompressionType.BLOCK);

    // Make sure there are exactly 2 parameters left.
    FileInputFormat.setInputPaths(jobConf, inPath);
    FileOutputFormat.setOutputPath(jobConf, new Path(outPath));

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:cascading.flow.hadoop.MapReduceFlow.java

License:Open Source License

protected Map<String, Tap> createSources(JobConf jobConf) {
    Path[] paths = FileInputFormat.getInputPaths(jobConf);

    if (paths.length == 0) {
        try {/*from   ww  w  .j  av a2 s  .c  o m*/
            paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(new Job(jobConf));
        } catch (IOException exception) {
            throw new CascadingException(exception);
        }
    }

    Map<String, Tap> taps = new HashMap<String, Tap>();

    for (Path path : paths)
        taps.put(path.toString(), new Hfs(new NullScheme(), path.toString()));

    return taps;
}

From source file:cascading.flow.MapReduceFlow.java

License:Open Source License

private Map<String, Tap> createSources(JobConf jobConf) {
    Path[] paths = FileInputFormat.getInputPaths(jobConf);

    Map<String, Tap> taps = new HashMap<String, Tap>();

    for (Path path : paths)
        taps.put(path.toString(), new Hfs(new NullScheme(), path.toString()));

    return taps;/*from ww w.jav  a  2  s  .c om*/
}

From source file:cascading.hbase.helper.TableInputFormat.java

License:Apache License

public void validateInput(JobConf job) throws IOException {
    // expecting exactly one path
    Path[] tableNames = FileInputFormat.getInputPaths(job);
    if (tableNames == null || tableNames.length > 1) {
        throw new IOException("expecting one table name");
    }//from   w w  w  . j a  v a  2  s .co  m

    // connected to table?
    if (getHTable() == null) {
        throw new IOException("could not connect to table '" + tableNames[0].getName() + "'");
    }

    // expecting at least one column
    String colArg = job.get(COLUMN_LIST);
    if (colArg == null || colArg.length() == 0) {
        throw new IOException("expecting at least one column");
    }
}

From source file:cascading.scheme.hadoop.TextLine.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess,
        Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(asJobConfInstance(conf))))
        throw new IllegalStateException("cannot read zip files: "
                + Arrays.toString(FileInputFormat.getInputPaths(asJobConfInstance(conf))));

    conf.setBoolean("mapred.mapper.new-api", false);
    conf.setClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class);
}

From source file:cascading.scheme.TextLine.java

License:Open Source License

@Override
public void sourceInit(Tap tap, JobConf conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
        conf.setInputFormat(ZipInputFormat.class);
    else//from   w w  w  .ja  va  2  s  .  com
        conf.setInputFormat(TextInputFormat.class);
}

From source file:cascading.tap.hadoop.Hfs.java

License:Open Source License

protected static void verifyNoDuplicates(Configuration conf) {
    Path[] inputPaths = FileInputFormat.getInputPaths(HadoopUtil.asJobConfInstance(conf));
    Set<Path> paths = new HashSet<Path>((int) (inputPaths.length / .75f));

    for (Path inputPath : inputPaths) {
        if (!paths.add(inputPath))
            throw new TapException("may not add duplicate paths, found: " + inputPath);
    }//from ww  w  . j  a  v  a 2s  . co  m
}

From source file:cascading.tap.hadoop.io.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob//from  www.  j a v a 2s.c  o m
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        if (fromJob.get("mapred.input.format.class") == null)
            throw new CascadingException(
                    "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit");

        configs.add(HadoopUtil.getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = HadoopUtil.isLocal(fromJob);
    }

    if (!allPaths.isEmpty()) // it's possible there aren't any
        FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        HadoopUtil.setLocal(toJob);
}

From source file:cascading.tap.hadoop.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob// w ww .ja  va 2  s.  c o m
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        configs.add(getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local");
    }

    FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", Util.serializeBase64(configs));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        toJob.set("mapred.job.tracker", "local");
}