Example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat setInputPaths.

Prototype

public static void setInputPaths(JobConf conf, Path... inputPaths)

Source Link

Document

Set the array of Path s as the list of inputs for the map-reduce job.

Usage

From source file:BU.MET.CS755.SpeciesIterDriver2.java

static boolean MRSpeciesRank(String args[], int iterCnt) {
    long newCounterVal = 0;
    long totalLinks = 1; // Initialize to 1 to prevent divide by zero
    long totalIterations = 0;
    Job theJob = null;//from ww w  . j a va  2  s  .c om

    conf = new JobConf(SpeciesIterDriver2.class);
    conf.setJobName("Species Iter");
    conf.setNumReduceTasks(5);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(SpeciesIterMapper2.class);
    conf.setReducerClass(SpeciesIterReducer2.class);

    boolean nextIterationNeeded = true;

    while (nextIterationNeeded || numExtraIterations != 0) {
        long iterationNumber = 0;

        if ((iterCnt == 0) || (iterCnt == 1)) {
            inputpath = args[1] + "0";
        } else {
            inputpath = args[1] + iterCnt;
        }

        iterCnt++;

        conf.set("iterationNumber", Integer.toString(iterCnt));
        conf.set("totalLinks", Long.toString(totalLinks));

        outputpath = args[1] + iterCnt;

        FileInputFormat.setInputPaths(conf, new Path(inputpath));
        FileOutputFormat.setOutputPath(conf, new Path(outputpath));

        try {
            theJob = new Job(conf, "SpeciesIter");
        } catch (Exception e) {
            e.printStackTrace();
        }

        try {
            if (theJob != null) {
                theJob.waitForCompletion(true);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        try {
            if (theJob.isComplete()) {
                Counters jobCtrs = theJob.getCounters();

                if (jobCtrs != null) {
                    newCounterVal = jobCtrs.findCounter(ITERATION_COUNTER.ITERATIONS_NEEDED).getValue();
                }

                // If reducer recorded change in species rank, repeat iteration.
                if ((newCounterVal > 0) || (iterCnt == 1)) {
                    nextIterationNeeded = true;
                } else {
                    nextIterationNeeded = false;
                    numExtraIterations--; // Do one extra iteration
                }

                totalLinks = jobCtrs.findCounter(BU.MET.CS755.SpeciesIterDriver2.ITERATION_COUNTER.TOTAL_LINKS)
                        .getValue();
            }

            totalIterations += 1;

            if (totalIterations > 200) {
                System.out.println("too many iterations!!");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    System.out.println("Total iterations = " + totalIterations);

    return true;
}

From source file:BU.MET.CS755.SpeciesIterDriver2.java

static boolean MRSpeciesView(String input, String args[]) {
    Job theJob = null;/*from   ww  w  .  j  av  a 2 s  . com*/

    JobConf conf = new JobConf(SpeciesIterDriver2.class);
    conf.setJobName("Species Viewer");

    conf.setOutputKeyClass(FloatWritable.class);
    conf.setOutputValueClass(Text.class);

    inputpath = input;
    outputpath = args[1] + "FinalRanks";

    FileInputFormat.setInputPaths(conf, new Path(inputpath));
    FileOutputFormat.setOutputPath(conf, new Path(outputpath));

    conf.setMapperClass(SpeciesViewerMapper.class);
    conf.setReducerClass(org.apache.hadoop.mapred.lib.IdentityReducer.class);

    try {
        theJob = new Job(conf, "SpeciesIter");
        theJob.waitForCompletion(true);
    } catch (Exception e) {
        e.printStackTrace();
    }

    return true;
}

From source file:buildtestproject.MyFirstMapReduce.java

public static void main(String[] args) throws Exception {
    //Configuration conf = new Configuration();
    JobConf conf = new JobConf(MyFirstMapReduce.class);
    //Job job = Job.getInstance(conf, "word-count-one");
    conf.setJobName("word-count-one");

    conf.setMapperClass(TokenizerMapper.class);
    conf.setCombinerClass(IntSumReducer.class);
    conf.setReducerClass(IntSumReducer.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    //        job.setJarByClass(MyFirstMapReduce.class);
    //        job.setMapperClass(TokenizerMapper.class);
    //        job.setCombinerClass(IntSumReducer.class);
    //        job.setReducerClass(IntSumReducer.class);
    //        /*from   ww w  .j  a v  a2  s .c  o m*/
    //        job.setOutputKeyClass(Text.class);
    //        job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    //        FileInputFormat.addInputPath(job, new Path(args[0]));
    //        FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(conf);

    //        System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:Business.DataJoin.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    JobConf job = new JobConf(conf, DataJoin.class);

    final File f = new File(MapReduceOne.class.getProtectionDomain().getCodeSource().getLocation().getPath());
    String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/inFiles/";
    String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outFiles/OutputOne";
    //use the arguments instead if provided.
    if (args.length > 1) {
        inFiles = args[1];/*from w  ww.  j  a  v  a  2  s .co m*/
        outFiles = args[2];
    }
    Path in = new Path(inFiles);
    Path out = new Path(outFiles);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Data Join");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(ReduceClass.class);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TaggedWritable.class);
    job.set("mapred.textoutputformat.separator", ",");

    JobClient.runJob(job);
    return 0;
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testFlow() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration();

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));

    String outputPath = getOutputPath("flowTest");
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)),
            10);/*  ww w  .  j a v  a2s  .co  m*/

    flow.complete();

    validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();/*  www  .  ja va2  s. c  o  m*/

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testFlow() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);/*from  ww w .  j  a v  a  2  s  .co m*/

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath1));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(flow.openSource(), 10);

    flow.complete();

    validateLength(flow.openSink(), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testCascade() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);//ww  w.  j  a v a2 s . c  o  m

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true);
    Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow"));

    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true);
    Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow"));

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true)));
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true)));
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    JobConf thirdConf = new JobConf(defaultConf);
    thirdConf.setJobName("third-mr");

    thirdConf.setOutputKeyClass(LongWritable.class);
    thirdConf.setOutputValueClass(Text.class);

    thirdConf.setMapperClass(IdentityMapper.class);
    thirdConf.setReducerClass(IdentityReducer.class);

    thirdConf.setInputFormat(TextInputFormat.class);
    thirdConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true)));
    FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true)));

    Flow thirdMR = new MapReduceFlow(thirdConf, true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    //    cascade.writeDOT( "mrcascade.dot" );

    cascade.complete();

    validateLength(thirdMR.openSink(), 10);
}

From source file:cascading.jdbc.JDBCTap.java

License:Open Source License

@Override
public void sourceInit(JobConf conf) throws IOException {
    // a hack for MultiInputFormat to see that there is a child format
    FileInputFormat.setInputPaths(conf, getPath());

    if (username == null)
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl);
    else//from w w  w .j a v  a2  s .c o  m
        DBConfiguration.configureDB(conf, driverClassName, connectionUrl, username, password);

    super.sourceInit(conf);
}

From source file:cascading.tap.hadoop.io.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob//from w w w  . j  a  va  2  s.  c  om
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        if (fromJob.get("mapred.input.format.class") == null)
            throw new CascadingException(
                    "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit");

        configs.add(HadoopUtil.getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = HadoopUtil.isLocal(fromJob);
    }

    if (!allPaths.isEmpty()) // it's possible there aren't any
        FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        HadoopUtil.setLocal(toJob);
}