Example usage for org.apache.hadoop.mapred FileInputFormat addInputPath

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPath.

Prototype

public static void addInputPath(JobConf conf, Path path) 

Source Link

Document

Add a Path to the list of inputs for the map-reduce job.

Usage

From source file:org.apache.nutch.tools.CrawlDBScanner.java

License:Apache License

private void scan(Path crawlDb, Path outputPath, String regex, String status, boolean text) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("CrawlDB scanner: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    job.setJobName("Scan : " + crawlDb + " for URLS matching : " + regex);

    job.set("CrawlDBScanner.regex", regex);
    if (status != null)
        job.set("CrawlDBScanner.status", status);

    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(CrawlDBScanner.class);
    job.setReducerClass(CrawlDBScanner.class);

    FileOutputFormat.setOutputPath(job, outputPath);

    // if we want a text dump of the entries
    // in order to check something - better to use the text format and avoid
    // compression
    if (text) {/*  w w  w.ja v  a2  s.com*/
        job.set("mapred.output.compress", "false");
        job.setOutputFormat(TextOutputFormat.class);
    }
    // otherwise what we will actually create is a mini-crawlDB which can be
    // then used
    // for debugging
    else {
        job.setOutputFormat(MapFileOutputFormat.class);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(CrawlDatum.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    long end = System.currentTimeMillis();
    LOG.info("CrawlDb scanner: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.tools.FreeGenerator.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
        System.err.println("\tinputDir\tinput directory containing one or more input files.");
        System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
        System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
        System.err.println("\t-filter\trun current URLFilters on input URLs");
        System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
        return -1;
    }/*from  w  w  w  .  ja  v  a2 s .co  m*/
    boolean filter = false;
    boolean normalize = false;
    if (args.length > 2) {
        for (int i = 2; i < args.length; i++) {
            if (args[i].equals("-filter")) {
                filter = true;
            } else if (args[i].equals("-normalize")) {
                normalize = true;
            } else {
                LOG.error("Unknown argument: " + args[i] + ", exiting ...");
                return -1;
            }
        }
    }

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("FreeGenerator: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());
    job.setBoolean(FILTER_KEY, filter);
    job.setBoolean(NORMALIZE_KEY, normalize);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormat(TextInputFormat.class);
    job.setMapperClass(FG.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Generator.SelectorEntry.class);
    job.setPartitionerClass(URLPartitioner.class);
    job.setReducerClass(FG.class);
    String segName = Generator.generateSegmentName();
    job.setNumReduceTasks(job.getNumMapTasks());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(Generator.HashComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        LOG.error("FAILED: " + StringUtils.stringifyException(e));
        return -1;
    }
    long end = System.currentTimeMillis();
    LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
    return 0;
}

From source file:org.apache.pig.test.pigmix.mapreduce.L1.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);/*from  ww  w  .  ja v a2  s .c  o  m*/
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    JobConf lp = new JobConf(L1.class);
    lp.setJobName("L1 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(IntWritable.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setCombinerClass(Group.class);
    lp.setReducerClass(Group.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L1out"));
    lp.setNumReduceTasks(Integer.parseInt(parallel));
    Job group = new Job(lp);

    JobControl jc = new JobControl("L1 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L10.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);/*from w w w.  ja  v  a2s .  c  o m*/
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    JobConf lp = new JobConf(L10.class);
    lp.setJobName("L10 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(MyType.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setReducerClass(Group.class);
    lp.setPartitionerClass(MyPartitioner.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L10out"));
    // Hardcode the parallel to 40 since MyPartitioner assumes it
    lp.setNumReduceTasks(40);
    Job group = new Job(lp);

    JobControl jc = new JobControl("L10 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L11.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);/*  w  w w .  j  a  v  a2 s  .  c o m*/
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    String user = System.getProperty("user.name");
    JobConf lp = new JobConf(L11.class);
    lp.setJobName("L11 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setCombinerClass(ReadPageViews.class);
    lp.setReducerClass(ReadPageViews.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/p"));
    lp.setNumReduceTasks(Integer.parseInt(parallel));
    Job loadPages = new Job(lp);

    JobConf lu = new JobConf(L11.class);
    lu.setJobName("L11 Load Widerow");
    lu.setInputFormat(TextInputFormat.class);
    lu.setOutputKeyClass(Text.class);
    lu.setOutputValueClass(Text.class);
    lu.setMapperClass(ReadWideRow.class);
    lu.setCombinerClass(ReadWideRow.class);
    lu.setReducerClass(ReadWideRow.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lu.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lu, new Path(inputDir + "/widerow"));
    FileOutputFormat.setOutputPath(lu, new Path(outputDir + "/wr"));
    lu.setNumReduceTasks(Integer.parseInt(parallel));
    Job loadWideRow = new Job(lu);

    JobConf join = new JobConf(L11.class);
    join.setJobName("L11 Union WideRow and Pages");
    join.setInputFormat(KeyValueTextInputFormat.class);
    join.setOutputKeyClass(Text.class);
    join.setOutputValueClass(Text.class);
    join.setMapperClass(IdentityMapper.class);
    join.setCombinerClass(Union.class);
    join.setReducerClass(Union.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        join.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(join, new Path(outputDir + "/p"));
    FileInputFormat.addInputPath(join, new Path(outputDir + "/wr"));
    FileOutputFormat.setOutputPath(join, new Path(outputDir + "/L11out"));
    join.setNumReduceTasks(Integer.parseInt(parallel));
    Job joinJob = new Job(join);
    joinJob.addDependingJob(loadPages);
    joinJob.addDependingJob(loadWideRow);

    JobControl jc = new JobControl("L11 join");
    jc.addJob(loadPages);
    jc.addJob(loadWideRow);
    jc.addJob(joinJob);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L12.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);//from  ww w  .ja v  a2 s.c  om
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    String user = System.getProperty("user.name");
    JobConf lp = new JobConf(L12.class);
    lp.setJobName("L12 Find Highest Value Page Per User");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(DoubleWritable.class);
    lp.setMapperClass(HighestValuePagePerUser.class);
    lp.setCombinerClass(HighestValuePagePerUser.class);
    lp.setReducerClass(HighestValuePagePerUser.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/highest_value_page_per_user"));
    lp.setNumReduceTasks(Integer.parseInt(parallel));
    Job loadPages = new Job(lp);

    JobConf lu = new JobConf(L12.class);
    lu.setJobName("L12 Find Total Timespent per Term");
    lu.setInputFormat(TextInputFormat.class);
    lu.setOutputKeyClass(Text.class);
    lu.setOutputValueClass(LongWritable.class);
    lu.setMapperClass(TotalTimespentPerTerm.class);
    lu.setCombinerClass(TotalTimespentPerTerm.class);
    lu.setReducerClass(TotalTimespentPerTerm.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lu.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lu, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lu, new Path(outputDir + "/total_timespent_per_term"));
    lu.setNumReduceTasks(Integer.parseInt(parallel));
    Job loadUsers = new Job(lu);

    JobConf join = new JobConf(L12.class);
    join.setJobName("L12 Find Queries Per Action");
    join.setInputFormat(TextInputFormat.class);
    join.setOutputKeyClass(Text.class);
    join.setOutputValueClass(LongWritable.class);
    join.setMapperClass(QueriesPerAction.class);
    join.setCombinerClass(QueriesPerAction.class);
    join.setReducerClass(QueriesPerAction.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        join.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(join, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(join, new Path(outputDir + "/queries_per_action"));
    join.setNumReduceTasks(Integer.parseInt(parallel));
    Job joinJob = new Job(join);

    JobControl jc = new JobControl("L12 join");
    jc.addJob(loadPages);
    jc.addJob(loadUsers);
    jc.addJob(joinJob);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L13.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);//  w  w w. ja va2s .  co m
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    String user = System.getProperty("user.name");
    JobConf lp = new JobConf(L13.class);
    lp.setJobName("L13 Load Left Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadLeftPageViews.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/indexed_left_pages"));
    lp.setNumReduceTasks(0);
    Job loadPages = new Job(lp);

    JobConf lu = new JobConf(L13.class);
    lu.setJobName("L13 Load Right Page Views");
    lu.setInputFormat(TextInputFormat.class);
    lu.setOutputKeyClass(Text.class);
    lu.setOutputValueClass(Text.class);
    lu.setMapperClass(ReadRightPageViews.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lu.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lu, new Path(inputDir + "/power_users_samples"));
    FileOutputFormat.setOutputPath(lu, new Path(outputDir + "/indexed_right_pages"));
    lu.setNumReduceTasks(0);
    Job loadUsers = new Job(lu);

    JobConf join = new JobConf(L13.class);
    join.setJobName("L13 Join Two Pages");
    join.setInputFormat(KeyValueTextInputFormat.class);
    join.setOutputKeyClass(Text.class);
    join.setOutputValueClass(Text.class);
    join.setMapperClass(IdentityMapper.class);
    join.setReducerClass(Join.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        join.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(join, new Path(outputDir + "/indexed_left_pages"));
    FileInputFormat.addInputPath(join, new Path(outputDir + "/indexed_right_pages"));
    FileOutputFormat.setOutputPath(join, new Path(outputDir + "/L13out"));
    join.setNumReduceTasks(Integer.parseInt(parallel));
    Job joinJob = new Job(join);
    joinJob.addDependingJob(loadPages);
    joinJob.addDependingJob(loadUsers);

    JobControl jc = new JobControl("L13 join");
    jc.addJob(loadPages);
    jc.addJob(loadUsers);
    jc.addJob(joinJob);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L14.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);//from w  ww . ja  v a  2s  .c  o m
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    String user = System.getProperty("user.name");
    JobConf lp = new JobConf(L14.class);
    lp.setJobName("L14 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views_sorted"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/indexed_pages_14"));
    lp.setNumReduceTasks(0);
    Job loadPages = new Job(lp);

    JobConf lu = new JobConf(L14.class);
    lu.setJobName("L14 Load Users");
    lu.setInputFormat(TextInputFormat.class);
    lu.setOutputKeyClass(Text.class);
    lu.setOutputValueClass(Text.class);
    lu.setMapperClass(ReadUsers.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lu.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lu, new Path(inputDir + "/users_sorted"));
    FileOutputFormat.setOutputPath(lu, new Path(outputDir + "/indexed_users_14"));
    lu.setNumReduceTasks(0);
    Job loadUsers = new Job(lu);

    JobConf join = new JobConf(L14.class);
    join.setJobName("L14 Join Users and Pages");
    join.setInputFormat(KeyValueTextInputFormat.class);
    join.setOutputKeyClass(Text.class);
    join.setOutputValueClass(Text.class);
    join.setMapperClass(IdentityMapper.class);
    join.setReducerClass(Join.class);
    props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        join.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(join, new Path(outputDir + "/indexed_pages_14"));
    FileInputFormat.addInputPath(join, new Path(outputDir + "/indexed_users_14"));
    FileOutputFormat.setOutputPath(join, new Path(outputDir + "/L14out"));
    join.setNumReduceTasks(Integer.parseInt(parallel));
    Job joinJob = new Job(join);
    joinJob.addDependingJob(loadPages);
    joinJob.addDependingJob(loadUsers);

    JobControl jc = new JobControl("L14 join");
    jc.addJob(loadPages);
    jc.addJob(loadUsers);
    jc.addJob(joinJob);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L15.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);//from   w  w w  .  ja v  a  2 s . com
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    JobConf lp = new JobConf(L15.class);
    lp.setJobName("L15 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setCombinerClass(Combiner.class);
    lp.setReducerClass(Group.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L15out"));
    lp.setNumReduceTasks(Integer.parseInt(parallel));
    Job group = new Job(lp);

    JobControl jc = new JobControl("L15 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}

From source file:org.apache.pig.test.pigmix.mapreduce.L16.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 3) {
        System.out.println("Parameters: inputDir outputDir parallel");
        System.exit(1);//w w w  .j  av  a 2 s.co  m
    }
    String inputDir = args[0];
    String outputDir = args[1];
    String parallel = args[2];
    JobConf lp = new JobConf(L16.class);
    lp.setJobName("L16 Load Page Views");
    lp.setInputFormat(TextInputFormat.class);
    lp.setOutputKeyClass(Text.class);
    lp.setOutputValueClass(Text.class);
    lp.setMapperClass(ReadPageViews.class);
    lp.setReducerClass(Group.class);
    Properties props = System.getProperties();
    for (Map.Entry<Object, Object> entry : props.entrySet()) {
        lp.set((String) entry.getKey(), (String) entry.getValue());
    }
    FileInputFormat.addInputPath(lp, new Path(inputDir + "/page_views"));
    FileOutputFormat.setOutputPath(lp, new Path(outputDir + "/L16out"));
    lp.setNumReduceTasks(Integer.parseInt(parallel));
    Job group = new Job(lp);

    JobControl jc = new JobControl("L16 join");
    jc.addJob(group);

    new Thread(jc).start();

    int i = 0;
    while (!jc.allFinished()) {
        ArrayList<Job> failures = jc.getFailedJobs();
        if (failures != null && failures.size() > 0) {
            for (Job failure : failures) {
                System.err.println(failure.getMessage());
            }
            break;
        }

        try {
            Thread.sleep(5000);
        } catch (InterruptedException e) {
        }

        if (i % 10000 == 0) {
            System.out.println("Running jobs");
            ArrayList<Job> running = jc.getRunningJobs();
            if (running != null && running.size() > 0) {
                for (Job r : running) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Ready jobs");
            ArrayList<Job> ready = jc.getReadyJobs();
            if (ready != null && ready.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Waiting jobs");
            ArrayList<Job> waiting = jc.getWaitingJobs();
            if (waiting != null && waiting.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
            System.out.println("Successful jobs");
            ArrayList<Job> success = jc.getSuccessfulJobs();
            if (success != null && success.size() > 0) {
                for (Job r : ready) {
                    System.out.println(r.getJobName());
                }
            }
        }
        i++;
    }
    ArrayList<Job> failures = jc.getFailedJobs();
    if (failures != null && failures.size() > 0) {
        for (Job failure : failures) {
            System.err.println(failure.getMessage());
        }
    }
    jc.stop();
}