hadoopProcesses.testJob.java Source code

Introduction

Here is the source code for hadoopProcesses.testJob.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package hadoopProcesses;

import hdfsIO.fileInteractions;
import java.util.List;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.WordCount;

/**
 *
 * @author madhatter
 */
public class testJob {

    public static void start(String[] args) {
        try {

            JobConf conf = new JobConf(WordCount.class);
            conf.setJobName("wordcount");

            conf.setOutputKeyClass(Text.class);
            conf.setOutputValueClass(IntWritable.class);

            map Map = new map();
            conf.setMapperClass(Map.getClass());

            reducer Reduce = new reducer();
            conf.setCombinerClass(Reduce.getClass());
            conf.setReducerClass(Reduce.getClass());

            conf.setInputFormat(TextInputFormat.class);
            conf.setOutputFormat(TextOutputFormat.class);

            FileInputFormat.setInputPaths(conf, new Path(args[1]));

            Path outputDir = new Path(args[2]);

            outputDir.getFileSystem(conf).delete(outputDir, true);
            FileSystem fs = FileSystem.get(conf);
            fs.delete(outputDir, true);

            FileOutputFormat.setOutputPath(conf, outputDir);

            JobClient.runJob(conf);

            FileSystem FS = FileSystem.get(conf);

            Path src = new Path(FS.getWorkingDirectory() + "/output/part-00000");

            if (FS.exists(src)) {
                System.out.println("\t\t------ Results ------ ");
                /*
                 BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(src)));
                 String line;
                 line = br.readLine();
                 while (line != null) {
                 System.out.println("\t" + line);
                 line = br.readLine();
                 }
                 */

                List<String> FileList = (new fileInteractions()).readLines(src, conf);
                for (String LocString : FileList) {

                    System.out.println(LocString);
                }
            }
        } catch (Exception Exp) {
            Exp.printStackTrace();
        }
    }
}

// otomatik altrma
/*
// create a configuration
Configuration conf = new Configuration();
// create a new job based on the configuration
Job job = new Job(conf);
// here you have to put your mapper class
job.setMapperClass(Mapper.class);
// here you have to put your reducer class
job.setReducerClass(Reducer.class);
// here you have to set the jar which is containing your 
// map/reduce class, so you can use the mapper class
job.setJarByClass(Mapper.class);
// key/value of your reducer output
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
// this is setting the format of your input, can be TextInputFormat
job.setInputFormatClass(SequenceFileInputFormat.class);
// same with output
job.setOutputFormatClass(TextOutputFormat.class);
// here you can set the path of your input
SequenceFileInputFormat.addInputPath(job, new Path("files/toMap/"));
// this deletes possible output paths to prevent job failures
FileSystem fs = FileSystem.get(conf);
Path out = new Path("files/out/processed/");
fs.delete(out, true);
// finally set the empty out path
TextOutputFormat.setOutputPath(job, out);
    
// this waits until the job completes and prints debug out to STDOUT or whatever
// has been configured in your log4j properties.
job.waitForCompletion(true);
// this should be like defined in your mapred-site.xml
conf.set("mapred.job.tracker", "jobtracker.com:50001"); 
// like defined in hdfs-site.xml
conf.set("fs.default.name", "hdfs://namenode.com:9000");
This should be no problem when the hadoop-core.jar is in your application containers classpath. But I think you should put some kind of progress indicator to your web page, because it may take minutes to hours to complete a hadoop job ;)
    
For YARN (> Hadoop 2)
    
For YARN, the following configurations need to be set.
    
// this should be like defined in your yarn-site.xml
conf.set("yarn.resourcemanager.address", "yarn-manager.com:50001"); 
    
// framework is now "yarn", should be defined like this in mapred-site.xm
conf.set("mapreduce.framework.name", "yarn");
    
// like defined in hdfs-site.xml
conf.set("fs.default.name", "hdfs://namenode.com:9000");
*/