org.pentaho.weblogs.WebLogs.java Source code

Introduction

Here is the source code for org.pentaho.weblogs.WebLogs.java
Source

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.weblogs;

import java.io.File;
import java.net.URL;
import java.net.URLClassLoader;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.pentaho.di.trans.TransConfiguration;
import org.pentaho.di.trans.TransExecutionConfiguration;
import org.pentaho.di.trans.TransMeta;

/**
 * This is an example Hadoop Map/Reduce application. It reads the text input files, breaks each line into words and
 * counts them. The output is a locally sorted list of words and the count of how often they occurred.
 * 
 * To run: bin/hadoop jar build/hadoop-examples.jar wordcount [-m <i>maps</i>] [-r <i>reduces</i>] <i>in-dir</i>
 * <i>out-dir</i>
 */
public class WebLogs extends Configured implements Tool {

    private static final String input = "./junit/weblogs/input/access.log";
    private static final String outputFolder = "./junit/weblogs/output";

    static int printUsage() {
        System.out.println("Weblogs [-m <maps>] [-r <reduces>] <input> <output>");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    /**
     * The main driver for word count map/reduce program. Invoke this method to submit the map/reduce job.
     * 
     * @throws IOException
     *           When there is communication problems with the job tracker.
     */
    public int run(String[] args) throws Exception {

        JobConf conf = new JobConf(getConf(), WebLogs.class);
        conf.setJobName("wordcount");
        conf.set("debug", "true");
        conf.setWorkingDirectory(new Path("./"));
        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        // these are set so the job is run in the same
        // JVM as the debugger - we are not submitting
        // to MR Node.
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "local");

        // The mapper, reducer and combiner classes.
        File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar");
        URLClassLoader loader = new URLClassLoader(new URL[] { jar.toURI().toURL() });
        conf.setMapperClass(
                (Class<? extends Mapper>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransMap"));
        // conf.setCombinerClass((Class<? extends Reducer>)
        // loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));
        conf.setReducerClass(
                (Class<? extends Reducer>) loader.loadClass("org.pentaho.hadoop.mapreduce.GenericTransReduce"));

        TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration();

        TransMeta mapperTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-mapper.ktr");
        TransConfiguration mapperTransConfig = new TransConfiguration(mapperTransMeta, transExecConfig);
        conf.set("transformation-map-xml", mapperTransConfig.getXML());

        TransMeta reducerTransMeta = new TransMeta("./samples/jobs/hadoop/weblogs-reducer.ktr");
        TransConfiguration reducerTransConfig = new TransConfiguration(reducerTransMeta, transExecConfig);
        conf.set("transformation-reduce-xml", reducerTransConfig.getXML());

        // transformation data interface
        conf.set("transformation-map-input-stepname", "Injector");
        conf.set("transformation-map-output-stepname", "Output");
        conf.set("transformation-reduce-input-stepname", "Injector");
        conf.set("transformation-reduce-output-stepname", "Output");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        List<String> other_args = new ArrayList<String>();
        for (int i = 0; i < args.length; ++i) {
            try {
                if ("-m".equals(args[i])) {
                    conf.setNumMapTasks(Integer.parseInt(args[++i]));
                } else if ("-r".equals(args[i])) {
                    conf.setNumReduceTasks(Integer.parseInt(args[++i]));
                } else {
                    other_args.add(args[i]);
                }
            } catch (NumberFormatException except) {
                System.out.println("ERROR: Integer expected instead of " + args[i]);
                return printUsage();
            } catch (ArrayIndexOutOfBoundsException except) {
                System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
                return printUsage();
            }
        }
        // Make sure there are exactly 2 parameters left.
        if (other_args.size() != 2) {
            System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
            return printUsage();
        }
        FileInputFormat.setInputPaths(conf, other_args.get(0));
        FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1)));

        JobClient.runJob(conf);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new WebLogs(), args);
        System.exit(res);
    }
}