com.cg.mapreduce.myfpgrowth.PFPGrowth.java Source code

Introduction

Here is the source code for com.cg.mapreduce.myfpgrowth.PFPGrowth.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cg.mapreduce.myfpgrowth;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;

import com.cg.mapreduce.utils.EJob;
import com.google.common.collect.Lists;

/**
 * 
 * Parallel FP Growth Driver Class. Runs each stage of PFPGrowth as described in the paper
 * http://infolab.stanford.edu/~echang/recsys08-69.pdf
 * 
 */
@Deprecated
public final class PFPGrowth {

    public static final String ENCODING = "encoding";
    public static final String F_LIST = "fList";
    public static final String NUM_GROUPS = "numGroups";
    public static final int NUM_GROUPS_DEFAULT = 1000;
    public static final String MAX_PER_GROUP = "maxPerGroup";
    public static final String OUTPUT = "output";
    public static final String MIN_SUPPORT = "minSupport";
    public static final String MAX_HEAPSIZE = "maxHeapSize";
    public static final String INPUT = "input";
    public static final String PFP_PARAMETERS = "pfp.parameters";
    public static final String FILE_PATTERN = "part-*";
    public static final String FPGROWTH = "fpgrowth";
    public static final String FREQUENT_PATTERNS = "frequentpatterns";
    public static final String PARALLEL_COUNTING = "parallelcounting";
    public static final String SPLIT_PATTERN = "splitPattern";
    public static final String USE_FPG2 = "use_fpg2";

    public static final Pattern SPLITTER = Pattern.compile("[ ,\t]*[,|\t][ ,\t]*");

    private PFPGrowth() {
    }

    /**
     * @throws ClassNotFoundException 
    * @throws InterruptedException 
    * @throws IOException 
    * @params
     *    input, output locations, additional parameters like minSupport(3), maxHeapSize(50), numGroups(1000)
     * @conf
     *    initial Hadoop configuration to use.
     * 
     * */
    public static void runPFPGrowth(Parameters params)
            throws IOException, InterruptedException, ClassNotFoundException {

        Configuration conf = new Configuration();
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                + "org.apache.hadoop.io.serializer.WritableSerialization");
        startParallelCounting(params, conf);

        genGroupList(params, conf);

        startParallelFPGrowth(params, conf);
    }

    private static void genGroupList(Parameters params, Configuration conf) throws IOException {
        // save feature list to dcache
        List<Pair<String, Long>> fList = readFList(params);
        saveFList(fList, params, conf);
    }

    /**
     * Serializes the fList and returns the string representation of the List
     */
    public static void saveFList(List<Pair<String, Long>> fList, Parameters params, Configuration conf)
            throws IOException {
        Path flistPath = new Path(params.get(OUTPUT) + "/oldlist", F_LIST);
        FileSystem fs = FileSystem.get(flistPath.toUri(), conf);
        flistPath = fs.makeQualified(flistPath);
        HadoopUtil.delete(conf, flistPath);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, Pair.class);

        // set param to control group size in MR jobs
        int numGroups = params.getInt(NUM_GROUPS, NUM_GROUPS_DEFAULT);
        int maxPerGroup = fList.size() / numGroups;
        if (fList.size() % numGroups != 0) {
            maxPerGroup++;
        }
        params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup));

        try {
            int group = 0;
            int count = 0;
            for (Pair<String, Long> pair : fList) {
                if (count == maxPerGroup) {
                    group++;
                    count = 0;
                }
                writer.append(new Text(pair.getFirst()), new Pair<Integer, Long>(group, pair.getSecond()));
                //writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond()));
            }
        } finally {
            writer.close();
        }
        DistributedCache.addCacheFile(flistPath.toUri(), conf);
    }

    /**
     * read the feature frequency List which is built at the end of the Parallel counting job
     * 
     * @return Feature Frequency List
     */
    public static List<Pair<String, Long>> readFList(Parameters params) {
        int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
        Configuration conf = new Configuration();

        Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);

        PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
                new Comparator<Pair<String, Long>>() {
                    @Override
                    public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
                        int ret = o2.getSecond().compareTo(o1.getSecond());
                        if (ret != 0) {
                            return ret;
                        }
                        return o1.getFirst().compareTo(o2.getFirst());
                    }
                });

        for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
                new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) {
            long value = record.getSecond().get();
            if (value >= minSupport) {
                queue.add(new Pair<String, Long>(record.getFirst().toString(), value));
            }
        }
        List<Pair<String, Long>> fList = Lists.newArrayList();
        while (!queue.isEmpty()) {
            fList.add(queue.poll());
        }
        return fList;
    }

    /**
     * Count the frequencies of various features in parallel using Map/Reduce
     */
    public static void startParallelCounting(Parameters params, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(PFP_PARAMETERS, params.toString());
        conf.set("mapred.compress.map.output", "true");
        conf.set("mapred.output.compression.type", "BLOCK");

        String input = params.get(INPUT);
        Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
        job.setJarByClass(PFPGrowth.class);

        //    Job job = initJob(conf);  
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.addInputPath(job, new Path(input));
        Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);
        FileOutputFormat.setOutputPath(job, outPath);

        HadoopUtil.delete(conf, outPath);

        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(ParallelCountingMapper.class);
        job.setCombinerClass(ParallelCountingReducer.class);
        job.setReducerClass(ParallelCountingReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }

    }

    /**
     * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards
     */
    public static void startParallelFPGrowth(Parameters params, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(PFP_PARAMETERS, params.toString());
        conf.set("mapred.compress.map.output", "true");
        conf.set("mapred.output.compression.type", "BLOCK");

        Path input = new Path(params.get(INPUT));
        Job job = new Job(conf, "PFP Growth Driver running over input" + input);
        job.setJarByClass(PFPGrowth.class);
        //    Job job = initJob(conf);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(ArrayList.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        FileInputFormat.addInputPath(job, input);
        Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
        FileOutputFormat.setOutputPath(job, outPath);

        HadoopUtil.delete(conf, outPath);

        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(ParallelFPGrowthMapper.class);
        //job.setCombinerClass(ParallelFPGrowthCombiner.class);
        job.setReducerClass(ParallelFPGrowthReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private static Job initJob(Configuration conf) {
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/core-site.xml"));
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/hdfs-default.xml"));
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/hdfs-site.xml"));
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/yarn-default.xml"));
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/yarn-site.xml"));
        conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/mapred-site.xml"));
        conf.set("HADOOP_USER_NAME", "hadoop");
        conf.set("mapred.reduce.tasks", "3");
        Job job = null;
        try {
            File jarFile = EJob.createTempJar("bin");
            EJob.addClasspath("D:/program/hadoop-2.6.0/etc/hadoop/");
            ClassLoader classLoader = EJob.getClassLoader();
            Thread.currentThread().setContextClassLoader(classLoader);
            job = new Job(conf, "PFP");
            ((JobConf) job.getConfiguration()).setJar(jarFile.toString());
        } catch (IOException e) {
            e.printStackTrace();
        }
        return job;
    }
}