Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cg.mapreduce.myfpgrowth; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.PriorityQueue; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.Pair; import org.apache.mahout.common.Parameters; import org.apache.mahout.common.iterator.sequencefile.PathType; import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; import com.cg.mapreduce.utils.EJob; import com.google.common.collect.Lists; /** * * Parallel FP Growth Driver Class. Runs each stage of PFPGrowth as described in the paper * http://infolab.stanford.edu/~echang/recsys08-69.pdf * */ @Deprecated public final class PFPGrowth { public static final String ENCODING = "encoding"; public static final String F_LIST = "fList"; public static final String NUM_GROUPS = "numGroups"; public static final int NUM_GROUPS_DEFAULT = 1000; public static final String MAX_PER_GROUP = "maxPerGroup"; public static final String OUTPUT = "output"; public static final String MIN_SUPPORT = "minSupport"; public static final String MAX_HEAPSIZE = "maxHeapSize"; public static final String INPUT = "input"; public static final String PFP_PARAMETERS = "pfp.parameters"; public static final String FILE_PATTERN = "part-*"; public static final String FPGROWTH = "fpgrowth"; public static final String FREQUENT_PATTERNS = "frequentpatterns"; public static final String PARALLEL_COUNTING = "parallelcounting"; public static final String SPLIT_PATTERN = "splitPattern"; public static final String USE_FPG2 = "use_fpg2"; public static final Pattern SPLITTER = Pattern.compile("[ ,\t]*[,|\t][ ,\t]*"); private PFPGrowth() { } /** * @throws ClassNotFoundException * @throws InterruptedException * @throws IOException * @params * input, output locations, additional parameters like minSupport(3), maxHeapSize(50), numGroups(1000) * @conf * initial Hadoop configuration to use. * * */ public static void runPFPGrowth(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); startParallelCounting(params, conf); genGroupList(params, conf); startParallelFPGrowth(params, conf); } private static void genGroupList(Parameters params, Configuration conf) throws IOException { // save feature list to dcache List<Pair<String, Long>> fList = readFList(params); saveFList(fList, params, conf); } /** * Serializes the fList and returns the string representation of the List */ public static void saveFList(List<Pair<String, Long>> fList, Parameters params, Configuration conf) throws IOException { Path flistPath = new Path(params.get(OUTPUT) + "/oldlist", F_LIST); FileSystem fs = FileSystem.get(flistPath.toUri(), conf); flistPath = fs.makeQualified(flistPath); HadoopUtil.delete(conf, flistPath); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, Pair.class); // set param to control group size in MR jobs int numGroups = params.getInt(NUM_GROUPS, NUM_GROUPS_DEFAULT); int maxPerGroup = fList.size() / numGroups; if (fList.size() % numGroups != 0) { maxPerGroup++; } params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup)); try { int group = 0; int count = 0; for (Pair<String, Long> pair : fList) { if (count == maxPerGroup) { group++; count = 0; } writer.append(new Text(pair.getFirst()), new Pair<Integer, Long>(group, pair.getSecond())); //writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond())); } } finally { writer.close(); } DistributedCache.addCacheFile(flistPath.toUri(), conf); } /** * read the feature frequency List which is built at the end of the Parallel counting job * * @return Feature Frequency List */ public static List<Pair<String, Long>> readFList(Parameters params) { int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3")); Configuration conf = new Configuration(); Path parallelCountingPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11, new Comparator<Pair<String, Long>>() { @Override public int compare(Pair<String, Long> o1, Pair<String, Long> o2) { int ret = o2.getSecond().compareTo(o1.getSecond()); if (ret != 0) { return ret; } return o1.getFirst().compareTo(o2.getFirst()); } }); for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>( new Path(parallelCountingPath, FILE_PATTERN), PathType.GLOB, null, null, true, conf)) { long value = record.getSecond().get(); if (value >= minSupport) { queue.add(new Pair<String, Long>(record.getFirst().toString(), value)); } } List<Pair<String, Long>> fList = Lists.newArrayList(); while (!queue.isEmpty()) { fList.add(queue.poll()); } return fList; } /** * Count the frequencies of various features in parallel using Map/Reduce */ public static void startParallelCounting(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); String input = params.get(INPUT); Job job = new Job(conf, "Parallel Counting Driver running over input: " + input); job.setJarByClass(PFPGrowth.class); // Job job = initJob(conf); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelCountingMapper.class); job.setCombinerClass(ParallelCountingReducer.class); job.setReducerClass(ParallelCountingReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } } /** * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards */ public static void startParallelFPGrowth(Parameters params, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { conf.set(PFP_PARAMETERS, params.toString()); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(INPUT)); Job job = new Job(conf, "PFP Growth Driver running over input" + input); job.setJarByClass(PFPGrowth.class); // Job job = initJob(conf); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ArrayList.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), FPGROWTH); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ParallelFPGrowthMapper.class); //job.setCombinerClass(ParallelFPGrowthCombiner.class); job.setReducerClass(ParallelFPGrowthReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } } private static Job initJob(Configuration conf) { conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/core-site.xml")); conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/hdfs-default.xml")); conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/hdfs-site.xml")); conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/yarn-default.xml")); conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/yarn-site.xml")); conf.addResource(new Path("D:/program/hadoop-2.6.0/etc/hadoop/mapred-site.xml")); conf.set("HADOOP_USER_NAME", "hadoop"); conf.set("mapred.reduce.tasks", "3"); Job job = null; try { File jarFile = EJob.createTempJar("bin"); EJob.addClasspath("D:/program/hadoop-2.6.0/etc/hadoop/"); ClassLoader classLoader = EJob.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); job = new Job(conf, "PFP"); ((JobConf) job.getConfiguration()).setJar(jarFile.toString()); } catch (IOException e) { e.printStackTrace(); } return job; } }