be.uantwerpen.adrem.disteclat.DistEclatDriver.java Source code

Java tutorial

Introduction

Here is the source code for be.uantwerpen.adrem.disteclat.DistEclatDriver.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package be.uantwerpen.adrem.disteclat;

import static be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.NUMBER_OF_CHUNKS;
import static be.uantwerpen.adrem.hadoop.util.Tools.cleanDirs;
import static be.uantwerpen.adrem.hadoop.util.Tools.cleanupAfterJob;
import static be.uantwerpen.adrem.hadoop.util.Tools.prepareJob;
import static be.uantwerpen.adrem.util.FIMOptions.DELIMITER_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.MIN_SUP_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.NUMBER_OF_MAPPERS_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.OUTPUT_DIR_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.PREFIX_LENGTH_KEY;
import static java.io.File.separator;
import static java.lang.System.currentTimeMillis;
import static org.apache.hadoop.filecache.DistributedCache.addCacheFile;
import static org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths;
import static org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.addNamedOutput;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import be.uantwerpen.adrem.bigfim.ComputeTidListMapper;
import be.uantwerpen.adrem.eclat.EclatMinerMapper;
import be.uantwerpen.adrem.eclat.EclatMinerReducer;
import be.uantwerpen.adrem.hadoop.util.IntArrayWritable;
import be.uantwerpen.adrem.hadoop.util.IntMatrixWritable;
import be.uantwerpen.adrem.hadoop.util.NoSplitSequenceFileInputFormat;
import be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat;
import be.uantwerpen.adrem.util.FIMOptions;

/**
 * Driver class for DistEclat (distributed Eclat) implementation on the Hadoop framework. DistEclat operates in three
 * steps and starts from databases in vertical format. It first mines X-FIs seed elements which it further distributes
 * among available mappers.
 * 
 * The first step consists of reading the vertical database file and reporting the frequent singletons. The latter are
 * distributed by the reducer into distinct groups. The distinct groups are used in the next cycle to compute X-FIs
 * seeds.The seeds are again distributed among a new batch of mappers. The mappers compute closed sets on their local
 * subtrees, indicated by the received prefixes.
 */
public class DistEclatDriver implements Tool {

    // output files first MapReduce cycle
    public static final String OSingletonsDistribution = "singletonsDistribution";
    public static final String OSingletonsOrder = "singletonsOrder";
    public static final String OSingletonsTids = "singletonsTids";

    // output files second MapReduce cycle
    public static final String OShortFIs = "shortfis";

    // output files third MapReduce cycle
    private static final String OFis = "fis";

    // default extension for output file of first reducer
    public static final String rExt = "-r-00000";

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new DistEclatDriver(), args);
    }

    @Override
    public int run(String[] args) throws Exception {
        FIMOptions opt = new FIMOptions();
        if (!opt.parseOptions(args)) {
            opt.printHelp();
            return -1;
        }

        String tmpDir1 = opt.outputDir + separator + "tmp1" + separator;
        String tmpDir2 = opt.outputDir + separator + "prefixes" + separator;

        long start = currentTimeMillis();
        cleanDirs(new String[] { opt.outputDir, tmpDir1, tmpDir2 });
        readHorizontalDb(tmpDir1, opt);
        startPrefixComputation(tmpDir1, tmpDir2, opt);
        startMining(tmpDir2, opt);
        cleanupAfterJob(opt);
        long end = currentTimeMillis();

        System.out.println("[DistEclat]: Total time: " + (end - start) / 1000 + "s");
        return 1;
    }

    /**
     * Passes all configuration flags to the Hadoop Configuration framework
     * 
     * @param conf
     *          the Hadoop configuration
     * @param config
     *          the configuration that has user-defined flags
     */
    private static void setConfigurationValues(Configuration conf, FIMOptions opt) {
        conf.set(DELIMITER_KEY, opt.delimiter);
        conf.setInt(MIN_SUP_KEY, opt.minSup);
        conf.setInt(NUMBER_OF_MAPPERS_KEY, opt.nrMappers);
        conf.setInt(NUMBER_OF_CHUNKS, opt.nrMappers);
        conf.setInt(PREFIX_LENGTH_KEY, opt.prefixLength);
        conf.setStrings(OUTPUT_DIR_KEY, opt.outputDir);
    }

    private static void runJob(Job job, String jobName)
            throws ClassNotFoundException, IOException, InterruptedException {
        long start = System.currentTimeMillis();
        job.waitForCompletion(true);
        long end = System.currentTimeMillis();
        System.out.println("Job " + jobName + " took " + (end - start) / 1000 + "s");
    }

    /**
     * Starts the first MapReduce cycle. First, the transaction file is partitioned into a number of chunks that is given
     * to different mappers. Each mapper reads a chunk and return the items together with their partial tid-lists. The
     * reducer attaches the partial tid-lists to each other, then discards the infrequent ones and sorts the frequent one
     * based on ascending frequency and divides the singletons among available mappers.
     * 
     * This method generates three files, the frequent singletons (OSingletonsTids), the order file for singletons based
     * on ascending frequency (OSingletonsOrder) and the singletons distribution file (OSingletonsDistribution).
     * 
     * @param outputFile
     * @param opt
     * @throws IOException
     * @throws ClassNotFoundException
     * @throws InterruptedException
     */
    private void readHorizontalDb(String outputFile, FIMOptions opt)
            throws IOException, ClassNotFoundException, InterruptedException {
        System.out.println("[ItemReading]: input: " + opt.inputFile + ", output: " + outputFile);

        Job job = prepareJob(new Path(opt.inputFile), new Path(outputFile), SplitByKTextInputFormat.class,
                ComputeTidListMapper.class, Text.class, IntArrayWritable.class, ItemReaderReducer.class,
                IntWritable.class, Writable.class, TextOutputFormat.class);

        job.setJobName("Read Singletons");
        job.setJarByClass(DistEclatDriver.class);
        job.setNumReduceTasks(1);

        Configuration conf = job.getConfiguration();
        setConfigurationValues(conf, opt);

        addNamedOutput(job, OSingletonsDistribution, TextOutputFormat.class, Text.class, Text.class);
        addNamedOutput(job, OSingletonsOrder, TextOutputFormat.class, Text.class, Text.class);
        addNamedOutput(job, OSingletonsTids, SequenceFileOutputFormat.class, IntWritable.class,
                IntMatrixWritable.class);

        runJob(job, "Item Reading");
    }

    /**
     * Starts the second MapReduce cycle. Each mapper gets a list of singletons from which it should start building X-FIs.
     * Each mapper uses Eclat to quickly compute the list of X-FIs. The total set of X-FIs is again obtained by the
     * reducer, which then gets divided into independent sets. All sets that have been computed from level 1 to X are
     * already reported. The distribution of seeds is obtained by some allocation scheme, e.g., Round-Robin,
     * Lowest-Frequency, ...
     * 
     * This method generates three files, the frequent itemsets from level 1 to X (OFises), the prefix groups
     * (OPrefixGroups) and the prefix distribution file (OPrefixDistribution).
     * 
     * @param inputDir
     * @param outputDir
     * @param opt
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     * @throws URISyntaxException
     */
    private void startPrefixComputation(String inputDir, String outputDir, FIMOptions opt)
            throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {

        String inputFile = inputDir + separator + OSingletonsDistribution + rExt;
        String singletonsOrderFile = inputDir + separator + OSingletonsOrder + rExt;
        String singletonsTidsFile = inputDir + separator + OSingletonsTids + rExt;

        System.out.println("[PrefixComputation]: input: " + inputFile);

        Job job = prepareJob(new Path(inputFile), new Path(outputDir), NLineInputFormat.class,
                PrefixComputerMapper.class, Text.class, IntMatrixWritable.class, PrefixComputerReducer.class,
                IntArrayWritable.class, IntMatrixWritable.class, SequenceFileOutputFormat.class);

        job.setJobName("Compute Prefixes");
        job.setJarByClass(DistEclatDriver.class);
        job.setNumReduceTasks(1);

        Configuration conf = job.getConfiguration();
        setConfigurationValues(conf, opt);

        addCacheFile(new URI(singletonsOrderFile.replace(" ", "%20")), job.getConfiguration());
        addCacheFile(new URI(singletonsTidsFile.replace(" ", "%20")), job.getConfiguration());

        runJob(job, "Partition Prefixes");
    }

    /**
     * Starts the third MapReduce cycle. Each mapper reads the prefix groups assigned to it and computes the collection of
     * closed sets. All information is reported to the reducer which finally writes the output to disk.
     * 
     * 
     * @param inputDir
     * @param config
     * @throws IOException
     * @throws InterruptedException
     * @throws ClassNotFoundException
     * @throws URISyntaxException
     */
    private void startMining(String inputDir, FIMOptions opt)
            throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {

        String inputFilesDir = inputDir;
        String outputFile = opt.outputDir + separator + OFis;
        System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

        Job job = prepareJob(new Path(inputFilesDir), new Path(outputFile), NoSplitSequenceFileInputFormat.class,
                EclatMinerMapper.class, Text.class, Text.class, EclatMinerReducer.class, Text.class, Text.class,
                TextOutputFormat.class);

        job.setJobName("Start Mining");
        job.setJarByClass(DistEclatDriver.class);
        job.setNumReduceTasks(1);

        Configuration conf = job.getConfiguration();
        setConfigurationValues(conf, opt);

        List<Path> inputPaths = new ArrayList<Path>();

        FileStatus[] listStatus = FileSystem.get(conf).globStatus(new Path(inputFilesDir + "bucket*"));
        for (FileStatus fstat : listStatus) {
            inputPaths.add(fstat.getPath());
        }

        if (inputPaths.isEmpty()) {
            System.out.println("[StartMining]: No prefixes to extend further");
            return;
        }

        setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));

        runJob(job, "Mining");
    }

    @Override
    public Configuration getConf() {
        return null;
    }

    @Override
    public void setConf(Configuration arg0) {

    }
}