be.uantwerpen.adrem.bigfim.BigFIMDriver.java Source code

Introduction

Here is the source code for be.uantwerpen.adrem.bigfim.BigFIMDriver.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package be.uantwerpen.adrem.bigfim;

import static be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.COUNTER_GROUPNAME;
import static be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.COUNTER_NRLARGEPREFIXGROUPS;
import static be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.NUMBER_OF_CHUNKS;
import static be.uantwerpen.adrem.hadoop.util.Tools.cleanDirs;
import static be.uantwerpen.adrem.hadoop.util.Tools.cleanupAfterJob;
import static be.uantwerpen.adrem.hadoop.util.Tools.prepareJob;
import static be.uantwerpen.adrem.util.FIMOptions.DELIMITER_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.MIN_SUP_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.NUMBER_OF_LINES_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.NUMBER_OF_MAPPERS_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.OUTPUT_DIR_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.PREFIX_LENGTH_KEY;
import static java.io.File.separator;
import static org.apache.hadoop.filecache.DistributedCache.addCacheFile;
import static org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import be.uantwerpen.adrem.eclat.EclatMinerMapper;
import be.uantwerpen.adrem.eclat.EclatMinerReducer;
import be.uantwerpen.adrem.hadoop.util.IntArrayWritable;
import be.uantwerpen.adrem.hadoop.util.IntMatrixWritable;
import be.uantwerpen.adrem.hadoop.util.NoSplitSequenceFileInputFormat;
import be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat;
import be.uantwerpen.adrem.util.FIMOptions;

/**
 * Driver class for the BigFIM algorithm. This class calls all necessary map and reduce cycles and eventually writes the
 * output to a subdirectory called 'fis'.
 * 
 * The first phase consists of mining the X-prefixes using a distributed version of apriori. Each mapper is given a part
 * of the database and computes words of length+1 that are frequent. In the second phase the tid lists of the words are
 * computed. The third and last phase uses Eclat on a conditional dataset to compute closed sets.
 */
public class BigFIMDriver implements Tool {

    private static final String OFis = "fis";
    public static final String rExt = "-r-00000";

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new BigFIMDriver(), args);
    }

    @Override
    public int run(String[] args) throws Exception {
        FIMOptions opt = new FIMOptions();
        if (!opt.parseOptions(args)) {
            opt.printHelp();
            return -1;
        }

        cleanDirs(new String[] { opt.outputDir });
        long start = System.currentTimeMillis();

        int phase = startAprioriPhase(opt);
        if (canStartPrefixGeneration(opt, phase)) {
            startCreatePrefixGroups(opt, phase);
            if (canStartMining(opt)) {
                startMining(opt);
            } else {
                System.out.println("[BigFIM]: No prefix groups found");
            }
        } else {
            System.out.println("[BigFIM]: No prefixes to extend further");
        }
        cleanupAfterJob(opt);
        long end = System.currentTimeMillis();

        System.out.println("[BigFIM]: Total time: " + (end - start) / 1000 + "s");
        return 1;
    }

    private boolean canStartPrefixGeneration(FIMOptions opt, int phase) throws IOException {
        Path path = new Path(opt.outputDir + separator + "tg" + phase);
        return phase >= opt.prefixLength && path.getFileSystem(new Configuration()).exists(path);
    }

    private boolean canStartMining(FIMOptions opt) throws IOException {
        Path path = new Path(opt.outputDir + separator + "pg");
        return path.getFileSystem(new Configuration()).exists(path);
    }

    private static void setConfigurationValues(Configuration conf, FIMOptions opt) {
        conf.set(DELIMITER_KEY, opt.delimiter);
        conf.setInt(MIN_SUP_KEY, opt.minSup);
        conf.setInt(NUMBER_OF_MAPPERS_KEY, opt.nrMappers);
        conf.setInt(NUMBER_OF_CHUNKS, opt.nrMappers);
        conf.setInt(PREFIX_LENGTH_KEY, opt.prefixLength);
        conf.setStrings(OUTPUT_DIR_KEY, opt.outputDir);
    }

    private static void runJob(Job job, String jobName)
            throws ClassNotFoundException, IOException, InterruptedException {
        long start = System.currentTimeMillis();
        job.waitForCompletion(true);
        long end = System.currentTimeMillis();
        System.out.println("Job " + jobName + " took " + (end - start) / 1000 + "s");
    }

    protected int startAprioriPhase(FIMOptions opt)
            throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
        long nrLines = -1;
        int i = 1;
        boolean run = true;
        while (run) {
            if (i == 1) {
                String outputDir = opt.outputDir + separator + "ap" + i;
                String info = "Apriori Phase " + i;
                run = runAprioriOncPhaseOnce(opt, nrLines, i, info, outputDir, null);
            } else {
                Path path = new Path(opt.outputDir + separator + "tg" + (i - 1));
                for (FileStatus status : path.getFileSystem(new Configuration()).listStatus(path)) {
                    String cacheFile = status.getPath().toString();
                    String trieGroupCount = cacheFile.substring(cacheFile.lastIndexOf('/'), cacheFile.length());
                    trieGroupCount = trieGroupCount.split("-")[1];
                    String outputDir = opt.outputDir + separator + "ap" + i + "-trieGroup" + trieGroupCount;
                    System.out.println("CacheFile " + cacheFile);
                    String info = "Apriori Phase " + i + ", Trie Group " + trieGroupCount;
                    run = runAprioriOncPhaseOnce(opt, nrLines, i, info, outputDir, cacheFile);
                }
            }

            i++;
        }
        return i - 1;
    }

    private boolean runAprioriOncPhaseOnce(FIMOptions opt, long nrLines, int i, String info, String outputDir,
            String cacheFile) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
        int prefixSize = opt.prefixLength;

        System.out
                .println("[AprioriPhase]: Phase: " + i + " input: " + opt.inputFile + ", output: " + opt.outputDir);

        Job job = prepareJob(new Path(opt.inputFile), new Path(outputDir), SplitByKTextInputFormat.class,
                AprioriPhaseMapper.class, Text.class, Text.class, AprioriPhaseReducer.class, Text.class,
                IntWritable.class, TextOutputFormat.class);

        job.setJobName(info);
        job.setJarByClass(BigFIMDriver.class);

        job.setNumReduceTasks(1);

        Configuration conf = job.getConfiguration();
        setConfigurationValues(conf, opt);
        if (nrLines != -1) {
            conf.setLong(NUMBER_OF_LINES_KEY, nrLines);
        }

        if (cacheFile != null) {
            addCacheFile(new URI(cacheFile.replace(" ", "%20")), conf);
        }

        runJob(job, info);

        if (prefixSize <= i
                && job.getCounters().findCounter(COUNTER_GROUPNAME, COUNTER_NRLARGEPREFIXGROUPS).getValue() == 0) {
            return false;
        }
        if (prefixSize < i) {
            System.out.println(
                    "[AprioriPhase]: Prefix group length updated! Now " + (i) + " instead of " + prefixSize);
        }
        return true;
    }

    private void startCreatePrefixGroups(FIMOptions opt, int phase)
            throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Path path = new Path(opt.outputDir + separator + "tg" + phase);
        for (FileStatus status : path.getFileSystem(new Configuration()).listStatus(path)) {
            String cacheFile = status.getPath().toString();
            String trieGroupCount = cacheFile.substring(cacheFile.lastIndexOf('/'), cacheFile.length());
            trieGroupCount = trieGroupCount.split("-")[1];
            String outputFile = opt.outputDir + separator + "pg-trieGroup" + trieGroupCount;
            System.out.println("[CreatePrefixGroups]: input: " + opt.inputFile + ", output: " + opt.outputDir
                    + ", cache: " + cacheFile);

            Job job = prepareJob(new Path(opt.inputFile), new Path(outputFile), SplitByKTextInputFormat.class,
                    ComputeTidListMapper.class, Text.class, IntArrayWritable.class, ComputeTidListReducer.class,
                    IntArrayWritable.class, IntMatrixWritable.class, SequenceFileOutputFormat.class);

            job.setJobName("Create Prefix Groups");
            job.setJarByClass(BigFIMDriver.class);
            job.setNumReduceTasks(1);

            Configuration conf = job.getConfiguration();
            setConfigurationValues(conf, opt);
            conf.setInt(PREFIX_LENGTH_KEY, phase);

            addCacheFile(new URI(cacheFile.replace(" ", "%20")), job.getConfiguration());

            runJob(job, "Prefix Creation");
        }
    }

    private void startMining(FIMOptions opt) throws IOException, ClassNotFoundException, InterruptedException {
        String inputFilesDir = opt.outputDir + separator + "pg" + separator;
        String outputFile = opt.outputDir + separator + OFis;
        System.out.println("[StartMining]: input: " + inputFilesDir + ", output: " + outputFile);

        Job job = prepareJob(new Path(inputFilesDir), new Path(outputFile), NoSplitSequenceFileInputFormat.class,
                EclatMinerMapper.class, Text.class, Text.class, EclatMinerReducer.class, Text.class, Text.class,
                TextOutputFormat.class);

        job.setJobName("Start Mining");
        job.setJarByClass(BigFIMDriver.class);
        job.setNumReduceTasks(1);

        Configuration conf = job.getConfiguration();
        setConfigurationValues(conf, opt);

        List<Path> inputPaths = new ArrayList<Path>();

        FileStatus[] listStatus = FileSystem.get(conf).globStatus(new Path(inputFilesDir + "bucket*"));
        for (FileStatus fstat : listStatus) {
            inputPaths.add(fstat.getPath());
        }

        setInputPaths(job, inputPaths.toArray(new Path[inputPaths.size()]));

        runJob(job, "Mining");
    }

    @Override
    public Configuration getConf() {
        return null;
    }

    @Override
    public void setConf(Configuration arg0) {
    }
}