be.uantwerpen.adrem.bigfim.AprioriPhaseMapper.java Source code

Introduction

Here is the source code for be.uantwerpen.adrem.bigfim.AprioriPhaseMapper.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package be.uantwerpen.adrem.bigfim;

import static be.uantwerpen.adrem.bigfim.Tools.convertLineToSet;
import static be.uantwerpen.adrem.bigfim.Tools.getSingletonsFromCountTrie;
import static be.uantwerpen.adrem.bigfim.Tools.readCountTrieFromItemSetsFile;
import static be.uantwerpen.adrem.util.FIMOptions.DELIMITER_KEY;
import static org.apache.hadoop.filecache.DistributedCache.getLocalCacheFiles;

import java.io.IOException;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import be.uantwerpen.adrem.util.ItemSetTrie;
import be.uantwerpen.adrem.util.ItemSetTrie.SupportCountItemsetTrie;

/**
 * Mapper class for Apriori phase of BigFIM. Each mapper receives a sub part (horizontal cut) of the dataset and
 * combines a list of base itemsets in candidates of length+1 for its sub database. The latter are counted in the map
 * function. If no base itemsets are specified, all singletons are counted. The size of the sub part is depending on the
 * number of mappers and the size of the original dataset.
 * 
 * <pre>
 * {@code
 * Original Input Per Mapper:
 * 
 * 1 2                                      | Mapper 1
 * 1                                        | Mapper 1
 * 
 * 1 2 3                                    | Mapper 2
 * 1 2                                      | Mapper 2
 * 
 * 1 2                                      | Mapper 3
 * 2 3                                      | Mapper 3
 * 
 * 
 * 
 * Example Phase=1, MinSup=1:
 * ==========================
 * 
 * Input:
 * LongWritable   Text
 * (Offset)       (Transaction)
 * 0              "1 2"                     | Mapper 1
 * 4              "1"                       | Mapper 1
 * 
 * 6              "1 2 3"                   | Mapper 2
 * 12             "1 2"                     | Mapper 2
 * 
 * 16             "1 2"                     | Mapper 3
 * 20             "2 3"                     | Mapper 3
 * 
 * Output:
 * Text           Text
 * (Prefix)       (Item + Support)
 * ""             "1 2"                     | Mapper 1
 * ""             "2 1"                     | Mapper 1
 * 
 * ""             "1 2"                     | Mapper 2
 * ""             "2 2"                     | Mapper 2
 * ""             "3 1"                     | Mapper 2
 * 
 * ""             "1 1"                     | Mapper 3
 * ""             "2 2"                     | Mapper 3
 * ""             "3 1"                     | Mapper 3
 * 
 * 
 * 
 * Example Phase=2, MinSup=1:
 * ==========================
 * 
 * Itemsets:
 * 1
 * 2
 * 3
 * 
 * Itemsets of length+1:
 * 1 2
 * 1 3
 * 2 3
 * 
 * Input:
 * LongWritable   Text
 * (Offset)       (Transaction)
 * 0              "1 2"             | Mapper 1
 * 4              "1"               | Mapper 1
 * 
 * 6              "1 2 3"           | Mapper 2
 * 12             "1 2"             | Mapper 2
 * 
 * 16             "1 2"             | Mapper 3
 * 20             "2 3"             | Mapper 3
 * 
 * Output:
 * Text           Text
 * (Itemset)      (Support)
 * "1"            "2 1"             | Mapper 1
 * 
 * "1"            "2 2"             | Mapper 2
 * "1"            "3 1"             | Mapper 2
 * "2"            "3 1"             | Mapper 2
 * 
 * "1"            "2 1"             | Mapper 3
 * "2"            "3 1"             | Mapper 3
 * }
 * </pre>
 */
public class AprioriPhaseMapper extends Mapper<LongWritable, Text, Text, Text> {

    private Set<Integer> singletons;
    private ItemSetTrie countTrie;

    private int phase = 1;
    private String delimiter;

    @Override
    public void setup(Context context) throws IOException {
        Configuration conf = context.getConfiguration();
        delimiter = conf.get(DELIMITER_KEY, " ");

        Path[] localCacheFiles = getLocalCacheFiles(conf);
        countTrie = new ItemSetTrie.SupportCountItemsetTrie(-1);
        if (localCacheFiles != null) {
            String filename = localCacheFiles[0].toString();
            phase = readCountTrieFromItemSetsFile(filename, countTrie) + 1;
            singletons = getSingletonsFromCountTrie(countTrie);
        }
    }

    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        List<Integer> items = convertLineToSet(line, phase == 1, singletons, delimiter);
        incrementSubSets(items);
    }

    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
        recReport(context, new StringBuilder(), countTrie);
    }

    private void recReport(Context context, StringBuilder builder, ItemSetTrie trie)
            throws IOException, InterruptedException {
        int length = builder.length();
        for (Entry<Integer, ItemSetTrie> entry : trie.children.entrySet()) {
            ItemSetTrie recTrie = entry.getValue();
            if (recTrie.children.isEmpty()) {
                int support = ((SupportCountItemsetTrie) recTrie).support;
                if (support != 0) {
                    Text key = new Text(builder.substring(0, Math.max(0, builder.length() - 1)));
                    Text value = new Text(recTrie.id + " " + support);
                    context.write(key, value);
                }
            } else {
                builder.append(recTrie.id + " ");
                recReport(context, builder, recTrie);
            }
            builder.setLength(length);
        }
    }

    private void incrementSubSets(List<Integer> items) {
        if (items.size() < phase) {
            return;
        }

        if (phase == 1) {
            for (int i = 0; i < items.size(); i++) {
                ItemSetTrie recTrie = countTrie.getChild(items.get(i));
                recTrie.addTid(1);
            }
            return;
        }

        doRecursiveCount(items, 0, countTrie);
    }

    private void doRecursiveCount(List<Integer> items, int ix, ItemSetTrie trie) {
        for (int i = ix; i < items.size(); i++) {
            ItemSetTrie recTrie = trie.children.get(items.get(i));
            if (recTrie != null) {
                if (recTrie.children.isEmpty()) {
                    recTrie.addTid(1);
                } else {
                    doRecursiveCount(items, i + 1, recTrie);
                }
            }
        }
    }
}