be.uantwerpen.adrem.disteclat.ItemReaderReducer.java Source code

Java tutorial

Introduction

Here is the source code for be.uantwerpen.adrem.disteclat.ItemReaderReducer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package be.uantwerpen.adrem.disteclat;

import static be.uantwerpen.adrem.disteclat.DistEclatDriver.OShortFIs;
import static be.uantwerpen.adrem.disteclat.DistEclatDriver.OSingletonsDistribution;
import static be.uantwerpen.adrem.disteclat.DistEclatDriver.OSingletonsOrder;
import static be.uantwerpen.adrem.disteclat.DistEclatDriver.OSingletonsTids;
import static be.uantwerpen.adrem.hadoop.util.Tools.createPath;
import static be.uantwerpen.adrem.hadoop.util.Tools.getJobAbsoluteOutputDir;
import static be.uantwerpen.adrem.util.FIMOptions.MIN_SUP_KEY;
import static be.uantwerpen.adrem.util.FIMOptions.NUMBER_OF_MAPPERS_KEY;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Maps.newHashMap;
import static java.lang.Integer.parseInt;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.lang.mutable.MutableInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

import be.uantwerpen.adrem.hadoop.util.IntArrayWritable;
import be.uantwerpen.adrem.hadoop.util.IntMatrixWritable;

/**
 * Reducer for the first cycle for DistEclat. It receives the complete set of frequent singletons from the different
 * mappers. The frequent singletons are sorted on ascending frequency and distributed among a number of map-tasks.
 * 
 * <pre>
 * {@code
 * Original Input Per Mapper:
 * 
 * 1 2                                      | Mapper 1
 * 1                                        | Mapper 1
 * 
 * 1 2 3                                    | Mapper 2
 * 1 2                                      | Mapper 2
 * 
 * 1 2                                      | Mapper 3
 * 2 3                                      | Mapper 3
 * 
 * 
 * 
 * Example Phase=1, MinSup=1:
 * ==========================
 * 
 * Input:
 * Text                   Iterable<IntArrayWritable>
 * (Prefix)               (<[Mapper ID, Item, Partial TIDs...]>)
 * ""                     <[1,1,0,1],[1,2,0],[2,1,0,1],[2,2,0,1],[2,3,0],[3,1,0],[3,2,0,1],[3,3,1]>
 * 
 * Output:
 * 
 * ==> OSingletonsOrder
 * IntWritable            Writable (Text)
 * (Empty)                (Sorted items)
 *                        1 2 3
 * 
 * 
 * ==> OSingletonsDistribution
 * IntWritable            Writable (Text)
 * (MapperId)             (Assigned singletons)
 * 1                      1
 * 2                      2
 * 3                      3
 * 
 * ==> OSingletonsTids
 * IntWritable            Writable (IntMatrixWritable)
 * (Singleton)            (Partial TIDs)
 * 1                      [[0,1],[0,1],[0]]
 * 2                      [[0],[0,1],[0,1]]
 * 3                      [[],[0],[1]]
 * }
 * </pre>
 */
public class ItemReaderReducer extends Reducer<Text, IntArrayWritable, IntWritable, Writable> {

    public static final Text EmptyKey = new Text("");

    private int numberOfMappers;
    private int minSup;
    private final Map<Integer, MutableInt> itemSupports = newHashMap();
    private MultipleOutputs<IntWritable, Writable> mos;

    private String shortFisFilename;

    @Override
    public void setup(Context context) {
        Configuration conf = context.getConfiguration();

        mos = new MultipleOutputs<IntWritable, Writable>(context);
        numberOfMappers = parseInt(conf.get(NUMBER_OF_MAPPERS_KEY, "1"));
        minSup = conf.getInt(MIN_SUP_KEY, -1);

        shortFisFilename = createPath(getJobAbsoluteOutputDir(context), OShortFIs, OShortFIs + "-1");
    }

    @Override
    public void reduce(Text key, Iterable<IntArrayWritable> values, Context context)
            throws IOException, InterruptedException {
        Map<Integer, IntArrayWritable[]> map = getPartitionTidListsPerExtension(values);
        reportItemsWithLargeSupport(map, context);
    }

    private Map<Integer, IntArrayWritable[]> getPartitionTidListsPerExtension(Iterable<IntArrayWritable> values) {
        Map<Integer, IntArrayWritable[]> map = newHashMap();
        for (IntArrayWritable iaw : values) {
            Writable[] w = iaw.get();
            int mapperId = ((IntWritable) w[0]).get();
            int item = ((IntWritable) w[1]).get();
            IntArrayWritable[] tidList = map.get(item);
            if (tidList == null) {
                tidList = new IntArrayWritable[numberOfMappers];
                Arrays.fill(tidList, new IntArrayWritable(new IntWritable[0]));
                map.put(item, tidList);
                itemSupports.put(item, new MutableInt());
            }
            IntWritable[] mapperTids = (IntWritable[]) tidList[mapperId].get();
            IntWritable[] copy = Arrays.copyOf(mapperTids, mapperTids.length + w.length - 2);
            for (int i = w.length - 1, ix = copy.length - 1; i >= 2; i--) {
                copy[ix--] = (IntWritable) w[i];
            }
            tidList[mapperId] = new IntArrayWritable(copy);
            itemSupports.get(item).add(w.length - 2);
        }
        return map;
    }

    private void reportItemsWithLargeSupport(Map<Integer, IntArrayWritable[]> map, Context context)
            throws IOException, InterruptedException {
        for (Entry<Integer, IntArrayWritable[]> entry : map.entrySet()) {
            int support = 0;
            for (IntArrayWritable iaw : entry.getValue()) {
                support += iaw.get().length;
            }
            if (support < minSup) {
                itemSupports.remove(entry.getKey());
                continue;
            }

            final Integer item = entry.getKey();
            final IntArrayWritable[] tids = entry.getValue();

            // write the item to the short fis file
            mos.write(new IntWritable(1), new Text(item + "(" + support + ")"), shortFisFilename);

            // write the item with the tidlist
            mos.write(OSingletonsTids, new IntWritable(item), new IntMatrixWritable(tids));
        }
    }

    @Override
    public void cleanup(Context context) throws IOException, InterruptedException {
        List<Integer> sortedSingletons = getSortedSingletons();

        context.setStatus("Writing Singletons");
        writeSingletonsOrders(sortedSingletons);

        context.setStatus("Distributing Singletons");
        writeSingletonsDistribution(sortedSingletons);

        context.setStatus("Finished");
        mos.close();
    }

    /**
     * Gets the list of singletons in ascending order of frequency.
     * 
     * @return the sorted list of singletons
     */
    private List<Integer> getSortedSingletons() {
        List<Integer> items = newArrayList(itemSupports.keySet());

        Collections.sort(items, new Comparator<Integer>() {
            @Override
            public int compare(Integer o1, Integer o2) {
                return itemSupports.get(o1).compareTo(itemSupports.get(o2));
            }
        });

        return items;
    }

    /**
     * Writes the singletons order to the file OSingletonsOrder.
     * 
     * @param sortedSingletons
     *          the sorted singletons
     * @throws IOException
     * @throws InterruptedException
     */
    private void writeSingletonsOrders(List<Integer> sortedSingletons) throws IOException, InterruptedException {
        StringBuilder builder = new StringBuilder();
        for (Integer singleton : sortedSingletons) {
            builder.append(singleton).append(" ");
        }

        Text order = new Text(builder.substring(0, builder.length() - 1));
        mos.write(OSingletonsOrder, EmptyKey, order);
    }

    /**
     * Writes the singletons distribution to file OSingletonsDistribution. The distribution is obtained using Round-Robin
     * allocation.
     * 
     * @param sortedSingletons
     *          the sorted list of singletons
     * @throws IOException
     * @throws InterruptedException
     */
    private void writeSingletonsDistribution(List<Integer> sortedSingletons)
            throws IOException, InterruptedException {
        int end = Math.min(numberOfMappers, sortedSingletons.size());

        Text mapperId = new Text();
        Text assignedItems = new Text();

        // Round robin assignment
        for (int ix = 0; ix < end; ix++) {
            StringBuilder sb = new StringBuilder();
            for (int ix1 = ix; ix1 < sortedSingletons.size(); ix1 += numberOfMappers) {
                sb.append(sortedSingletons.get(ix1)).append(" ");
            }

            mapperId.set("" + ix);
            assignedItems.set(sb.substring(0, sb.length() - 1));
            mos.write(OSingletonsDistribution, mapperId, assignedItems);
        }
    }
}