org.apache.mahout.freqtermsets.PFPGrowth.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.mahout.freqtermsets.PFPGrowth.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.freqtermsets;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.Callable;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.math.util.MathUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.freqtermsets.convertors.string.TopKStringPatterns;
import org.apache.mahout.freqtermsets.fpgrowth.FPGrowth;
import org.apache.mahout.freqtermsets.stream.TimeWeightFunction;
import org.apache.mahout.math.list.IntArrayList;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.apache.mahout.math.map.OpenObjectLongHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
import com.twitter.corpus.data.CSVTweetInputFormat;
import com.twitter.corpus.data.util.PartitionByTimestamp;

/**
 * 
 * Parallel FP Growth Driver Class. Runs each stage of PFPGrowth as described in the paper
 * http://infolab.stanford.edu/~echang/recsys08-69.pdf
 * 
 */
public final class PFPGrowth implements Callable<Void> {
    private static final Logger LOG = LoggerFactory.getLogger(PFPGrowth.class);

    public static final String ENCODING = "encoding";
    public static final String F_LIST = "fList";
    public static final String G_LIST = "gList";
    public static final String NUM_GROUPS = "numGroups";
    public static final int NUM_GROUPS_DEFAULT = 1000;
    public static final String MAX_PER_GROUP = "maxPerGroup";
    public static final String OUTPUT = "output";
    public static final String OUTROOT = "outroot";
    public static final String MIN_SUPPORT = "minSupport";
    public static final String MAX_HEAPSIZE = "maxHeapSize";
    public static final String INPUT = "input";
    public static final String PFP_PARAMETERS = "pfp.parameters";
    public static final String FILE_PATTERN = "part-*";
    public static final String FPGROWTH = "fpgrowth";
    public static final String FREQUENT_PATTERNS = "frequentpatterns";
    public static final String PARALLEL_COUNTING = "parallelcounting";

    // public static final String USE_FPG2 = "use_fpg2";
    // YA
    public static final String PRUNE_PCTILE = "percentile";
    public static final int PRUNE_PCTILE_DEFAULT = 95;
    public static final String MIN_FREQ = "minFreq";
    public static final int MIN_FREQ_DEFAULT = 33;
    // All those setting are cluster level and cannot be set per job
    // public static final String PSEUDO = "pseudo";
    public static final String COUNT_IN = "countIn";
    public static final String GROUP_FIS_IN = "gfisIn";
    public static final String PARAM_INTERVAL_START = "startTime";
    public static final String PARAM_INTERVAL_END = "endTime";
    // public static final String INDEX_OUT = "index";

    public static final String PARAM_WINDOW_SIZE = "windowSize";
    public static final String PARAM_STEP_SIZE = "stepSize";

    // TODO command line
    // static final boolean FPSTREAM = false;
    public static enum RunningMode {
        Batch, BlockUpdate, SlidingWin
    };

    public static final RunningMode runMode = RunningMode.SlidingWin;

    public static final float FPSTREAM_LINEAR_DECAY_COEFF = 0.9f;
    // private static final double AVG_TOKENS_PER_DOC = 7;
    public static final String PARAM_MAX_PATTERN_LOAD_LAG = "maxLag";
    public static final String DEFAULT_MAX_PATTERN_LOAD_LAG = Long.toString(24 * 3600 * 1000);

    // private static final long DIRICHLET_SMOOTHING_PARAM = 3579L;

    // public static final long TREC2011_MIN_TIMESTAMP = 1296130141000L; // 1297209010000L;
    // public static final long GMT23JAN2011 = 1295740800000L;

    // Not text input anymore
    // public static final String SPLIT_PATTERN = "splitPattern";
    // public static final Pattern SPLITTER = Pattern.compile("[ ,\t]*[,|\t][ ,\t]*");
    // END YA
    // private PFPGrowth() {
    // }

    public static void loadEarlierFHashMaps(JobContext context, Parameters params, long intervalStart,
            OpenIntObjectHashMap<String> idStringMapOut, OpenObjectIntHashMap<String> stringIdMapOut)
            throws IOException {
        // I resist the urge to cache this list because I don't know what exactly would happen
        // when the job is run in hadoop where every job has its own JVM.. will static
        // fields somehow leak? Can I be sure that the static WeakHashMap used as a cache is mine?
        // FINALLY.. the list would be loaded only twice, once for mapper, and once for reducer

        OpenObjectLongHashMap<String> prevFLists = PFPGrowth.readOlderCachedFLists(context.getConfiguration(),
                intervalStart, TimeWeightFunction.getDefault(params));

        LinkedList<String> terms = Lists.newLinkedList();
        prevFLists.keysSortedByValue(terms);
        Iterator<String> termsIter = terms.descendingIterator();
        while (termsIter.hasNext()) {

            String t = termsIter.next();
            int id = Hashing.murmur3_32().hashString(t, Charset.forName("UTF-8")).asInt();
            int c = 0;
            while (idStringMapOut.containsKey(id)) {
                // Best effort
                if (c < t.length()) {
                    id = Hashing.murmur3_32((int) t.charAt(c++)).hashString(t, Charset.forName("UTF-8")).asInt();
                } else {
                    ++id;
                }
            }

            idStringMapOut.put(id, t);
            stringIdMapOut.put(t, id);
        }
    }

    public static long readTermFreqHashMap(Configuration conf, OpenObjectLongHashMap<String> fMap)
            throws IOException {

        long totalNterms = 0;

        switch (runMode) {
        case Batch:
        case SlidingWin:
            for (Pair<String, Long> e : readCachedFList(conf)) {
                fMap.put(e.getFirst(), e.getSecond());
                totalNterms += e.getSecond();
            }
            break;
        case BlockUpdate:

            Parameters params = new Parameters(conf.get(PFPGrowth.PFP_PARAMETERS, ""));
            long currWindowStart = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));

            OpenObjectLongHashMap<String> prevFLists = readOlderCachedFLists(conf, currWindowStart,
                    TimeWeightFunction.getDefault(params));
            LinkedList<String> terms = Lists.newLinkedList();
            prevFLists.keysSortedByValue(terms);
            Iterator<String> termsIter = terms.descendingIterator();

            while (termsIter.hasNext()) {
                String t = termsIter.next();
                long freq = prevFLists.get(t);
                fMap.put(t, freq);
                totalNterms += freq;
            }
            break;
        }
        return totalNterms;
    }

    // public static List<Pair<String, Long>> readFList(Configuration conf) throws IOException {
    // if (runMode.equals(RunningMode.Batch)) {
    //
    // return readCachedFList(conf);
    //
    // } else {
    //
    // Parameters params = new Parameters(conf.get(PFPGrowth.PFP_PARAMETERS, ""));
    // long currWindowStart = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
    //
    // OpenObjectLongHashMap<String> prevFLists = readOlderCachedFLists(conf,
    // currWindowStart,
    // TimeWeightFunction.getDefault(params));
    // LinkedList<String> terms = Lists.newLinkedList();
    // prevFLists.keysSortedByValue(terms);
    // Iterator<String> termsIter = terms.descendingIterator();
    //
    // List<Pair<String, Long>> result =
    // Lists.<Pair<String, Long>> newArrayListWithCapacity(terms.size());
    // while (termsIter.hasNext()) {
    // String t = termsIter.next();
    // result.add(new Pair<String, Long>(t, prevFLists.get(t)));
    // }
    //
    // return result;
    // }
    // }
    //
    /**
     * Generates the fList from the serialized string representation
     * 
     * @return Deserialized Feature Frequency List
     */
    public static OpenObjectLongHashMap<String> readOlderCachedFLists(Configuration conf, long currWindowStart,
            TimeWeightFunction weightFunction) throws IOException {
        OpenObjectLongHashMap<String> list = new OpenObjectLongHashMap<String>();
        Path[] files = DistributedCache.getLocalCacheFiles(conf);
        if (files == null) {
            throw new IOException("Cannot read Frequency list from Distributed Cache");
        }
        for (int i = 0; i < files.length; ++i) {
            FileSystem fs = FileSystem.getLocal(conf);
            Path fListLocalPath = fs.makeQualified(files[i]);
            // Fallback if we are running locally.
            if (!fs.exists(fListLocalPath)) {
                URI[] filesURIs = DistributedCache.getCacheFiles(conf);
                if (filesURIs == null) {
                    throw new IOException("Cannot read Frequency list from Distributed Cache");
                }
                fListLocalPath = new Path(filesURIs[i].getPath());
            }
            long listWindowStart = Long.parseLong(fListLocalPath.getParent().getParent().getName());
            for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath,
                    true, conf)) {
                String token = record.getFirst().toString();

                list.put(token, Math.round(list.get(token)
                        + weightFunction.apply(record.getSecond().get(), listWindowStart, currWindowStart)));
            }
        }
        return list;
    }

    /**
     * Generates the fList from the serialized string representation
     * 
     * @return Deserialized Feature Frequency List
     */
    public static List<Pair<String, Long>> readCachedFList(Configuration conf) throws IOException {
        List<Pair<String, Long>> list = new ArrayList<Pair<String, Long>>();
        Path[] files = DistributedCache.getLocalCacheFiles(conf);
        if (files == null) {
            throw new IOException("Cannot read Frequency list from Distributed Cache");
        }
        if (files.length != 1) {
            throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
        }
        FileSystem fs = FileSystem.getLocal(conf);
        Path fListLocalPath = fs.makeQualified(files[0]);
        // Fallback if we are running locally.
        if (!fs.exists(fListLocalPath)) {
            URI[] filesURIs = DistributedCache.getCacheFiles(conf);
            if (filesURIs == null) {
                throw new IOException("Cannot read Frequency list from Distributed Cache");
            }
            if (filesURIs.length != 1) {
                throw new IOException("Cannot read Frequency list from Distributed Cache (" + files.length + ")");
            }
            fListLocalPath = new Path(filesURIs[0].getPath());
        }
        // Done below, while caching the list
        // // YA: Lang independent stop words removal
        // // FIXMENOT: as below
        // Parameters params = new Parameters(conf.get(PFP_PARAMETERS, ""));
        // int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
        // int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);
        //
        // // TODONOT: assert minFr >= minSupport;
        //
        // Iterator<Pair<Text, LongWritable>> tempIter = new SequenceFileIterable<Text, LongWritable>(
        // fListLocalPath, true, conf).iterator();
        // long maxFr = Long.MAX_VALUE;
        // if (tempIter.hasNext()) {
        // maxFr = tempIter.next().getSecond().get() * prunePct / 100;
        // }
        // tempIter = null;
        //
        // for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(
        // fListLocalPath, true, conf)) {
        // String token = record.getFirst().toString();
        // char ch0 = token.charAt(0);
        // if ((ch0 != '#' && ch0 != '@')
        // && (record.getSecond().get() < minFr || record.getSecond().get() > maxFr)) {
        // continue;
        // }
        // list.add(new Pair<String, Long>(token, record.getSecond().get()));
        // }
        // // END YA

        for (Pair<Text, LongWritable> record : new SequenceFileIterable<Text, LongWritable>(fListLocalPath, true,
                conf)) {
            list.add(new Pair<String, Long>(record.getFirst().toString(), record.getSecond().get()));
        }

        return list;
    }

    /**
     * Serializes the fList and returns the string representation of the List
     * 
     * @param flistPath
     * 
     * @return Serialized String representation of List
     */
    public static void saveFList(Iterable<Pair<String, Long>> flist, // Parameters params,
            Configuration conf, Path flistPath) throws IOException {
        FileSystem fs = FileSystem.get(flistPath.toUri(), conf);
        flistPath = fs.makeQualified(flistPath);
        // HadoopUtil.delete(conf, flistPath);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, flistPath, Text.class, LongWritable.class);
        try {
            for (Pair<String, Long> pair : flist) {
                writer.append(new Text(pair.getFirst()), new LongWritable(pair.getSecond()));
            }
        } finally {
            writer.close();
        }
        DistributedCache.addCacheFile(flistPath.toUri(), conf);
    }

    // /**
    // * read the feature frequency List which is built at the end of the Parallel counting job
    // *
    // * @param countIn
    // * @param minSupport
    // * @param minFr
    // * @param prunePct
    // *
    // * @return Feature Frequency List
    // */
    // public static List<Pair<String, Long>> readFList(String countIn, int minSupport, int minFr,
    // int prunePct) throws IOException {
    //
    // Path parallelCountingPath = new Path(countIn, PARALLEL_COUNTING);
    //
    // PriorityQueue<Pair<String, Long>> queue = new PriorityQueue<Pair<String, Long>>(11,
    // new Comparator<Pair<String, Long>>() {
    // @Override
    // public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
    // int ret = o2.getSecond().compareTo(o1.getSecond());
    // if (ret != 0) {
    // return ret;
    // }
    // return o1.getFirst().compareTo(o2.getFirst());
    // }
    // });
    //
    // Path path = new Path(parallelCountingPath, FILE_PATTERN);
    // // if(!FileSystem.get(path.toUri(),conf).exists(path)){
    // // throw new IOException("Cannot find flist file: " + path);
    // // }
    // Configuration conf = new Configuration();
    // // double totalFreq = 0;
    // for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
    // path, PathType.GLOB, null, null, true, conf)) {
    // long freq = record.getSecond().get();
    // // totalFreq += freq;
    // queue.add(new Pair<String, Long>(record.getFirst().toString(), freq));
    // }
    // //
    // // YA: language indipendent stop words.. the 5% most frequent
    // // FIXME: this will remove words from only the mostly used lang
    // // i.e. cannot be used for a multilingual task
    // long maxFreq = Long.MAX_VALUE;
    // if (!queue.isEmpty()) {
    // maxFreq = Math.round(1.0f * queue.peek().getSecond() * prunePct / 100);
    // // maxFreq = Math.round(1.0f * queue.size() * prunePct / 100);
    // }
    // // totalFreq += DIRICHLET_SMOOTHING_PARAM;
    // // double pruneProb = prunePct / 100.0;
    // assert minFr >= minSupport;
    // // double numDocs = totalFreq / AVG_TOKENS_PER_DOC;
    //
    // List<Pair<String, Long>> fList = Lists.newArrayList();
    // boolean withinUpperBound = false;
    // // int i=0;
    // while (!queue.isEmpty()) {
    // Pair<String, Long> record = queue.poll();
    // String token = record.getFirst().toString();
    // char ch0 = token.charAt(0);
    // long value = record.getSecond();
    // // Always Count patterns associated with hashtags and mentions,
    // // but for other tokens ONLY those who are within medium range
    // // double prob = (value + DIRICHLET_SMOOTHING_PARAM) / totalFreq;
    // if (!withinUpperBound) {
    // // double idf = MathUtils.log(2, numDocs / value+1) + 1;
    // // withinUpperBound = idf >= prunePct/* value <= maxFreq */;
    // // withinUpperBound = (value/totalFreq) < pruneProb;
    // withinUpperBound = value <= maxFreq;
    // // withinUpperBound = i > maxFreq;
    // // ++i;
    // }
    // if (!(value >= minFr)) { // (withinLoweBound){
    // break;
    // }
    // if ((ch0 == '#' || ch0 == '@')
    // || (withinUpperBound)) { // && withinLoweBound)) {
    // fList.add(record);
    // }
    // }
    // return fList; // This prevents a null exception when using FP2 --> .subList(0, fList.size());
    // }

    public static void appendParallelCountingResults(String countIn, Configuration conf,
            OpenObjectLongHashMap<String> freqMapOut) throws IOException {

        Path parallelCountingPath = new Path(countIn, PARALLEL_COUNTING);

        Path path = new Path(parallelCountingPath, FILE_PATTERN);

        for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(path, PathType.GLOB,
                null, null, true, conf)) {
            long freq = record.getSecond().get();
            String token = record.getFirst().toString();

            freqMapOut.put(token, freqMapOut.get(token) + freq);
        }

    }

    /**
     * read the feature frequency List which is built at the end of the Parallel counting job
     * 
     * @param countIn
     * 
     * @return Feature Frequency Map
     */
    public static OpenObjectLongHashMap<String> readParallelCountingResults(String countIn, Configuration conf)
            throws IOException {

        OpenObjectLongHashMap<String> result = new OpenObjectLongHashMap<String>();
        Path parallelCountingPath = new Path(countIn, PARALLEL_COUNTING);

        Path path = new Path(parallelCountingPath, FILE_PATTERN);

        for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(path, PathType.GLOB,
                null, null, true, conf)) {
            long freq = record.getSecond().get();
            String token = record.getFirst().toString();

            result.put(token, freq);
        }

        return result;
    }

    public static List<Pair<String, Long>> pruneParallelCountingResults(OpenObjectLongHashMap<String> freqMap,
            int minSupport, int minFr, int prunePct) throws IOException {

        List<String> termsSorted = Lists.newArrayListWithCapacity(freqMap.size());
        freqMap.keysSortedByValue(termsSorted);

        // FIXME: this will remove words from only the mostly used lang
        // i.e. cannot be used for a multilingual task
        // Percentile: Pretty aggressive since the Zipfe distribution is very steep
        int maxIx;
        if (prunePct == 100) {
            maxIx = freqMap.size();
        } else {
            maxIx = (int) Math.round(((freqMap.size() - 1) * prunePct / 100.0) + 1);
            long maxFreq = freqMap.get(termsSorted.get(maxIx));
            // Alternatives
            // double maxFreq = (1.0f * MathUtils.log(2,freqArr[0].getSecond()) * prunePct / 100);
            // double maxFreq = (1.0f * totalFreq * prunePct / 100); //FIX this to be weighted percentile

            // This will ALWAYS prune something; even the 100th percentile (i.e. highest rank)
            while (freqMap.get(termsSorted.get(maxIx - 1)) == maxFreq) {
                --maxIx;
            }
        }

        assert minFr >= minSupport;
        int minIx = 0;
        while (freqMap.get(termsSorted.get(minIx)) < minFr) {
            ++minIx;
        }
        List<Pair<String, Long>> result = Lists.newArrayListWithExpectedSize(maxIx - minIx);

        for (int i = maxIx - 1; i >= minIx; --i) {
            String term = termsSorted.get(i);
            result.add(new Pair(term, freqMap.get(term)));
        }

        return result;
    }

    // public static List<Pair<String, Long>> readParallelCountingResults(String countIn, int
    // minSupport, int minFr,
    // int prunePct, Configuration conf) throws IOException {
    //
    // Path parallelCountingPath = new Path(countIn, PARALLEL_COUNTING);
    //
    // Path path = new Path(parallelCountingPath, FILE_PATTERN);
    // // = new Configuration();
    // // if (!FileSystem.get(path.toUri(), conf).exists(path)) {
    // // throw new IOException("Cannot find flist file: " + path);
    // // }
    //
    // // Won't hold in case of steps
    // assert minFr >= minSupport;
    // double totalFreq = 0;
    // List<Pair<String, Long>> freqList = Lists.newLinkedList();
    // for (Pair<Text, LongWritable> record : new SequenceFileDirIterable<Text, LongWritable>(
    // path, PathType.GLOB, null, null, true, conf)) {
    // long freq = record.getSecond().get();
    // String token = record.getFirst().toString();
    //
    // // char ch0 = token.charAt(0);
    // // No special treatment for mentions: || ch0 == '@'
    // // or hashtags: (ch0 == '#') ||
    // if ((freq >= minFr)) {
    // freqList.add(new Pair<String, Long>(token, freq));
    // totalFreq += freq;
    // }
    // }
    //
    // Comparator<Pair<String, Long>> descComp =
    // new Comparator<Pair<String, Long>>() {
    // @Override
    // public int compare(Pair<String, Long> o1, Pair<String, Long> o2) {
    // int ret = o2.getSecond().compareTo(o1.getSecond());
    // if (ret != 0) {
    // return ret;
    // }
    // return o1.getFirst().compareTo(o2.getFirst());
    // }
    // };
    //
    // @SuppressWarnings("unchecked")
    // Pair<String, Long>[] freqArr = freqList.toArray(new Pair[0]);
    // Arrays.sort(freqArr, descComp);
    //
    // List<Pair<String, Long>> result = null; // = Lists.newLinkedList();
    // // YA: language indipendent stop words.. the 5% most frequent
    // // FIXME: this will remove words from only the mostly used lang
    // // i.e. cannot be used for a multilingual task
    // // Percentile: Pretty aggressive since the Zipfe distribution is very steep
    // int minIx = (int) Math.round(1.0f * (freqArr.length + 1) * (100 - prunePct) / 100);
    // long maxFreq = freqArr[minIx].getSecond();
    // // double maxFreq = (1.0f * MathUtils.log(2,freqArr[0].getSecond()) * prunePct / 100);
    // // double maxFreq = (1.0f * totalFreq * prunePct / 100);
    // boolean withinUpperBound = false;
    // for (int i = 0; i < freqArr.length; ++i) {
    // if (!withinUpperBound) {
    // // totalFreq -= freqArr[i].getSecond();
    // // withinUpperBound = totalFreq <= maxFreq;
    // // withinUpperBound = MathUtils.log(2,freqArr[i].getSecond()) <= maxFreq;
    // withinUpperBound = freqArr[i].getSecond() <= maxFreq;
    // if (withinUpperBound) {
    // result = Lists.newArrayListWithCapacity(freqArr.length - i);
    // }
    // }
    //
    // if (withinUpperBound) {
    // result.add(freqArr[i]);
    // }
    // }
    //
    // return result;
    // }
    //
    public static int cacheFList(Parameters params, Configuration conf, String countIn, int minSupport, int minFr,
            int prunePct) throws IOException {

        long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
        long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
        long windowSize = Long
                .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
        long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));
        endTime = Math.min(endTime, startTime + windowSize);

        Path cachedPath = new Path(countIn, Long.toString(startTime));
        cachedPath = new Path(cachedPath, Long.toString(Math.min(endTime, startTime + windowSize)));
        cachedPath = new Path(cachedPath, F_LIST);

        FileSystem fs = FileSystem.getLocal(conf);
        int result;
        if (fs.exists(cachedPath)) {
            // assert FPSTREAM;
            result = -1;
            DistributedCache.addCacheFile(cachedPath.toUri(), conf);
        } else {
            OpenObjectLongHashMap<String> freqMap;
            if (runMode.equals(RunningMode.SlidingWin)) {
                freqMap = new OpenObjectLongHashMap<String>();
                while (startTime < endTime) {
                    String stepCount = FilenameUtils.concat(countIn, Long.toString(startTime));
                    stepCount = FilenameUtils.concat(stepCount,
                            Long.toString(Math.min(endTime, startTime + stepSize)));

                    appendParallelCountingResults(stepCount, conf, freqMap);
                    startTime += stepSize;
                }
            } else {
                countIn = FilenameUtils.concat(countIn, Long.toString(startTime));
                countIn = FilenameUtils.concat(countIn, Long.toString(Math.min(endTime, startTime + windowSize)));
                freqMap = readParallelCountingResults(countIn, conf);
            }
            List<Pair<String, Long>> flist = pruneParallelCountingResults(freqMap, minSupport, minFr, prunePct);
            saveFList(flist, // params,
                    conf, cachedPath);
            result = flist.size();
        }
        return result;
    }

    public static int getGroup(int itemId, int maxPerGroup) { //, long timestamp, long endTimestamp, long stepSize) {
        int result = itemId / maxPerGroup;
        //    if(runMode.equals(RunningMode.SlidingWin)){
        //       long lag = (endTimestamp - timestamp) / stepSize;
        //       // oops, this will cause transactions to be grouped by timestamp, but no according to the amount of lag needed for different items
        //    }
        return result;
    }

    public static IntArrayList getGroupMembers(int groupId, int maxPerGroup, int numFeatures) {
        IntArrayList ret = new IntArrayList();
        int start = groupId * maxPerGroup;
        int end = start + maxPerGroup;
        if (end > numFeatures)
            end = numFeatures;
        for (int i = start; i < end; i++) {
            ret.add(i);
        }
        return ret;
    }

    public static int getGroupFromHash(int attrHash, int numGroups) {
        // TODO save log(2) as a constant and devide by it.. this function is SLOW
        int maskLen = (int) MathUtils.log(2, numGroups - 1) + 1;
        int mask = (int) Math.pow(2, maskLen) - 1;
        int result = 0;

        int attrLSBs = 0;
        int byteMask = 255;
        int numBytes = (maskLen / 8) + 1;

        int leftShifts = 0;
        while (leftShifts < maskLen) { // No folding.. the murmer hash seems to be a repeating: 32) {

            for (int i = 0; i < numBytes; ++i) {
                attrLSBs += attrHash & byteMask;
                byteMask <<= 8;
            }
            result ^= attrLSBs & mask;

            leftShifts += maskLen;
            attrHash >>>= maskLen;
        }

        return result + 1; // group numbers are not zero based
    }

    // Modulo doesn't work because the hash is not necessarily positive
    // public static int getGroupFromHash(int attrHash, int numGroups) {
    // int result = (attrHash % numGroups) + 1;
    // return result;
    // }

    public static boolean isGroupMember(int groupId, int attrId, int numGroups) {
        int attrGroup = getGroupFromHash(attrId, numGroups);
        return groupId == attrGroup;
    }

    /**
     * Read the Frequent Patterns generated from Text
     * 
     * @return List of TopK patterns for each string frequent feature
     */
    public static List<Pair<String, TopKStringPatterns>> readFrequentPattern(Parameters params) throws IOException {

        Configuration conf = new Configuration();

        Path frequentPatternsPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
        FileSystem fs = FileSystem.get(frequentPatternsPath.toUri(), conf);
        FileStatus[] outputFiles = fs.globStatus(new Path(frequentPatternsPath, FILE_PATTERN));

        List<Pair<String, TopKStringPatterns>> ret = Lists.newArrayList();
        for (FileStatus fileStatus : outputFiles) {
            ret.addAll(FPGrowth.readFrequentPattern(conf, fileStatus.getPath()));
        }
        return ret;
    }

    /**
     * 
     * @param params
     *          params should contain input and output locations as a string value, the additional
     *          parameters include minSupport(3), maxHeapSize(50), numGroups(1000)
     * @throws NoSuchAlgorithmException
     * @throws ParseException
     */
    public static void runPFPGrowth(Parameters params) throws IOException, InterruptedException,
            ClassNotFoundException, NoSuchAlgorithmException, ParseException {
        Configuration conf = new Configuration();
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
                + "org.apache.hadoop.io.serializer.WritableSerialization");

        long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
        long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
        long windowSize = Long
                .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
        long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));
        endTime = Math.min(endTime, startTime + windowSize);

        int minSupport = Integer.valueOf(params.get(MIN_SUPPORT, "3"));
        String countIn = params.get(COUNT_IN);
        if (countIn == null) {
            countIn = params.get(OUTROOT); // PUT);
        }
        int minFr = params.getInt(MIN_FREQ, MIN_FREQ_DEFAULT);
        int prunePct = params.getInt(PRUNE_PCTILE, PRUNE_PCTILE_DEFAULT);

        if (params.get(COUNT_IN) == null) {
            startParallelCounting(params, conf);
        }

        if (params.get(GROUP_FIS_IN) == null) {
            // save feature list to dcache
            // List<Pair<String, Long>> fList = readFList(params);
            // saveFList(fList, params, conf);

            int fListSize = cacheFList(params, conf, countIn, minSupport, minFr, prunePct);

            if (runMode.equals(RunningMode.BlockUpdate)) {
                fListSize = -1;
                Path timeRoot = new Path(countIn).getParent().getParent();
                FileSystem fs = FileSystem.getLocal(conf);
                final long currStartTime = startTime;
                for (FileStatus earlierWindow : fs.listStatus(timeRoot, new PathFilter() {
                    @Override
                    public boolean accept(Path p) {
                        // should have used end time, but it doesn't make a difference,
                        // AS LONG AS windows don't overlap
                        return Long.parseLong(p.getName()) < currStartTime;
                    }
                })) {
                    // TODO: At such low frequency and support, does pruning out items with less frequency
                    // than minFreq cause loosing itemsets that are frequent but through a longer time frame
                    cacheFList(params, conf, fs.listStatus(earlierWindow.getPath())[0].getPath().toString(),
                            minSupport, minFr, prunePct);
                }
            } else {
                // set param to control group size in MR jobs
                int numGroups = params.getInt(PFPGrowth.NUM_GROUPS, PFPGrowth.NUM_GROUPS_DEFAULT);
                int maxPerGroup = fListSize / numGroups;
                if (fListSize % numGroups != 0)
                    maxPerGroup++;
                params.set(MAX_PER_GROUP, Integer.toString(maxPerGroup));
            }
            // fList = null;

            startParallelFPGrowth(params, conf);
        } else {
            cacheFList(params, conf, countIn, minSupport, minFr, prunePct);
        }
        startAggregating(params, conf);

        if (runMode.equals(RunningMode.BlockUpdate)) {
            String indexDirStr;// = params.get(INDEX_OUT);
            // if (indexDirStr == null || indexDirStr.isEmpty()) {
            indexDirStr = FilenameUtils.concat(params.get(OUTPUT), "index");
            // } else {
            // indexDirStr = FilenameUtils.concat(indexDirStr, startTime);
            // indexDirStr = FilenameUtils.concat(indexDirStr, endTime);
            // }
            File indexDir = FileUtils.toFile(new URL(indexDirStr));

            // clean up
            FileUtils.deleteQuietly(indexDir);

            Path seqPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);
            Directory earlierIndex = null;

            Path timeRoot = new Path(params.get(OUTPUT)).getParent().getParent();
            FileSystem fs = FileSystem.getLocal(conf);

            long mostRecent = Long.MIN_VALUE;
            Path mostRecentPath = null;
            for (FileStatus earlierWindow : fs.listStatus(timeRoot)) {
                long earlierStart = Long.parseLong(earlierWindow.getPath().getName());
                // should have used end time, but it doesn't make a difference,
                // AS LONG AS windows don't overlap
                if (earlierStart < startTime && earlierStart > mostRecent) {
                    mostRecentPath = earlierWindow.getPath();
                    mostRecent = earlierStart;
                }
            }
            if (mostRecentPath != null) {
                mostRecentPath = fs.listStatus(mostRecentPath)[0].getPath();
                mostRecentPath = new Path(mostRecentPath, "index");
                // earlierIndex = new Directory[1];
                // FIXME: as with anything that involves lucene.. won't work except on a local machine
                earlierIndex = new MMapDirectory(FileUtils.toFile(mostRecentPath.toUri().toURL()));
            }
        }
        // FIXME: When we want to stream, we have to build the index of earlier window
        // ItemSetIndexBuilder.buildIndex(seqPath, indexDir,
        // startTime, Math.min(endTime, startTime + windowSize), earlierIndex);
    }

    /**
     * Run the aggregation Job to aggregate the different TopK patterns and group each Pattern by the
     * features present in it and thus calculate the final Top K frequent Patterns for each feature
     */
    public static void startAggregating(Parameters params, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException {

        conf.set(PFP_PARAMETERS, params.toString());
        conf.set("mapred.compress.map.output", "true");
        conf.set("mapred.output.compression.type", "BLOCK");
        // YA
        // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){
        // conf.set("mapred.tasktracker.map.tasks.maximum", "6");
        // conf.set("mapred.map.child.java.opts", "-Xmx1000M");
        // conf.set("mapred.tasktracker.reduce.tasks.maximum", "6");
        // conf.set("mapred.reduce.child.java.opts", "-Xmx1000M");
        // }
        conf.setInt("mapred.max.map.failures.percent", 10);
        conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError");
        // END YA

        String gfisIn = params.get(PFPGrowth.GROUP_FIS_IN, params.get(OUTPUT));

        Path input = new Path(gfisIn, FPGROWTH);
        Job job = new Job(conf, "PFP Aggregator Driver running over input: " + input);
        job.setJarByClass(PFPGrowth.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(TopKStringPatterns.class);

        FileInputFormat.addInputPath(job, input);
        Path outPath = new Path(params.get(OUTPUT), FREQUENT_PATTERNS);

        FileOutputFormat.setOutputPath(job, outPath);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapperClass(AggregatorMapper.class);
        job.setCombinerClass(AggregatorReducer.class);
        job.setReducerClass(AggregatorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        // HadoopUtil.delete(conf, outPath);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }
    }

    /**
     * Count the frequencies of various features in parallel using Map/Reduce
     */
    public static void startParallelCounting(Parameters params, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(PFP_PARAMETERS, params.toString());

        conf.set("mapred.compress.map.output", "true");
        conf.set("mapred.output.compression.type", "BLOCK");

        // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){
        // conf.set("mapred.tasktracker.map.tasks.maximum", "3");
        // conf.set("mapred.tasktracker.reduce.tasks.maximum", "3");
        // conf.set("mapred.map.child.java.opts", "-Xmx777M");
        // conf.set("mapred.reduce.child.java.opts", "-Xmx777M");
        // conf.setInt("mapred.max.map.failures.percent", 0);
        // }
        conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError");

        // String input = params.get(INPUT);
        // Job job = new Job(conf, "Parallel Counting Driver running over input: " + input);
        long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
        // Long.toString(PFPGrowth.TREC2011_MIN_TIMESTAMP)); //GMT23JAN2011));
        long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
        // Long.toString(Long.MAX_VALUE));

        long windowSize = Long
                .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
        long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));

        endTime = Math.min(endTime, startTime + windowSize);

        FileSystem fs = FileSystem.get(conf); // TODONE: do I need?getLocal(conf);

        Job[] jobArr = new Job[(int) Math.ceil(windowSize / stepSize)];
        for (int j = 0; startTime < endTime; startTime += stepSize, ++j) {
            long jobEnd = startTime + stepSize;
            Job job = new Job(conf, "Parallel counting running over inerval " + startTime + "-" + jobEnd); // endTime);

            // Path outPath = new Path(params.get(OUTPUT), PARALLEL_COUNTING);
            Path outRoot = new Path(params.get(OUTROOT));
            Path stepOutput = new Path(outRoot, startTime + "");
            stepOutput = new Path(stepOutput, jobEnd + "");
            if (fs.exists(stepOutput)) {
                continue;
            }
            jobArr[j] = job;
            Path outPath = new Path(stepOutput, PARALLEL_COUNTING);
            FileOutputFormat.setOutputPath(job, outPath);
            // HadoopUtil.delete(conf, outPath);

            job.setJarByClass(PFPGrowth.class);

            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(LongWritable.class);

            PartitionByTimestamp.setInputPaths(job, params, conf);
            // FileInputFormat.addInputPath(job, new Path(input));

            // job.setInputFormatClass(HtmlTweetInputFormat.class);
            job.setInputFormatClass(CSVTweetInputFormat.class);
            job.setMapperClass(ParallelCountingMapper.class);
            job.setCombinerClass(ParallelCountingReducer.class);
            job.setReducerClass(ParallelCountingReducer.class);
            job.setOutputFormatClass(SequenceFileOutputFormat.class);

            job.submit();

            // boolean succeeded = job.waitForCompletion(true);
            // if (!succeeded) {
            // throw new IllegalStateException("Job failed!");
            // }
        }

        boolean allCompleted;
        do {
            Thread.sleep(1000);
            allCompleted = true;
            for (int j = 0; j < jobArr.length; ++j) {
                if (jobArr[j] == null) {
                    continue;
                }
                boolean complete = jobArr[j].isComplete();
                allCompleted &= complete;
                if (!complete) {
                    String report = (j + " (" + jobArr[j].getJobName() + "): map "
                            + StringUtils.formatPercent(jobArr[j].mapProgress(), 0) + " reduce "
                            + StringUtils.formatPercent(jobArr[j].reduceProgress(), 0) + " - Tracking: "
                            + jobArr[j].getTrackingURL());
                    LOG.info(report);
                }
            }
        } while (!allCompleted);

        boolean allSuccess = true;
        for (int j = 0; j < jobArr.length; ++j) {
            if (jobArr[j] == null) {
                continue;
            }
            boolean success = jobArr[j].isSuccessful();
            allSuccess &= success;
            if (!success) {
                String report = (j + " (" + jobArr[j].getJobName() + "): FAILED - Tracking: "
                        + jobArr[j].getTrackingURL());
                LOG.info(report);
            }
        }
        if (!allSuccess) {
            throw new IllegalStateException("Job failed!");
        }
    }

    /**
     * Run the Parallel FPGrowth Map/Reduce Job to calculate the Top K features of group dependent shards
     */
    public static void startParallelFPGrowth(Parameters params, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException {
        conf.set(PFP_PARAMETERS, params.toString());
        conf.set("mapred.compress.map.output", "true");
        conf.set("mapred.output.compression.type", "BLOCK");

        // YA
        // if(Boolean.parseBoolean(params.get(PFPGrowth.PSEUDO, "false"))){
        // conf.set("mapred.tasktracker.map.tasks.maximum", "6");
        // conf.set("mapred.map.child.java.opts", "-Xmx1000M");
        // conf.set("mapred.tasktracker.reduce.tasks.maximum", "6");
        // conf.set("mapred.reduce.child.java.opts", "-Xmx1000M");
        // }
        conf.setInt("mapred.max.map.failures.percent", 10);
        conf.set("mapred.child.java.opts", "-XX:-UseGCOverheadLimit -XX:+HeapDumpOnOutOfMemoryError");
        // END YA

        long startTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_START));
        long endTime = Long.parseLong(params.get(PFPGrowth.PARAM_INTERVAL_END));
        long windowSize = Long
                .parseLong(params.get(PFPGrowth.PARAM_WINDOW_SIZE, Long.toString(endTime - startTime)));
        long stepSize = Long.parseLong(params.get(PFPGrowth.PARAM_STEP_SIZE, Long.toString(windowSize)));
        endTime = Math.min(endTime, startTime + windowSize);
        Job job = new Job(conf, "PFPGrowth running over inerval " + startTime + "-" + endTime);

        job.setJarByClass(PFPGrowth.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(TransactionTree.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(TopKStringPatterns.class);

        // FileSystem fs = FileSystem.get(conf); //TODONE: do I need?getLocal(conf);
        PartitionByTimestamp.setInputPaths(job, params, conf);
        // FileInputFormat.addInputPath(job, input);

        Path outPath = new Path(params.get(OUTPUT), FPGROWTH);
        FileOutputFormat.setOutputPath(job, outPath);

        // HadoopUtil.delete(conf, outPath);

        // job.setInputFormatClass(HtmlTweetInputFormat.class);
        job.setInputFormatClass(CSVTweetInputFormat.class);
        if (runMode.equals(RunningMode.BlockUpdate)) {
            job.setMapperClass(ParallelFPStreamMapper.class);
            // job.setCombinerClass(ParallelFPStreamCombiner.class);
            job.setCombinerClass(ParallelFPGrowthCombiner.class);
            job.setReducerClass(ParallelFPStreamReducer.class);
        } else {
            job.setMapperClass(ParallelFPGrowthMapper.class);
            job.setCombinerClass(ParallelFPGrowthCombiner.class);
            job.setReducerClass(ParallelFPGrowthReducer.class);
        }
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private final Parameters intervalParams;

    public PFPGrowth(Parameters pIntervalParams) throws IOException {
        intervalParams = new Parameters(pIntervalParams.toString());
    }

    @Override
    public Void call() throws Exception {
        runPFPGrowth(intervalParams);
        return null;
    }

}