eu.morfeoproject.fast.catalogue.recommender.FPGrowth.java Source code

Java tutorial

Introduction

Here is the source code for eu.morfeoproject.fast.catalogue.recommender.FPGrowth.java

Source

/**
 * Copyright (c) 2008-2011, FAST Consortium
 * 
 * This file is part of FAST Platform.
 * 
 * FAST Platform is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * FAST Platform is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
 * License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * along with FAST Platform. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Info about members and contributors of the FAST Consortium
 * is available at http://fast.morfeo-project.eu
 *
 **/
package eu.morfeoproject.fast.catalogue.recommender;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.FileLineIterable;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.Parameters;
import org.apache.mahout.common.StringRecordIterator;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.fpm.pfpgrowth.convertors.ContextStatusUpdater;
import org.apache.mahout.fpm.pfpgrowth.convertors.SequenceFileOutputCollector;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.StringOutputConverter;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;

public class FPGrowth {

    protected final Log log = LogFactory.getLog(this.getClass());

    private Parameters params;
    private Configuration conf;

    public FPGrowth(Parameters params) {
        this.params = params;
        this.conf = new Configuration();
    }

    public void rebuild() {
        log.info("Starting Sequential FPGrowth");
        int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
        int minSupport = Integer.valueOf(params.get("minSupport", "2"));

        try {
            String output = params.get("output", "fpgrowth-output.dat");
            Path path = new Path(output);
            FileSystem fs = FileSystem.get(this.conf);

            Charset encoding = Charset.forName(params.get("encoding"));
            String input = params.get("input");

            String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());

            SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, Text.class,
                    TopKStringPatterns.class);

            org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String> fp = new org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth<String>();
            Set<String> features = new HashSet<String>();

            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(new File(input), encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(new File(input), encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
            writer.close();

            if (log.isInfoEnabled()) {
                List<Pair<String, TopKStringPatterns>> frequentPatterns = org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
                        .readFrequentPattern(fs, conf, path);
                for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
                    log.info("Dumping Patterns for Feature: " + entry.getFirst() + " \n"
                            + entry.getSecond().toString());
                }
            }
        } catch (IOException e) {
            log.error(e.toString(), e);
        }
    }

    public TopKStringPatterns getTopKFrequentPatterns(String feature) {
        try {
            String output = params.get("output", "fpgrowth-output.dat");
            Path path = new Path(output);
            FileSystem fs = FileSystem.get(this.conf);

            List<Pair<String, TopKStringPatterns>> frequentPatterns = org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
                    .readFrequentPattern(fs, conf, path);
            for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
                if (entry.getFirst().equals(feature))
                    return entry.getSecond();
            }
        } catch (IOException e) {
            log.error(e.toString(), e);
        }
        return null;
    }

}