edu.ucsb.cs.partitioning.lsh.LshPartitionMain.java Source code

Java tutorial

Introduction

Here is the source code for edu.ucsb.cs.partitioning.lsh.LshPartitionMain.java

Source

/**
 * Copyright 2012-2013 The Regents of the University of California
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: maha alabduljalil <maha (at) cs.ucsb.edu>
 */

package edu.ucsb.cs.partitioning.lsh;

import java.io.IOException;
import java.net.URI;

import org.apache.commons.cli.ParseException;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * This class takes a text input of records (id word1 word2 ..) and
 * partitions/bins containing similar records with probability &lt epsilon, it
 * will miss mapping similar pairs together. This class is responsible for
 * filtering out dissimilar jaccard records by placing documents' signatures
 * into the same buckets when they are likely to be similar and to different
 * buckets whenever they're likely not. It's a mapReduce job with
 * {@link LshMapper} as a class to generate the documents' signatures and
 * {@link LshReducer} to write documents sharing signatures into the same file
 * for all-to-all/approximate comparison later.
 * 
 * @author Maha Alabduljalil
 * 
 */
public class LshPartitionMain {

    public static final String NAMESPACE = "lsh";
    public static final String THRESHOLD_PROPERTY = NAMESPACE + ".sim.threshold";
    public static final float THRESHOLD_VALUE = 0.44f;
    public static final String L_PROPERTY = NAMESPACE + ".l";
    public static final int L_VALUE = 32;
    public static final String K_PROPERTY = NAMESPACE + ".k";
    public static final int K_VALUE = 6;
    public static final String NUM_FEATURES_PROPERTY = NAMESPACE + ".num.features";
    public static final long NUM_FEATURES_VALUE = 6000;
    public static final String NUM_REDUCERS_PROPERTY = NAMESPACE + ".num.reducers";
    public static final int NUM_REDUCERS_VALUE = 3;

    public static void writeLsh(JobConf job, FileSystem fs, LshTable lshTable) {
        try {
            Path lshfile = new Path("lshfile");
            NullWritable none = NullWritable.get();
            if (fs.exists(lshfile))
                fs.delete(lshfile);
            SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, lshfile, LshTable.class,
                    NullWritable.class, SequenceFile.CompressionType.NONE);
            writer.append(lshTable, none);
            writer.close();
            DistributedCache.addCacheFile(new URI("lshfile"), job);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String args[]) throws ParseException, IOException {

        JobConf job = new JobConf();
        job.setJarByClass(LshPartitionMain.class);
        job.setJobName(LshPartitionMain.class.getSimpleName());
        GenericOptionsParser gop = new GenericOptionsParser(job, args);
        args = gop.getRemainingArgs();

        job.setMapperClass(LshMapper.class);
        job.setMapOutputKeyClass(IntArrayWritable.class); // signatures
        job.setMapOutputValueClass(LongWritable.class); // doc IDs
        job.setNumReduceTasks(job.getInt(NUM_REDUCERS_PROPERTY, NUM_REDUCERS_VALUE));
        job.setReducerClass(LshReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);

        String inputDir = args[0];
        if (inputDir == null) {
            throw new UnsupportedOperationException("ERROR: input directory not set.");
        }
        FileInputFormat.addInputPath(job, new Path(inputDir));
        Path outputPath = new Path("lsh-jaccard-buckets");
        FileOutputFormat.setOutputPath(job, outputPath);
        FileSystem.get(job).delete(outputPath, true);

        LshTable lshTable = new LshTable(job.getInt(K_PROPERTY, K_VALUE), job.getInt(L_PROPERTY, L_VALUE), 1024,
                job.getLong(NUM_FEATURES_PROPERTY, NUM_FEATURES_VALUE),
                job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE));

        writeLsh(job, outputPath.getFileSystem(job), lshTable);

        run(job);

    }

    public static void run(JobConf job) throws IOException {

        String ret = stars() + "\n  Running job:  " + job.getJobName() + "\n  Input Path:   {";
        Path inputs[] = FileInputFormat.getInputPaths(job);
        for (int ctr = 0; ctr < inputs.length; ctr++) {
            if (ctr > 0) {
                ret += "\n                ";
            }
            ret += inputs[ctr].toString();
        }
        ret += "}\n";
        ret += "  Output Path:  " + FileOutputFormat.getOutputPath(job) + "\n" + "  Threshold:    "
                + job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE) + "\n  k:            "
                + job.getInt(K_PROPERTY, K_VALUE) + "\n  l:            " + job.getInt(L_PROPERTY, L_VALUE);
        System.out.println(ret);
        JobClient.runJob(job);
    }

    public static String stars() {
        return new String(new char[77]).replace("\0", "*");
    }
}