mx.itam.metodos.minhashing.MinhashMapper.java Source code

Java tutorial

Introduction

Here is the source code for mx.itam.metodos.minhashing.MinhashMapper.java

Source

package mx.itam.metodos.minhashing;

// This method is based on Broder '97 Syntactic Clustering of the Web 
// plus LSH as described on Rajaraman, Leskovec and Ullman 2012
// and code originally found on org.apache.mahout.clustering.minhash.MinHashMapper
// available under the Apache License 2.0.

import java.io.IOException;
import java.util.Random;

import mx.itam.metodos.common.IntArrayWritable;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Mapper;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;

public final class MinhashMapper extends Mapper<Text, IntArrayWritable, Text, Text> {

    private HashFunction lsh;

    private HashFunction[] functions;

    private int functionsCount;

    private int rows;

    private int[] hashValues;

    @Override
    public void map(Text id, IntArrayWritable values, Context ctx) throws IOException, InterruptedException {
        for (int i = 0; i < functionsCount; i++) {
            hashValues[i] = Integer.MAX_VALUE;
        }
        for (int i = 0; i < functionsCount; i++) {
            HashFunction hf = functions[i];
            for (Writable wr : values.get()) {
                IntWritable value = (IntWritable) wr;
                int hash = hf.hashInt(value.get()).asInt();
                if (hash < hashValues[i]) {
                    hashValues[i] = hash;
                }
            }
        }
        Text sketch = new Text();
        Hasher hasher = lsh.newHasher();
        int band = 0;
        for (int i = 0; i < functionsCount; i++) {
            hasher.putInt(hashValues[i]);
            if (i > 0 && (i % rows) == 0) {
                sketch.set(band + "-" + hasher.hash().toString());
                write(id, sketch, ctx);
                hasher = lsh.newHasher();
                band++;
            }
        }
        sketch.set(band + "-" + hasher.hash().toString());
        write(id, sketch, ctx);
    }

    private void write(Text id, Text sketck, Context ctx) throws IOException, InterruptedException {
        ctx.write(sketck, id);
    }

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        this.functionsCount = 100;
        this.rows = context.getConfiguration().getInt(HadoopMinhashing.ROWS, 10);
        this.hashValues = new int[functionsCount];
        this.functions = new HashFunction[functionsCount];
        Random r = new Random(11);
        for (int i = 0; i < functionsCount; i++) {
            functions[i] = Hashing.murmur3_32(r.nextInt());
        }
        this.lsh = Hashing.murmur3_32(r.nextInt());
    }
}