edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastSelfJoin.java Source code

Java tutorial

Introduction

Here is the source code for edu.uci.ics.fuzzyjoin.hadoop.recordpairs.MapBroadcastSelfJoin.java

Source

/**
 * Copyright 2010-2011 The Regents of the University of California
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on
 * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations under
 * the License.
 * 
 * Author: Rares Vernica <rares (at) ics.uci.edu>
 */

package edu.uci.ics.fuzzyjoin.hadoop.recordpairs;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import edu.uci.ics.fuzzyjoin.FuzzyJoinConfig;
import edu.uci.ics.fuzzyjoin.RIDPairSimilarity;
import edu.uci.ics.fuzzyjoin.hadoop.FuzzyJoinDriver;
import edu.uci.ics.fuzzyjoin.hadoop.IntPairWritable;

/**
 * @author rares
 * 
 *         KEY1: Unused
 * 
 *         VALUE1: Record
 * 
 *         KEY2: RID1, RID2
 * 
 *         VALUE2: Record; Similarity
 */
public class MapBroadcastSelfJoin extends MapReduceBase implements Mapper<Object, Text, IntPairWritable, Text> {

    private static final Log LOG = LogFactory.getLog(MapBroadcastSelfJoin.class);

    private final HashMap<Integer, ArrayList<RIDPairSimilarity>> joinIndex = new HashMap<Integer, ArrayList<RIDPairSimilarity>>();

    private final IntPairWritable key = new IntPairWritable();

    private void addJoinIndex(Path path) {
        try {
            BufferedReader fis = new BufferedReader(new FileReader(path.toString()));
            String line = null;
            while ((line = fis.readLine()) != null) {
                String[] splits = line.split(" ");
                int rid1 = Integer.parseInt(splits[0]);
                int rid2 = Integer.parseInt(splits[1]);
                float similarity = Float.parseFloat(splits[2]);

                if (rid1 < rid2) {
                    int r = rid1;
                    rid1 = rid2;
                    rid2 = r;
                }

                RIDPairSimilarity ridSimilarity = new RIDPairSimilarity(rid1, rid2, similarity);

                // Use ArrayList to lower the memory requirements. This causes
                // duplicated to be outputed. The duplicates need to be
                // eliminated in the Reduce.
                ArrayList<RIDPairSimilarity> ridSimilarities = joinIndex.get(rid1);
                if (ridSimilarities != null) {
                    ridSimilarities.add(ridSimilarity);
                } else {
                    ridSimilarities = new ArrayList<RIDPairSimilarity>();
                    ridSimilarities.add(ridSimilarity);
                    joinIndex.put(rid1, ridSimilarities);
                }

                ridSimilarities = joinIndex.get(rid2);
                if (ridSimilarities != null) {
                    ridSimilarities.add(ridSimilarity);
                } else {
                    ridSimilarities = new ArrayList<RIDPairSimilarity>();
                    ridSimilarities.add(ridSimilarity);
                    joinIndex.put(rid2, ridSimilarities);
                }
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void configure(JobConf job) {
        LOG.info("Configure START");
        //
        // read join index
        //
        Path[] cache = null;
        try {
            cache = DistributedCache.getLocalCacheFiles(job);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (cache == null) {
            addJoinIndex(new Path(job.get(FuzzyJoinDriver.DATA_JOININDEX_PROPERTY)));
        } else {
            for (Path path : cache) {
                addJoinIndex(path);
            }
        }
        LOG.info("Configure END");
    }

    public void map(Object unused, Text record, OutputCollector<IntPairWritable, Text> output, Reporter reporter)
            throws IOException {
        String valueSplit[] = record.toString().split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX);
        int rid = Integer.valueOf(valueSplit[FuzzyJoinConfig.RECORD_KEY]);
        String recordStr = record.toString();
        ArrayList<RIDPairSimilarity> set = joinIndex.get(rid);
        if (set != null) {
            for (RIDPairSimilarity ridSimilarity : set) {
                key.set(ridSimilarity.rid1, ridSimilarity.rid2);
                record.set("" + ridSimilarity.similarity + FuzzyJoinConfig.RECORD_EXTRA_SEPARATOR + recordStr);
                output.collect(key, record);
            }
        }
    }
}