com.nomsic.sid.Sid.java Source code

Java tutorial

Introduction

Here is the source code for com.nomsic.sid.Sid.java

Source

/******************************************************************
 *  Copyright 2011 Simon Kelly
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 *  Sid.java
 */
package com.nomsic.sid;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;

import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Files;

/**
 * Sid builds Srensen similarity indexes between records based on the records
 * fields. 
 * 
 * @author Simon Kelly <simongdkelly@gmail.com>
 */
public class Sid {

    private static final Logger log = LoggerFactory.getLogger(Sid.class);

    /**
     * This map contains entries whose key is a recordId and value is a sorted
     * set of Simsets that the record belongs to i.e. the record is present in
     * each Simset
     */
    private SortedMap<String, Set<Simset>> recordSimsetMap;

    /**
     * This map contains entries whose key is a recordId and value is the number
     * of fields that the record contains.
     */
    private Map<String, Integer> recordFieldCountMap;

    private File recordFile;

    private Collection<SortedSet<String>> recordSets;

    public boolean ignoreFirstLine;

    public Sid() {
        recordSimsetMap = Maps.newTreeMap();
        recordFieldCountMap = Maps.newHashMap();
    }

    private void buildRecordFieldCountMap() {
        for (String recordId : recordSimsetMap.keySet()) {
            int recordFieldCount = getRecordFieldCount(recordId);
            recordFieldCountMap.put(recordId, recordFieldCount);
        }
    }

    /**
     * Given the cardinality map of the record field sets this method creates
     * the Simsets and populates the recordSimsetMap
     * 
     * @param recordSetCardinalityMap
     *            map with key equal to a set of recordIds and value equal to
     *            the number of times that set of recordIds appeared in the
     *            original set of record sets
     */
    private void buildRecordSimsetMap(Map<SortedSet<String>, Integer> recordSetCardinalityMap) {
        for (Entry<SortedSet<String>, Integer> entry : recordSetCardinalityMap.entrySet()) {
            Simset simset = new Simset(entry.getValue(), entry.getKey());
            for (String recordId : simset.getRecordSet()) {
                if (recordSimsetMap.containsKey(recordId)) {
                    boolean add = recordSimsetMap.get(recordId).add(simset);
                    if (!add) {
                        log.error("Simset '{}' not added for record '{}'", simset, recordId);
                    }
                } else {
                    Set<Simset> treeSet = Sets.newHashSet();
                    treeSet.add(simset);
                    recordSimsetMap.put(recordId, treeSet);
                }
            }
        }

    }

    /**
     * Given recordA and recordB calculate the Srensen similarity index
     * between the two records.
     * 
     * @param recordA
     *            the id of recordA
     * @param recordB
     *            the id of recordB
     * @return the Srensen similarity index for recordA and recordB
     */
    private double calculateSimlarity(String recordA, String recordB) {
        Integer recordAFieldCount = recordFieldCountMap.get(recordA);
        Integer recordBFieldCount = recordFieldCountMap.get(recordB);
        int commonFieldCount = countCommonFields(recordA, recordB);
        double qs = (double) (2 * commonFieldCount) / (double) (recordAFieldCount + recordBFieldCount);
        return qs;
    }

    private Set<Match> computeMatches() {
        int recordCount = 1;
        int recordSetCount = 1;
        int matchCount = 1;

        Set<Match> matches = Sets.newHashSet();
        for (Entry<String, Set<Simset>> entry : recordSimsetMap.entrySet()) {
            recordCount++;
            String recordA = entry.getKey();
            for (Simset simset : entry.getValue()) {
                SortedSet<String> potentialSimalarRecords = simset.getRecordSet();
                recordSetCount++;
                for (String recordB : potentialSimalarRecords) {
                    Match match = new Match(recordA, recordB);
                    if (!recordB.equals(recordA) && !matches.contains(match)) {
                        matchCount++;
                        double qs = calculateSimlarity(recordA, recordB);
                        match.setSimilarityIndex(qs);
                        matches.add(match);
                    }
                }
            }
        }
        log.debug("{} records processed", String.valueOf(recordCount));
        log.debug("{} record sets processed", String.valueOf(recordSetCount));
        log.debug("{} matches tested", String.valueOf(matchCount));
        return matches;
    }

    /**
     * Calculate the number of common fields between recordA and recordB by
     * summing the fieldCount for each Simset belonging to recordA that also
     * contains recordB
     * 
     * @param recordA
     * @param recordB
     * @return number of fields in common
     */
    private int countCommonFields(String recordAId, String recordBId) {
        int count = 0;
        Set<Simset> recordASimsets = recordSimsetMap.get(recordAId);
        for (Simset simset : recordASimsets) {
            if (simset.getRecordSet().contains(recordBId)) {
                count += simset.getFieldCount();
            }
        }
        return count;
    }

    /**
     * 
     * @param thresholdSq
     * @return
     * @throws IOException
     */
    public List<Match> getMatches(final double thresholdSq) throws IOException {

        if (recordSets == null) {
            Preconditions.checkNotNull(recordFile);
            FieldRecordProcessor processor = new FieldRecordProcessor();
            processor.setIgnoreFirstLine(ignoreFirstLine);
            recordSets = Files.readLines(recordFile, Charset.defaultCharset(), processor);
        }

        @SuppressWarnings("unchecked")
        Map<SortedSet<String>, Integer> recordSetCardinalityMap = CollectionUtils.getCardinalityMap(recordSets);

        buildRecordSimsetMap(recordSetCardinalityMap);
        buildRecordFieldCountMap();

        Set<Match> matches = computeMatches();
        matches = Sets.filter(matches, new Predicate<Match>() {
            @Override
            public boolean apply(Match m) {
                return m.getSimilarityIndex() > thresholdSq;
            }
        });
        ArrayList<Match> matchList = Lists.newArrayList();
        matchList.addAll(matches);
        Collections.sort(matchList, new Comparator<Match>() {
            @Override
            public int compare(Match o1, Match o2) {
                return o2.getSimilarityIndex().compareTo(o1.getSimilarityIndex());
            }
        });
        return matchList;
    }

    /**
     * Calculates the number of fields for a given record based on the set of
     * Simsets it belongs to.
     * 
     * @param recordId
     * @return fieldCount
     */
    private int getRecordFieldCount(String recordId) {
        int fieldCount = 0;
        for (Simset simset : recordSimsetMap.get(recordId)) {
            fieldCount += simset.getFieldCount();
        }
        return fieldCount;
    }

    public void loadFromFile(String recordFileName) throws IOException {
        loadFromFile(new File(recordFileName));
    }

    public void loadFromFile(File recordFile) throws IOException {
        try {
            log.info("Reading from file '{}'", recordFile.getAbsolutePath());
            String firstLine = Files.readFirstLine(recordFile, Charset.defaultCharset());
            log.debug("File first line: {}", firstLine);
            this.recordFile = recordFile;
        } catch (IOException e) {
            log.error("Error reading from file", e);
            throw e;
        }
    }

    public void loadFromRecordMap(Map<String, String> fieldRecordMap) {
        FieldRecordProcessor processor = new FieldRecordProcessor();
        processor.setIgnoreFirstLine(ignoreFirstLine);
        for (Entry<String, String> entry : fieldRecordMap.entrySet()) {
            processor.addFieldRecord(entry.getKey(), entry.getValue());
        }
        recordSets = processor.getResult();
    }

    public void setIgnoreFirstLine(boolean ignoreFirstLine) {
        this.ignoreFirstLine = ignoreFirstLine;
    }

}