org.thiesen.jiffs.jobs.clusterer.ClusterFinder.java Source code

Introduction

Here is the source code for org.thiesen.jiffs.jobs.clusterer.ClusterFinder.java
Source

/*
 * (c) Copyright 2010 Marcus Thiesen (marcus@thiesen.org)
 *
 *  This file is part of jiffs.
 *
 *  jiffs is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  jiffs is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with jiffs.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package org.thiesen.jiffs.jobs.clusterer;

import gnu.trove.TObjectIntHashMap;

import java.util.Map;
import java.util.Set;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;

public class ClusterFinder implements Runnable {

    private final static Object CLUSTER_LOCK = new Object();
    private final static AtomicLong CLUSTER_COUNT = new AtomicLong();

    private Map<String, Long> _clusters;
    private ClusterItem _firstItem;
    private ClusterItem _secondItem;
    private Semaphore _maxEnqueuedTasks;

    public ClusterFinder(Semaphore maxEnqueedTasks, Map<String, Long> clusters, ClusterItem firstItem,
            ClusterItem secondItem) {
        _maxEnqueuedTasks = maxEnqueedTasks;
        _clusters = clusters;
        _firstItem = firstItem;
        _secondItem = secondItem;
    }

    @Override
    public void run() {
        try {
            findCluster();
        } finally {
            _maxEnqueuedTasks.release();
        }

    }

    private void findCluster() {

        final double similarity = computeSimilarity();

        if (similarity >= Clusterer.BASE_SIMILARITY) {
            final String firstUri = _firstItem.getUri();
            final String secondUri = _secondItem.getUri();
            synchronized (CLUSTER_LOCK) {
                final Long clusterId = findClusterId(firstUri, secondUri);

                _clusters.put(firstUri, clusterId);
                _clusters.put(secondUri, clusterId);

            }
        }
    }

    private double computeSimilarity() {
        final Iterable<String> firstWords = _firstItem.getWords();
        final Iterable<String> secondWords = _secondItem.getWords();

        final Set<String> allWords = makeAllWordsSet(firstWords, secondWords);
        final TObjectIntHashMap<String> wordsByPosition = makeWordsByBositionMap(allWords);

        final TermVector firstVector = makeVector(firstWords, allWords, wordsByPosition);
        final TermVector secondVector = makeVector(secondWords, allWords, wordsByPosition);

        final double similarity = firstVector.dot(secondVector);
        return similarity;
    }

    private TermVector makeVector(final Iterable<String> firstWords, final Set<String> allWords,
            final TObjectIntHashMap<String> wordsByPosition) {
        final TermVector firstVector = new TermVector(allWords.size());
        for (final String word : firstWords) {
            firstVector.put(wordsByPosition.get(word), 1.0D);
        }
        firstVector.normalize();
        return firstVector;
    }

    private Set<String> makeAllWordsSet(final Iterable<String> firstWords, final Iterable<String> secondWords) {
        final Set<String> allWords = Sets.newTreeSet();
        Iterables.addAll(allWords, firstWords);
        Iterables.addAll(allWords, secondWords);
        return allWords;
    }

    private TObjectIntHashMap<String> makeWordsByBositionMap(final Set<String> allWords) {
        final TObjectIntHashMap<String> wordsByPosition = new TObjectIntHashMap<String>();
        int position = 1;
        for (final String word : allWords) {
            wordsByPosition.put(word, position++);
        }
        return wordsByPosition;
    }

    private Long findClusterId(String firstUri, String secondUri) {
        if (_clusters.containsKey(firstUri)) {
            return _clusters.get(firstUri);
        }
        if (_clusters.containsKey(secondUri)) {
            return _clusters.get(secondUri);
        }

        return Long.valueOf(CLUSTER_COUNT.incrementAndGet());
    }

}