eu.eexcess.diversityasurement.iaselect.IAselect.java Source code

Introduction

Here is the source code for eu.eexcess.diversityasurement.iaselect.IAselect.java
Source

/* Copyright (C) 2014 
"Kompetenzzentrum fuer wissensbasierte Anwendungen Forschungs- und EntwicklungsgmbH" 
(Know-Center), Graz, Austria, office@know-center.at.
    
Licensees holding valid Know-Center Commercial licenses may use this file in
accordance with the Know-Center Commercial License Agreement provided with 
the Software or, alternatively, in accordance with the terms contained in
a written agreement between Licensees and Know-Center.
    
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
    
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU Affero General Public License for more details.
    
You should have received a copy of the GNU Affero General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * @author Raoul Rubien
*/

package eu.eexcess.diversityasurement.iaselect;

import java.util.LinkedHashSet;
import java.util.Set;

import org.apache.commons.lang.StringUtils;

/**
 * Implementation of IA-Select described in [Agrawal, R., Gollapudi, S.,
 * Halverson, A., & Ieong, S. (2009). Diversifying search results. In
 * Proceedings of the Second ACM International Conference on Web Search and Data
 * Mining - WSDM 09 (p. 5). New York, New York, USA: ACM Press.
 * http://doi.org/10.1145/1498759.1498766].
 * 
 * @author Raoul Rubien
 */
public class IAselect {

    private Document maxMarginalUtilityDocument = null;
    private double maxMarginalUtility = -1;

    /**
     * IASelect(k, q, C(q), R(q), C(d), P(c|q), V (d|q, c)) - greedy
     * algorithm to diversify(k) and maximize P(S|q)
     * 
     * @param k
     *            top k documents to reorder
     * @param q
     *            query for ranking documents
     * @param Cq
     *            C(q) - set of categories query belongs to
     * @param Rq
     *            R(q) - top k ranked documents
     * @param Dq C(d) - set of categories
     *              documents belongs to
     * @param Pcq P(c|q) - distribution of
     *              probability that category c
     *              belongs to query q
     * @param Vdqc
     *            V(d|q,c) - document quality for query q qhen intended category
     *            is c
     * @return resorted list of k documents out of R(q)
     */
    Set<Document> IASelect(int k, Query q, Set<Category> Cq, Set<Document> Rq, DocumentQualityValueV V) {
        Set<Document> R = Rq;
        Set<Document> S = new LinkedHashSet<Document>(k);

        ConditionalProbabilityU U = new ConditionalProbabilityU(Cq);
        System.out.println(U);

        while (S.size() < k) {
            System.out.println("\n[" + S.size() + "] out of [" + k + "] needed documents selected");
            System.out.println(sToString(S));

            clearMaxMarginalUtility();
            for (Document d : R) {
                g(d, q, S, U, V);
            }
            Document dMax = argmax();
            S.add(dMax);

            System.out.println(
                    "select maxarg(g(d|q,c,S=" + sToString(S) + ")=" + maxMarginalUtility + ")=" + dMax.name);
            /**
             * for all c  C(d*) AND "c  C(q)" because but P(c|q) always refers
             * to c  C(q), see 3.1
             */
            for (Category c : C(dMax)) {
                if (Cq.contains(c)) {
                    U.updateU(q.getCategory(c), q, S, dMax, V);
                    System.out.println(U);
                }
            }
            R.remove(dMax);
        }
        return S;
    }

    private String sToString(Set<Document> S) {
        return new StringBuilder().append("S:{").append(StringUtils.join(S.toArray(new Document[0]), ", "))
                .append("}").toString();
    }

    private void clearMaxMarginalUtility() {
        maxMarginalUtility = -1;
    }

    /**
     * fetches the document with the highest marginal utility g(d|q,c,S)
     * 
     * @return
     */
    Document argmax() throws IllegalStateException {
        if (maxMarginalUtility < 0) {
            throw new IllegalStateException("no argmax calculated at this time");
        }
        return maxMarginalUtilityDocument;
    }

    /**
     * greedy algorithm to diversify(k) and maximize P(S|q)
     * 
     * @param k
     *            number of documents to re-rank out of classical ranked
     *            document set
     * @param q
     *            query to re-rank documents
     * @param Rq
     *            R(q) - top documents returned by a classical ranking algorithm
     *            for query q
     * @param V
     *            document qualities for  d  R(q)
     * @return re-ranked document list with |R(q)| = k
     */
    public Set<Document> IASelect(int k, Query q, Set<Document> Rq, DocumentQualityValueV V) {
        return IASelect(k, q, C(q), Rq, V);
    }

    /**
     * C(m) - fetches the set of categories to which a query|document m belongs to
     * 
     * @param m
     *            also known as q (query) or d (document)
     * @return set of categories m belongs to
     */
    Set<Category> C(MessageCategories m) {
        return m.categories();
    }

    /**
     * g(d|q,c,S) - calculated the highest marginal utility as a product of:
     * <p>
     * U(c,q,S) * V(d,q,c)
     * 
     * @param d
     *            document
     * @param q
     *            query
     * @param S
     *            already selected documents
     */
    void g(Document d, Query q, Set<Document> S, ConditionalProbabilityU U, DocumentQualityValueV V) {
        double sum = 0;

        /**
         * for all c  C(d) AND "c  C(q)" because but P(c|q) always refers to c
         *  C(q), see 3.1
         */
        for (Category c : C(d)) {
            if (C(q).contains(c)) {
                sum += U.U(c, q, S) * V.V(d, q, c);
            }
        }
        if (maxMarginalUtility < sum) {
            maxMarginalUtility = sum;
            maxMarginalUtilityDocument = d;
            System.out.println("argmx=" + maxMarginalUtility + " d=" + d.name);
        }
    }
}