Java tutorial
/* Copyright (C) 2014 "Kompetenzzentrum fuer wissensbasierte Anwendungen Forschungs- und EntwicklungsgmbH" (Know-Center), Graz, Austria, office@know-center.at. Licensees holding valid Know-Center Commercial licenses may use this file in accordance with the Know-Center Commercial License Agreement provided with the Software or, alternatively, in accordance with the terms contained in a written agreement between Licensees and Know-Center. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. * @author Raoul Rubien */ package eu.eexcess.diversityasurement.iaselect; import java.util.LinkedHashSet; import java.util.Set; import org.apache.commons.lang.StringUtils; /** * Implementation of IA-Select described in [Agrawal, R., Gollapudi, S., * Halverson, A., & Ieong, S. (2009). Diversifying search results. In * Proceedings of the Second ACM International Conference on Web Search and Data * Mining - WSDM 09 (p. 5). New York, New York, USA: ACM Press. * http://doi.org/10.1145/1498759.1498766]. * * @author Raoul Rubien */ public class IAselect { private Document maxMarginalUtilityDocument = null; private double maxMarginalUtility = -1; /** * IASelect(k, q, C(q), R(q), C(d), P(c|q), V (d|q, c)) - greedy * algorithm to diversify(k) and maximize P(S|q) * * @param k * top k documents to reorder * @param q * query for ranking documents * @param Cq * C(q) - set of categories query belongs to * @param Rq * R(q) - top k ranked documents * @param Dq C(d) - set of categories * documents belongs to * @param Pcq P(c|q) - distribution of * probability that category c * belongs to query q * @param Vdqc * V(d|q,c) - document quality for query q qhen intended category * is c * @return resorted list of k documents out of R(q) */ Set<Document> IASelect(int k, Query q, Set<Category> Cq, Set<Document> Rq, DocumentQualityValueV V) { Set<Document> R = Rq; Set<Document> S = new LinkedHashSet<Document>(k); ConditionalProbabilityU U = new ConditionalProbabilityU(Cq); System.out.println(U); while (S.size() < k) { System.out.println("\n[" + S.size() + "] out of [" + k + "] needed documents selected"); System.out.println(sToString(S)); clearMaxMarginalUtility(); for (Document d : R) { g(d, q, S, U, V); } Document dMax = argmax(); S.add(dMax); System.out.println( "select maxarg(g(d|q,c,S=" + sToString(S) + ")=" + maxMarginalUtility + ")=" + dMax.name); /** * for all c C(d*) AND "c C(q)" because but P(c|q) always refers * to c C(q), see 3.1 */ for (Category c : C(dMax)) { if (Cq.contains(c)) { U.updateU(q.getCategory(c), q, S, dMax, V); System.out.println(U); } } R.remove(dMax); } return S; } private String sToString(Set<Document> S) { return new StringBuilder().append("S:{").append(StringUtils.join(S.toArray(new Document[0]), ", ")) .append("}").toString(); } private void clearMaxMarginalUtility() { maxMarginalUtility = -1; } /** * fetches the document with the highest marginal utility g(d|q,c,S) * * @return */ Document argmax() throws IllegalStateException { if (maxMarginalUtility < 0) { throw new IllegalStateException("no argmax calculated at this time"); } return maxMarginalUtilityDocument; } /** * greedy algorithm to diversify(k) and maximize P(S|q) * * @param k * number of documents to re-rank out of classical ranked * document set * @param q * query to re-rank documents * @param Rq * R(q) - top documents returned by a classical ranking algorithm * for query q * @param V * document qualities for d R(q) * @return re-ranked document list with |R(q)| = k */ public Set<Document> IASelect(int k, Query q, Set<Document> Rq, DocumentQualityValueV V) { return IASelect(k, q, C(q), Rq, V); } /** * C(m) - fetches the set of categories to which a query|document m belongs to * * @param m * also known as q (query) or d (document) * @return set of categories m belongs to */ Set<Category> C(MessageCategories m) { return m.categories(); } /** * g(d|q,c,S) - calculated the highest marginal utility as a product of: * <p> * U(c,q,S) * V(d,q,c) * * @param d * document * @param q * query * @param S * already selected documents */ void g(Document d, Query q, Set<Document> S, ConditionalProbabilityU U, DocumentQualityValueV V) { double sum = 0; /** * for all c C(d) AND "c C(q)" because but P(c|q) always refers to c * C(q), see 3.1 */ for (Category c : C(d)) { if (C(q).contains(c)) { sum += U.U(c, q, S) * V.V(d, q, c); } } if (maxMarginalUtility < sum) { maxMarginalUtility = sum; maxMarginalUtilityDocument = d; System.out.println("argmx=" + maxMarginalUtility + " d=" + d.name); } } }