Java tutorial
/* * Copyright (C) 2013 Ilya Markov * * Full copyright notice can be found in LICENSE. */ package ch.usi.inf.lidr.merging; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.math.stat.regression.SimpleRegression; import ch.usi.inf.lidr.utils.ScoredEntity; /** * SAFE results merging/score normalization algorithm. * SAFE learns the transformation function between centralized document ranks * and centralized document scores. * Then this function is applied to source-specific ranks (<code>1</code>, <code>2</code>, ...) * to obtain corresponding normalized scores. * * <p> * <b>IMPORTANT:</b> For this class to function correctly, * its type <code>U</code> * should be the same as the type <code>T</code> of the {@link #normalize(List)} method. * </p> * * @author Ilya Markov * * @see "Robust result merging using sample-based score estimates", * Milad Shokouhi and Justin Zobel. * <i>ACM Transactions on Information Systems</i>, 27:3, pages 1-29, 2009. */ public final class SAFE<U> implements ResultsMerging { /** * An abstract class that performs regression * of the type <code>y = a * f(x) + b</code>, where the function * <code>f</code> should be overridden by subclasses. * * @author Ilya Markov */ private abstract static class Regression { /** * The regression. */ private final SimpleRegression regression = new SimpleRegression(); /** * Adds the observation <code>(f(x), y)</code> * to the regression data set. * * @param x The independent variable value. * @param y The dependent variable value. */ public void addData(double x, double y) { regression.addData(f(x), y); } /** * Returns the "predicted" y value associated * with the supplied x value, based on the data that * has been added to the model so far. * In particular, <code>y = a * f(x) + b</code>, * where <code>a</code> and <code>b</code> * are estimated by the regression. * * <p> * Returns 0 if number of added observations is less than 2. * </p> * * @param x The input <code>x</code> value. * * @return The predicted <code>y</code> value. */ public double predict(double x) { if (getN() < 2) { return 0; } return regression.getSlope() * f(x) + regression.getIntercept(); } /** * Returns Pearson's product moment correlation coefficient, * usually denoted as r. * * <p> * Returns 0 if number of added observations is less than 2. * </p> * * @return Pearson's r. */ public double getR() { if (getN() < 2) { return 0; } return regression.getR(); } /** * Returns the number of observations that have been added to the model. * * @return The number of observations that have been added. */ public long getN() { return regression.getN(); } /** * Returns <code>f(x)</code>. Must be implemented by subclasses. * * @param x The <code>x</code> value. * * @return The <code>f(x)</code> value. */ protected abstract double f(double x); } /** * The rank ratio. * This ratio means that * if a document has a rank <code>r_s</code> in a sample, * then it should have the rank <code>r_c = (r_s - 0.5) * rankRatio</code> * in the original collection. * * <p> * <b>IMPORTANT:</b> the rank ratio should be set using {@link #setRankRatio(double)} * every time before running normalization. * If it is not set, the ratio of 1 is used. * </p> * * @see #setRankRatio */ private double rankRatio = 1; /** * The ranked list of sample documents. * This list is used as an additional evidence/training data for performing SAFE score normalization. * * <p> * <b>IMPORTANT:</b> the list of sampled documents must be set for each query * using {@link #setSampleDocuments(List)} before running normalization. * </p> * * @see #setSampleDocuments(List) */ private List<ScoredEntity<U>> sampleScoredDocs = new ArrayList<ScoredEntity<U>>(); /** * Sets the rank ratio. * * <p> * <b>IMPORTANT:</b> this method should be called * every time before running normalization. * By default, the ratio of 1 is used. * </p> * * @param rankRatio The rank ratio. Should be positive. * * @throws IllegalArgumentException * if <code>rankRatio</code> is not positive. * * @see #rankRatio */ public void setRankRatio(double rankRatio) { if (rankRatio <= 0) { throw new IllegalArgumentException("The rank ratio is not positive: " + rankRatio); } this.rankRatio = rankRatio; } /** * Sets the ranked list of sample documents. * Sample documents' scores must be calculated by one single scoring function for a given query. * Moreover, the scores must be calculated within a centralized * index of documents, sampled from all sources of information/search engines. * * <p> * In other words, the following steps must be performed * in order to obtain <code>sampledDocs</code>. * <ol> * <li>Sample a number of documents from each * source of information/search engine.</li> * <li>Create a single index out of all these documents.</li> * <li>Run a given query on this index.</li> * <li>Wrap obtained results into a ranked list of {@link ScoredEntity} objects.</li> * <li>Extract documents belonging to a particular source and * pass them to this method.</li> * </ol> * </p> * * <p> * <b>IMPORTANT:</b> this method must be invoked for each query * before running normalization. * </p> * * @param sampleScoredDocs The ranked list of sample documents. * * @throws NullPointerException * if <code>sampledDocs</code> is <code>null</code>. * * @see #sampleScoredDocs */ public void setSampleDocuments(List<ScoredEntity<U>> sampleScoredDocs) { if (sampleScoredDocs == null) { throw new NullPointerException("The list of sample scored documents is null."); } this.sampleScoredDocs = sampleScoredDocs; } /** * <b>IMPORTANT:</b> {@link #setSampleDocuments(List)} must * and {@link #setRankRatio(double)} should be invoked before performing normalization. * * @see ch.usi.inf.lidr.norm.ScoreNormalization#normalize(List<ScoredEntity<T>>) * @see #setSampleDocuments(List) * @see #setRankRatio(double) */ @Override public <T> List<ScoredEntity<T>> normalize(List<ScoredEntity<T>> unnormScoredDocs) { if (unnormScoredDocs == null) { throw new NullPointerException("The list of scored documents is null."); } Map<Integer, Double> rank2score = getRank2ScoreMapping(sampleScoredDocs, unnormScoredDocs); Regression[] regressions = getRegressions(rank2score); Regression hybrid = getBestFitRegression(regressions); if (hybrid.getN() < 3) { return new ArrayList<ScoredEntity<T>>(); // ??? } List<ScoredEntity<T>> normScoredDocs = new ArrayList<ScoredEntity<T>>(unnormScoredDocs.size()); for (int i = 0; i < unnormScoredDocs.size(); i++) { normScoredDocs.add(new ScoredEntity<T>(unnormScoredDocs.get(i).getEntity(), hybrid.predict(i + 1))); } reset(); return normScoredDocs; } /** * For sample documents in <code>sampledDocs</code> * calculates the correspondence between their estimated centralized ranks * (based on {@link #rankRatio}) and centralized scores. * If a document from <code>sampledDocs</code> appears * also in <code>scoredDocs</code> then its true rank is used. */ private <T> Map<Integer, Double> getRank2ScoreMapping(List<ScoredEntity<U>> sampledDocs, List<ScoredEntity<T>> scoredDocs) { Map<Integer, Double> rank2score = new HashMap<Integer, Double>(); Map<T, Integer> scoredDocRanks = getDoc2RankMap(scoredDocs); int lastSeenIndex = getOverlapDocs(sampledDocs, scoredDocRanks, rank2score); getNonoverlapDocs(sampledDocs, lastSeenIndex, scoredDocs.size(), rank2score); return rank2score; } /** * For each sample document in <code>sampledDocs</code> * searches for a corresponding document in the source-specific * list <code>scoredDocRanks</code>. * If the correspondence is found, the true document rank * is mapped to a centralized score. * * <p> * Note the following situation. Let <code>sampledDocs[i]</code> * and <code>sampledDocs[j]</code> have corresponding documents * in <code>scoredDocRanks</code>. Then all sample documents * between <code>i</code> and <code>j</code> are skipped * and are not used as training data! * This is because it is impossible to estimate their source-specific ranks. * </p> * * <p> * Returns the index of the last sample document * that has a correspondence in a source-specific list <code>scoredDocRanks</code>. * </p> * * @return The index of the last overlapping sample document. */ private <T> int getOverlapDocs(List<ScoredEntity<U>> sampledDocs, Map<T, Integer> scoredDocRanks, Map<Integer, Double> rank2score) { int lastSeenIndex = -1; for (int i = 0; i < sampledDocs.size(); i++) { Object document = sampledDocs.get(i).getEntity(); if (scoredDocRanks.containsKey(document)) { int rank = scoredDocRanks.get(document); double score = sampledDocs.get(i).getScore(); lastSeenIndex = i; rank2score.put(rank, score); } } return lastSeenIndex; } /** * For each sample document starting from <code>lastSeenIndex</code> * estimates its source-specific rank according to the following formula: * <code>r_c = offset + (r_s - 0.5) * rankRatio</code>. */ private <T> void getNonoverlapDocs(List<ScoredEntity<T>> sampledDocs, int lastSeenIndex, int offset, Map<Integer, Double> rank2score) { for (int i = lastSeenIndex + 1; i < sampledDocs.size(); i++) { int rank = (int) (offset + (i - lastSeenIndex - 0.5) * rankRatio); double score = sampledDocs.get(i).getScore(); rank2score.put(rank, score); } } /** * Transforms <code>scoredDocs</code> into a mapping * between documents and their ranks. */ private <T> Map<T, Integer> getDoc2RankMap(List<ScoredEntity<T>> scoredDocs) { Map<T, Integer> doc2rankMap = new HashMap<T, Integer>(); for (int i = 0; i < scoredDocs.size(); i++) { doc2rankMap.put(scoredDocs.get(i).getEntity(), i + 1); } return doc2rankMap; } /** * Creates a number of regressions and fills them * with data from <code>rank2score</code>. */ private Regression[] getRegressions(Map<Integer, Double> rank2score) { Regression[] regressions = getRegressions(); for (Regression regression : regressions) { for (Map.Entry<Integer, Double> entry : rank2score.entrySet()) { regression.addData(entry.getKey(), entry.getValue()); } } return regressions; } /** * Returns four different regression models, namely, * linear, log, square root and inverse linear. */ private Regression[] getRegressions() { return new Regression[] { new Regression() { protected double f(double x) { return x; } }, new Regression() { protected double f(double x) { if (x <= 0) { return Double.NEGATIVE_INFINITY; } return Math.log(x); } }, new Regression() { protected double f(double x) { if (x < 0) { return 0; } return Math.sqrt(x); } }, new Regression() { protected double f(double x) { if (x == 0) { return 0; } return 1 / x; } } }; } /** * Finds a regression with the highest Pearson's r coefficient * among given <code>regressions</code>. */ private Regression getBestFitRegression(Regression[] regressions) { assert regressions != null : "An array pf regression should not be null"; assert regressions.length > 0 : "An array of regressions must contain at least one element"; Regression result = regressions[0]; for (int i = 1; i < regressions.length; i++) { if (result.getR() < regressions[i].getR()) { result = regressions[i]; } } return result; } /** * Resets {@link #rankRatio} and {@link #sampleScoredDocs}. * * @see #setRankRatio(double) * @see #setSampleDocuments(List) */ private void reset() { rankRatio = 1; sampleScoredDocs = new ArrayList<ScoredEntity<U>>(); } }