Java tutorial
/* * Copyright Myrrix Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package net.myrrix.online; import java.io.BufferedReader; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; import java.util.Queue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import com.google.common.base.CharMatcher; import com.google.common.base.Preconditions; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.common.net.HostAndPort; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.math3.util.FastMath; import org.apache.mahout.cf.taste.common.NoSuchItemException; import org.apache.mahout.cf.taste.common.NoSuchUserException; import org.apache.mahout.cf.taste.common.Refreshable; import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; import org.apache.mahout.cf.taste.model.DataModel; import org.apache.mahout.cf.taste.model.IDMigrator; import org.apache.mahout.cf.taste.recommender.IDRescorer; import org.apache.mahout.cf.taste.recommender.RecommendedItem; import org.apache.mahout.cf.taste.recommender.Rescorer; import org.apache.mahout.common.LongPair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import net.myrrix.common.ClassUtils; import net.myrrix.common.OneWayMigrator; import net.myrrix.common.math.Solver; import net.myrrix.common.parallel.ExecutorUtils; import net.myrrix.common.ReloadingReference; import net.myrrix.common.MutableRecommendedItem; import net.myrrix.common.collection.FastByIDFloatMap; import net.myrrix.common.collection.FastIDSet; import net.myrrix.common.io.IOUtils; import net.myrrix.common.LangUtils; import net.myrrix.common.MyrrixRecommender; import net.myrrix.common.NotReadyException; import net.myrrix.common.TopN; import net.myrrix.common.collection.FastByIDMap; import net.myrrix.online.candidate.CandidateFilter; import net.myrrix.common.math.SimpleVectorMath; import net.myrrix.online.generation.Generation; import net.myrrix.online.generation.GenerationManager; import net.myrrix.online.generation.IDCluster; /** * <p>The core implementation of {@link org.apache.mahout.cf.taste.recommender.Recommender} and furthermore * {@link MyrrixRecommender} that lies inside the Serving Layer.</p> * * <p>It is useful to note here, again, that the API methods {@link #setPreference(long, long)} * and {@link #removePreference(long, long)}, retained from Apache Mahout, have a somewhat different meaning * than in Mahout. They add to an association strength, rather than replace it. See the javadoc.</p> * * @author Sean Owen * @since 1.0 */ public final class ServerRecommender implements MyrrixRecommender, Closeable { private static final Logger log = LoggerFactory.getLogger(ServerRecommender.class); private static final Splitter DELIMITER = Splitter.on(CharMatcher.anyOf(",\t")).trimResults(); // Maybe expose this publicly later private static final double FOLDIN_LEARN_RATE = Double .parseDouble(System.getProperty("model.foldin.learningRate", "1.0")); // Only temporary private static final double BIG_FOLDIN_THRESHOLD = Double .parseDouble(System.getProperty("model.foldin.bigThreshold", "10000.0")); private final GenerationManager generationManager; private final int numCores; private final ReloadingReference<ExecutorService> executor; private final IDMigrator tagHasher; /** * Calls {@link #ServerRecommender(String, String, File, int, ReloadingReference)} for simple local mode, * with no bucket, instance ID 0, and no partitions (partition 0 of 1 total). * * @param localInputDir local input and model file directory */ public ServerRecommender(File localInputDir) { this(null, null, localInputDir, 0, null); } /** * @param bucket bucket that Serving Layer is using for instances * @param instanceID instance ID that the Serving Layer is serving. May be 0 for local mode. * @param localInputDir local input and model file directory * @param partition partition number in a partitioned distributed mode. 0 if not partitioned. * @param allPartitions reference to an object that can describe all partitions; only used to get their count */ public ServerRecommender(String bucket, String instanceID, File localInputDir, int partition, ReloadingReference<List<List<HostAndPort>>> allPartitions) { Preconditions.checkNotNull(localInputDir, "No local dir"); if (bucket == null || instanceID == null) { log.info("Creating ServerRecommender with local input dir {}", localInputDir); } else { log.info( "Creating ServerRecommender for bucket {}, instance {} and with local input dir {}, partition {}", bucket, instanceID, localInputDir, partition); } generationManager = ClassUtils.loadInstanceOf("net.myrrix.online.generation.DelegateGenerationManager", GenerationManager.class, new Class<?>[] { String.class, String.class, File.class, int.class, ReloadingReference.class }, new Object[] { bucket, instanceID, localInputDir, partition, allPartitions }); numCores = Runtime.getRuntime().availableProcessors(); executor = new ReloadingReference<ExecutorService>(new Callable<ExecutorService>() { @Override public ExecutorService call() { return Executors.newFixedThreadPool(2 * numCores, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ServerRecommender-%d").build()); } }); tagHasher = new OneWayMigrator(); } public String getBucket() { return generationManager.getBucket(); } public String getInstanceID() { return generationManager.getInstanceID(); } public GenerationManager getGenerationManager() { return generationManager; } @Deprecated @Override public void refresh(Collection<Refreshable> alreadyRefreshed) { if (alreadyRefreshed != null) { log.warn("Ignoring argument {}", alreadyRefreshed); } refresh(); } @Override public void refresh() { generationManager.refresh(); } @Override public void ingest(File file) throws TasteException { Reader reader = null; try { reader = IOUtils.openReaderMaybeDecompressing(file); ingest(reader); } catch (IOException ioe) { throw new TasteException(ioe); } finally { try { if (reader != null) { reader.close(); } } catch (IOException e) { // Can't happen, continue } } } @Override public void ingest(Reader reader) throws TasteException { // See also InputFilesReader BufferedReader buffered = IOUtils.buffer(reader); try { int lines = 0; int badLines = 0; String line; while ((line = buffered.readLine()) != null) { if (badLines > 100) { // Crude check throw new IOException("Too many bad lines; aborting"); } lines++; if (line.isEmpty() || line.charAt(0) == '#') { continue; } Iterator<String> it = DELIMITER.split(line).iterator(); long userID = Long.MIN_VALUE; String itemTag = null; long itemID = Long.MIN_VALUE; String userTag = null; float value; try { String userIDString = it.next(); if (userIDString.startsWith("\"")) { itemTag = userIDString.substring(1, userIDString.length() - 1); } else { userID = Long.parseLong(userIDString); } String itemIDString = it.next(); if (itemIDString.startsWith("\"")) { userTag = itemIDString.substring(1, itemIDString.length() - 1); } else { itemID = Long.parseLong(itemIDString); } if (it.hasNext()) { String valueToken = it.next(); value = valueToken.isEmpty() ? Float.NaN : LangUtils.parseFloat(valueToken); } else { value = 1.0f; } } catch (NoSuchElementException ignored) { log.warn("Ignoring line with too few columns: '{}'", line); badLines++; continue; } catch (IllegalArgumentException iae) { // includes NumberFormatException if (lines == 1) { log.info("Ignoring header line: '{}'", line); } else { log.warn("Ignoring unparseable line: '{}'", line); badLines++; } continue; } boolean remove = Float.isNaN(value); if (itemTag != null) { if (userTag != null) { log.warn("Two tags not allowed: '{}'", line); badLines++; continue; } if (!remove) { setItemTag(itemTag, itemID, value, true); } // else ignore? no support for remove tag yet } else if (userTag != null) { if (!remove) { setUserTag(userID, userTag, value, true); } // else ignore? no support for remove tag yet } else { if (remove) { removePreference(userID, itemID, true); } else { setPreference(userID, itemID, value, true); } } if (lines % 1000000 == 0) { log.info("Finished {} lines", lines); } } generationManager.bulkDone(); } catch (IOException ioe) { throw new TasteException(ioe); } } @Override public void close() throws IOException { generationManager.close(); ExecutorService executorService = executor.maybeGet(); if (executorService != null) { ExecutorUtils.shutdownNowAndAwait(executorService); } } /** * @throws NotReadyException if {@link GenerationManager#getCurrentGeneration()} returns null */ private Generation getCurrentGeneration() throws NotReadyException { Generation generation = generationManager.getCurrentGeneration(); if (generation == null) { throw new NotReadyException(); } return generation; } /** * Like {@link #recommend(long, int, IDRescorer)} but supplies no rescorer. */ @Override public List<RecommendedItem> recommend(long userID, int howMany) throws NoSuchUserException, NotReadyException { return recommend(userID, howMany, null); } /** * Like {@link #recommend(long, int, boolean, IDRescorer)} and specifies to not consider known items. */ @Override public List<RecommendedItem> recommend(long userID, int howMany, IDRescorer rescorer) throws NoSuchUserException, NotReadyException { return recommend(userID, howMany, false, rescorer); } /** * @param userID user for which recommendations are to be computed * @param howMany desired number of recommendations * @param considerKnownItems if true, items that the user is already associated to are candidates * for recommendation. Normally this is {@code false}. * @param rescorer rescoring function used to modify association strengths before ranking results * @return {@link List} of recommended {@link RecommendedItem}s, ordered from most strongly recommend to least * @throws NoSuchUserException if the user is not known in the model * @throws NotReadyException if the recommender has no model available yet */ @Override public List<RecommendedItem> recommend(long userID, int howMany, boolean considerKnownItems, IDRescorer rescorer) throws NoSuchUserException, NotReadyException { return recommendToMany(new long[] { userID }, howMany, considerKnownItems, rescorer); } @Override public List<RecommendedItem> recommendToMany(long[] userIDs, int howMany, boolean considerKnownItems, IDRescorer rescorer) throws NoSuchUserException, NotReadyException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); Generation generation = getCurrentGeneration(); FastByIDMap<float[]> X = generation.getX(); Lock xLock = generation.getXLock().readLock(); List<float[]> userFeatures = Lists.newArrayListWithCapacity(userIDs.length); xLock.lock(); try { for (long userID : userIDs) { float[] theUserFeatures = X.get(userID); if (theUserFeatures != null) { userFeatures.add(theUserFeatures); } } } finally { xLock.unlock(); } if (userFeatures.isEmpty()) { throw new NoSuchUserException(Arrays.toString(userIDs)); } FastByIDMap<FastIDSet> knownItemIDs = generation.getKnownItemIDs(); if (knownItemIDs == null && !considerKnownItems) { throw new UnsupportedOperationException("Can't ignore known items because no known items available"); } FastIDSet usersKnownItemIDs = null; if (!considerKnownItems) { Lock knownItemLock = generation.getKnownItemLock().readLock(); knownItemLock.lock(); try { for (long userID : userIDs) { FastIDSet theKnownItemIDs = knownItemIDs.get(userID); if (theKnownItemIDs == null) { continue; } if (usersKnownItemIDs == null) { usersKnownItemIDs = theKnownItemIDs; } else { LongPrimitiveIterator it = usersKnownItemIDs.iterator(); while (it.hasNext()) { if (!theKnownItemIDs.contains(it.nextLong())) { it.remove(); } } } if (usersKnownItemIDs.isEmpty()) { break; } } } finally { knownItemLock.unlock(); } } float[][] userFeaturesArray = userFeatures.toArray(new float[userFeatures.size()][]); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { return multithreadedTopN(userFeaturesArray, usersKnownItemIDs, generation.getUserTagIDs(), rescorer, howMany, generation.getCandidateFilter()); } finally { yLock.unlock(); } } private List<RecommendedItem> multithreadedTopN(final float[][] userFeatures, final FastIDSet userKnownItemIDs, final FastIDSet userTagIDs, final IDRescorer rescorer, final int howMany, CandidateFilter candidateFilter) { Collection<Iterator<FastByIDMap.MapEntry<float[]>>> candidateIterators = candidateFilter .getCandidateIterator(userFeatures); int numIterators = candidateIterators.size(); int parallelism = FastMath.min(numCores, numIterators); final Queue<MutableRecommendedItem> topN = TopN.initialQueue(howMany); if (parallelism > 1) { ExecutorService executorService = executor.get(); final Iterator<Iterator<FastByIDMap.MapEntry<float[]>>> candidateIteratorsIterator = candidateIterators .iterator(); Collection<Future<?>> futures = Lists.newArrayList(); for (int i = 0; i < numCores; i++) { futures.add(executorService.submit(new Callable<Void>() { @Override public Void call() { float[] queueLeastValue = { Float.NEGATIVE_INFINITY }; while (true) { Iterator<FastByIDMap.MapEntry<float[]>> candidateIterator; synchronized (candidateIteratorsIterator) { if (!candidateIteratorsIterator.hasNext()) { break; } candidateIterator = candidateIteratorsIterator.next(); } Iterator<RecommendedItem> partialIterator = new RecommendIterator(userFeatures, candidateIterator, userKnownItemIDs, userTagIDs, rescorer); TopN.selectTopNIntoQueueMultithreaded(topN, queueLeastValue, partialIterator, howMany); } return null; } })); } for (Future<?> future : futures) { try { future.get(); } catch (InterruptedException e) { throw new IllegalStateException(e); } catch (ExecutionException e) { throw new IllegalStateException(e.getCause()); } } } else { for (Iterator<FastByIDMap.MapEntry<float[]>> candidateIterator : candidateIterators) { Iterator<RecommendedItem> partialIterator = new RecommendIterator(userFeatures, candidateIterator, userKnownItemIDs, userTagIDs, rescorer); TopN.selectTopNIntoQueue(topN, partialIterator, howMany); } } return TopN.selectTopNFromQueue(topN, howMany); } @Override public List<RecommendedItem> recommendToAnonymous(long[] itemIDs, int howMany) throws NotReadyException, NoSuchItemException { return recommendToAnonymous(itemIDs, howMany, null); } @Override public List<RecommendedItem> recommendToAnonymous(long[] itemIDs, float[] values, int howMany) throws NotReadyException, NoSuchItemException { return recommendToAnonymous(itemIDs, values, howMany, null); } @Override public List<RecommendedItem> recommendToAnonymous(long[] itemIDs, int howMany, IDRescorer rescorer) throws NotReadyException, NoSuchItemException { return recommendToAnonymous(itemIDs, null, howMany, rescorer); } @Override public List<RecommendedItem> recommendToAnonymous(long[] itemIDs, float[] values, int howMany, IDRescorer rescorer) throws NotReadyException, NoSuchItemException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); float[] anonymousUserFeatures = buildAnonymousUserFeatures(itemIDs, values); FastIDSet userKnownItemIDs = new FastIDSet(itemIDs.length); for (long itemID : itemIDs) { userKnownItemIDs.add(itemID); } float[][] anonymousFeaturesAsArray = { anonymousUserFeatures }; Generation generation = getCurrentGeneration(); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { return multithreadedTopN(anonymousFeaturesAsArray, userKnownItemIDs, generation.getUserTagIDs(), rescorer, howMany, generation.getCandidateFilter()); } finally { yLock.unlock(); } } private float[] buildAnonymousUserFeatures(long[] itemIDs, float[] values) throws NotReadyException, NoSuchItemException { Preconditions.checkArgument(values == null || values.length == itemIDs.length, "Number of values doesn't match number of items"); Generation generation = getCurrentGeneration(); FastByIDMap<float[]> Y = generation.getY(); Solver ytySolver = generation.getYTYSolver(); if (ytySolver == null) { throw new NotReadyException(); } float[] anonymousUserFeatures = null; Lock yLock = generation.getYLock().readLock(); boolean anyItemIDFound = false; for (int j = 0; j < itemIDs.length; j++) { long itemID = itemIDs[j]; float[] itemFeatures; yLock.lock(); try { itemFeatures = Y.get(itemID); } finally { yLock.unlock(); } if (itemFeatures == null) { continue; } anyItemIDFound = true; double[] userFoldIn = ytySolver.solveFToD(itemFeatures); if (anonymousUserFeatures == null) { anonymousUserFeatures = new float[userFoldIn.length]; } double signedFoldInWeight = foldInWeight(0.0, values == null ? 1.0f : values[j]); if (signedFoldInWeight != 0.0) { for (int i = 0; i < anonymousUserFeatures.length; i++) { anonymousUserFeatures[i] += (float) (signedFoldInWeight * userFoldIn[i]); } } } if (!anyItemIDFound) { throw new NoSuchItemException(Arrays.toString(itemIDs)); } return anonymousUserFeatures; } @Override public List<RecommendedItem> mostPopularItems(int howMany) throws NotReadyException { return mostPopularItems(howMany, null); } @Override public List<RecommendedItem> mostPopularItems(int howMany, IDRescorer rescorer) throws NotReadyException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); Generation generation = getCurrentGeneration(); FastByIDMap<FastIDSet> knownItemIDs = generation.getKnownItemIDs(); if (knownItemIDs == null) { throw new UnsupportedOperationException(); } FastIDSet itemTagIDs = generation.getItemTagIDs(); FastByIDFloatMap itemCounts = new FastByIDFloatMap(); Lock knownItemReadLock = generation.getKnownItemLock().readLock(); knownItemReadLock.lock(); try { // Don't count data from users that are really item tags Lock xReadLock = generation.getXLock().readLock(); xReadLock.lock(); try { for (FastByIDMap.MapEntry<FastIDSet> entry : generation.getKnownItemIDs().entrySet()) { long userID = entry.getKey(); if (!itemTagIDs.contains(userID)) { FastIDSet itemIDs = entry.getValue(); synchronized (itemIDs) { LongPrimitiveIterator it = itemIDs.iterator(); while (it.hasNext()) { long itemID = it.nextLong(); itemCounts.increment(itemID, 1.0f); } } } } } finally { xReadLock.unlock(); } } finally { knownItemReadLock.unlock(); } // Filter out 'items' that were really user tags FastIDSet userTagIDs = generation.getUserTagIDs(); Lock yReadLock = generation.getYLock().readLock(); yReadLock.lock(); try { LongPrimitiveIterator it = itemCounts.keySetIterator(); while (it.hasNext()) { if (userTagIDs.contains(it.nextLong())) { it.remove(); } } } finally { yReadLock.unlock(); } return TopN.selectTopN(new MostPopularItemsIterator(itemCounts.entrySet().iterator(), rescorer), howMany); } /** * @param userID user ID whose preference is to be estimated * @param itemID item ID to estimate preference for * @return an estimate of the strength of the association between the user and item. These values are the * same as will be returned from {@link #recommend(long, int)}. They are opaque values and have no interpretation * other than that larger means stronger. The values are typically in the range [0,1] but are not guaranteed * to be so. Note that 0 will be returned if the user or item is not known in the data. * @throws NotReadyException if the recommender has no model available yet */ @Override public float estimatePreference(long userID, long itemID) throws NotReadyException { return estimatePreferences(userID, itemID)[0]; } @Override public float[] estimatePreferences(long userID, long... itemIDs) throws NotReadyException { Generation generation = getCurrentGeneration(); FastByIDMap<float[]> X = generation.getX(); float[] userFeatures; Lock xLock = generation.getXLock().readLock(); xLock.lock(); try { userFeatures = X.get(userID); } finally { xLock.unlock(); } if (userFeatures == null) { return new float[itemIDs.length]; // All 0.0f } FastByIDMap<float[]> Y = generation.getY(); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { float[] result = new float[itemIDs.length]; for (int i = 0; i < itemIDs.length; i++) { long itemID = itemIDs[i]; float[] itemFeatures = Y.get(itemID); if (itemFeatures != null) { float value = (float) SimpleVectorMath.dot(itemFeatures, userFeatures); Preconditions.checkState(LangUtils.isFinite(value), "Bad estimate"); result[i] = value; } // else leave value at 0.0f } return result; } finally { yLock.unlock(); } } @Override public float estimateForAnonymous(long toItemID, long[] itemIDs) throws NotReadyException, NoSuchItemException { return estimateForAnonymous(toItemID, itemIDs, null); } @Override public float estimateForAnonymous(long toItemID, long[] itemIDs, float[] values) throws NotReadyException, NoSuchItemException { Generation generation = getCurrentGeneration(); FastByIDMap<float[]> Y = generation.getY(); Lock yLock = generation.getYLock().readLock(); float[] toItemFeatures; yLock.lock(); try { toItemFeatures = Y.get(toItemID); } finally { yLock.unlock(); } if (toItemFeatures == null) { throw new NoSuchItemException(toItemID); } float[] anonymousUserFeatures = buildAnonymousUserFeatures(itemIDs, values); return (float) SimpleVectorMath.dot(anonymousUserFeatures, toItemFeatures); } /** * Calls {@link #setPreference(long, long, float)} with value 1.0. */ @Override public void setPreference(long userID, long itemID) { setPreference(userID, itemID, 1.0f); } @Override public void setPreference(long userID, long itemID, float value) { setPreference(userID, itemID, value, false); } public void setPreference(long userID, long itemID, float value, boolean bulk) { // Record datum try { generationManager.append(userID, itemID, value, bulk); } catch (IOException ioe) { log.warn("Could not append datum; continuing", ioe); } Generation generation; try { generation = getCurrentGeneration(); } catch (NotReadyException nre) { // Corner case -- no model ready so all we can do is record (above). Don't fail the request. return; } float[] userFeatures = getFeatures(userID, generation.getX(), generation.getXLock()); boolean newItem; Lock yReadLock = generation.getYLock().readLock(); yReadLock.lock(); try { newItem = generation.getY().get(itemID) == null; } finally { yReadLock.unlock(); } if (newItem) { generation.getCandidateFilter().addItem(itemID); } float[] itemFeatures = getFeatures(itemID, generation.getY(), generation.getYLock()); updateFeatures(userFeatures, itemFeatures, value, generation); FastByIDMap<FastIDSet> knownItemIDs = generation.getKnownItemIDs(); if (knownItemIDs != null) { FastIDSet userKnownItemIDs; ReadWriteLock knownItemLock = generation.getKnownItemLock(); Lock knownItemReadLock = knownItemLock.readLock(); knownItemReadLock.lock(); try { userKnownItemIDs = knownItemIDs.get(userID); if (userKnownItemIDs == null) { userKnownItemIDs = new FastIDSet(); Lock knownItemWriteLock = knownItemLock.writeLock(); knownItemReadLock.unlock(); knownItemWriteLock.lock(); try { knownItemIDs.put(userID, userKnownItemIDs); } finally { knownItemReadLock.lock(); knownItemWriteLock.unlock(); } } } finally { knownItemReadLock.unlock(); } synchronized (userKnownItemIDs) { userKnownItemIDs.add(itemID); } } updateClusters(userID, userFeatures, generation.getUserClusters(), generation.getUserClustersLock().readLock()); updateClusters(itemID, itemFeatures, generation.getItemClusters(), generation.getItemClustersLock().readLock()); } private static float[] getFeatures(long id, FastByIDMap<float[]> matrix, ReadWriteLock lock) { float[] features; Lock readLock = lock.readLock(); readLock.lock(); try { features = matrix.get(id); if (features == null) { int numFeatures = countFeatures(matrix); if (numFeatures > 0) { features = new float[numFeatures]; Lock writeLock = lock.writeLock(); readLock.unlock(); writeLock.lock(); try { matrix.put(id, features); } finally { readLock.lock(); writeLock.unlock(); } } } } finally { readLock.unlock(); } return features; } private static void updateFeatures(float[] userFeatures, float[] itemFeatures, float value, Generation generation) { if (userFeatures == null || itemFeatures == null) { return; } double signedFoldInWeight = foldInWeight(SimpleVectorMath.dot(userFeatures, itemFeatures), value); if (signedFoldInWeight == 0.0) { return; } // Here, we are using userFeatures, which is a row of X, as if it were a column of X'. // This is multiplied on the left by (X'*X)^-1. That's our left-inverse of X or at least the one // column we need. Which is what the new data point is multiplied on the left by. The result is a column; // we scale to complete the multiplication of the fold-in and add it in. Solver xtxSolver = generation.getXTXSolver(); double[] itemFoldIn = xtxSolver == null ? null : xtxSolver.solveFToD(userFeatures); // Same, but reversed. Multiply itemFeatures, which is a row of Y, on the right by (Y'*Y)^-1. // This is the right-inverse for Y', or at least the row we need. Because of the symmetries we can use // the same method above to carry out the multiply; the result is conceptually a row vector. // The result is scaled and added in. Solver ytySolver = generation.getYTYSolver(); double[] userFoldIn = ytySolver == null ? null : ytySolver.solveFToD(itemFeatures); if (itemFoldIn != null) { if (SimpleVectorMath.norm(userFoldIn) > BIG_FOLDIN_THRESHOLD) { log.warn("Item fold in vector is large; reduce -Dmodel.features?"); } for (int i = 0; i < itemFeatures.length; i++) { double delta = signedFoldInWeight * itemFoldIn[i]; Preconditions.checkState(LangUtils.isFinite(delta)); itemFeatures[i] += (float) delta; } } if (userFoldIn != null) { if (SimpleVectorMath.norm(userFoldIn) > BIG_FOLDIN_THRESHOLD) { log.warn("User fold in vector is large; reduce -Dmodel.features?"); } for (int i = 0; i < userFeatures.length; i++) { double delta = signedFoldInWeight * userFoldIn[i]; Preconditions.checkState(LangUtils.isFinite(delta)); userFeatures[i] += (float) delta; } } } private static void updateClusters(long id, float[] featureVector, Collection<IDCluster> clusters, Lock clustersReadLock) { if (featureVector == null || clusters == null || clusters.isEmpty()) { return; } IDCluster closestCentroid; clustersReadLock.lock(); try { closestCentroid = findClosestCentroid(featureVector, clusters); } finally { clustersReadLock.unlock(); } if (closestCentroid == null) { return; } FastIDSet newMembers = closestCentroid.getMembers(); boolean removeFromCurrentCluster; synchronized (newMembers) { // Wasn't already present, so was present elsewhere; find and remove it removeFromCurrentCluster = newMembers.add(id); } if (removeFromCurrentCluster) { clustersReadLock.lock(); try { for (IDCluster cluster : clusters) { FastIDSet oldMembers = cluster.getMembers(); synchronized (oldMembers) { if (oldMembers.remove(id)) { break; } } } } finally { clustersReadLock.unlock(); } } } private static IDCluster findClosestCentroid(float[] vector, Iterable<IDCluster> clusters) { double vectorNorm = SimpleVectorMath.norm(vector); IDCluster closestCentroid = null; double highestDot = Double.NEGATIVE_INFINITY; for (IDCluster cluster : clusters) { double dot = SimpleVectorMath.dot(cluster.getCentroid(), vector) / cluster.getCentroidNorm() / vectorNorm; if (LangUtils.isFinite(dot) && dot > highestDot) { highestDot = dot; closestCentroid = cluster; } } return closestCentroid; } private static int countFeatures(FastByIDMap<float[]> M) { // assumes the read lock is held return M.isEmpty() ? 0 : M.entrySet().iterator().next().getValue().length; } /** * This function decides how much of a folded-in user or item vector should be added to a target item or user * vector, respectively, on a new action. The idea is that a positive value should push the current value towards * 1, but not further, and a negative value should push towards 0, but not further. How much to move should be * mostly proportional to the size of the value. 0 should move the result not at all; 2 ought to move twice as * much as 1, etc. This isn't quite possible but can be approximated by moving a fraction 1-1/(1+value) of the * distance towards 1, or 0. */ private static double foldInWeight(double estimate, float value) { Preconditions.checkState(LangUtils.isFinite(estimate)); double signedFoldInWeight; if (value > 0.0f && estimate < 1.0) { double multiplier = 1.0 - FastMath.max(0.0, estimate); signedFoldInWeight = (1.0 - 1.0 / (1.0 + value)) * multiplier; } else if (value < 0.0f && estimate > 0.0) { double multiplier = -FastMath.min(1.0, estimate); signedFoldInWeight = (1.0 - 1.0 / (1.0 - value)) * multiplier; } else { signedFoldInWeight = 0.0; } return FOLDIN_LEARN_RATE * signedFoldInWeight; } @Override public void removePreference(long userID, long itemID) { removePreference(userID, itemID, false); } private void removePreference(long userID, long itemID, boolean bulk) { // Record datum try { generationManager.remove(userID, itemID, bulk); } catch (IOException ioe) { log.warn("Could not append datum; continuing", ioe); } Generation generation; try { generation = getCurrentGeneration(); } catch (NotReadyException nre) { // Corner case -- no model ready so all we can do is record (above). Don't fail the request. return; } ReadWriteLock knownItemLock = generation.getKnownItemLock(); boolean removeUser = false; FastByIDMap<FastIDSet> knownItemIDs = generation.getKnownItemIDs(); if (knownItemIDs != null) { Lock knownItemReadLock = knownItemLock.readLock(); FastIDSet userKnownItemIDs; knownItemReadLock.lock(); try { userKnownItemIDs = knownItemIDs.get(userID); } finally { knownItemReadLock.unlock(); } if (userKnownItemIDs == null) { // Doesn't exist? So ignore this request return; } synchronized (userKnownItemIDs) { if (!userKnownItemIDs.remove(itemID)) { // Item unknown, so ignore this request return; } removeUser = userKnownItemIDs.isEmpty(); } } // We can proceed with the request FastByIDMap<float[]> X = generation.getX(); ReadWriteLock xLock = generation.getXLock(); if (removeUser) { Lock knownItemWriteLock = knownItemLock.writeLock(); knownItemWriteLock.lock(); try { knownItemIDs.remove(userID); } finally { knownItemWriteLock.unlock(); } Lock xWriteLock = xLock.writeLock(); xWriteLock.lock(); try { X.remove(userID); } finally { xWriteLock.unlock(); } } } @Override public void setUserTag(long userID, String tag) { setUserTag(userID, tag, 1.0f); } @Override public void setUserTag(long userID, String tag, float value) { setUserTag(userID, tag, value, false); } public void setUserTag(long userID, String tag, float value, boolean bulk) { Preconditions.checkNotNull(tag); Preconditions.checkArgument(!tag.isEmpty()); // Record datum try { generationManager.appendUserTag(userID, tag, value, bulk); } catch (IOException ioe) { log.warn("Could not append datum; continuing", ioe); } Generation generation; try { generation = getCurrentGeneration(); } catch (NotReadyException nre) { // Corner case -- no model ready so all we can do is record (above). Don't fail the request. return; } long tagID = tagHasher.toLongID(tag); FastIDSet userTagIDs = generation.getUserTagIDs(); Lock userTagWriteLock = generation.getYLock().writeLock(); userTagWriteLock.lock(); try { userTagIDs.add(tagID); } finally { userTagWriteLock.unlock(); } float[] userFeatures = getFeatures(userID, generation.getX(), generation.getXLock()); float[] tagFeatures = getFeatures(tagID, generation.getY(), generation.getYLock()); updateFeatures(userFeatures, tagFeatures, value, generation); updateClusters(userID, userFeatures, generation.getUserClusters(), generation.getUserClustersLock().readLock()); } @Override public void setItemTag(String tag, long itemID) { setItemTag(tag, itemID, 1.0f); } @Override public void setItemTag(String tag, long itemID, float value) { setItemTag(tag, itemID, value, false); } public void setItemTag(String tag, long itemID, float value, boolean bulk) { Preconditions.checkNotNull(tag); Preconditions.checkArgument(!tag.isEmpty()); // Record datum try { generationManager.appendItemTag(tag, itemID, value, bulk); } catch (IOException ioe) { log.warn("Could not append datum; continuing", ioe); } Generation generation; try { generation = getCurrentGeneration(); } catch (NotReadyException nre) { // Corner case -- no model ready so all we can do is record (above). Don't fail the request. return; } long tagID = tagHasher.toLongID(tag); FastIDSet itemTagIDs = generation.getItemTagIDs(); Lock itemTagWriteLock = generation.getXLock().writeLock(); itemTagWriteLock.lock(); try { itemTagIDs.add(tagID); } finally { itemTagWriteLock.unlock(); } float[] tagFeatures = getFeatures(tagID, generation.getX(), generation.getXLock()); float[] itemFeatures = getFeatures(itemID, generation.getY(), generation.getYLock()); updateFeatures(tagFeatures, itemFeatures, value, generation); updateClusters(itemID, itemFeatures, generation.getItemClusters(), generation.getItemClustersLock().readLock()); } /** * One-argument version of {@link #mostSimilarItems(long[], int)}. */ @Override public List<RecommendedItem> mostSimilarItems(long itemID, int howMany) throws NoSuchItemException, NotReadyException { return mostSimilarItems(itemID, howMany, null); } /** * One-argument version of {@link #mostSimilarItems(long[], int, Rescorer)}. */ @Override public List<RecommendedItem> mostSimilarItems(long itemID, int howMany, Rescorer<LongPair> rescorer) throws NoSuchItemException, NotReadyException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); Generation generation = getCurrentGeneration(); FastByIDMap<float[]> Y = generation.getY(); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { float[] itemFeatures = Y.get(itemID); if (itemFeatures == null) { throw new NoSuchItemException(itemID); } return TopN.selectTopN(new MostSimilarItemIterator(Y.entrySet().iterator(), generation.getUserTagIDs(), new long[] { itemID }, new float[][] { itemFeatures }, rescorer), howMany); } finally { yLock.unlock(); } } /** * Like {@link #mostSimilarItems(long[], int, Rescorer)} but uses no rescorer. */ @Override public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany) throws NoSuchItemException, NotReadyException { return mostSimilarItems(itemIDs, howMany, null); } /** * Computes items most similar to an item or items. The returned items have the highest average similarity * to the given items. * * @param itemIDs items for which most similar items are required * @param howMany maximum number of similar items to return; fewer may be returned * @param rescorer rescoring function used to modify item-item similarities before ranking results * @return {@link RecommendedItem}s representing the top recommendations for the user, ordered by quality, * descending. The score associated to it is an opaque value. Larger means more similar, but no further * interpretation may necessarily be applied. * @throws NoSuchItemException if any of the items is not known in the model * @throws NotReadyException if the recommender has no model available yet */ @Override public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany, Rescorer<LongPair> rescorer) throws NoSuchItemException, NotReadyException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); Generation generation = getCurrentGeneration(); FastByIDMap<float[]> Y = generation.getY(); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { List<float[]> itemFeatures = Lists.newArrayListWithCapacity(itemIDs.length); for (long itemID : itemIDs) { float[] features = Y.get(itemID); if (features != null) { itemFeatures.add(features); } } if (itemFeatures.isEmpty()) { throw new NoSuchItemException(Arrays.toString(itemIDs)); } float[][] itemFeaturesArray = itemFeatures.toArray(new float[itemFeatures.size()][]); return TopN.selectTopN(new MostSimilarItemIterator(Y.entrySet().iterator(), generation.getUserTagIDs(), itemIDs, itemFeaturesArray, rescorer), howMany); } finally { yLock.unlock(); } } @Override public float[] similarityToItem(long toItemID, long... itemIDs) throws TasteException { Generation generation = getCurrentGeneration(); FastByIDMap<float[]> Y = generation.getY(); float[] similarities = new float[itemIDs.length]; Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { float[] toFeatures = Y.get(toItemID); if (toFeatures == null) { throw new NoSuchItemException(toItemID); } double toFeaturesNorm = SimpleVectorMath.norm(toFeatures); boolean anyFound = false; for (int i = 0; i < similarities.length; i++) { float[] features = Y.get(itemIDs[i]); if (features == null) { similarities[i] = Float.NaN; } else { anyFound = true; double featuresNorm = SimpleVectorMath.norm(features); similarities[i] = (float) (SimpleVectorMath.dot(features, toFeatures) / (featuresNorm * toFeaturesNorm)); } } if (!anyFound) { throw new NoSuchItemException(Arrays.toString(itemIDs)); } } finally { yLock.unlock(); } return similarities; } /** * <p>Lists the items that were most influential in recommending a given item to a given user. Exactly how this * is determined is left to the implementation, but, generally this will return items that the user prefers * and that are similar to the given item.</p> * * <p>These values by which the results are ordered are opaque values and have no interpretation * other than that larger means stronger.</p> * * @param userID ID of user who was recommended the item * @param itemID ID of item that was recommended * @param howMany maximum number of items to return * @return {@link List} of {@link RecommendedItem}, ordered from most influential in recommended the given * item to least * @throws NoSuchUserException if the user is not known in the model * @throws NoSuchItemException if the item is not known in the model * @throws NotReadyException if the recommender has no model available yet */ @Override public List<RecommendedItem> recommendedBecause(long userID, long itemID, int howMany) throws NoSuchUserException, NoSuchItemException, NotReadyException { Preconditions.checkArgument(howMany > 0, "howMany must be positive"); Generation generation = getCurrentGeneration(); FastByIDMap<FastIDSet> knownItemIDs = generation.getKnownItemIDs(); if (knownItemIDs == null) { throw new UnsupportedOperationException("No known item IDs available"); } Lock knownItemLock = generation.getKnownItemLock().readLock(); FastIDSet userKnownItemIDs; knownItemLock.lock(); try { userKnownItemIDs = knownItemIDs.get(userID); } finally { knownItemLock.unlock(); } if (userKnownItemIDs == null) { throw new NoSuchUserException(userID); } FastByIDMap<float[]> Y = generation.getY(); Lock yLock = generation.getYLock().readLock(); yLock.lock(); try { float[] features = Y.get(itemID); if (features == null) { throw new NoSuchItemException(itemID); } FastByIDMap<float[]> toFeatures; synchronized (userKnownItemIDs) { toFeatures = new FastByIDMap<float[]>(userKnownItemIDs.size()); LongPrimitiveIterator it = userKnownItemIDs.iterator(); while (it.hasNext()) { long fromItemID = it.nextLong(); float[] fromFeatures = Y.get(fromItemID); toFeatures.put(fromItemID, fromFeatures); } } return TopN.selectTopN(new RecommendedBecauseIterator(toFeatures.entrySet().iterator(), generation.getUserTagIDs(), features), howMany); } finally { yLock.unlock(); } } @Override public boolean isReady() { try { getCurrentGeneration(); return true; } catch (NotReadyException ignored) { return false; } } @Override public void await() throws InterruptedException { while (!isReady()) { Thread.sleep(1000L); } } @Override public boolean await(long time, TimeUnit unit) throws InterruptedException { Preconditions.checkArgument(time >= 0L, "time must be positive: {}", time); Preconditions.checkNotNull(unit); long waitForMS = TimeUnit.MILLISECONDS.convert(time, unit); long waitIntervalMS = FastMath.min(1000L, waitForMS); long waitUntil = System.currentTimeMillis() + waitForMS; while (!isReady()) { if (System.currentTimeMillis() > waitUntil) { return false; } Thread.sleep(waitIntervalMS); } return true; } @Override public FastIDSet getAllUserIDs() throws NotReadyException { Generation generation = getCurrentGeneration(); return getIDsFromKeys(generation.getX(), generation.getXLock().readLock(), generation.getItemTagIDs()); } @Override public FastIDSet getAllItemIDs() throws NotReadyException { Generation generation = getCurrentGeneration(); return getIDsFromKeys(generation.getY(), generation.getYLock().readLock(), generation.getUserTagIDs()); } private static FastIDSet getIDsFromKeys(FastByIDMap<float[]> map, Lock readLock, FastIDSet tagIDs) { readLock.lock(); try { FastIDSet ids = new FastIDSet(map.size()); LongPrimitiveIterator it = map.keySetIterator(); while (it.hasNext()) { long id = it.nextLong(); if (!tagIDs.contains(id)) { ids.add(id); } } return ids; } finally { readLock.unlock(); } } @Override public int getNumUserClusters() throws NotReadyException { Generation generation = getCurrentGeneration(); Collection<IDCluster> clusters = generation.getUserClusters(); if (clusters == null || clusters.isEmpty()) { throw new UnsupportedOperationException(); } Lock lock = generation.getUserClustersLock().readLock(); lock.lock(); try { return clusters.size(); } finally { lock.unlock(); } } @Override public int getNumItemClusters() throws NotReadyException { Generation generation = getCurrentGeneration(); Collection<IDCluster> clusters = generation.getItemClusters(); if (clusters == null || clusters.isEmpty()) { throw new UnsupportedOperationException(); } Lock lock = generation.getItemClustersLock().readLock(); lock.lock(); try { return clusters.size(); } finally { lock.unlock(); } } @Override public FastIDSet getUserCluster(int n) throws NotReadyException { Generation generation = getCurrentGeneration(); List<IDCluster> clusters = generation.getUserClusters(); if (clusters == null || clusters.isEmpty()) { throw new UnsupportedOperationException(); } Lock lock = generation.getUserClustersLock().readLock(); FastIDSet members; lock.lock(); try { members = clusters.get(n).getMembers(); } finally { lock.unlock(); } synchronized (members) { return members.clone(); } } @Override public FastIDSet getItemCluster(int n) throws NotReadyException { Generation generation = getCurrentGeneration(); List<IDCluster> clusters = generation.getItemClusters(); if (clusters == null || clusters.isEmpty()) { throw new UnsupportedOperationException(); } Lock lock = generation.getItemClustersLock().readLock(); FastIDSet members; lock.lock(); try { members = clusters.get(n).getMembers(); } finally { lock.unlock(); } synchronized (members) { return members.clone(); } } /** * @throws UnsupportedOperationException * @deprecated do not call */ @Deprecated @Override public DataModel getDataModel() { throw new UnsupportedOperationException(); } /** * {@code excludeItemIfNotSimilarToAll} is not applicable in this implementation. * * @return {@link #mostSimilarItems(long[], int)} if excludeItemIfNotSimilarToAll is false * @throws UnsupportedOperationException otherwise * @deprecated use {@link #mostSimilarItems(long[], int)} instead */ @Deprecated @Override public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany, boolean excludeItemIfNotSimilarToAll) throws NoSuchItemException, NotReadyException { if (excludeItemIfNotSimilarToAll) { throw new UnsupportedOperationException(); } return mostSimilarItems(itemIDs, howMany); } /** * {@code excludeItemIfNotSimilarToAll} is not applicable in this implementation. * * @return {@link #mostSimilarItems(long[], int, Rescorer)} if excludeItemIfNotSimilarToAll is false * @throws UnsupportedOperationException otherwise * @deprecated use {@link #mostSimilarItems(long[], int, Rescorer)} instead */ @Deprecated @Override public List<RecommendedItem> mostSimilarItems(long[] itemIDs, int howMany, Rescorer<LongPair> rescorer, boolean excludeItemIfNotSimilarToAll) throws NoSuchItemException, NotReadyException { if (excludeItemIfNotSimilarToAll) { throw new UnsupportedOperationException(); } return mostSimilarItems(itemIDs, howMany); } }