com.google.walkaround.wave.server.googleimport.FindRemoteWavesProcessor.java Source code

Java tutorial

Introduction

Here is the source code for com.google.walkaround.wave.server.googleimport.FindRemoteWavesProcessor.java

Source

/*
 * Copyright 2011 Google Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.walkaround.wave.server.googleimport;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.primitives.Ints;
import com.google.inject.Inject;
import com.google.walkaround.proto.FindRemoteWavesTask;
import com.google.walkaround.proto.FindWaveletsForRemoteWaveTask;
import com.google.walkaround.proto.ImportSettings;
import com.google.walkaround.proto.ImportTaskPayload;
import com.google.walkaround.proto.ImportWaveletTask;
import com.google.walkaround.proto.RobotSearchDigest;
import com.google.walkaround.proto.gson.FindRemoteWavesTaskGsonImpl;
import com.google.walkaround.proto.gson.FindWaveletsForRemoteWaveTaskGsonImpl;
import com.google.walkaround.proto.gson.ImportTaskPayloadGsonImpl;
import com.google.walkaround.proto.gson.ImportWaveletTaskGsonImpl;
import com.google.walkaround.util.server.RetryHelper;
import com.google.walkaround.util.server.RetryHelper.PermanentFailure;
import com.google.walkaround.util.server.RetryHelper.RetryableFailure;
import com.google.walkaround.util.server.appengine.CheckedDatastore;
import com.google.walkaround.util.server.appengine.CheckedDatastore.CheckedTransaction;
import com.google.walkaround.util.shared.Assert;
import com.google.walkaround.wave.server.auth.StableUserId;
import com.google.walkaround.wave.server.gxp.SourceInstance;

import org.joda.time.LocalDate;
import org.waveprotocol.wave.model.id.IdUtil;
import org.waveprotocol.wave.model.id.WaveId;
import org.waveprotocol.wave.model.id.WaveletId;
import org.waveprotocol.wave.model.util.Pair;

import java.io.IOException;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.logging.Logger;

import javax.annotation.Nullable;

/**
 * Processes a {@link FindRemoteWavesTask}.
 *
 * @author ohler@google.com (Christian Ohler)
 */
public class FindRemoteWavesProcessor {

    @SuppressWarnings("unused")
    private static final Logger log = Logger.getLogger(FindRemoteWavesProcessor.class.getName());

    private final RobotApi.Factory robotApiFactory;
    private final SourceInstance.Factory sourceInstanceFactory;
    private final StableUserId userId;
    private final PerUserTable perUserTable;
    private final CheckedDatastore datastore;
    private final Random random;

    @Inject
    public FindRemoteWavesProcessor(RobotApi.Factory robotApiFactory, SourceInstance.Factory sourceInstanceFactory,
            StableUserId userId, PerUserTable perUserTable, CheckedDatastore datastore, Random random) {
        this.robotApiFactory = robotApiFactory;
        this.sourceInstanceFactory = sourceInstanceFactory;
        this.userId = userId;
        this.perUserTable = perUserTable;
        this.datastore = datastore;
        this.random = random;
    }

    // This used to be 300 but has been raised.  Some of the comments elsewhere in
    // the code probably still assume 300.
    private static final int MAX_RESULTS = 10000;

    private String getQueryDateRestriction(String facet, long dateDays) {
        LocalDate date = DaysSinceEpoch.toLocalDate(dateDays);
        return String.format("%s:%04d/%02d/%02d", facet, date.getYear(), date.getMonthOfYear(),
                date.getDayOfMonth());
    }

    private List<RobotSearchDigest> searchBetween(SourceInstance instance, long onOrAfterDays, long beforeDays)
            throws IOException {
        RobotApi api = robotApiFactory.create(instance.getApiUrl());
        String query = getQueryDateRestriction("after", onOrAfterDays)
                // The "before" search operator is inclusive (i.e., it means before the
                // end of the day); beforeDays is exclusive.
                + " " + getQueryDateRestriction("before", beforeDays - 1);
        return api.search(query, 0, MAX_RESULTS);
    }

    private long randomBetween(long min, long limit) {
        return min + random.nextInt(Ints.checkedCast(limit - min));
    }

    private List<Pair<Long, Long>> splitInterval(long onOrAfterDays, long beforeDays) {
        Preconditions.checkArgument(onOrAfterDays < beforeDays - 1,
                "Interval invalid or too small to split further: %s, %s", onOrAfterDays, beforeDays);
        // Split into roughly 5 intervals (if possible) because we want a high
        // branching factor (300*5^n reaches 1000, 10000 etc. quite a bit faster
        // than 300*2^n) and the maximum number of tasks GAE lets us add in one
        // transaction is 5.
        //
        // TreeSet for iteration order.
        Set<Long> splitPoints = Sets.newTreeSet();
        for (int i = 0; i < 4; i++) {
            // NOTE(ohler): Randomized strategy because it's simple to implement (the
            // cases where beforeDays - onOrAfterDays < 5 would require some thought
            // otherwise) and to make it unlikely that repeated runs send the same
            // queries to the googlewave.com servers, which seem to have a bug where
            // the result list is sometimes truncated for a query that has been issued
            // previously with a lower maxResults limit (perhaps some incorrect
            // caching).  Randomization means that re-running the "find waves" step
            // several times might have a greater chance to discover all waves.  But
            // I'm not positive whether this helps since I don't understand the bug.
            //
            // Other options include:
            //
            // * Instead of this interval splitting, start with "folder:3" or
            //   "before:2013/01/01" (for all waves), then do "before:<date of oldest
            //   wave returned by previous search>" until no more waves are returned.
            //   However, this relies on the assumption that truncated result lists
            //   are always truncated in such a way that only old waves are missing,
            //   not new waves.  We'd have to verify this.  Also, it's completely
            //   sequential rather than parallelizable.
            //
            // * Follow up every search for "after:A before:B" with another a search
            //   for "after:A before:<date of oldest wave returned by previous
            //   search>".  This could be a good combination of the two but relies on
            //   the same assumption and adds quite a bit more code.
            //
            // * When the user triggers the "find remote waves" task, enqueue N of
            //   them rather than just one, to cover the search space N times with
            //   different random interval splits to improve the likelihood that we
            //   find everything.  Could be good as well but adds code.
            //
            // * Add random negative search terms like -dgzimhmcoblhqfjciezc to the
            //   query that are unlikely to restrict the result set but make the query
            //   unique to avoid the poisoned caches.  Could also do many different
            //   such searches and merge the result sets.  (Can't assert that they are
            //   the same since waves may have been modified and fallen out of the
            //   date range.)  Probably worth implementing.
            //
            // * Fix the bug in googlewave.com or demonstrate that it's not
            //   reproducible.  Unlikely to happen since it's harder than any of these
            //   workarounds.
            splitPoints.add(randomBetween(onOrAfterDays + 1, beforeDays));
        }
        splitPoints.add(beforeDays);
        ImmutableList.Builder<Pair<Long, Long>> out = ImmutableList.builder();
        long left = onOrAfterDays;
        for (long right : splitPoints) {
            Assert.check(left < right, "left=%s, right=%s", left, right);
            out.add(Pair.of(left, right));
            left = right;
        }
        return out.build();
    }

    private List<ImportTaskPayload> makeTasks(SourceInstance instance, List<Pair<Long, Long>> intervals,
            @Nullable ImportSettings autoImportSettings) {
        log.info("intervals=" + intervals + ", settings=" + autoImportSettings);
        ImmutableList.Builder<ImportTaskPayload> accu = ImmutableList.builder();
        for (Pair<Long, Long> interval : intervals) {
            FindRemoteWavesTask task = new FindRemoteWavesTaskGsonImpl();
            task.setInstance(instance.serialize());
            task.setOnOrAfterDays(interval.getFirst());
            task.setBeforeDays(interval.getSecond());
            if (autoImportSettings != null) {
                task.setAutoImportSettings(autoImportSettings);
            }
            ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
            payload.setFindWavesTask(task);
            accu.add(payload);
        }
        return accu.build();
    }

    public List<ImportTaskPayload> makeRandomTasksForInterval(SourceInstance instance, long onOrAfterDays,
            long beforeDays, @Nullable ImportSettings autoImportSettings) {
        if (onOrAfterDays == beforeDays - 1) {
            return makeTasks(instance, ImmutableList.of(Pair.of(onOrAfterDays, beforeDays)), autoImportSettings);
        } else {
            return makeTasks(instance, splitInterval(onOrAfterDays, beforeDays), autoImportSettings);
        }
    }

    // Transaction limit is 500 entities but let's stay well below that.
    private static final int MAX_WAVELETS_PER_TRANSACTION = 300;

    private void storeResults(List<RemoteConvWavelet> results) throws PermanentFailure {
        for (final List<RemoteConvWavelet> partition : Iterables.partition(results, MAX_WAVELETS_PER_TRANSACTION)) {
            new RetryHelper().run(new RetryHelper.VoidBody() {
                @Override
                public void run() throws RetryableFailure, PermanentFailure {
                    CheckedTransaction tx = datastore.beginTransaction();
                    try {
                        if (perUserTable.addRemoteWavelets(tx, userId, partition)) {
                            tx.commit();
                        }
                    } finally {
                        tx.close();
                    }
                }
            });
        }
        log.info("Successfully added " + results.size() + " remote waves");
    }

    private void scheduleFindWaveletTasks(final SourceInstance instance, List<RobotSearchDigest> results,
            @Nullable final ImportSettings autoImportSettings) throws PermanentFailure {
        for (final List<RobotSearchDigest> partition : Iterables.partition(results,
                // 5 tasks per transaction.
                5)) {
            new RetryHelper().run(new RetryHelper.VoidBody() {
                @Override
                public void run() throws RetryableFailure, PermanentFailure {
                    CheckedTransaction tx = datastore.beginTransaction();
                    try {
                        for (RobotSearchDigest result : partition) {
                            FindWaveletsForRemoteWaveTask task = new FindWaveletsForRemoteWaveTaskGsonImpl();
                            task.setInstance(instance.serialize());
                            task.setWaveDigest(result);
                            if (autoImportSettings != null) {
                                task.setAutoImportSettings(autoImportSettings);
                            }
                            ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
                            payload.setFindWaveletsTask(task);
                            perUserTable.addTask(tx, userId, payload);
                        }
                        tx.commit();
                    } finally {
                        tx.close();
                    }
                }
            });
        }
        log.info("Successfully scheduled import of " + results.size() + " waves");
    }

    private void scheduleImportTasks(List<RemoteConvWavelet> results, final ImportSettings autoImportSettings)
            throws PermanentFailure {
        for (final List<RemoteConvWavelet> partition : Iterables.partition(results,
                // 5 tasks per transaction.
                5)) {
            new RetryHelper().run(new RetryHelper.VoidBody() {
                @Override
                public void run() throws RetryableFailure, PermanentFailure {
                    CheckedTransaction tx = datastore.beginTransaction();
                    try {
                        for (RemoteConvWavelet wavelet : partition) {
                            ImportWaveletTask task = new ImportWaveletTaskGsonImpl();
                            task.setInstance(wavelet.getSourceInstance().serialize());
                            task.setWaveId(wavelet.getDigest().getWaveId());
                            task.setWaveletId(wavelet.getWaveletId().serialise());
                            task.setSettings(autoImportSettings);
                            ImportTaskPayload payload = new ImportTaskPayloadGsonImpl();
                            payload.setImportWaveletTask(task);
                            perUserTable.addTask(tx, userId, payload);
                        }
                        tx.commit();
                    } finally {
                        tx.close();
                    }
                }
            });
        }
        log.info("Successfully scheduled import of " + results.size() + " waves");
    }

    private List<RemoteConvWavelet> expandPrivateReplies(SourceInstance instance, RobotSearchDigest digest)
            throws IOException {
        RobotApi api = robotApiFactory.create(instance.getApiUrl());
        ImmutableList.Builder<RemoteConvWavelet> wavelets = ImmutableList.builder();
        WaveId waveId = WaveId.deserialise(digest.getWaveId());
        // The robot API only allows access to waves with ids that start with "w".
        if (!waveId.getId().startsWith(IdUtil.WAVE_PREFIX + "+")) {
            log.info("Wave " + waveId + " not accessible through Robot API, skipping");
        } else {
            log.info("Getting wave view for " + waveId);
            List<WaveletId> waveletIds = api.getWaveView(waveId);
            log.info("Wave view for " + waveId + ": " + waveletIds);
            for (WaveletId waveletId : waveletIds) {
                if (IdUtil.isConversationalId(waveletId)) {
                    wavelets.add(new RemoteConvWavelet(instance, digest, waveletId, null, null));
                } else {
                    log.info("Skipping non-conv wavelet " + waveletId);
                }
            }
        }
        return wavelets.build();
    }

    public List<ImportTaskPayload> findWavelets(FindWaveletsForRemoteWaveTask task)
            throws IOException, PermanentFailure {
        SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
        List<RemoteConvWavelet> wavelets = expandPrivateReplies(instance, task.getWaveDigest());
        if (wavelets.isEmpty()) {
            return ImmutableList.of();
        }
        storeResults(wavelets);
        if (task.hasAutoImportSettings()) {
            scheduleImportTasks(wavelets, task.getAutoImportSettings());
        }
        return ImmutableList.of();
    }

    public List<ImportTaskPayload> findWaves(FindRemoteWavesTask task) throws IOException, PermanentFailure {
        SourceInstance instance = sourceInstanceFactory.parseUnchecked(task.getInstance());
        long onOrAfterDays = task.getOnOrAfterDays();
        long beforeDays = task.getBeforeDays();
        List<RobotSearchDigest> results = searchBetween(instance, onOrAfterDays, beforeDays);
        log.info("Search found " + results.size() + " waves");
        if (results.isEmpty()) {
            return ImmutableList.of();
        }
        @Nullable
        ImportSettings autoImportSettings = task.hasAutoImportSettings() ? task.getAutoImportSettings() : null;
        // NOTE(ohler): Having many concurrent tasks like this that all need to write to
        // the PerUserTable leads to a lot of write contention.  We'll just have to
        // keep max-concurrent-requests low.
        scheduleFindWaveletTasks(instance, results, autoImportSettings);
        if (results.size() >= MAX_RESULTS) {
            // Result list is most likely truncated, repeat with smaller intervals.
            log.info("Got " + results.size() + " results between " + onOrAfterDays + " and " + beforeDays
                    + ", splitting");
            if (beforeDays - onOrAfterDays <= 1) {
                throw new RuntimeException("Can't split further; too many results (" + results.size() + ") between "
                        + onOrAfterDays + " and " + beforeDays);
            }
            return makeRandomTasksForInterval(instance, onOrAfterDays, beforeDays, autoImportSettings);
        } else {
            return ImmutableList.of();
        }
    }

}