com.crosstreelabs.cognitio.service.mongo.MongoCatalogueService.java Source code

Java tutorial

Introduction

Here is the source code for com.crosstreelabs.cognitio.service.mongo.MongoCatalogueService.java

Source

/*
 * Copyright 2015 Crosstree Labs.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.crosstreelabs.cognitio.service.mongo;

import com.crosstreelabs.cognitio.api.resource.CatalogueEntry;
import com.crosstreelabs.cognitio.api.resource.Host;
import com.crosstreelabs.cognitio.api.resource.Relocated;
import com.crosstreelabs.cognitio.api.resource.Status;
import com.crosstreelabs.cognitio.api.service.CatalogueService;
import com.crosstreelabs.cognitio.api.service.HostService;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import com.mongodb.BasicDBList;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.WriteConcern;
import com.mongodb.WriteResult;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.bson.types.ObjectId;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MongoCatalogueService implements CatalogueService {
    private static final Logger LOGGER = LoggerFactory.getLogger(MongoCatalogueService.class);
    private final MongoClient client;
    private final HostService hostService;
    private final DB db;
    private final DBCollection catalogue;
    private final Queue<CatalogueEntry> workPool = new ConcurrentLinkedQueue<>();

    public MongoCatalogueService(final MongoClient client, final HostService hostService) {
        this.client = client;
        this.hostService = hostService;
        db = client.getDB("cognitio");
        if (!db.collectionExists("catalogue")) {
            catalogue = db.createCollection("catalogue", null);
            catalogue.createIndex(new BasicDBObject("location_hash", 1), new BasicDBObject("unique", true));
        } else {
            catalogue = db.getCollection("catalogue");
        }
    }

    @Override
    public CatalogueEntry findForLocation(final String location) {
        DBObject result = catalogue.findOne(new BasicDBObject("location", location));
        if (result == null) {
            return null;
        }
        return fromMongoObject(result);
    }

    @Override
    public List<CatalogueEntry> findAllLocations(final String... locations) {
        DBCursor cursor = catalogue.find(new BasicDBObject("location", new BasicDBObject("$in", locations)));
        List<CatalogueEntry> entries = new ArrayList<>();
        for (DBObject obj : cursor) {
            entries.add(fromMongoObject(obj));
        }
        return entries;
    }

    @Override
    public void save(final CatalogueEntry entry) {
        if (entry == null) {
            throw new IllegalArgumentException("Entry cannot be null");
        }

        // Save the host
        if (entry.host.id != null) {
            hostService.save(entry.host);
        } else {
            // Attempt to find and update a host with the corresponding domain
            Host host = hostService.findByDomain(entry.host.host);
            if (host == null) {
                hostService.save(entry.host);
            } else {
                host.lastRequest = entry.host.lastRequest;
                host.robotsTxt = entry.host.robotsTxt;
                hostService.save(host);
                entry.host = host;
            }
        }

        // Save the entry
        if (entry.id == null) {
            DBObject obj = toMongoObject(entry);
            WriteResult result = catalogue.insert(obj, WriteConcern.JOURNALED);
            entry.id = obj.get("_id").toString();
        } else {
            catalogue.update(new BasicDBObject("_id", new ObjectId(entry.id)), toMongoObject(entry), false, false,
                    WriteConcern.JOURNALED);
        }
    }

    @Override
    public synchronized CatalogueEntry getWorkEntry() {
        if (!workPool.isEmpty()) {
            return workPool.remove();
        }

        // Find hosts that have been recently accessed
        // XXX The delay time needs to be configurable
        int hostDelaySeconds = 45;
        Collection<Host> recentHosts = hostService.recentlyRequested(hostDelaySeconds);
        Set<ObjectId> excludedHostIds = new HashSet<>();
        for (Host host : recentHosts) {
            excludedHostIds.add(new ObjectId(host.id));
        }
        // Find hosts that are currently queued
        List<ObjectId> queuedHosts = catalogue.distinct("host", new BasicDBObject("status", "QUEUED"));
        for (ObjectId id : queuedHosts) {
            excludedHostIds.add(id);
        }

        // Exclude recently visited hosts
        BasicDBObject excludedHosts = new BasicDBObject("host",
                new BasicDBObject("$nin", excludedHostIds.toArray(new ObjectId[0])));
        // Retreive pending (un-indexed) resources
        BasicDBObject unindexed = new BasicDBObject("status", "PENDING");
        // Reindex resources older than 7 days (XXX Make configurable/calculated
        BasicDBObject staleResources = new BasicDBObject("$and",
                Arrays.asList(new BasicDBObject("last_visit", new BasicDBObject("$exists", true)),
                        new BasicDBObject("last_visit",
                                new BasicDBObject("$lte", new DateTime().minusDays(7).toDate())),
                        new BasicDBObject("status", "INDEXED")));
        // Now put it all together
        BasicDBObject and = new BasicDBObject("$and",
                Arrays.asList(excludedHosts, new BasicDBObject("$or", Arrays.asList(unindexed, staleResources))));
        DBObject match = new BasicDBObject("$match", and);
        DBObject sort = new BasicDBObject("$sort", new BasicDBObject("last_updated", 1));
        DBObject group = new BasicDBObject("$group",
                new BasicDBObject("_id", "$host").append("first", new BasicDBObject("$first", "$$ROOT")));
        List<DBObject> pipeline = Arrays.asList(match, group, sort);
        LOGGER.debug("PIPELINE: {}", pipeline);
        Iterable<DBObject> refs = catalogue.aggregate(pipeline).results();

        for (DBObject obj : refs) {
            workPool.add(fromMongoObject((DBObject) obj.get("first")));
        }
        LOGGER.debug("WORK POOL: {}", workPool);

        // Update the objects in mongo to queued state
        BasicDBList ids = new BasicDBList();
        ids.addAll(listIds(workPool));
        catalogue.update(new BasicDBObject("_id", new BasicDBObject("$in", ids)),
                new BasicDBObject("$set", new BasicDBObject("status", "QUEUED")), false, true,
                WriteConcern.JOURNALED);

        if (workPool.isEmpty()) {
            return null;
        }
        return workPool.remove();
    }

    //~ Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    protected DBObject toMongoObject(final CatalogueEntry entry) {
        if (entry == null) {
            throw new IllegalArgumentException("Entry cannot be null");
        }
        if (entry.host == null || StringUtils.isBlank(entry.host.id)) {
            throw new IllegalArgumentException("Cannot persist catalogue entry without host ref");
        }
        BasicDBObject obj = new BasicDBObject(/*"_id", entry.id*/).append("status", entry.status.toString())
                .append("status_reason", StringUtils.defaultIfBlank(entry.status_reason, ""))
                .append("location", entry.location).append("location_hash", DigestUtils.md5(entry.location))
                .append("host", new ObjectId(entry.host.id))
                .append("path", StringUtils.defaultIfBlank(entry.path, ""))
                .append("relocated", entry.relocated.toString())
                .append("new_location", StringUtils.defaultIfBlank(entry.newLocation, ""))
                .append("initial_parent", StringUtils.defaultIfBlank(entry.initialParent, ""))
                .append("initial_depth", entry.initialDepth)
                .append("initial_parent_title", StringUtils.defaultIfBlank(entry.initialParentTitle, ""));
        if (entry.firstSeen != null) {
            obj.append("first_seen", entry.firstSeen.toDate());
        }
        if (entry.lastVisit != null) {
            obj.append("last_visit", entry.lastVisit.toDate());
        }
        return obj;
    }

    protected CatalogueEntry fromMongoObject(final DBObject obj) {
        if (obj == null) {
            throw new IllegalArgumentException("Object cannot be null");
        }
        CatalogueEntry entry = new CatalogueEntry();
        entry.id = obj.get("_id").toString();
        entry.status = Status.valueOf(obj.get("status").toString());
        entry.status_reason = obj.get("status_reason").toString();
        entry.location = obj.get("location").toString();
        entry.host = hostService.findById(obj.get("host").toString());
        entry.path = obj.get("path").toString();
        entry.relocated = Relocated.valueOf(obj.get("relocated").toString());
        entry.newLocation = obj.get("new_location").toString();
        entry.initialParent = obj.get("initial_parent").toString();
        entry.initialDepth = (int) obj.get("initial_depth");
        entry.initialParentTitle = obj.get("initial_parent_title").toString();
        if (obj.get("first_seen") != null) {
            entry.firstSeen = new DateTime((Date) obj.get("first_seen"));
        }
        if (obj.get("last_visit") != null) {
            entry.lastVisit = new DateTime((Date) obj.get("last_visit"));
        }
        return entry;
    }

    protected Collection<ObjectId> listIds(final Collection<CatalogueEntry> entries) {
        return Collections2.transform(entries, new Function<CatalogueEntry, ObjectId>() {
            @Override
            public ObjectId apply(final CatalogueEntry entry) {
                return new ObjectId(entry.id);
            }
        });
    }
}