Java tutorial
/* * Copyright 2015 Crosstree Labs. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.crosstreelabs.cognitio.service.mongo; import com.crosstreelabs.cognitio.api.resource.CatalogueEntry; import com.crosstreelabs.cognitio.api.resource.Host; import com.crosstreelabs.cognitio.api.resource.Relocated; import com.crosstreelabs.cognitio.api.resource.Status; import com.crosstreelabs.cognitio.api.service.CatalogueService; import com.crosstreelabs.cognitio.api.service.HostService; import com.google.common.base.Function; import com.google.common.collect.Collections2; import com.mongodb.BasicDBList; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.MongoClient; import com.mongodb.WriteConcern; import com.mongodb.WriteResult; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.HashSet; import java.util.List; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.bson.types.ObjectId; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class MongoCatalogueService implements CatalogueService { private static final Logger LOGGER = LoggerFactory.getLogger(MongoCatalogueService.class); private final MongoClient client; private final HostService hostService; private final DB db; private final DBCollection catalogue; private final Queue<CatalogueEntry> workPool = new ConcurrentLinkedQueue<>(); public MongoCatalogueService(final MongoClient client, final HostService hostService) { this.client = client; this.hostService = hostService; db = client.getDB("cognitio"); if (!db.collectionExists("catalogue")) { catalogue = db.createCollection("catalogue", null); catalogue.createIndex(new BasicDBObject("location_hash", 1), new BasicDBObject("unique", true)); } else { catalogue = db.getCollection("catalogue"); } } @Override public CatalogueEntry findForLocation(final String location) { DBObject result = catalogue.findOne(new BasicDBObject("location", location)); if (result == null) { return null; } return fromMongoObject(result); } @Override public List<CatalogueEntry> findAllLocations(final String... locations) { DBCursor cursor = catalogue.find(new BasicDBObject("location", new BasicDBObject("$in", locations))); List<CatalogueEntry> entries = new ArrayList<>(); for (DBObject obj : cursor) { entries.add(fromMongoObject(obj)); } return entries; } @Override public void save(final CatalogueEntry entry) { if (entry == null) { throw new IllegalArgumentException("Entry cannot be null"); } // Save the host if (entry.host.id != null) { hostService.save(entry.host); } else { // Attempt to find and update a host with the corresponding domain Host host = hostService.findByDomain(entry.host.host); if (host == null) { hostService.save(entry.host); } else { host.lastRequest = entry.host.lastRequest; host.robotsTxt = entry.host.robotsTxt; hostService.save(host); entry.host = host; } } // Save the entry if (entry.id == null) { DBObject obj = toMongoObject(entry); WriteResult result = catalogue.insert(obj, WriteConcern.JOURNALED); entry.id = obj.get("_id").toString(); } else { catalogue.update(new BasicDBObject("_id", new ObjectId(entry.id)), toMongoObject(entry), false, false, WriteConcern.JOURNALED); } } @Override public synchronized CatalogueEntry getWorkEntry() { if (!workPool.isEmpty()) { return workPool.remove(); } // Find hosts that have been recently accessed // XXX The delay time needs to be configurable int hostDelaySeconds = 45; Collection<Host> recentHosts = hostService.recentlyRequested(hostDelaySeconds); Set<ObjectId> excludedHostIds = new HashSet<>(); for (Host host : recentHosts) { excludedHostIds.add(new ObjectId(host.id)); } // Find hosts that are currently queued List<ObjectId> queuedHosts = catalogue.distinct("host", new BasicDBObject("status", "QUEUED")); for (ObjectId id : queuedHosts) { excludedHostIds.add(id); } // Exclude recently visited hosts BasicDBObject excludedHosts = new BasicDBObject("host", new BasicDBObject("$nin", excludedHostIds.toArray(new ObjectId[0]))); // Retreive pending (un-indexed) resources BasicDBObject unindexed = new BasicDBObject("status", "PENDING"); // Reindex resources older than 7 days (XXX Make configurable/calculated BasicDBObject staleResources = new BasicDBObject("$and", Arrays.asList(new BasicDBObject("last_visit", new BasicDBObject("$exists", true)), new BasicDBObject("last_visit", new BasicDBObject("$lte", new DateTime().minusDays(7).toDate())), new BasicDBObject("status", "INDEXED"))); // Now put it all together BasicDBObject and = new BasicDBObject("$and", Arrays.asList(excludedHosts, new BasicDBObject("$or", Arrays.asList(unindexed, staleResources)))); DBObject match = new BasicDBObject("$match", and); DBObject sort = new BasicDBObject("$sort", new BasicDBObject("last_updated", 1)); DBObject group = new BasicDBObject("$group", new BasicDBObject("_id", "$host").append("first", new BasicDBObject("$first", "$$ROOT"))); List<DBObject> pipeline = Arrays.asList(match, group, sort); LOGGER.debug("PIPELINE: {}", pipeline); Iterable<DBObject> refs = catalogue.aggregate(pipeline).results(); for (DBObject obj : refs) { workPool.add(fromMongoObject((DBObject) obj.get("first"))); } LOGGER.debug("WORK POOL: {}", workPool); // Update the objects in mongo to queued state BasicDBList ids = new BasicDBList(); ids.addAll(listIds(workPool)); catalogue.update(new BasicDBObject("_id", new BasicDBObject("$in", ids)), new BasicDBObject("$set", new BasicDBObject("status", "QUEUED")), false, true, WriteConcern.JOURNALED); if (workPool.isEmpty()) { return null; } return workPool.remove(); } //~ Conversion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ protected DBObject toMongoObject(final CatalogueEntry entry) { if (entry == null) { throw new IllegalArgumentException("Entry cannot be null"); } if (entry.host == null || StringUtils.isBlank(entry.host.id)) { throw new IllegalArgumentException("Cannot persist catalogue entry without host ref"); } BasicDBObject obj = new BasicDBObject(/*"_id", entry.id*/).append("status", entry.status.toString()) .append("status_reason", StringUtils.defaultIfBlank(entry.status_reason, "")) .append("location", entry.location).append("location_hash", DigestUtils.md5(entry.location)) .append("host", new ObjectId(entry.host.id)) .append("path", StringUtils.defaultIfBlank(entry.path, "")) .append("relocated", entry.relocated.toString()) .append("new_location", StringUtils.defaultIfBlank(entry.newLocation, "")) .append("initial_parent", StringUtils.defaultIfBlank(entry.initialParent, "")) .append("initial_depth", entry.initialDepth) .append("initial_parent_title", StringUtils.defaultIfBlank(entry.initialParentTitle, "")); if (entry.firstSeen != null) { obj.append("first_seen", entry.firstSeen.toDate()); } if (entry.lastVisit != null) { obj.append("last_visit", entry.lastVisit.toDate()); } return obj; } protected CatalogueEntry fromMongoObject(final DBObject obj) { if (obj == null) { throw new IllegalArgumentException("Object cannot be null"); } CatalogueEntry entry = new CatalogueEntry(); entry.id = obj.get("_id").toString(); entry.status = Status.valueOf(obj.get("status").toString()); entry.status_reason = obj.get("status_reason").toString(); entry.location = obj.get("location").toString(); entry.host = hostService.findById(obj.get("host").toString()); entry.path = obj.get("path").toString(); entry.relocated = Relocated.valueOf(obj.get("relocated").toString()); entry.newLocation = obj.get("new_location").toString(); entry.initialParent = obj.get("initial_parent").toString(); entry.initialDepth = (int) obj.get("initial_depth"); entry.initialParentTitle = obj.get("initial_parent_title").toString(); if (obj.get("first_seen") != null) { entry.firstSeen = new DateTime((Date) obj.get("first_seen")); } if (obj.get("last_visit") != null) { entry.lastVisit = new DateTime((Date) obj.get("last_visit")); } return entry; } protected Collection<ObjectId> listIds(final Collection<CatalogueEntry> entries) { return Collections2.transform(entries, new Function<CatalogueEntry, ObjectId>() { @Override public ObjectId apply(final CatalogueEntry entry) { return new ObjectId(entry.id); } }); } }