com.jaeksoft.searchlib.crawler.cache.MongoDbCrawlCache.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.cache.MongoDbCrawlCache.java

Source

/*
 * License Agreement for OpenSearchServer
 * <p>
 * Copyright (C) 2012-2017 Emmanuel Keller / Jaeksoft
 * <p>
 * http://www.open-search-server.com
 * <p>
 * This file is part of OpenSearchServer.
 * <p>
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * <p>
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * <p>
 * You should have received a copy of the GNU General Public License
 * along with OpenSearchServer.
 * If not, see <http://www.gnu.org/licenses/>.
 */

package com.jaeksoft.searchlib.crawler.cache;

import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.util.ReadWriteLock;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.gridfs.GridFSBucket;
import com.mongodb.client.gridfs.GridFSBuckets;
import com.mongodb.client.gridfs.model.GridFSFile;
import com.mongodb.client.gridfs.model.GridFSUploadOptions;
import com.mongodb.client.model.Filters;
import com.mongodb.client.model.UpdateOptions;
import org.bson.BsonArray;
import org.bson.BsonDocument;
import org.bson.BsonString;
import org.bson.BsonValue;
import org.bson.Document;
import org.bson.conversions.Bson;
import org.bson.types.ObjectId;
import org.json.JSONException;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;

import static com.mongodb.client.model.Filters.eq;

public class MongoDbCrawlCache extends CrawlCacheProvider {

    private final static String DEFAULT_DATABASE = "oss_crawl_cache";
    private final static String META_COLLECTION = "meta";
    private final static String INDEXED_COLLECTION = "indexed";

    private final ReadWriteLock rwl = new ReadWriteLock();

    private MongoClient mongoClient = null;
    private MongoCollection<Document> metaCollection = null;
    private MongoCollection<Document> indexedCollection = null;
    private GridFSBucket contentGrid = null;

    public MongoDbCrawlCache() {
        super(CrawlCacheProviderEnum.LOCAL_FILE);
    }

    private void closeNoLock() {
        if (mongoClient != null) {
            mongoClient.close();
            mongoClient = null;
            metaCollection = null;
            indexedCollection = null;
            contentGrid = null;
        }
    }

    @Override
    public void close() {
        rwl.w.lock();
        try {
            closeNoLock();
        } finally {
            rwl.w.unlock();
        }
    }

    @Override
    public String getInfos() throws IOException {
        rwl.r.lock();
        try {
            return mongoClient == null ? null : mongoClient.getConnectPoint();
        } finally {
            rwl.r.unlock();
        }
    }

    @Override
    public void init(String configString) throws IOException {
        rwl.w.lock();
        try {
            closeNoLock();
            final MongoClientURI connectionString = new MongoClientURI(configString);
            mongoClient = new MongoClient(connectionString);
            final MongoDatabase database = mongoClient.getDatabase(
                    connectionString.getDatabase() == null ? DEFAULT_DATABASE : connectionString.getDatabase());
            metaCollection = database.getCollection(META_COLLECTION);
            metaCollection.createIndex(Document.parse("{\"uri\":1}"));
            indexedCollection = database.getCollection(INDEXED_COLLECTION);
            indexedCollection.createIndex(Document.parse("{\"uri\":1}"));
            contentGrid = GridFSBuckets.create(database);
        } finally {
            rwl.w.unlock();
        }
    }

    @Override
    public long flush(long expiration) throws IOException {
        rwl.r.lock();
        try {
            final Bson filter = expiration == 0 ? Filters.exists("uri")
                    : Filters.lt("_id", new ObjectId(new Date(expiration)));
            indexedCollection.deleteMany(filter);
            for (GridFSFile f : contentGrid.find(filter))
                contentGrid.delete(f.getObjectId());
            long l = metaCollection.deleteMany(filter).getDeletedCount();
            return l;
        } finally {
            rwl.r.unlock();
        }
    }

    @Override
    public String getConfigurationInformation() {
        return "Please provide the connection URL to the MongoDB instance (Eg.: mongodb://localhost:27017/oss_crawl_cache)";
    }

    @Override
    public Item getItem(URI uri, long expirationTime) throws UnsupportedEncodingException {
        return new MongoDbItem(uri, expirationTime);
    }

    private static final UpdateOptions UPSERT = new UpdateOptions().upsert(true);

    public class MongoDbItem extends Item {

        private final URI uri;
        private final String uriString;
        private final int expirationTime;

        private MongoDbItem(final URI uri, long expirationTime) throws UnsupportedEncodingException {
            this.uri = uri;
            this.uriString = uri.toString();
            this.expirationTime = (int) (expirationTime / 1000);
        }

        @Override
        public InputStream store(DownloadItem downloadItem) throws IOException, JSONException {
            rwl.r.lock();
            try {
                final URI uri = downloadItem.getUri();
                if (!uri.equals(this.uri))
                    throw new IOException("The URI does not match: " + uri + " / " + this.uri);

                final Document newDocument = Document.parse(downloadItem.getMetaAsJson());
                newDocument.put("uri", uriString);

                final BsonValue id = metaCollection.replaceOne(eq("uri", uriString), newDocument, UPSERT)
                        .getUpsertedId();

                final GridFSUploadOptions options = new GridFSUploadOptions().metadata(new Document("_id", id));

                contentGrid.uploadFromStream(id, uriString, downloadItem.getContentInputStream(), options);

                return contentGrid.openDownloadStream(id);
            } finally {
                rwl.r.unlock();
            }
        }

        @Override
        public DownloadItem load() throws IOException, JSONException, URISyntaxException {
            rwl.r.lock();
            try {
                final Document foundDocument = metaCollection.find(eq("uri", uri.toString())).first();
                if (foundDocument == null)
                    return null;
                final ObjectId objectId = foundDocument.getObjectId("_id");
                if (expirationTime != 0)
                    if (objectId.getTimestamp() < expirationTime)
                        return null;
                final DownloadItem downloadItem = new DownloadItem(uri, foundDocument);
                downloadItem.setContentInputStream(contentGrid.openDownloadStream(objectId));
                return downloadItem;
            } finally {
                rwl.r.unlock();
            }
        }

        @Override
        public boolean flush() throws IOException {
            rwl.r.lock();
            try {
                boolean deleted = false;
                final Bson filter = eq("uri", uriString);
                final Document document = metaCollection.find(filter).first();
                if (document == null)
                    return false;
                contentGrid.delete(document.getObjectId("_id"));
                if (indexedCollection.deleteOne(eq("uri", uriString)).getDeletedCount() > 0)
                    deleted = true;
                if (metaCollection.deleteOne(eq("uri", uriString)).getDeletedCount() > 0)
                    deleted = true;
                return deleted;
            } finally {
                rwl.r.unlock();
            }
        }

        @Override
        public void store(List<ParserResultItem> parserResults) throws IOException {
            rwl.r.lock();
            try {
                if (parserResults == null || parserResults.isEmpty())
                    return;
                final BsonArray documents = new BsonArray();
                parserResults.forEach(parserResultItem -> {
                    final BsonDocument document = new BsonDocument();
                    final IndexDocument indexDocument = parserResultItem.getParserDocument();
                    indexDocument.forEachFieldValueItem((fieldName, fieldValueItems) -> {
                        final LinkedHashSet<String> fieldValues = new LinkedHashSet<>();
                        fieldValueItems.forEach(fvi -> fieldValues.add(fvi.value));
                        final BsonValue values;
                        switch (fieldValues.size()) {
                        case 0:
                            values = null;
                            break;
                        case 1:
                            values = new BsonString(fieldValues.iterator().next());
                            break;
                        default:
                            final BsonArray array = new BsonArray();
                            fieldValues.forEach(value -> array.add(new BsonString(value)));
                            values = array;
                            break;
                        }
                        if (values != null)
                            document.append(fieldName, values);
                    });
                    documents.add(document);
                });
                indexedCollection.replaceOne(eq("uri", uriString),
                        new Document("uri", uriString).append("documents", documents), UPSERT);
            } finally {
                rwl.r.unlock();
            }
        }
    }
}