com.jaeksoft.searchlib.crawler.database.DatabaseCrawlMongoDbThread.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.database.DatabaseCrawlMongoDbThread.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2015 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.database;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

import org.bson.Document;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.crawler.FieldMapContext;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.util.InfoCallback;
import com.jaeksoft.searchlib.util.ReadWriteLock;
import com.jaeksoft.searchlib.util.Variables;
import com.jayway.jsonpath.Configuration;
import com.mongodb.MongoClient;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.util.JSON;

public class DatabaseCrawlMongoDbThread extends DatabaseCrawlThread {

    private final ReadWriteLock rwl = new ReadWriteLock();

    private final DatabaseCrawlMongoDb databaseCrawl;

    public DatabaseCrawlMongoDbThread(Client client, DatabaseCrawlMaster crawlMaster,
            DatabaseCrawlMongoDb databaseCrawl, Variables variables, InfoCallback infoCallback) {
        super(client, crawlMaster, databaseCrawl, infoCallback);
        this.databaseCrawl = (DatabaseCrawlMongoDb) databaseCrawl.duplicate();
        this.databaseCrawl.applyVariables(variables);
    }

    private boolean index(List<IndexDocument> indexDocumentList, int limit)
            throws IOException, SearchLibException, InterruptedException {
        int i = indexDocumentList.size();
        if (i == 0 || i < limit)
            return false;
        client.updateDocuments(indexDocumentList);
        rwl.w.lock();
        try {
            pendingIndexDocumentCount -= i;
            updatedIndexDocumentCount += i;
        } finally {
            rwl.w.unlock();
        }
        indexDocumentList.clear();
        if (infoCallback != null)
            infoCallback.setInfo(updatedIndexDocumentCount + " document(s) indexed");
        sleepMs(databaseCrawl.getMsSleep());
        return true;
    }

    final private void runner_update(FindIterable<Document> iterable)
            throws SearchLibException, ClassNotFoundException, InstantiationException, IllegalAccessException,
            IOException, ParseException, SyntaxError, URISyntaxException, InterruptedException {
        final int limit = databaseCrawl.getBufferSize();
        iterable.batchSize(limit);
        DatabaseFieldMap databaseFieldMap = databaseCrawl.getFieldMap();
        List<IndexDocument> indexDocumentList = new ArrayList<IndexDocument>(0);
        LanguageEnum lang = databaseCrawl.getLang();
        FieldMapContext fieldMapContext = new FieldMapContext(client, lang);
        String uniqueField = client.getSchema().getUniqueField();
        MongoCursor<Document> cursor = iterable.iterator();
        while (cursor.hasNext() && !isAborted()) {

            String json = JSON.serialize(cursor.next());
            Object document = Configuration.defaultConfiguration().jsonProvider().parse(json);
            IndexDocument indexDocument = new IndexDocument(lang);
            databaseFieldMap.mapJson(fieldMapContext, document, indexDocument);
            if (uniqueField != null && !indexDocument.hasContent(uniqueField)) {
                rwl.w.lock();
                try {
                    ignoredDocumentCount++;
                } finally {
                    rwl.w.unlock();
                }
                continue;
            }
            indexDocumentList.add(indexDocument);
            rwl.w.lock();
            try {
                pendingIndexDocumentCount++;
            } finally {
                rwl.w.unlock();
            }
            if (index(indexDocumentList, limit))
                setStatus(CrawlStatus.CRAWL);

        }
        index(indexDocumentList, 0);
    }

    @Override
    public void runner() throws Exception {
        setStatus(CrawlStatus.STARTING);

        MongoClient mongoClient = null;
        try {
            mongoClient = databaseCrawl.getMongoClient();
            MongoCollection<Document> collection = databaseCrawl.getCollection(mongoClient);
            FindIterable<Document> iterable = collection.find(databaseCrawl.getCriteriaObject());
            setStatus(CrawlStatus.CRAWL);
            if (iterable != null)
                runner_update(iterable);
            if (updatedIndexDocumentCount > 0 || updatedDeleteDocumentCount > 0)
                client.reload();
        } finally {
            if (mongoClient != null)
                mongoClient.close();
        }

    }
}