fr.eolya.crawler.queue.mongodb.MongoDBSourceQueue.java Source code

Java tutorial

Introduction

Here is the source code for fr.eolya.crawler.queue.mongodb.MongoDBSourceQueue.java

Source

/*
 * Licensed to Eolya and Dominique Bejean under one
 * or more contributor license agreements. 
 * Eolya licenses this file to you under the 
 * Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package fr.eolya.crawler.queue.mongodb;

import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Map;

import com.mongodb.*;

import fr.eolya.utils.nosql.mongodb.*;
import fr.eolya.crawler.connectors.ISource;
import fr.eolya.crawler.queue.*;

/**
 * A MongoDB queue for web site definition
 */
public class MongoDBSourceQueue implements ISourceQueue {

    private MongoDBConnection con = null;
    private MongoDBDatabase db = null;

    private String collName;
    private MongoDBCollection coll = null;
    private final Object collMonitor = new Object();

    private boolean test;
    private boolean interactiveOnly;
    private boolean suspiciousOnly;
    private String accountId;
    private String sourceId;
    private String engineId;

    /**
     * @param db                  The MongoDB database
     * @param collName              The MongoDB collection name
     * @param uniqueKeyFieldName      unique key field name for documents in the queue
     * @return
     * @throws UnknownHostException 
     */
    public MongoDBSourceQueue(MongoDBConnection con, String dbName, String collName, boolean test,
            boolean interactiveOnly, boolean suspiciousOnly, String accountId, String sourceId, String engineId)
            throws UnknownHostException {

        this.con = con;
        this.db = new MongoDBDatabase(this.con, dbName);

        this.collName = collName;
        this.coll = new MongoDBCollection(db, this.collName);

        this.test = test;
        this.interactiveOnly = interactiveOnly;
        this.suspiciousOnly = suspiciousOnly;
        this.accountId = accountId;
        this.sourceId = sourceId;
        this.engineId = engineId;
    }

    public void close() {
        if (this.con == null)
            return;
        this.con.close();
    }

    public long size() {
        DBCursor cur = null;
        String query = getQuery(test, interactiveOnly, suspiciousOnly, accountId, sourceId, engineId);
        BasicDBObject docsearch = MongoDBHelper.JSON2BasicDBObject(query);
        cur = coll.getColl().find(docsearch);
        return cur.size();
    }

    public Map<String, Object> pop() {
        DBCursor cur = null;
        String query = getQuery(test, interactiveOnly, suspiciousOnly, accountId, sourceId, engineId);
        query = String.format(
                "{\"$and\": [{\"$or\": [{\"_poped\": { \"$exists\": false }},{\"_poped\": false}]}, %1$s]}", query);

        BasicDBObject docsearch = MongoDBHelper.JSON2BasicDBObject(query);

        synchronized (collMonitor) {
            // TODO : sort by priority and next crawl date
            cur = coll.getColl().find(docsearch)
                    .sort(new BasicDBObject("crawl_priority", -1).append("crawl_nexttime", 1));

            if (cur.hasNext()) {
                BasicDBObject doc = (BasicDBObject) cur.next();
                BasicDBObject doc2 = (BasicDBObject) doc.copy();
                doc2.put("_poped", true);
                coll.update(doc, doc2);
                //return doc.toMap();
                return MongoDBHelper.BasicDBObject2Map(doc);
                //return doc.toString();
            }
        }
        return null;
    }

    public void unpop(int id) {
        DBCursor cur = null;
        String query = String.format("{\"id\": %1$s}", id);
        BasicDBObject docsearch = MongoDBHelper.JSON2BasicDBObject(query);

        synchronized (collMonitor) {
            cur = coll.getColl().find(docsearch);
            if (cur.hasNext()) {
                BasicDBObject doc = (BasicDBObject) cur.next();
                BasicDBObject doc2 = (BasicDBObject) doc.copy();
                doc2.put("_poped", false);
                coll.update(doc, doc2);
            }
        }
    }

    private String getQuery(boolean test, boolean interactiveOnly, boolean suspiciousOnly, String accountId,
            String sourceId, String engineId) {
        // http://docs.mongodb.org/manual/reference/sql-comparison/
        // http://rickosborne.org/download/SQL-to-MongoDB.pdf

        String qQeleted = "{\"deleted\": \"0\"}";

        String qStatus = String.format(
                "{ \"$and\": [{\"crawl_process_status\": { \"$ne\": \"%1$s\"}}, {\"crawl_process_status\": { \"$ne\": \"%2$s\"}}, {\"crawl_process_status\": { \"$ne\": \"%3$s\"}}]}",
                ISource.CRAWL_PROCESS_STATUS_CRAWLING, ISource.CRAWL_PROCESS_STATUS_PAUSE_REQUESTED,
                ISource.CRAWL_PROCESS_STATUS_STOP_REQUESTED);

        //String qMode = String.format("{ \"$or\": [{\"crawl_mode\": \"%1$s\"}, {\"crawl_mode\": \"%2$s\"}, {\"crawl_mode\": \"%3$s\"}]}",
        //      ISource.CRAWL_PROCESS_MODE_RESET, ISource.CRAWL_PROCESS_MODE_CLEAR, ISource.CRAWL_PROCESS_MODE_CLEAN);
        String qMode = String.format("{ \"$or\": [{\"crawl_mode\": \"%1$s\"}, {\"crawl_mode\": \"%2$s\"}]}",
                ISource.CRAWL_PROCESS_MODE_RESET, ISource.CRAWL_PROCESS_MODE_CLEAR);

        String qStatusMode = String.format("{ \"$or\": [%1$s, %2$s]}", qStatus, qMode);

        String query;
        if (test) {
            String qEnabled = "{\"enabled\": \"2\"}";
            query = String.format("{ \"$and\": [%1$s, %2$s]}", qQeleted, qEnabled);
            //sqlStatement = "enabled = 2 and deleted = 0 and ";

        } else {
            String qEnabled = "{\"enabled\": \"1\"}";
            query = String.format("{ \"$and\": [%1$s, %2$s, %3$s]}", qQeleted, qEnabled, qStatusMode);
            //sqlStatement = "(enabled = 1 and deleted = 0 and ((not crawl_process_status = 1 and not crawl_process_status = 2) or (crawl_mode = 2 or crawl_mode = 5 or crawl_mode = 6))) and ";
        }

        if (!"".equals(accountId)) {
            String qAccountId = String.format("{\"id_account\": %1$s}", accountId);
            query = String.format("{\"$and\": [%1$s, %2$s]}", qAccountId, query);
            //sqlStatement += " id_account = " + accountId + " and " ;            
        } else {
            String qAccountIds;
            if ("".equals(engineId)) {
                qAccountIds = "{\"deleted\": { \"$ne\": \"1\"}}";
                // sqlStatement += " (id_account in (select id from accounts where deleted = 0)) and " ;
            } else {
                qAccountIds = String.format("{\"$and\": [{\"deleted\": { \"$ne\": \"1\"}}, {\"id_engine\": %1$s}]}",
                        engineId);
                // sqlStatement += " (id_account in (select id from accounts where id_engine = " + engineId + " and deleted = 0)) and " ;   
            }
            MongoDBCollection accountsColl = new MongoDBCollection(db, "accounts");
            ArrayList<String> ids = accountsColl.getValues(MongoDBHelper.JSON2BasicDBObject(qAccountIds), "id");
            if (ids != null) {
                String qAccounts = "{ \"id_account\": { \"$in\": [";
                for (int i = 0; i < ids.size(); i++) {
                    if (i > 0)
                        qAccounts += ", ";
                    qAccounts += ids.get(i);
                }
                qAccounts += "]}}";
                query = String.format("{\"$and\": [%1$s, %2$s]}", qAccounts, query);
            }
        }

        if (!"".equals(sourceId)) {
            String qSourceId = String.format("{\"id\": %1$s}", sourceId);
            query = String.format("{\"$and\": [%1$s, %2$s]}", qSourceId, query);
            //sqlStatement += " id = " + sourceId + " and ";   
        }

        String qTime1 = "{\"$and\": [{\"crawl_nexttime\": { \"$exists\": false }}, {\"crawl_minimal_period\": { \"$ne\": \"999999\"}}]}";

        Date startDate = new Date();
        startDate.setTime(0);
        Date endDate = new Date();
        DBObject qTime2 = new BasicDBObject();
        qTime2.put("crawl_nexttime", new BasicDBObject("$gt", startDate).append("$lte", endDate));

        //       DBObject qqq = QueryBuilder.start().put("crawl_nexttime").greaterThan(startDate).lessThanEquals(endDate).get();
        //       String sss4 = qqq.toString();

        String qTime = String.format("{\"$or\": [%1$s, %2$s]}", qTime1, qTime2.toString());
        query = String.format("{\"$and\": [%1$s, %2$s]}", qTime, query);
        //sqlStatement += " ((crawl_nexttime is null and crawl_minimal_period != '999999') or crawl_nexttime <= now())";

        if (interactiveOnly) {
            String qInteractive = String.format("{\"crawl_mode\": { \"$ne\": %1$s}}",
                    ISource.CRAWL_PROCESS_MODE_NONE);
            query = String.format("{\"$and\": [%1$s, %2$s]}", qInteractive, query);
            //sqlStatement += " and (not crawl_mode = 0)";
        }

        if (suspiciousOnly) {
            String qSuspicious1 = "{\"$and\": [{\"crawl_lasttime_end\" : { \"$exists\": false }, {\"crawl_lastpagecount\": { \"$lt\": 3}}]}";
            String qSuspicious = String.format("{\"$or\": [%1$s, {\"crawl_status\": { \"$ne\": \"0\"}}]}",
                    qSuspicious1);
            query = String.format("{\"$and\": [%1$s, %2$s]}", qSuspicious, query);
            // sqlStatement += " and ((not crawl_lasttime_end is null and crawl_lastpagecount < 3) or not crawl_status = '0')";
        }
        return query;
    }
}