com.jaeksoft.searchlib.crawler.web.process.WebCrawlMaster.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.web.process.WebCrawlMaster.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.web.process;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.Set;

import org.apache.commons.lang3.StringUtils;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.AbstractManager;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlMasterAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlQueueAbstract;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem;
import com.jaeksoft.searchlib.crawler.web.database.NamedItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlCrawlQueue;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternListMatcher;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapItem;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapList;
import com.jaeksoft.searchlib.crawler.web.sitemap.SiteMapUrl;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.scheduler.TaskManager;

public class WebCrawlMaster extends CrawlMasterAbstract<WebCrawlMaster, WebCrawlThread> {

    private final LinkedList<NamedItem> hostList;

    private Date fetchIntervalDate;

    public class Selection {

        private final ListType listType;

        private final Date beforeDate;

        private final Date afterDate;

        private final FetchStatus fetchStatus;

        public Selection(ListType listType, FetchStatus fetchStatus, Date beforeDate, Date afterDate) {
            this.listType = listType;
            this.fetchStatus = fetchStatus;
            this.beforeDate = beforeDate;
            this.afterDate = afterDate;
        }
    }

    private Selection selection;

    private int maxUrlPerSession;

    private int maxUrlPerHost;

    private final PatternListMatcher exclusionMatcher;

    private final PatternListMatcher inclusionMatcher;

    private final UrlCrawlQueue urlCrawlQueue;

    public WebCrawlMaster(Config config) throws SearchLibException {
        super(config);
        WebPropertyManager propertyManager = config.getWebPropertyManager();
        urlCrawlQueue = new UrlCrawlQueue(config);
        exclusionMatcher = propertyManager.getExclusionEnabled().getValue()
                ? config.getExclusionPatternManager().getPatternListMatcher()
                : null;
        inclusionMatcher = propertyManager.getInclusionEnabled().getValue()
                ? config.getInclusionPatternManager().getPatternListMatcher()
                : null;
        hostList = new LinkedList<NamedItem>();
        if (propertyManager.getCrawlEnabled().getValue()) {
            Logging.info("Webcrawler is starting for " + config.getIndexName());
            start(false);
        }
    }

    @Override
    public void runner() throws Exception {
        Config config = getConfig();
        WebPropertyManager propertyManager = config.getWebPropertyManager();
        urlCrawlQueue.setMaxBufferSize(propertyManager.getIndexDocumentBufferSize().getValue());
        while (!isAborted()) {

            currentStats = new CrawlStatistics();
            addStatistics(currentStats);
            urlCrawlQueue.setStatistiques(currentStats);

            int threadNumber = propertyManager.getMaxThreadNumber().getValue();
            maxUrlPerSession = propertyManager.getMaxUrlPerSession().getValue();
            maxUrlPerHost = propertyManager.getMaxUrlPerHost().getValue();
            String schedulerJobName = propertyManager.getSchedulerAfterSession().getValue();

            synchronized (hostList) {
                hostList.clear();
            }

            extractSiteMapList();
            extractHostList();

            while (!isAborted()) {

                int howMany = urlLeftPerHost();
                if (howMany <= 0)
                    break;

                NamedItem host = getNextHost();
                if (host == null)
                    break;

                HostUrlList hostUrlList = getNextUrlList(host, howMany);
                if (hostUrlList == null)
                    continue;

                WebCrawlThread crawlThread = new WebCrawlThread(config, this, currentStats, hostUrlList);
                add(crawlThread);

                while (getThreadsCount() >= threadNumber && !isAborted())
                    sleepSec(5);
            }

            setStatus(CrawlStatus.WAITING_CHILD);
            while (getThreadsCount() > 0) {
                waitForChild(1800);
                if (isAborted())
                    break;
            }
            setStatus(CrawlStatus.INDEXATION);
            urlCrawlQueue.index(true);
            if (currentStats.getUrlCount() > 0)
                config.getUrlManager().reload(false, null);
            if (schedulerJobName != null && schedulerJobName.length() > 0) {
                setStatus(CrawlStatus.EXECUTE_SCHEDULER_JOB);
                TaskManager.getInstance().executeJob(config.getIndexName(), schedulerJobName);
            }

            if (isOnce())
                break;
            sleepSec(5);
        }
        urlCrawlQueue.index(true);
        setStatus(CrawlStatus.NOT_RUNNING);
    }

    private void extractHostList()
            throws IOException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException,
            InterruptedException, SearchLibException, InstantiationException, IllegalAccessException {
        Config config = getConfig();
        UrlManager urlManager = config.getUrlManager();
        urlManager.reload(false, null);
        setStatus(CrawlStatus.EXTRACTING_HOSTLIST);

        WebPropertyManager propertyManager = config.getWebPropertyManager();
        fetchIntervalDate = AbstractManager.getPastDate(propertyManager.getFetchInterval().getValue(),
                propertyManager.getFetchIntervalUnit().getValue());

        int urlLimit = maxUrlPerSession;
        // First try fetch priority
        selection = new Selection(ListType.PRIORITY_URL, FetchStatus.FETCH_FIRST, null, null);
        urlLimit = urlManager.getHostToFetch(selection.fetchStatus, selection.beforeDate, selection.afterDate,
                urlLimit, maxUrlPerHost, hostList);

        // Second try old URLs
        if (hostList.size() == 0) {
            selection = new Selection(ListType.OLD_URL, null, fetchIntervalDate, null);
            urlLimit = urlManager.getHostToFetch(selection.fetchStatus, selection.beforeDate, selection.afterDate,
                    urlLimit, maxUrlPerHost, hostList);
        }

        // Finally try new unfetched URLs
        if (hostList.size() == 0) {
            selection = new Selection(ListType.NEW_URL, FetchStatus.UN_FETCHED, null, fetchIntervalDate);
            urlLimit = urlManager.getHostToFetch(selection.fetchStatus, selection.beforeDate, selection.afterDate,
                    urlLimit, maxUrlPerHost, hostList);
        }
        currentStats.addHostListSize(hostList.size());

    }

    private void extractSiteMapList() throws SearchLibException {
        HttpDownloader httpDownloader = null;
        try {
            httpDownloader = getNewHttpDownloader(true);
            SiteMapList siteMapList = getConfig().getSiteMapList();
            if (siteMapList != null && siteMapList.getArray() != null) {
                setStatus(CrawlStatus.LOADING_SITEMAP);
                UrlManager urlManager = getConfig().getUrlManager();
                List<UrlItem> workInsertUrlList = new ArrayList<UrlItem>();
                for (SiteMapItem siteMap : siteMapList.getArray()) {
                    Set<SiteMapUrl> siteMapUrlSet = siteMap.load(getNewHttpDownloader(true), null);
                    for (SiteMapUrl siteMapUrl : siteMapUrlSet) {

                        URI uri = siteMapUrl.getLoc();
                        String sUri = uri.toString();
                        URL url;
                        try {
                            url = uri.toURL();
                        } catch (MalformedURLException e) {
                            continue;
                        }

                        if (exclusionMatcher != null)
                            if (exclusionMatcher.matchPattern(url, sUri))
                                continue;
                        if (inclusionMatcher != null)
                            if (!inclusionMatcher.matchPattern(url, sUri))
                                continue;

                        if (!urlManager.exists(sUri)) {
                            workInsertUrlList.add(
                                    urlManager.getNewUrlItem(new LinkItem(sUri, LinkItem.Origin.sitemap, null)));
                        }
                    }
                }
                if (workInsertUrlList.size() > 0)
                    urlManager.updateUrlItems(workInsertUrlList);
            }
        } finally {
            if (httpDownloader != null)
                httpDownloader.release();
        }
    }

    public HttpDownloader getNewHttpDownloader(boolean followRedirect, String userAgent, boolean useProxies)
            throws SearchLibException {
        Config config = getConfig();
        WebPropertyManager propertyManager = config.getWebPropertyManager();
        if (StringUtils.isEmpty(userAgent))
            userAgent = propertyManager.getUserAgent().getValue();
        return new HttpDownloader(userAgent, followRedirect, useProxies ? propertyManager.getProxyHandler() : null);
    }

    final public HttpDownloader getNewHttpDownloader(final boolean followRedirect) throws SearchLibException {
        return getNewHttpDownloader(followRedirect, null, true);
    }

    private NamedItem getNextHost() {
        synchronized (hostList) {
            int s = hostList.size();
            if (s > 0) {
                NamedItem host = hostList.remove(new Random().nextInt(s));
                if (host != null) {
                    host.setList(hostList);
                    currentStats.incHostCount();
                    return host;
                }
            }
        }
        return null;
    }

    protected int urlLeft() {
        return (int) (maxUrlPerSession - currentStats.getFetchedCount());
    }

    private int urlLeftPerHost() {
        int leftCount = urlLeft();
        if (leftCount < 0)
            return leftCount;
        if (leftCount > maxUrlPerHost)
            leftCount = maxUrlPerHost;
        return leftCount;
    }

    private HostUrlList getNextUrlList(NamedItem host, int count)
            throws ParseException, IOException, SyntaxError, URISyntaxException, ClassNotFoundException,
            InterruptedException, SearchLibException, InstantiationException, IllegalAccessException {

        setStatus(CrawlStatus.EXTRACTING_URLLIST);
        setInfo(host.getName());
        UrlManager urlManager = getConfig().getUrlManager();

        List<UrlItem> urlList = new ArrayList<UrlItem>();
        HostUrlList hostUrlList = new HostUrlList(urlList, host);
        hostUrlList.setListType(selection.listType);

        urlManager.getUrlToFetch(host, selection.fetchStatus, selection.beforeDate, selection.afterDate, count,
                urlList);

        setInfo(null);
        return hostUrlList;
    }

    public boolean isFull() throws SearchLibException {
        return currentStats.getFetchedCount() >= getConfig().getWebPropertyManager().getMaxUrlPerSession()
                .getValue();
    }

    public Crawl getNewCrawl(WebCrawlThread crawlThread) throws SearchLibException {
        return new Crawl(crawlThread);

    }

    public WebCrawlThread manualCrawl(URL url, HostUrlList.ListType listType)
            throws SearchLibException, ParseException, IOException, SyntaxError, URISyntaxException,
            ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException {
        Config config = getConfig();
        UrlManager urlManager = config.getUrlManager();
        List<UrlItem> urlItemList = new ArrayList<UrlItem>();
        UrlItem urlItem = urlManager.getUrlToFetch(url);
        if (urlItem == null)
            urlItem = urlManager.getNewUrlItem(new LinkItem(url.toExternalForm(), LinkItem.Origin.manual, null));
        urlItemList.add(urlItem);
        HostUrlList hostUrlList = new HostUrlList(urlItemList, new NamedItem(url.getHost()));
        hostUrlList.setListType(listType);
        WebCrawlThread crawlThread = new WebCrawlThread(config, this, new CrawlStatistics(), hostUrlList);
        crawlThread.execute(180);
        return crawlThread;

    }

    public CrawlQueueAbstract getCrawlQueue() {
        return urlCrawlQueue;
    }

    @Override
    protected WebCrawlThread[] getNewArray(int size) {
        return new WebCrawlThread[size];
    }

}