com.jaeksoft.searchlib.crawler.web.spider.Crawl.java Source code

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.web.spider.Crawl.java
Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;

import org.apache.commons.io.FilenameUtils;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.ClientCatalog;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.FieldMap;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.web.database.CookieItem;
import com.jaeksoft.searchlib.crawler.web.database.CookieManager;
import com.jaeksoft.searchlib.crawler.web.database.CredentialItem;
import com.jaeksoft.searchlib.crawler.web.database.CredentialManager;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem;
import com.jaeksoft.searchlib.crawler.web.database.LinkItem.Origin;
import com.jaeksoft.searchlib.crawler.web.database.PatternManager;
import com.jaeksoft.searchlib.crawler.web.database.RobotsTxtStatus;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.robotstxt.RobotsTxt;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.HtmlParser;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.parser.ParserFieldEnum;
import com.jaeksoft.searchlib.parser.ParserResultItem;
import com.jaeksoft.searchlib.parser.ParserSelector;
import com.jaeksoft.searchlib.plugin.IndexPluginList;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.streamlimiter.LimitException;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;

public class Crawl {

    private List<IndexDocument> targetIndexDocuments;
    private HostUrlList hostUrlList;
    private final UrlItem urlItem;
    private CredentialManager credentialManager;
    private CookieManager cookieManager;
    private CredentialItem credentialItem;
    private String userAgent;
    private ParserSelector parserSelector;
    private Config config;
    private Parser parser;
    private String error;
    private List<LinkItem> discoverLinks;
    private FieldMap urlFieldMap;
    private URI redirectUrlLocation;
    private boolean inclusionEnabled;
    private boolean exclusionEnabled;
    private boolean robotsTxtEnabled;

    public Crawl(HostUrlList hostUrlList, UrlItem urlItem, Config config, ParserSelector parserSelector)
            throws SearchLibException {
        this.credentialManager = config.getWebCredentialManager();
        this.cookieManager = config.getWebCookieManager();
        this.credentialItem = null;
        WebPropertyManager propertyManager = config.getWebPropertyManager();
        this.hostUrlList = hostUrlList;
        this.targetIndexDocuments = null;
        this.urlFieldMap = config.getWebCrawlerFieldMap();
        this.discoverLinks = null;
        this.urlItem = urlItem;
        this.urlItem.setWhenNow();
        this.userAgent = propertyManager.getUserAgent().getValue();
        this.parser = null;
        this.parserSelector = parserSelector;
        this.config = config;
        this.error = null;
        this.redirectUrlLocation = null;
        this.exclusionEnabled = propertyManager.getExclusionEnabled().getValue();
        this.inclusionEnabled = propertyManager.getInclusionEnabled().getValue();
        this.robotsTxtEnabled = propertyManager.getRobotsTxtEnabled().getValue();
    }

    public Crawl(WebCrawlThread crawlThread) throws SearchLibException {
        this(crawlThread.getHostUrlList(), crawlThread.getCurrentUrlItem(), crawlThread.getConfig(),
                crawlThread.getConfig().getParserSelector());
    }

    protected void parseContent(InputStream inputStream) throws InstantiationException, IllegalAccessException,
            ClassNotFoundException, IOException, SearchLibException, NoSuchAlgorithmException, URISyntaxException {
        if (parserSelector == null) {
            urlItem.setParserStatus(ParserStatus.NOPARSER);
            return;
        }
        String fileName = urlItem.getContentDispositionFilename();
        if (fileName == null) {
            URL url = urlItem.getURL();
            if (url != null)
                fileName = FilenameUtils.getName(url.getFile());
        }
        IndexDocument sourceDocument = new IndexDocument();
        urlItem.populate(sourceDocument);
        Date parserStartDate = new Date();
        // TODO Which language for OCR ?
        parser = parserSelector.parseStream(sourceDocument, fileName, urlItem.getContentBaseType(),
                urlItem.getUrl(), inputStream, null, parserSelector.getWebCrawlerDefaultParser());
        if (parser == null) {
            urlItem.setParserStatus(ParserStatus.NOPARSER);
            return;
        }

        if (parser.getError() != null) {
            urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
            return;
        }
        urlItem.clearInLinks();
        urlItem.clearOutLinks();

        for (ParserResultItem result : parser.getParserResults()) {
            urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link));
            urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link_nofollow));
            urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link));
            urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link_nofollow));
            urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0));
            urlItem.setLangMethod(result.getFieldValue(ParserFieldEnum.lang_method, 0));
            urlItem.setContentTypeCharset(result.getFieldValue(ParserFieldEnum.charset, 0));
        }
        ParserStatus parsedStatus = ParserStatus.PARSED;
        if (parser instanceof HtmlParser)
            if (!((HtmlParser) parser).isCanonical())
                parsedStatus = ParserStatus.PARSED_NON_CANONICAL;
        urlItem.setParserStatus(parsedStatus);
        String oldMd5size = urlItem.getMd5size();
        String newMd5size = parser.getMd5size();
        urlItem.setMd5size(newMd5size);
        Date oldContentUpdateDate = urlItem.getContentUpdateDate();
        Date newContentUpdateDate = null;
        if (oldContentUpdateDate == null)
            newContentUpdateDate = parserStartDate;
        else {
            if (oldMd5size != null && newMd5size != null)
                if (!oldMd5size.equals(newMd5size))
                    newContentUpdateDate = parserStartDate;
        }
        if (newContentUpdateDate != null)
            urlItem.setContentUpdateDate(newContentUpdateDate);

        for (ParserResultItem result : parser.getParserResults()) {
            FieldContent fieldContent = result.getFieldContent(ParserFieldEnum.meta_robots);
            if (fieldContent != null) {
                List<FieldValueItem> fieldValues = fieldContent.getValues();
                if (fieldValues != null) {
                    for (FieldValueItem item : result.getFieldContent(ParserFieldEnum.meta_robots).getValues())
                        if ("noindex".equalsIgnoreCase(item.getValue())) {
                            urlItem.setIndexStatus(IndexStatus.META_NOINDEX);
                            break;
                        }
                }
            }
        }
    }

    public boolean checkRobotTxtAllow(HttpDownloader httpDownloader)
            throws MalformedURLException, SearchLibException, URISyntaxException, ClassNotFoundException {
        RobotsTxtStatus robotsTxtStatus;
        if (robotsTxtEnabled) {
            RobotsTxt robotsTxt = config.getRobotsTxtCache().getRobotsTxt(httpDownloader, config, urlItem.getURL(),
                    false);
            robotsTxtStatus = robotsTxt.getStatus(userAgent, urlItem);
        } else
            robotsTxtStatus = RobotsTxtStatus.DISABLED;
        urlItem.setRobotsTxtStatus(robotsTxtStatus);
        if (robotsTxtStatus == RobotsTxtStatus.DISABLED || robotsTxtStatus == RobotsTxtStatus.ALLOW)
            return true;
        if (robotsTxtStatus == RobotsTxtStatus.NO_ROBOTSTXT)
            return true;
        urlItem.setFetchStatus(FetchStatus.NOT_ALLOWED);
        return false;
    }

    /**
     * Download the file and extract content informations
     * 
     * @param httpDownloader
     */
    public DownloadItem download(HttpDownloader httpDownloader) {
        synchronized (this) {
            InputStream is = null;
            DownloadItem downloadItem = null;
            try {
                URL url = urlItem.getURL();
                if (url == null)
                    throw new MalformedURLException("Malformed URL: " + urlItem.getUrl());
                // URL normalisation
                URI uri = url.toURI();
                url = uri.toURL();

                credentialItem = credentialManager == null ? null : credentialManager.matchCredential(url);

                List<CookieItem> cookieList = cookieManager.getCookies(url.toExternalForm());
                downloadItem = ClientCatalog.getCrawlCacheManager().loadCache(uri);

                boolean fromCache = (downloadItem != null);

                if (!fromCache)
                    downloadItem = httpDownloader.get(uri, credentialItem, null, cookieList);
                else if (Logging.isDebug)
                    Logging.debug("Crawl cache deliver: " + uri);

                urlItem.setContentDispositionFilename(downloadItem.getContentDispositionFilename());

                urlItem.setContentBaseType(downloadItem.getContentBaseType());

                urlItem.setContentTypeCharset(downloadItem.getContentTypeCharset());

                urlItem.setContentEncoding(downloadItem.getContentEncoding());

                urlItem.setContentLength(downloadItem.getContentLength());

                urlItem.setLastModifiedDate(downloadItem.getLastModified());

                urlItem.setFetchStatus(FetchStatus.FETCHED);

                urlItem.setHeaders(downloadItem.getHeaders());

                Integer code = downloadItem.getStatusCode();
                if (code == null)
                    throw new IOException("Http status is null");

                urlItem.setResponseCode(code);
                redirectUrlLocation = downloadItem.getRedirectLocation();
                if (redirectUrlLocation != null)
                    urlItem.setRedirectionUrl(redirectUrlLocation.toURL().toExternalForm());

                urlItem.setBacklinkCount(config.getUrlManager().countBackLinks(urlItem.getUrl()));

                if (code >= 200 && code < 300) {
                    if (!fromCache)
                        is = ClientCatalog.getCrawlCacheManager().storeCache(downloadItem);
                    else
                        is = downloadItem.getContentInputStream();
                    parseContent(is);
                } else if (code == 301) {
                    urlItem.setFetchStatus(FetchStatus.REDIR_PERM);
                } else if (code > 301 && code < 400) {
                    urlItem.setFetchStatus(FetchStatus.REDIR_TEMP);
                } else if (code >= 400 && code < 500) {
                    urlItem.setFetchStatus(FetchStatus.GONE);
                } else if (code >= 500 && code < 600) {
                    urlItem.setFetchStatus(FetchStatus.HTTP_ERROR);
                }
            } catch (FileNotFoundException e) {
                Logging.info("FileNotFound: " + urlItem.getUrl());
                urlItem.setFetchStatus(FetchStatus.GONE);
                setError("FileNotFound: " + urlItem.getUrl());
            } catch (LimitException e) {
                Logging.warn(e.toString() + " (" + urlItem.getUrl() + ")");
                urlItem.setFetchStatus(FetchStatus.SIZE_EXCEED);
                setError(e.getMessage());
            } catch (InstantiationException e) {
                Logging.error(e.getMessage(), e);
                urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
                setError(e.getMessage());
            } catch (IllegalAccessException e) {
                Logging.error(e.getMessage(), e);
                urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
                setError(e.getMessage());
            } catch (ClassNotFoundException e) {
                Logging.error(e.getMessage(), e);
                urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
                setError(e.getMessage());
            } catch (URISyntaxException e) {
                Logging.warn(e.getMessage(), e);
                urlItem.setFetchStatus(FetchStatus.URL_ERROR);
                setError(e.getMessage());
            } catch (MalformedURLException e) {
                Logging.warn(e.getMessage(), e);
                urlItem.setFetchStatus(FetchStatus.URL_ERROR);
                setError(e.getMessage());
            } catch (IOException e) {
                Logging.error(e.getMessage(), e);
                urlItem.setFetchStatus(FetchStatus.ERROR);
                setError(e.getMessage());
            } catch (IllegalArgumentException e) {
                Logging.error(e.getMessage(), e);
                urlItem.setFetchStatus(FetchStatus.ERROR);
                setError(e.getMessage());
            } catch (Exception e) {
                Logging.error(e.getMessage(), e);
                urlItem.setFetchStatus(FetchStatus.ERROR);
                setError(e.getMessage());
            } finally {
                IOUtils.close(is);
            }
            return downloadItem;
        }
    }

    public void setError(String error) {
        this.error = error;
    }

    public String getUserAgent() {
        return userAgent;
    }

    public Parser getParser() {
        return parser;
    }

    public String getContentType() {
        if (urlItem == null)
            return null;
        return urlItem.getContentBaseType();
    }

    public StreamLimiter getStreamLimiter() {
        if (parser == null)
            return null;
        return parser.getStreamLimiter();
    }

    public String getError() {
        return error;
    }

    public UrlItem getUrlItem() {
        return urlItem;
    }

    public HostUrlList getHostUrlList() {
        return hostUrlList;
    }

    public CredentialItem getCredentialItem() {
        return credentialItem;
    }

    public IndexDocument getTargetIndexDocument(int documentPos)
            throws SearchLibException, IOException, URISyntaxException {
        if (targetIndexDocuments == null)
            getTargetIndexDocuments();
        if (targetIndexDocuments == null)
            return null;
        if (documentPos >= targetIndexDocuments.size())
            return null;
        return targetIndexDocuments.get(documentPos);
    }

    public List<IndexDocument> getTargetIndexDocuments()
            throws SearchLibException, IOException, URISyntaxException {
        synchronized (this) {
            if (targetIndexDocuments != null)
                return targetIndexDocuments;

            targetIndexDocuments = new ArrayList<IndexDocument>(0);

            if (parser == null)
                return targetIndexDocuments;

            List<ParserResultItem> results = parser.getParserResults();
            if (results == null)
                return targetIndexDocuments;

            for (ParserResultItem result : results) {
                IndexDocument targetIndexDocument = new IndexDocument(LanguageEnum.findByCode(urlItem.getLang()));

                IndexDocument urlIndexDocument = new IndexDocument();
                urlItem.populate(urlIndexDocument);
                urlFieldMap.mapIndexDocument(urlIndexDocument, targetIndexDocument);

                if (result != null)
                    result.populate(targetIndexDocument);

                IndexPluginList indexPluginList = config.getWebCrawlMaster().getIndexPluginList();

                if (indexPluginList != null) {
                    if (!indexPluginList.run((Client) config, getContentType(), getStreamLimiter(),
                            targetIndexDocument)) {
                        urlItem.setIndexStatus(IndexStatus.PLUGIN_REJECTED);
                        urlItem.populate(urlIndexDocument);
                        continue;
                    }
                }

                targetIndexDocuments.add(targetIndexDocument);
            }
            return targetIndexDocuments;
        }
    }

    final private static void addDiscoverLink(UrlManager urlManager, PatternManager inclusionManager,
            PatternManager exclusionManager, String href, Origin origin, String parentUrl, URL currentURL,
            UrlFilterItem[] urlFilterList, List<LinkItem> newUrlList) {
        if (href == null)
            return;
        try {
            URL url = currentURL != null ? LinkUtils.getLink(currentURL, href, urlFilterList, false)
                    : LinkUtils.newEncodedURL(href);

            if (exclusionManager != null)
                if (exclusionManager.matchPattern(url))
                    return;
            if (inclusionManager != null)
                if (!inclusionManager.matchPattern(url))
                    return;
            newUrlList.add(new LinkItem(url.toExternalForm(), origin, parentUrl));
        } catch (MalformedURLException e) {
            Logging.warn(href + " " + e.getMessage(), e);
        } catch (URISyntaxException e) {
            Logging.warn(href + " " + e.getMessage(), e);
        }
    }

    final private static void addDiscoverLinks(UrlManager urlManager, PatternManager inclusionManager,
            PatternManager exclusionManager, Collection<String> linkSet, Origin origin, String parentUrl,
            URL currentURL, UrlFilterItem[] urlFilterList, List<LinkItem> newUrlList)
            throws NoSuchAlgorithmException, IOException, SearchLibException {
        if (linkSet == null)
            return;
        for (String link : linkSet)
            addDiscoverLink(urlManager, inclusionManager, exclusionManager, link, origin, parentUrl, currentURL,
                    urlFilterList, newUrlList);
    }

    public List<LinkItem> getDiscoverLinks()
            throws NoSuchAlgorithmException, IOException, SearchLibException, URISyntaxException {
        synchronized (this) {
            if (discoverLinks != null)
                return discoverLinks;
            UrlManager urlManager = config.getUrlManager();
            PatternManager inclusionManager = inclusionEnabled ? config.getInclusionPatternManager() : null;
            PatternManager exclusionManager = exclusionEnabled ? config.getExclusionPatternManager() : null;
            UrlFilterItem[] urlFilterList = config.getUrlFilterList().getArray();
            String parentUrl = urlItem.getUrl();
            URL currentURL = urlItem.getURL();
            if (currentURL == null)
                return discoverLinks;
            discoverLinks = new ArrayList<LinkItem>();
            if (redirectUrlLocation != null) {
                addDiscoverLink(urlManager, inclusionManager, exclusionManager, redirectUrlLocation.toString(),
                        Origin.redirect, parentUrl, currentURL, urlFilterList, discoverLinks);
            }
            if (parser != null && urlItem.getFetchStatus() == FetchStatus.FETCHED)
                addDiscoverLinks(urlManager, inclusionManager, exclusionManager, parser.getDetectedLinks(),
                        Origin.content, parentUrl, currentURL, urlFilterList, discoverLinks);
            urlManager.removeExisting(discoverLinks);
            return discoverLinks;
        }
    }
}