com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java Source code

Introduction

Here is the source code for com.ewcms.plugin.crawler.generate.EwcmsContentCrawler.java
Source

/**
 * Copyright (c)2010-2011 Enterprise Website Content Management System(EWCMS), All rights reserved.
 * EWCMS PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 * http://www.ewcms.com
 */

package com.ewcms.plugin.crawler.generate;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.http.HttpStatus;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.ewcms.content.document.model.Article;
import com.ewcms.content.document.model.Content;
import com.ewcms.content.document.service.ArticleMainServiceable;
import com.ewcms.plugin.crawler.generate.crawler.Page;
import com.ewcms.plugin.crawler.generate.crawler.WebCrawler;
import com.ewcms.plugin.crawler.generate.url.WebURL;
import com.ewcms.plugin.crawler.manager.service.GatherServiceable;
import com.ewcms.plugin.crawler.model.Gather;
import com.ewcms.plugin.crawler.model.Storage;
import com.ewcms.plugin.crawler.util.CrawlerUtil;

/**
 * 
 * @author wu_zhijun
 * 
 */
public class EwcmsContentCrawler extends WebCrawler {

    private static final Logger logger = LoggerFactory.getLogger(EwcmsContentCrawler.class);

    private final static Pattern FILTERS = Pattern.compile(
            ".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf|rm|smil|wmv|swf|wma|zip|rar|gz))$");

    private String[] crawlDomains;
    private ArticleMainServiceable articleMainService;
    private GatherServiceable gatherService;
    private Gather gather;
    private String matchRegex;
    private String filterRegex;
    private String htmlType;
    private Boolean isLocal;
    private String[] keys;

    @Override
    public void onStart() {
        super.onStart();
        crawlDomains = (String[]) myController.getCustomData();
        articleMainService = (ArticleMainServiceable) getPassingParameters().get("articleMainService");
        gatherService = (GatherServiceable) getPassingParameters().get("gatherService");
        matchRegex = (String) getPassingParameters().get("matchRegex");
        filterRegex = (String) getPassingParameters().get("filterRegex");
        gather = (Gather) getPassingParameters().get("gather");
        htmlType = gather.getHtmlType();
        isLocal = gather.getIsLocal();
        if (isLocal) {
            keys = gather.getKeys().split(",");
        }
    }

    /**
     * ?url?TRUE?
     */
    @Override
    public boolean shouldVisit(WebURL url) {
        String href = url.getURL().toLowerCase();
        if (FILTERS.matcher(href).matches())
            return false;
        if (href.lastIndexOf("." + htmlType) == -1)
            return false;
        if (crawlDomains != null && crawlDomains.length > 0) {
            for (String crawlDomain : crawlDomains) {
                if (href.startsWith(crawlDomain)) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * ?page??
     */
    @Override
    public void visit(Page page) {
        try {
            String url = page.getWebURL().getURL();

            page.setContentType("text/html; charset=" + gather.getEncoding());
            Document doc = Jsoup.connect(url).timeout(gather.getTimeOutWait().intValue() * 1000).get();

            String title = doc.title();
            if (gather.getTitleExternal() && gather.getTitleRegex() != null
                    && gather.getTitleRegex().length() > 0) {
                Elements titleEles = doc.select(gather.getTitleRegex());
                if (!titleEles.isEmpty()) {
                    String tempTitle = titleEles.text();
                    if (tempTitle != null && tempTitle.length() > 0) {
                        title = tempTitle;
                    }
                }
            }

            if (title != null && title.trim().length() > 0) {
                Elements elements = doc.select(matchRegex);
                if (filterRegex != null && filterRegex.trim().length() > 0) {
                    elements = elements.not(filterRegex);
                }
                if (!elements.isEmpty()) {
                    String subHtml = elements.html();
                    Document blockDoc = Jsoup.parse(subHtml);
                    String contentText = blockDoc.html();

                    if (gather.getRemoveHref()) {
                        Document moveDoc = Jsoup.parse(contentText);
                        Elements moveEles = moveDoc.select("*").not("a");
                        contentText = moveEles.html();
                    }
                    if (gather.getRemoveHtmlTag())
                        contentText = doc.text();

                    if (isLocal) {
                        contentText = doc.text();

                        Boolean isMatcher = true;
                        for (int i = 0; i < keys.length; i++) {
                            Boolean result = Pattern.compile(keys[i].trim()).matcher(contentText).find();
                            if (!result) {
                                isMatcher = false;
                                break;
                            }
                        }

                        if (isMatcher) {
                            Storage storage = new Storage();
                            storage.setGatherId(gather.getId());
                            storage.setGatherName(gather.getName());
                            storage.setTitle(title);
                            storage.setUrl(url);
                            try {
                                gatherService.addStorage(storage);
                            } catch (Exception e) {
                                logger.error("save storage error : {}", e.getLocalizedMessage());
                            } finally {
                                storage = null;
                            }
                        }
                    } else {
                        Content content = new Content();
                        content.setDetail(contentText);
                        content.setPage(1);
                        List<Content> contents = new ArrayList<Content>();
                        contents.add(content);

                        Article article = new Article();
                        article.setTitle(title);
                        article.setContents(contents);

                        articleMainService.addArticleMainByCrawler(article, gather.getChannelId(),
                                CrawlerUtil.USER_NAME);
                    }
                }
            }
        } catch (IOException e) {
            logger.warn(e.getLocalizedMessage());
        }
    }

    /**
     * ?crawler?
     */
    @Override
    public Object getMyLocalData() {
        return null;
    }

    /**
     * ?
     */
    @Override
    public void onBeforeExit() {
        gather = null;
        matchRegex = null;
        filterRegex = null;
        articleMainService = null;
    }

    @Override
    protected void handlePageStatusCode(WebURL webUrl, int statusCode, String statusDescription) {
        if (statusCode != HttpStatus.SC_OK) {
            if (statusCode == HttpStatus.SC_NOT_FOUND) {
                logger.info("Broken link: {} , this link was found in page with docid: {}", webUrl.getURL(),
                        webUrl.getParentDocid());
            } else {
                logger.info("Non success status for link: {} , status code: {} , description: {}", webUrl.getURL(),
                        statusCode);
            }
        }
    }
}