com.webcrawler.MailCrawlerService.java Source code

Introduction

Here is the source code for com.webcrawler.MailCrawlerService.java
Source

//$Id: $
//$Revision: $
//$Date: $

/*
 * +=======================================================================+
 * |                                                                       |
 * |          Copyright (C) 2013-2014 Nomura Research Institute, Ltd.      |
 * |                          All Rights Reserved                          |
 * |                                                                       |
 * |    This document is the sole property of Nomura Research Institute,   |
 * |    Ltd. No part of this document may be reproduced in any form or     |
 * |    by any means - electronic, mechanical, photocopying, recording     |
 * |    or otherwise - without the prior written permission of Nomura      |
 * |    Research Institute, Ltd.                                           |
 * |                                                                       |
 * |    Unless required by applicable law or agreed to in writing,         |
 * |    software distributed under the License is distributed on an        |
 * |    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,       |
 * |    either express or implied.                                         |
 * |                                                                       |
 * +=======================================================================+
 */

package com.webcrawler;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Concrete implementation of the download service which will prepare absolute url address 
 * for the requested downloadable resources
 */
public class MailCrawlerService implements WebCrawlerService {
    private static final Logger log = Logger.getLogger(MailCrawlerService.class);
    private String searchToken = "";

    public MailCrawlerService(String searchText) {
        this.searchToken = searchText;
    }

    public List<String> getURLList(String startURL) throws IOException {
        return getURLList(Jsoup.connect(startURL).get(), this.searchToken);
    }

    public List<String> getURLList(Document doc, String searchToken) throws IOException {
        Elements linkElements = getLinkElements(doc, "a");
        CollectionUtils.filter(linkElements, getLinkFilterPredicate(getShouldVisitPattern(searchToken)));
        return getAbsoluteMailUrls(linkElements, searchToken);
    }

    /**
     * This method will create the filter criteria for the URL. 
     *
     * @param year the year
     * @return the should visit pattern
     */
    public String getShouldVisitPattern(String year) {
        return "(http://)?[a-zA-Z-._/]+" + year + "[0-9]{2}[.a-z/]+thread";
    }

    /**
     * Gets the regex mail url pattern.
     *
     * @param year the year
     * @return the regex mail url pattern
     */
    public String getRegexMailUrlPattern(String year) {
        return "(http://)?[a-zA-Z-._/]+" + year + "[0-9]{2}[.a-z]+/%[a-zA-Z0-9-._@%\\s]+";
    }

    /**
     * Gets the link filter predicate.
     *
     * @param shouldVisitPattern the should visit pattern
     * @return the link filter predicate
     */
    private Predicate getLinkFilterPredicate(final String shouldVisitPattern) {
        return new Predicate() {
            public boolean evaluate(Object arg0) {
                Pattern pattern = Pattern.compile(shouldVisitPattern);
                Element linkElement = (Element) arg0;
                String absoluteUrl = linkElement.attr("abs:href");
                Matcher matcher = pattern.matcher(absoluteUrl);
                if (matcher.find()) {
                    if (MailCrawlerService.log.isDebugEnabled()) {
                        MailCrawlerService.log.debug("Should be visited: " + absoluteUrl);
                    }
                    return true;
                }
                if (MailCrawlerService.log.isDebugEnabled()) {
                    MailCrawlerService.log.debug("Should not be visited: " + absoluteUrl);
                }
                return false;
            }
        };
    }

    /**
     * Gets the absolute mail urls.
     *
     * @param linkElements the link elements
     * @param searchToken the search token
     * @return the absolute mail urls
     * @throws IOException Signals that an I/O exception has occurred.
     */
    private List<String> getAbsoluteMailUrls(Elements linkElements, String searchToken) throws IOException {
        List<String> absoluteURLList = new ArrayList<String>();
        List<Element> relativeURLList = new ArrayList<Element>();
        for (Element linkElement : linkElements) {
            String absouleUrl = linkElement.attr("abs:href");
            Elements anchorElements = getLinkElements(Jsoup.connect(absouleUrl).get(), "a");
            CollectionUtils.select(anchorElements, getLinkFilterPredicate(getRegexMailUrlPattern(searchToken)),
                    relativeURLList);
        }
        for (Element element : relativeURLList) {
            absoluteURLList.add(element.attr("abs:href"));
        }
        if (log.isDebugEnabled()) {
            log.debug("Absolute URL List: " + absoluteURLList.toString());
        }
        return absoluteURLList;
    }

    /**
     * Gets the link elements.
     *
     * @param doc the doc
     * @param tagSelector the tag selector
     * @return the link elements
     */
    private Elements getLinkElements(Document doc, String tagSelector) {
        return doc.select(tagSelector);
    }
}