Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.fetcher; import java.io.File; import java.util.List; import java.util.regex.Pattern; import org.aliuge.crawler.exception.QueueException; import org.aliuge.crawler.jobconf.FetchConfig; import org.aliuge.crawler.model.KeyValue; import org.aliuge.crawler.page.Page; import org.aliuge.crawler.page.PageFetchResult; import org.aliuge.crawler.page.Parser; import org.aliuge.crawler.pendingqueue.PendingManager; import org.aliuge.crawler.pendingqueue.PendingPages; import org.aliuge.crawler.pendingqueue.PendingUrls; import org.aliuge.crawler.url.WebURL; import org.aliuge.crawler.util.BloomfilterHelper; import org.aliuge.crawler.util.UrlUtils; import org.aliuge.crawler.worker.Worker; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpStatus; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.google.common.collect.Lists; public abstract class FetchWorker extends Worker { private static Logger log = Logger.getLogger(FetchWorker.class); protected UrlUtils urlUtils = new UrlUtils(); protected BloomfilterHelper bloomfilterHelper = BloomfilterHelper.getInstance(); /** * url */ protected PendingUrls pendingUrls = null; /** * Page */ protected PendingPages pendingPages = null; /** * ? */ protected DefaultFetcher fetcher; /** * job? */ protected FetchConfig config; /** * ? */ protected Parser parser; /** * robots */ public List<Pattern> fetchFilters = Lists.newArrayList(); public List<KeyValue<Pattern, String>> extractFilters = Lists.newArrayList(); /** * @param conf * ???setFetcherFetcher */ public FetchWorker(String jobTag, FetchConfig config) { this(jobTag, config, new DefaultFetcher(config)); } /** * @param conf * @param fetcher * ?? */ @SuppressWarnings("unchecked") public FetchWorker(String jobTag, FetchConfig config, DefaultFetcher fetcher) { super(jobTag); this.fetcher = fetcher; this.config = config; pendingUrls = PendingManager.getPendingUlr(config.getJobName()); pendingPages = PendingManager.getPendingPages(config.getJobName()); parser = new Parser(config.isFetchBinaryContent()); // List<String> urls1 = config.getFetchUrlFilters(); List<KeyValue<String, String>> urls2 = config.getExtractUrlfilters(); for (String s : urls1) { fetchFilters.add(Pattern.compile(s)); } for (KeyValue<String, String> s : urls2) { extractFilters.add(new KeyValue(Pattern.compile(s.getKey()), s.getValue())); } } public FetchWorker setFetcher(final DefaultFetcher fetcher) { this.fetcher = fetcher; return this; } /** * @desc ? */ public abstract void onSuccessed(); /** * @desc */ public abstract void onFailed(WebURL url); /** * @desc */ public abstract void onIgnored(WebURL url); /** * fetcher filter * * @param url * @return */ public boolean fetchFilter(String url) { if (null == fetchFilters || fetchFilters.size() == 0) { return true; } for (Pattern p : fetchFilters) { if (p.matcher(url).matches()) { return true; } } return false; } /** * extract filter * * @param url * @return */ public boolean extractFilter(String url) { // extractFilters??url if (null == extractFilters || extractFilters.size() == 0) { return true; } for (KeyValue<Pattern, String> p : extractFilters) { if (p.getKey().matcher(url).matches()) { return true; } } return false; } /** * ???url????url * * @param url * @return */ public String extractFilterAndChangeUrl(String url) { // extractFilters??url if (null == extractFilters || extractFilters.size() == 0) { return url; } for (KeyValue<Pattern, String> p : extractFilters) { if (p.getKey().matcher(url).matches()) { if (StringUtils.isNoneBlank(p.getValue())) { String[] pp = p.getValue().split(","); //???url? if (pp.length == 2) return url.replace(pp[0], pp[1]); } return url; } } // ??url??null??url return null; } /** * @param url * @desc */ public void fetchPage(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getUrl())) { result = fetcher.fetch(url, true); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); onSuccessed(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getUrl())) { onFailed(url); return; } // ?? String e_url = extractFilterAndChangeUrl(url.getUrl()); if (StringUtils.isNoneBlank(e_url)) { url.setUrl(e_url); page.setWebURL(url); pendingPages.addElement(page); return; } // depth if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) { return; } // ???Url?Url Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getUrl())); Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); // ???url if ((fetchFilter(linkHref) || extractFilter(linkHref)) && !bloomfilterHelper.exist(linkHref)) { WebURL purl = new WebURL(); purl.setName(link.text()); purl.setUrl(linkHref); purl.setDepth((short) (url.getDepth() + 1)); if (purl.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) return; try { if (!pendingUrls.addElement(purl)) { FileUtils.writeStringToFile(new File("status/_urls.good"), url.getUrl() + "\n", true); } } catch (QueueException e) { log.error(e.getMessage()); } } } } } } } catch (QueueException e) { onFailed(url); } catch (Exception e) { e.printStackTrace(); onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } } /** * ???url? * * @param url */ public void fetchPageWhitoutExtractUrl(WebURL url) { PageFetchResult result = null; try { if (null != url && StringUtils.isNotBlank(url.getUrl())) { // ?? if (fetchFilter(url.getUrl())) { // result = fetcher.fetchHeader(url); result = fetcher.fetch(url, true); // ?? int statusCode = result.getStatusCode(); if (statusCode == CustomFetchStatus.PageTooBig) { onIgnored(url); return; } if (statusCode != HttpStatus.SC_OK) { onFailed(url); } else { Page page = new Page(url); pendingUrls.processedSuccess(); if (!result.fetchContent(page)) { onFailed(url); return; } if (!parser.parse(page, url.getUrl())) { onFailed(url); return; } // ?? if (extractFilter(url.getUrl())) { pendingPages.addElement(page); } } } else { onIgnored(url); } } } catch (Exception e) { onFailed(url); } catch (QueueException e) { onFailed(url); } finally { if (null != result) result.discardContentIfNotConsumed(); } } }