Java tutorial
/* * Asqatasun - Automated webpage assessment * Copyright (C) 2008-2015 Asqatasun.org * * This file is part of Asqatasun. * * Asqatasun is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * Contact us by mail: asqatasun AT asqatasun DOT org */ package org.asqatasun.crawler; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.*; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.archive.io.GzipHeader; import org.archive.io.RecordingInputStream; import org.archive.modules.CrawlURI; import org.archive.modules.deciderules.MatchesFilePatternDecideRule; import org.jsoup.Jsoup; import org.jsoup.select.Elements; import org.asqatasun.crawler.framework.AsqatasunCrawlJob; import org.asqatasun.crawler.util.CrawlUtils; import org.asqatasun.entity.audit.*; import org.asqatasun.entity.parameterization.Parameter; import org.asqatasun.entity.service.audit.ContentDataService; import org.asqatasun.entity.service.subject.WebResourceDataService; import org.asqatasun.entity.subject.Page; import org.asqatasun.entity.subject.Site; import org.asqatasun.entity.subject.WebResource; /** * * @author jkowalczyk */ public class CrawlerImpl implements Crawler, ContentWriter { private static final Logger LOGGER = Logger.getLogger(CrawlerImpl.class); private static final String OPEN_HTML_TAG = "<html"; private static final String END_HTML_TAG = "</html>"; private static final int RETRIEVE_WINDOW = 1000; private static final int REL_CANONICAL_PAGE_FAKE_HTTP_STATUS = 900; private static final String REL_CANONICAL_CSS_LIKE_QUERY = "head link[rel=canonical][href]"; private static final String BASE_CSS_LIKE_QUERY = "head base[href]"; private static final String UNREACHABLE_RESOURCE_STR = "Unreachable resource "; private static final String HERITRIX_SITE_FILE_NAME = "asqatasun-crawler-beans-site.xml"; private static final String HERITRIX_PAGE_FILE_NAME = "asqatasun-crawler-beans-page.xml"; /** * This boolean is used to determine whether a page has already been fetched * in case of 1 page audit. */ private boolean isPageAlreadyFetched = false; /** * This webResource is the main webResource of the audit. In case of site * audit, this webresource is the Site instance from which all the Page instances * are linked. */ private WebResource mainWebResource; private SSP lastFetchedSSP; private AsqatasunCrawlJob crawlJob; private final Set<Long> relatedContentSetTemp = new HashSet<>(); private Pattern cssFilePattern = null; public Pattern getCssFilePattern() { if (cssFilePattern == null && crawlJob != null) { cssFilePattern = crawlJob.getCssFilePattern(); } return cssFilePattern; } private Pattern htmlFilePattern = null; public Pattern getHtmlFilePattern() { if (htmlFilePattern == null && crawlJob != null) { htmlFilePattern = crawlJob.getHtmlFilePattern(); } return htmlFilePattern; } private ContentDataService contentDataService; public ContentDataService getContentDataService() { return contentDataService; } @Override public void setContentDataService(ContentDataService contentDataService) { this.contentDataService = contentDataService; } private WebResourceDataService webResourceDataService; public WebResourceDataService getWebResourceDataService() { return webResourceDataService; } @Override public void setWebResourceDataService(WebResourceDataService webResourceDataService) { this.webResourceDataService = webResourceDataService; } private String crawlConfigFilePath = null; public String getCrawlConfigFilePath() { return this.crawlConfigFilePath; } @Override public void setCrawlConfigFilePath(String crawlConfigFilePath) { this.crawlConfigFilePath = crawlConfigFilePath; } private String outputDir = System.getProperty("user.dir") + "/output"; // private String outputDir = System.getenv("PWD") + "/output"; public String getOutputDir() { return this.outputDir; } @Override public void setOutputDir(String outputDir) { this.outputDir = outputDir; } private Set<Parameter> paramSet = null; @Override public Set<Parameter> getParameterSet() { return paramSet; } @Override public void setParameterSet(Set<Parameter> paramSet) { if (this.paramSet == null) { this.paramSet = paramSet; } else { this.paramSet.addAll(paramSet); } } private boolean persistOnTheFly = true; public boolean isPersistOnTheFly() { return persistOnTheFly; } @Override public void setPersistOnTheFly(boolean persistOnTheFly) { this.persistOnTheFly = persistOnTheFly; } private final GzipHeader gzipHeader = new GzipHeader(); public GzipHeader getGzipHeader() { return gzipHeader; } /** * List of referential with no exclusion for rel canonical page */ private final List<String> keepRelCanonicalRefList = new ArrayList<>(); public void setKeepRelCanonicalRefList(List<String> keepRelCanonicalRefList) { this.keepRelCanonicalRefList.addAll(keepRelCanonicalRefList); } private boolean excludeRelCanonical = true; int pageRankCounter = 1; // a counter to determine the rank a page is fetched public CrawlerImpl() { super(); } public String getSiteURL() { return mainWebResource.getURL(); } @Override public void setSiteURL(String siteURL) { mainWebResource = webResourceDataService.createSite(siteURL); mainWebResource = webResourceDataService.saveOrUpdate(mainWebResource); Collection<String> urlList = new ArrayList<>(); urlList.add(siteURL); this.crawlJob = new AsqatasunCrawlJob(urlList, HERITRIX_SITE_FILE_NAME, getOutputDir(), getCrawlConfigFilePath(), paramSet); if (crawlJob.isLaunchable()) { crawlJob.checkXML(); } } /** * * @param siteName * @param siteURL */ @Override public void setSiteURL(String siteName, Collection<String> siteURL) { mainWebResource = webResourceDataService.createSite(siteName); mainWebResource = webResourceDataService.saveOrUpdate(mainWebResource); this.crawlJob = new AsqatasunCrawlJob(siteURL, HERITRIX_PAGE_FILE_NAME, outputDir, crawlConfigFilePath, paramSet); if (crawlJob.isLaunchable()) { crawlJob.checkXML(); } } /** * * @param pageURL */ @Override public void setPageURL(String pageURL) { mainWebResource = webResourceDataService.createPage(pageURL); mainWebResource = webResourceDataService.saveOrUpdate(mainWebResource); Collection<String> urlList = new ArrayList<>(); urlList.add(pageURL); this.crawlJob = new AsqatasunCrawlJob(urlList, HERITRIX_PAGE_FILE_NAME, outputDir, crawlConfigFilePath, paramSet); if (crawlJob.isLaunchable()) { crawlJob.checkXML(); } isPageAlreadyFetched = false; } @Override public WebResource getResult() { crawlJob = null; cssFilePattern = null; htmlFilePattern = null; isPageAlreadyFetched = false; return mainWebResource; } @Override public void run() { updateExcludeRelCanonicalRegardingRef(); pageRankCounter = 1; this.crawlJob.setContentWriter(this); this.crawlJob.launchCrawlJob(); removeOrphanContent(); } @Override public void computeAndPersistSuccessfullFetchedResource(CrawlURI curi, RecordingInputStream recis) throws IOException { LOGGER.debug("Writing " + curi.getURI() + " : " + curi.getFetchStatus() + " " + curi.getContentType() + " " + curi.getFetchDuration() + "ms"); if (curi.getContentType().contains(ContentType.html.getType()) && !curi.getURI().contains("robots.txt")) { saveHtmlContent(curi, recis); } else if (curi.getContentType().contains(ContentType.unknown.getType())) { lastChanceToQualifyUnknownContent(curi, recis); } else if (curi.getContentType().contains(ContentType.css.getType())) { LOGGER.debug("Found css " + curi.getURI() + " last fetched ssp " + lastFetchedSSP.getURI()); boolean compressed = gzipHeader.testGzipMagic(recis.getMessageBodyReplayInputStream()); String cssCode; if (compressed) { cssCode = ""; } else { cssCode = CrawlUtils.convertSourceCodeIntoUtf8(recis, extractCharset(curi, recis)).trim(); } saveStylesheetFromFetchedCss(curi, cssCode); } else if (curi.getContentType().contains(ContentType.img.getType())) { LOGGER.debug("Found Image" + curi.getURI()); byte[] rawImage = CrawlUtils.getImageContent(recis.getMessageBodyReplayInputStream(), CrawlUtils.getImageExtension(curi.getURI())); saveRawImageFromFetchedImage(curi, rawImage); } else { LOGGER.debug("Trashed content " + curi.getURI() + " of type " + curi.getContentType()); // do nothing, we ignore the fetched content when we cannot // categorize it } } /** * * @param curi * @param recis * @throws IOException */ private void saveHtmlContent(CrawlURI curi, RecordingInputStream recis) throws IOException { String charset = extractCharset(curi, recis); LOGGER.debug("Found Html " + curi.getURI() + " with charset " + charset); lastFetchedSSP = saveWebResourceFromFetchedPage(curi.getURI(), charset, curi.getFetchStatus(), CrawlUtils.convertSourceCodeIntoUtf8(recis, charset).trim(), true); } @Override public void computeAndPersistUnsuccessfullFetchedResource(CrawlURI curi) { ContentType resourceContentType = getContentTypeFromUnreacheableResource(curi.getCanonicalString()); switch (resourceContentType) { case misc: case html: LOGGER.debug(UNREACHABLE_RESOURCE_STR + curi.getURI() + " : " + curi.getFetchStatus()); saveWebResourceFromFetchedPage(curi.getURI(), null, curi.getFetchStatus(), null, false); break; case css: LOGGER.debug(UNREACHABLE_RESOURCE_STR + curi.getURI() + " : " + curi.getFetchStatus()); saveStylesheetFromFetchedCss(curi, null); break; case img: LOGGER.debug(UNREACHABLE_RESOURCE_STR + curi.getURI() + " : " + curi.getFetchStatus()); saveRawImageFromFetchedImage(curi, null); break; default: LOGGER.debug( "UNKNOWN_CONTENT" + UNREACHABLE_RESOURCE_STR + curi.getURI() + " : " + curi.getFetchStatus()); break; } } /** * * @param curi * @param charset * @param fetchStatus * @param sourceCode * @param successfullFetch * @return */ private SSP saveWebResourceFromFetchedPage(String uri, String charset, int fetchStatus, String sourceCode, boolean successfullFetch) { Page page; if (mainWebResource instanceof Page) { if (!isPageAlreadyFetched) { page = (Page) mainWebResource; // in case of redirection, we modify the URI of the webresource // to ensure the webresource and its SSP have the same URI. page.setURL(uri); if (successfullFetch) { isPageAlreadyFetched = true; SSP ssp = createSSPFromPage(uri, charset, page, sourceCode); if (persistOnTheFly) { persistSSP(ssp, uri, fetchStatus, page); } return ssp; } else { return lastFetchedSSP; } } else { // in case of one page audit, when a SSP have already been fetched // we don't create SSP anymore. return lastFetchedSSP; } } else { page = webResourceDataService.createPage(uri); page.setParent((Site) mainWebResource); page.setRank(pageRankCounter); pageRankCounter++; SSP ssp = createSSPFromPage(uri, charset, page, sourceCode); if (persistOnTheFly) { persistSSP(ssp, uri, fetchStatus, page); } return ssp; } } /** * * @param curi * @param charset * @param page * @param sourceCode * @return */ private SSP createSSPFromPage(String uri, String charset, Page page, String sourceCode) { SSP ssp = contentDataService.getSSP(uri); ssp.setPage(page); ssp.setCharset(charset); ssp.setSource(sourceCode); return ssp; } /** * * @param ssp * @param uri * @param httpStatusCode * @param page */ private void persistSSP(SSP ssp, String uri, int httpStatusCode, Page page) { webResourceDataService.saveOrUpdate(page); saveAndPersistFetchDataToContent(ssp, uri, httpStatusCode); } /** * * @param curi * @param charset * @param cssCode */ private void saveStylesheetFromFetchedCss(CrawlURI curi, String cssCode) { StylesheetContent newCssContent = contentDataService.getStylesheetContent(null, curi.getURI(), null, cssCode, curi.getFetchStatus()); // A relatedContent has to be linked to a SSP. // At this step, we don't know the relation between // SSP and relatedContent but we have to link this relatedContent to any // (the last) ssp to associate this relatedContent with the current // crawl StylesheetContent returnedCssContent = (StylesheetContent) saveAndPersistFetchDataToContent( (Content) newCssContent, curi.getURI(), curi.getFetchStatus()); persistContentRelationShip(lastFetchedSSP, returnedCssContent); } /** * * @param curi * @param rawImage */ private void saveRawImageFromFetchedImage(CrawlURI curi, byte[] rawImage) { ImageContent newImgContent = contentDataService.getImageContent(null, curi.getURI(), null, rawImage, curi.getFetchStatus()); ImageContent returnedImgContent = (ImageContent) saveAndPersistFetchDataToContent(newImgContent, curi.getURI(), curi.getFetchStatus()); persistContentRelationShip(lastFetchedSSP, returnedImgContent); } /** * This methods enables to get the type of resource from its uri. * In case of unreachable resource (404/403 errors), the return content is * a html page. So we can't use the content type of the returned page to * determine the type of the content we try to reach. In this case, we use * the uri extension, based-on regular expressions. * @param uri * @return */ private ContentType getContentTypeFromUnreacheableResource(String uri) { if (MatchesFilePatternDecideRule.Preset.IMAGES.getPattern().matcher(uri).matches()) { return ContentType.img; } else if (getHtmlFilePattern().matcher(uri).matches()) { return ContentType.html; } else if (getCssFilePattern().matcher(uri).matches()) { return ContentType.css; } return ContentType.misc; } // Bug #154 fix /** * Some resources may have been downloaded by the crawler component but they * are not linked with any webresource. They have to be removed from the * contentList. */ @SuppressWarnings("element-type-mismatch") private void removeOrphanContent() { List<Content> emptyContentSet; Integer nbOfContent = contentDataService.getNumberOfOrphanRelatedContent(mainWebResource).intValue(); Integer i = 0; Logger.getLogger(CrawlerImpl.class.getName()) .debug("remove Orphan related contents " + nbOfContent + " elements"); while (i.compareTo(nbOfContent) < 0) { emptyContentSet = contentDataService.getOrphanRelatedContentList(mainWebResource, 0, RETRIEVE_WINDOW); for (Content content : emptyContentSet) { Logger.getLogger(CrawlerImpl.class.getName()).debug("Removing " + content.getURI()); contentDataService.delete(content.getId()); } i = i + RETRIEVE_WINDOW; } nbOfContent = contentDataService.getNumberOfOrphanContent(mainWebResource).intValue(); i = 0; Logger.getLogger(CrawlerImpl.class.getName()).debug("remove Orphan SSPs " + nbOfContent + " elements"); while (i.compareTo(nbOfContent) < 0) { emptyContentSet = contentDataService.getOrphanContentList(mainWebResource, i, RETRIEVE_WINDOW); for (Content content : emptyContentSet) { contentDataService.delete(content.getId()); } i = i + RETRIEVE_WINDOW; } } /** * This methods add the fetch date and the fetch status to a content and * persist it * * @param content * @param curi */ private Content saveAndPersistFetchDataToContent(Content content, String uri, int status) { // Waiting for a better implementation, we parse here the html content // to detect the presence of the rel=canonical property. // If true, the HttpStatusCode is set arbitrarely to 900 and thus the // page won't be tested while processing if (isRelCanonicalPage(content)) { LOGGER.info("Fetching page with rel canonical " + uri + ". Set Http status to 900"); content.setHttpStatusCode(REL_CANONICAL_PAGE_FAKE_HTTP_STATUS); } else { content.setHttpStatusCode(status); } content.setDateOfLoading(Calendar.getInstance().getTime()); if (persistOnTheFly) { content = contentDataService.saveOrUpdate(content); } return content; } /** * This method update the relcanonical boolean regarding the referential. * For a11y refs, the boolean has to be set to false. */ private void updateExcludeRelCanonicalRegardingRef() { for (Parameter param : paramSet) { if (param.getParameterElement().getParameterElementCode().equals("LEVEL")) { String level = param.getValue().split(";")[0]; if (keepRelCanonicalRefList.contains(level)) { LOGGER.info("Rel canonical pages are kept for ref " + level); excludeRelCanonical = false; } else { LOGGER.info("Rel canonical pages are excluded for ref " + level); } break; } } } /** * Waiting for a better implementation, we parse here the html content * to detect the presence of the rel=canonical property. * @param content * @return whether the current page defines a rel canonical Url and whether * this url is different from the current url. */ public final boolean isRelCanonicalPage(Content content) { // @TODO make this implementation cleaner if (!excludeRelCanonical) { return false; } if (!(content instanceof SSP)) { return false; } if (StringUtils.isBlank(((SSP) content).getSource())) { return false; } Elements relCanonical = Jsoup.parse(((SSP) content).getSource()).select(REL_CANONICAL_CSS_LIKE_QUERY); if (relCanonical.isEmpty() || relCanonical.size() > 1) { return false; } // At this step, we are sure that the rel canonical is defined and // is unique String href = relCanonical.first().attr("href"); if (href.equals(".")) { return false; } if (href.endsWith("/")) { href = href.substring(0, href.length() - 1); } if (href.startsWith("/")) { Elements base = Jsoup.parse(((SSP) content).getSource()).select(BASE_CSS_LIKE_QUERY); if (!base.isEmpty()) { if (StringUtils.endsWith(base.first().attr("href"), "/")) { href = StringUtils.join(base.first().attr("href"), href.substring(1)); } else { href = StringUtils.join(base.first().attr("href") + href); } LOGGER.debug("(BASE CASE) The concat href " + href); } else { URI contractUri; try { contractUri = new URI(content.getURI()); href = StringUtils.join(contractUri.getScheme(), "://", contractUri.getHost(), href); LOGGER.debug("(NORMAL CASE) The concat href " + href); } catch (URISyntaxException ex) { LOGGER.error("Error when creating uri object with url " + content.getURI()); } } } if (href.contains("//")) { href = href.substring(href.indexOf("//") + 2); } String currentUrl = content.getURI(); if (currentUrl.endsWith("/")) { currentUrl = currentUrl.substring(0, currentUrl.length() - 1); } if (currentUrl.contains("//")) { currentUrl = currentUrl.substring(currentUrl.indexOf("//") + 2); } if (currentUrl.equals(href)) { LOGGER.info("rel canonical present but points to itself " + content.getURI()); return false; } return true; } /** * * @param ssp * @param relatedContent */ private void persistContentRelationShip(SSP ssp, RelatedContent relatedContent) { if (!persistOnTheFly) { ssp.addRelatedContent(relatedContent); return; } relatedContentSetTemp.clear(); relatedContentSetTemp.add(((Content) relatedContent).getId()); contentDataService.saveContentRelationShip(ssp, relatedContentSetTemp); } /** * * @param curi * @param recis * @return * @throws IOException */ private String extractCharset(CrawlURI curi, RecordingInputStream recis) throws IOException { if (curi.getContentType().contains("=")) { return curi.getContentType().substring(curi.getContentType().indexOf("=") + 1); } else { return CrawlUtils.extractCharset(recis.getMessageBodyReplayInputStream()); } } /** * Heritrix may return content with an unknow content-type. This content * may be Html and that's what we try to detect here. * The raw content is first converted into UTF-8 and then we search the * $lt;html and $lt;/html$gt;. If present, we deduce it is html. If not, * we do nothing, the content is trashed. * @param curi * @param recis */ private void lastChanceToQualifyUnknownContent(CrawlURI curi, RecordingInputStream recis) throws IOException { String charset = extractCharset(curi, recis); try { String data = CrawlUtils.convertSourceCodeIntoUtf8(recis, charset).trim(); if (StringUtils.containsIgnoreCase(data, OPEN_HTML_TAG) && StringUtils.containsIgnoreCase(data, END_HTML_TAG)) { saveHtmlContent(curi, recis); } } catch (Exception e) { LOGGER.debug("Exception caught when trying to convert unknow Content " + curi.getURI() + " into UTF-8. We deduce this content is not of html type"); } } }