Java tutorial
/* * Copyright (c) NASK, NCSC * * This file is part of HoneySpider Network 2.1. * * This is a free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package pl.nask.hsn2.service.urlfollower; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.SocketTimeoutException; import java.net.URISyntaxException; import java.net.URL; import java.net.UnknownHostException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.URIException; import org.apache.commons.httpclient.util.URIUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.nask.hsn2.ContextSizeLimitExceededException; import pl.nask.hsn2.ParameterException; import pl.nask.hsn2.RequiredParameterMissingException; import pl.nask.hsn2.ResourceException; import pl.nask.hsn2.StorageException; import pl.nask.hsn2.bus.api.TimeoutException; import pl.nask.hsn2.service.ServiceData; import pl.nask.hsn2.service.ServiceParameters; import pl.nask.hsn2.service.task.NewWebClientUrlObject; import pl.nask.hsn2.service.task.WebClientTaskContext; import pl.nask.hsn2.service.urlfollower.ScriptInterceptor.ScriptElement; import pl.nask.hsn2.wrappers.CookieWrapper; import pl.nask.hsn2.wrappers.RequestWrapper; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider; import com.gargoylesoftware.htmlunit.HttpWebConnection; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.TextPage; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.DomElement; import com.gargoylesoftware.htmlunit.html.HtmlElement; import com.gargoylesoftware.htmlunit.html.HtmlFrame; import com.gargoylesoftware.htmlunit.html.HtmlInlineFrame; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.util.Cookie; import com.gargoylesoftware.htmlunit.util.UrlUtils; public class WebClientWorker implements Runnable { private static final int ONE_SECOND_IN_MILISECONDS = 1000; private static final String HTML_STRING = "html"; private static final Logger LOGGER = LoggerFactory.getLogger(WebClientWorker.class); private static final String URL_ORIGINAL_STRING = "url_original"; private static final String HREF_STRING = "href"; private static final String SRC_STRING = "src"; private static final int TERMINATION_TIMEOUT = 500; private final ScriptInterceptor scriptInterceptor; private WebClient wc; private final CountDownLatch latch; private final HtmlUnitFollower workerDispatcher; private ServiceParameters taskParams; private Map<Page, ProcessedPage> previousTopPageMap = new HashMap<Page, ProcessedPage>(); private Map<Page, ProcessedPage> previousFramePageMap = new HashMap<Page, ProcessedPage>(); private WebClientTaskContext ctx; private volatile boolean interruptProcessing; private Set<CookieWrapper> cookieWrappers; private Set<String> processedSubPages = new HashSet<>(); public WebClientWorker(HtmlUnitFollower dispatcher, CountDownLatch l, WebClientTaskContext ctx, ServiceParameters taskParams) { if (taskParams == null) { throw new IllegalArgumentException("ServiceParameters cannot be null"); } if (l == null) { throw new IllegalArgumentException("CountDownLatch cannot be null"); } if (dispatcher == null) { throw new IllegalArgumentException("HtmlUnitFollower cannot be null"); } latch = l; workerDispatcher = dispatcher; scriptInterceptor = new ScriptInterceptor(taskParams); this.taskParams = taskParams; this.ctx = ctx; } private void initializeWebClient() { String proxy = null; ProxyParamsWrapper proxyParams = null; if (ctx != null && ctx.getCurrentContextServiceData() != null) { proxy = ctx.getCurrentContextServiceData().getProxyUri(); } if (proxy == null || proxy.trim().isEmpty()) { wc = new WebClient(getBrowserVersion()); } else { proxyParams = new ProxyParamsWrapper(proxy); if (proxyParams.isProxy()) { wc = new WebClient(getBrowserVersion(), proxyParams.getHost(), proxyParams.getPort()); if (proxyParams.isSocksProxy()) { wc.getOptions().getProxyConfig().setSocksProxy(true); } if (proxyParams.hasUserCredentials()) { DefaultCredentialsProvider dc = (DefaultCredentialsProvider) wc.getCredentialsProvider(); dc.addCredentials(proxyParams.getUserName(), proxyParams.getUserPswd(), proxyParams.getHost(), proxyParams.getPort(), null); } } else { LOGGER.warn("Incorrect proxy params: {}.proxy disabled.", proxy); wc = new WebClient(getBrowserVersion()); } } // http errors and script errors are not considered an error here wc.getOptions().setRedirectEnabled(false); // don't process activeX! wc.getOptions().setActiveXNative(false); wc.getOptions().setJavaScriptEnabled(taskParams.getJsEnable()); wc.getOptions().setHomePage("http://unknown.unknown/"); wc.getOptions().setTimeout(taskParams.getPageTimeoutMillis()); wc.setJavaScriptTimeout(taskParams.getSingleJsTimeoutMillis()); wc.getOptions().setThrowExceptionOnFailingStatusCode(false); // disable script errors wc.getOptions().setThrowExceptionOnScriptError(false); wc.getJavaScriptEngine().getContextFactory().setDebugger(scriptInterceptor); wc.setRefreshHandler( new MetaRedirectHandler(taskParams.getPageTimeoutMillis(), taskParams.getRedirectDepthLimit())); wc.setJavaScriptErrorListener(new JsScriptErrorListener()); wc.addWebWindowListener(new WebWindowListenerImpl(previousTopPageMap, previousFramePageMap)); initializeCookies(); LOGGER.info( "Initialized WebClientWorker with options: [{}, JsEnabled={}, ActiveXNative={},processing_timeout={},page_timeout={},proxy:{}] ", new Object[] { taskParams.getProfile(), wc.getOptions().isJavaScriptEnabled(), wc.getOptions().isActiveXNative(), taskParams.getProcessingTimeout(), taskParams.getPageTimeoutMillis(), proxyParams, }); } private void initializeCookies() { if (cookieWrappers != null) { for (CookieWrapper cookieWrapper : cookieWrappers) { Map<String, String> attributes = cookieWrapper.getAttributes(); Cookie cookie = new Cookie(attributes.get(CookieAttributes.DOMAIN.getName()), cookieWrapper.getName(), cookieWrapper.getValue(), attributes.get(CookieAttributes.PATH.getName()), null, Boolean.valueOf(attributes.get(CookieAttributes.IS_SECURE.getName()))); wc.getCookieManager().addCookie(cookie); } } } /** * Returns browser version according to browser profile set in service parameters.<br> * <br> * Below you can find list of currently supported browsers. * <ul> * <li>Internet Explorer 6</li> * <li>Internet Explorer 7</li> * <li>Internet Explorer 8</li> * <li>Firefox 3</li> * <li>Firefox 3.6 - default, if browser name in parameter does not match any of listed names</li> * <li>Firefox 10</li> * <li>Chrome 16</li> * </ul> * * @return Browser version. */ @SuppressWarnings("deprecation") private BrowserVersion getBrowserVersion() { String profileName = "";//taskParams.getProfile(); switch (profileName) { case "Internet Explorer 6": LOGGER.warn("requested deprecated browser version:{}", profileName); return BrowserVersion.INTERNET_EXPLORER_6; case "Internet Explorer 7": LOGGER.warn("requested deprecated browser version:{}", profileName); return BrowserVersion.INTERNET_EXPLORER_7; case "Internet Explorer 8": return BrowserVersion.INTERNET_EXPLORER_8; case "Internet Explorer": //fall through case "Internet Explorer 9": return BrowserVersion.INTERNET_EXPLORER_9; case "Firefox 3.6": LOGGER.warn("requested deprecated browser version:{}", profileName); return BrowserVersion.FIREFOX_3_6; case "Firefox 10": LOGGER.warn("requested deprecated browser version:{}", profileName); return BrowserVersion.FIREFOX_10; case "Firefox": //fall through case "Firefox 17": return BrowserVersion.FIREFOX_17; case "Chrome 16": LOGGER.warn("requested deprecated browser version:{}", profileName); return BrowserVersion.CHROME_16; case "Chrome": return BrowserVersion.CHROME; default: LOGGER.warn("Browser profile '{}' not supported. Using default Firefox 3.6 instead.", profileName); return BrowserVersion.INTERNET_EXPLORER_9;//FIREFOX_3_6; } } @Override public final void run() { String workerUrl = workerDispatcher.getUrlForProcessing(); try { initializeWebClient(); processTheUrl(workerUrl); } catch (ConnectTimeoutException e) { LOGGER.warn("Connection timeout for URL '{}'", workerUrl); LOGGER.debug(e.getMessage(), e); workerDispatcher.requestFailed(e); } catch (org.apache.http.conn.ConnectTimeoutException e) { LOGGER.warn("Connection timeout: {}", e.getMessage()); LOGGER.debug("Connection timeout stacktrace: {}", e); workerDispatcher.requestFailed(e); } catch (SocketTimeoutException e) { LOGGER.warn("Socket timeout for URL '{}'", workerUrl); LOGGER.debug(e.getMessage(), e); workerDispatcher.requestFailed(e); } catch (UnknownHostException e) { LOGGER.warn("Unknown host: {}", e.getMessage()); LOGGER.debug("Unknown host stacktrace: {}", e); workerDispatcher.requestFailed("Unknown host: " + e.getMessage()); } catch (IOException e) { LOGGER.warn("IOException: {}", e.getMessage()); LOGGER.debug("IOException for URL '{}' with stacktrace: {}", workerUrl, e); workerDispatcher.requestFailed(e); } catch (TimeoutException e) { LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); workerDispatcher.requestFailed(e); } catch (Exception e) { LOGGER.error("Exception for URL '{}'", workerUrl, e); workerDispatcher.requestFailed(e); } finally { closeAllWindows(); latch.countDown(); } } public final void stopProcessing() { interruptProcessing = true; LOGGER.debug("Setting: 'stop processing'"); stopJavaScripts(); } private void processTheUrl(String url) throws IOException, ParameterException, ResourceException, StorageException, BreakingChainException, ExecutionException, TimeoutException { if (interruptProcessing) { LOGGER.debug("Time limit exceeded. {} won't be processed", url); // it's thrown in getInsecurePagesChain() so might be omitted here throw new TimeoutException("Timeout, stopping processing:" + url); } LOGGER.debug("Gathering page {}", url); long startTime = System.currentTimeMillis(); ProcessedPage rootPage = null; try { rootPage = getInsecurePagesChain(url); workerDispatcher.setPage(rootPage); } catch (IOException e) { LOGGER.warn("Exception while gathering page: '{}'", url); LOGGER.debug("Exception while gathering page (stacktrace):", e); throw e; } processPage(rootPage); long pageProcessedTime = System.currentTimeMillis(); LOGGER.debug("Processing of {} took {} ms. ", url, pageProcessedTime - startTime); } private void processPage(ProcessedPage processedPage) throws IOException, ParameterException, ResourceException, StorageException { String reasonFailed = ""; try { int i = wc.waitForBackgroundJavaScript(taskParams.getBackgroundJsTimeoutMillis()); if (i > 0) { LOGGER.warn("There are still {} javascripts runnig in background", i); } restartJavaScript(); long pageGatheredTime = System.currentTimeMillis(); if (processedPage.getClientSideRedirectPage() != null) { processClientRedirectSubPage(processedPage.getClientSideRedirectPage()); handlePage(processedPage); } else if (processedPage.getServerSideRedirectLocation() != null) { processServerRedirectSubPage(processedPage); } else { handlePage(processedPage); } long pageProcessedTime = System.currentTimeMillis(); LOGGER.debug("Inspecting of {} took {} ms. ", processedPage.getRequestedUrl(), pageProcessedTime - pageGatheredTime); } catch (BreakingChainException e) { reasonFailed = "Error when processing " + processedPage.getActualUrl() + "(requested: " + processedPage.getRequestedUrl() + "). Some data may be lost!"; LOGGER.error(reasonFailed, e); } finally { if (processedPage != null) { addRequiredAttributesToCurrentContext(processedPage, reasonFailed); processedPage.cleanPage(); } } } public final void stopJavaScripts() { wc.getOptions().setJavaScriptEnabled(false); wc.getJavaScriptEngine().shutdownJavaScriptExecutor(); JsScriptDebugFrame.resetCounter(); LOGGER.debug("JavaScript was stopped."); } private void restartJavaScript() { stopJavaScripts(); wc.getOptions().setJavaScriptEnabled(true); LOGGER.debug("JavaScript was restarted."); } private void handlePage(ProcessedPage processedPage) throws IOException, ParameterException, ResourceException, StorageException { if (processedPage.isHtml()) { LOGGER.debug("Got HTML page, processing. (url={})", processedPage.getRequestedUrl()); handleHtmlPage((HtmlPage) processedPage.getPage()); } else if (processedPage.getPage() instanceof TextPage) { handleTextPage(); } else { LOGGER.warn("Unsupported page type ({}) wile parsing URL ({})", new Object[] { processedPage.getPage().getWebResponse().getContentType(), processedPage.getPage().getWebResponse().getWebRequest().getUrl().toExternalForm() }); } } private void processFramesSubPage(ProcessedPage subPage, String subPageUrl, WebClientOrigin origin) throws IOException, ParameterException, ResourceException, StorageException { boolean processingSubPage = false; try { String oldBaseUrl = getPageLinksForCurrentContext().getBaseUrl(); String newSubPageUrl = null; if (subPage == null) { // HtmlUnit shows about:blank frame page when it can't // follow it (i.e. when it is ftp:// or another protocol). String reasonFailed = ""; if (!subPageUrl.isEmpty()) { // Frame has not been followed, so we have to use url from // source, not from page. // In most cases that means protocol was not supported. try { Link frameUrlFromSource = new Link(oldBaseUrl, subPageUrl); newSubPageUrl = frameUrlFromSource.getAbsoluteUrl(); reasonFailed = "Unable to follow url from " + origin.getName(); } catch (URISyntaxException e) { // This is unlikely to happen. LOGGER.debug("Can't create Link for subpage: " + e.getMessage(), e); reasonFailed = "Can't create Link for " + origin.getName(); newSubPageUrl = "about:blank"; } } else { reasonFailed = "Src for " + origin.getName() + " is empty."; newSubPageUrl = "about:blank"; } processingSubPage = prepareSubPage(newSubPageUrl, origin); if (processingSubPage) { addRequiredAttributesToCurrentContext(reasonFailed); } } else { newSubPageUrl = subPage.getActualUrl().toExternalForm(); processingSubPage = prepareSubPage(newSubPageUrl, origin); if (processingSubPage) { processPage(subPage); } } } catch (ContextSizeLimitExceededException e) { LOGGER.debug("Couldn't open subcontext: size limit reached: {}", e.getMessage()); processingSubPage = false; } catch (URIException e) { // Protocol not supported, but object has to be created and url_original set. LOGGER.debug("Protocol not supported (not HTTP/HTTPS) for iframe:" + e.getMessage()); ctx.addAttribute(URL_ORIGINAL_STRING, e.getMessage()); } finally { if (processingSubPage) { ctx.closeSubContext(); } } } private void processClientRedirectSubPage(ProcessedPage subPage) throws IOException, ParameterException, ResourceException, StorageException { boolean processingSubPage = false; try { processingSubPage = prepareSubPage(subPage.getActualUrl().toExternalForm(), WebClientOrigin.CLIENT_REDIRECT); if (processingSubPage) { processPage(subPage); } } catch (ContextSizeLimitExceededException e) { LOGGER.debug("Couldn't open subcontext: size limit reached: {}", e.getMessage()); processingSubPage = true; } catch (URIException e) { // Protocol not supported, but object has to be created and url_original set. LOGGER.debug("Protocol not supported (not HTTP/HTTPS) for iframe:" + e.getMessage()); ctx.addAttribute(URL_ORIGINAL_STRING, e.getMessage()); } finally { if (processingSubPage) { ctx.closeSubContext(); } } } private void processServerRedirectSubPage(ProcessedPage processedPage) throws IOException, ParameterException, ResourceException, StorageException, BreakingChainException { boolean processingSubPage = false; try { processingSubPage = prepareSubPage(processedPage.getServerSideRedirectLocation(), WebClientOrigin.SERVER_REDIRECT); if (processingSubPage) { ProcessedPage newSubPage = getInsecurePagesChain(processedPage); processPage(newSubPage); } } catch (ContextSizeLimitExceededException e) { LOGGER.debug("Couldn't open subcontext: size limit reached: {}", e.getMessage()); processingSubPage = false; } catch (URIException e) { // Protocol not supported, but object has to be created and url_original set. LOGGER.debug("Protocol not supported (not HTTP/HTTPS) for iframe:" + e.getMessage()); ctx.addAttribute(URL_ORIGINAL_STRING, e.getMessage()); } catch (ExecutionException e) { processingSubPage = false; } catch (TimeoutException e) { LOGGER.debug("Time limit exceeded: {}", processedPage.getActualUrl()); processingSubPage = false; } finally { if (processingSubPage) { ctx.closeSubContext(); } } } private ProcessedPage getInsecurePagesChain(String url) throws IOException, BreakingChainException, ExecutionException, TimeoutException { Page resultingPage = null; resultingPage = getInsecurePage(url); ProcessedPage chain = previousTopPageMap.get(resultingPage); if (chain == null) { throw new BreakingChainException(resultingPage); } return chain; } private ProcessedPage getInsecurePagesChain(final ProcessedPage processedPage) throws IOException, BreakingChainException, ExecutionException, TimeoutException { final WebRequest req = insecurePagesChaingInitialization(processedPage); ExecutorService ex = Executors.newSingleThreadExecutor(); Future<Page> f = ex.submit(new Callable<Page>() { @Override public Page call() throws IOException { return wc.getPage(processedPage.getPage().getEnclosingWindow(), req); } }); Page p = null; try { if (!interruptProcessing) { if (taskParams.getPageTimeoutMillis() <= 0) { p = f.get(); } else { p = f.get(taskParams.getPageTimeoutMillis(), TimeUnit.MILLISECONDS); } } } catch (InterruptedException e) { LOGGER.warn("Gathering {} interrupted", req.getUrl()); Thread.currentThread().interrupt(); } catch (java.util.concurrent.TimeoutException e) { throw new TimeoutException("Timeout when gathering:" + req.getUrl(), e); } finally { if (f != null) { f.cancel(true); } closeExecutorWithJSDisabled(ex); } return insecurePagesChainPostprocessing(processedPage, p); } private void closeExecutorWithJSDisabled(ExecutorService ex) { wc.getOptions().setJavaScriptEnabled(false); try { ex.awaitTermination(TERMINATION_TIMEOUT, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { //ignore } wc.getOptions().setJavaScriptEnabled(true); } private ProcessedPage insecurePagesChainPostprocessing(final ProcessedPage processedPage, Page p) throws BreakingChainException { ProcessedPage chain = null; if (processedPage.isFromFrame()) { chain = previousFramePageMap.get(p); } else { chain = previousTopPageMap.get(p); } if (chain == null) { throw new BreakingChainException("Page: " + p + " dosn't have chain!\nTopMapKey: " + previousTopPageMap.keySet() + "\nFrameMapKey:" + previousFramePageMap.keySet()); } return chain; } private WebRequest insecurePagesChaingInitialization(final ProcessedPage processedPage) throws TimeoutException, MalformedURLException { if (interruptProcessing) { throw new TimeoutException("Overall time limit exceeded url:" + processedPage.getOriginalUrl()); } wc.getOptions().setUseInsecureSSL(true); final WebRequest req = new WebRequest(UrlUtils.toUrlUnsafe(processedPage.getServerSideRedirectLocation())); req.setAdditionalHeader("Accept-Encoding", ""); return req; } private boolean prepareSubPage(String subPageUrl, WebClientOrigin origin) throws ContextSizeLimitExceededException, URIException { if (taskParams.isSaveMultiple() || !processedSubPages.contains(subPageUrl) || subPageUrl.equals("about:blank")) { processedSubPages.add(subPageUrl); ServiceData curCtxServiceData = ctx.getCurrentContextServiceData(); String newSubpageReferrer = curCtxServiceData.getUrlForProcessing(); Long newSubpageReferrerCookiesId = ctx.getCookiesReferenceId(); ctx.openSubContext(); ctx.addAttribute("type", "url"); ctx.addAttribute("origin", origin.getName()); ctx.addAttribute(URL_ORIGINAL_STRING, subPageUrl); ctx.webContextInit(subPageUrl, newSubpageReferrer, newSubpageReferrerCookiesId); validateSupportedProtocols(subPageUrl); return true; } else { LOGGER.debug("({}) already processed, skipping", subPageUrl); return false; } } private void handleHtmlPage(HtmlPage page) throws IOException, ParameterException, ResourceException, StorageException { inspectHtmlPage(page); } private void handleTextPage() { ctx.addAttribute(HTML_STRING, false); } private void inspectHtmlPage(HtmlPage htmlPage) throws IOException, ParameterException, ResourceException, StorageException { getPageLinksForCurrentContext().setBaseUrl(htmlPage.getUrl().toExternalForm()); for (HtmlElement element : htmlPage.getHtmlElementDescendants()) { inspect(element); } } private void inspect(HtmlElement element) throws IOException, ParameterException, ResourceException, StorageException { String tagName = element.getTagName().toLowerCase(); if (interruptProcessing) { LOGGER.debug("Element [{}] won't be processed (timeout)", tagName); } else if ("head".equals(tagName)) { // Process HEAD tag. getPageLinksForCurrentContext().setHeadElement(element); } else if ("base".equals(tagName) && !getPageLinksForCurrentContext().getIsBaseTagIgnored()) { // Process BASE tag. processBaseTag(element); } else if ("applet".equals(tagName)) { // Process APPLET tag. processEmbeddedObjectFile(element, "code"); processEmbeddedObjectFile(element, "classid"); } else if ("audio".equals(tagName)) { // Process AUDIO tag. processEmbeddedMultimediaFile(element, SRC_STRING); } else if ("body".equals(tagName)) { // Process BODY tag. processEmbeddedImageFile(element, "background"); } else if ("command".equals(tagName)) { // Process COMMAND tag. processEmbeddedImageFile(element, "icon"); } else if ("embed".equals(tagName)) { // Process EMBED tag. processEmbeddedObjectFile(element, SRC_STRING); } else if (HTML_STRING.equals(tagName)) { // Process HTML tag. BASE and CODEBASE tags can't influence HTML MANIFEST. processEmbeddedOtherFile(element, "manifest"); } else if ("img".equals(tagName)) { // Process IMG tag. processEmbeddedImageFile(element, SRC_STRING); getPageLinksForCurrentContext().addLongdesc(element, "longdesc"); } else if ("input".equals(tagName)) { // Process INPUT tag. if ("image".equalsIgnoreCase(element.getAttribute("type"))) { processEmbeddedImageFile(element, SRC_STRING); } } else if ("link".equals(tagName)) { // Process LINK tag. processLinkTag(element); } else if ("object".equals(tagName)) { // Process OBJECT tag. processObjectTag(element); } else if ("video".equals(tagName)) { // Process VIDEO tag. processEmbeddedImageFile(element, "poster"); processEmbeddedMultimediaFile(element, SRC_STRING); } else if ("script".equals(tagName)) { // Process SCRIPT tag. processEmbeddedOtherFile(element, SRC_STRING); } else if ("source".equals(tagName)) { // Process SOURCE tag. processEmbeddedMultimediaFile(element, SRC_STRING); } else if ("a".equals(tagName)) { // Process A tag. (This is not embedded file but ongoing link.) getPageLinksForCurrentContext().addAnchor(element, HREF_STRING); } else if ("area".equals(tagName)) { // Process AREA tag. (This is not embedded file but ongoing link.) getPageLinksForCurrentContext().addAnchor(element, HREF_STRING); } else if ("frame".equals(tagName)) { // Process FRAME tag. LOGGER.debug("frame:{}", element.asXml()); getPageLinksForCurrentContext().ignoreBaseTag(); processFramesSubPage(previousFramePageMap.get(((HtmlFrame) element).getEnclosedPage()), element.getAttribute(SRC_STRING), WebClientOrigin.FRAME); } else if ("iframe".equals(tagName)) { // Process IFRAME tag. LOGGER.debug("iframe:{}", element.asXml()); getPageLinksForCurrentContext().ignoreBaseTag(); processFramesSubPage(previousFramePageMap.get(((HtmlInlineFrame) element).getEnclosedPage()), element.getAttribute(SRC_STRING), WebClientOrigin.IFRAME); } } private void processObjectTag(HtmlElement element) throws IOException, ParameterException, ResourceException, StorageException { processEmbeddedObjectFile(element, "data"); String classIdAttribute = element.getAttribute("classid").toLowerCase(); boolean classidIsUrl = classIdAttribute.startsWith("http:") || classIdAttribute.startsWith("https:"); if (classidIsUrl) { processEmbeddedObjectFile(element, "classid"); } } private void processLinkTag(HtmlElement element) throws IOException, ParameterException, ResourceException, StorageException { String rel = element.getAttribute("rel"); if ("stylesheet".equalsIgnoreCase(rel)) { processEmbeddedOtherFile(element, HREF_STRING); } else if ("icon".equalsIgnoreCase(rel) || "shortcut icon".equalsIgnoreCase(rel)) { processEmbeddedImageFile(element, HREF_STRING); } } private void processBaseTag(HtmlElement element) { if (getPageLinksForCurrentContext().isOutsideOfHeadElement(element)) { // BASE element should be inside HEAD, but it isn't. getPageLinksForCurrentContext().ignoreBaseTag(); return; } String base = element.getAttribute(HREF_STRING); if (properUrl(base)) { if (!base.endsWith("/")) { base += "/"; } getPageLinksForCurrentContext().setBaseUrl(base); } } private void processEmbeddedObjectFile(HtmlElement element, String attributeName) throws IOException, ParameterException, ResourceException, StorageException { // Ignore future BASE tags. getPageLinksForCurrentContext().ignoreBaseTag(); if (taskParams.isSaveObjects()) { // Check if provided attribute exists. String attributeNameChecked = element.getAttribute(attributeName); if (attributeNameChecked == DomElement.ATTRIBUTE_NOT_DEFINED) { return; } // Check for ARCHIVE attribute. String[] archives = null; String archiveAttribute = element.getAttribute("archive"); if (archiveAttribute != DomElement.ATTRIBUTE_NOT_DEFINED) { archives = archiveAttribute.split(" "); } // Check for CODEBASE attribute. String codebase = element.getAttribute("codebase"); if (codebase != DomElement.ATTRIBUTE_NOT_DEFINED && !codebase.endsWith("/")) { codebase += "/"; } // Resolve base path. String basePath = getPageLinksForCurrentContext().getBaseUrl(); if (!(codebase == null || codebase.isEmpty())) { basePath = codebase; } // Save archives if present. if (archives != null) { for (String arch : archives) { String urlTemp = UrlUtils.resolveUrl(basePath, arch); processEmbeddedFile(urlTemp); } } // Save embedded file if provided attribute name exists. String urlTemp = UrlUtils.resolveUrl(basePath, attributeNameChecked); processEmbeddedFile(urlTemp); } } private void processEmbeddedMultimediaFile(HtmlElement element, String attributeName) throws IOException, ParameterException, ResourceException, StorageException { // Ignore future BASE tags. getPageLinksForCurrentContext().ignoreBaseTag(); if (taskParams.isSaveMultimedia()) { // Save embedded file if provided attribute name exists. String attributeNameChecked = element.getAttribute(attributeName); if (attributeNameChecked != DomElement.ATTRIBUTE_NOT_DEFINED) { processEmbeddedFile( UrlUtils.resolveUrl(getPageLinksForCurrentContext().getBaseUrl(), attributeNameChecked)); } } } private void processEmbeddedImageFile(HtmlElement element, String attributeName) throws IOException, ParameterException, ResourceException, StorageException { // Ignore future BASE tags. getPageLinksForCurrentContext().ignoreBaseTag(); if (taskParams.isSaveImages()) { // Save embedded file if provided attribute name exists. String attributeNameChecked = element.getAttribute(attributeName); if (attributeNameChecked != DomElement.ATTRIBUTE_NOT_DEFINED) { processEmbeddedFile( UrlUtils.resolveUrl(getPageLinksForCurrentContext().getBaseUrl(), attributeNameChecked)); } } } private void processEmbeddedOtherFile(HtmlElement element, String attributeName) throws IOException, ParameterException, ResourceException, StorageException { // Ignore future BASE tags, for all tags but HTML. if (!element.getTagName().equalsIgnoreCase(HTML_STRING)) { // Basically, this is happening only when tag is HTML and attribute // is MANIFEST. getPageLinksForCurrentContext().ignoreBaseTag(); } if (taskParams.isSaveOthers()) { // Save embedded file if provided attribute name exists. String attributeNameChecked = element.getAttribute(attributeName); if (attributeNameChecked != DomElement.ATTRIBUTE_NOT_DEFINED) { processEmbeddedFile( UrlUtils.resolveUrl(getPageLinksForCurrentContext().getBaseUrl(), attributeNameChecked)); } } } private void processEmbeddedFile(String urlOfFileToSave) throws IOException, ParameterException, ResourceException, StorageException { boolean processingSubPage = false; try { wc.getOptions().setJavaScriptEnabled(false); processingSubPage = prepareSubPage(urlOfFileToSave, WebClientOrigin.EMBEDDED); if (processingSubPage) { // Checks for illegal characters in URI. new Link(urlOfFileToSave, ""); ProcessedPage processedPage = new ProcessedPage(wc.getPage(urlOfFileToSave)); processPage(processedPage); } } catch (java.net.URISyntaxException e) { addNoHostFailedInfoToEmbeddedUrlObject(e.getMessage()); LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); } catch (ContextSizeLimitExceededException e) { processingSubPage = false; LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); } catch (java.net.UnknownHostException e) { addNoHostFailedInfoToEmbeddedUrlObject("Host not found: " + e.getMessage()); LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); } catch (org.apache.commons.httpclient.URIException e) { addNoHostFailedInfoToEmbeddedUrlObject(e.getMessage()); LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); } catch (org.apache.http.conn.HttpHostConnectException e) { addNoHostFailedInfoToEmbeddedUrlObject("Connection error: " + e.getMessage()); LOGGER.warn(e.getMessage()); LOGGER.debug(e.getMessage(), e); } catch (Exception e) { addNoHostFailedInfoToEmbeddedUrlObject( "Unknown error while processing embedded resource: " + urlOfFileToSave + "; " + e.getMessage()); LOGGER.warn("Exception while saving embedded file: " + urlOfFileToSave); LOGGER.debug(e.getMessage(), e); } finally { if (processingSubPage) { ctx.closeSubContext(); } wc.getOptions().setJavaScriptEnabled(true); } } /** * Adds additional attributes to new object when embedded resource host does * not exists. */ private void addNoHostFailedInfoToEmbeddedUrlObject(String message) { ctx.addAttribute("active", false); ctx.addAttribute("reason_failed", message); if (taskParams.isAddReferrer()) { String referrer = ctx.getCurrentContextServiceData().getInputReferrer(); if (referrer != null) { ctx.addAttribute("referrer", referrer); } else { ctx.addAttribute("referrer", ""); } } if (taskParams.isAddReferrerCookie()) { Long referrerCookieId = ctx.getCurrentContextServiceData().getInputReferrerCookieId(); if (referrerCookieId != null) { ctx.addReference("referrer_cookie", referrerCookieId); } else { ctx.addReference("referrer_cookie", -1L); } } try { RequestWrapper rw = new RequestWrapper(ctx.getServiceData().getInputUrlOriginal(), ctx.getServiceData().getUrlForProcessing(), null); long referenceId = ctx.saveInDataStore(rw); ctx.addReference("http_request", referenceId); } catch (RequiredParameterMissingException e) { LOGGER.warn("Couldn't create HTTP request wrapper, parameter missing.", e); } catch (StorageException e) { LOGGER.warn("Couldn't write HTTP request object to Data Store.", e); } catch (ParameterException e) { LOGGER.warn("Invalid parameter while writting HTTP request object to Data Store.", e); } } private void validateSupportedProtocols(String absoluteUri) throws URIException { Pattern pattern = Pattern.compile("^([a-zA-Z][a-zA-Z\\-_].+?):"); Matcher matcher = pattern.matcher(absoluteUri); String scheme = ""; if (matcher.find()) { scheme = matcher.group(1); } if (!(scheme.equalsIgnoreCase("http") || scheme.equalsIgnoreCase("https"))) { throw new URIException(absoluteUri); } } public final PageLinks getPageLinksForCurrentContext() { return ctx.getPageLinks(); } public final Long getCookiesReferenceIdForCurrentContext() { return ctx.getCookiesReferenceId(); } /** * Check if provided URL is valid. * * @param url URL to check. * @return True if URL is valid, false otherwise. */ private boolean properUrl(String url) { try { String encoded = URIUtil.encode(url, Link.PROPER_URL_BITSET); new URL(encoded); return true; } catch (Exception e) { return false; } } // FIXME:zmienic na getInsecurePagesChain i zwracac ProcessedPage public final Page getInsecurePage(String url) throws IOException, ExecutionException, TimeoutException { final WebRequest req = insecurePageInitialization(url); long processingTime = System.currentTimeMillis(); ExecutorService ex = Executors.newSingleThreadExecutor(); Future<Page> f = ex.submit(new Callable<Page>() { @Override public Page call() throws IOException { ctx.addTimeAttribute("download_time_start", System.currentTimeMillis()); Page page = wc.getPage(req); ctx.addTimeAttribute("download_time_end", System.currentTimeMillis()); return page; } }); Page page = null; try { if (!interruptProcessing) { if (taskParams.getPageTimeoutMillis() <= 0) { page = f.get(); } else { page = f.get(taskParams.getPageTimeoutMillis(), TimeUnit.MILLISECONDS); } } } catch (InterruptedException e) { LOGGER.warn("Gathering {} interrupted", url); Thread.currentThread().interrupt(); } catch (java.util.concurrent.TimeoutException e) { throw new TimeoutException( "Timeout when gathering (" + taskParams.getPageTimeoutMillis() + " ms):" + url, e); } finally { if (f != null) { f.cancel(true); } closeExecutorWithJSDisabled(ex); } processingTime = System.currentTimeMillis() - processingTime; insecurePagePostprocessing(url, processingTime, page); return page; } private WebRequest insecurePageInitialization(String url) throws TimeoutException, MalformedURLException { if (interruptProcessing) { throw new TimeoutException("Overall time limit exceeded url:" + url); } wc.getOptions().setUseInsecureSSL(true); final WebRequest req = new WebRequest(UrlUtils.toUrlUnsafe(url)); // work-around for bug with deflated content. req.setAdditionalHeader("Accept-Encoding", ""); req.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); return req; } private void insecurePagePostprocessing(String url, long sTime, Page p) throws TimeoutException, IOException { if (interruptProcessing || Thread.currentThread().isInterrupted()) { if (wc.getWebConnection() instanceof HttpWebConnection) { ((HttpWebConnection) wc.getWebConnection()).shutdown(); } throw new TimeoutException("Overall processing time limit exceeded"); } if (p == null) { LOGGER.warn("Retrieving [{}] failed, spent: {}.{} sec.", new Object[] { url, sTime / ONE_SECOND_IN_MILISECONDS, sTime % ONE_SECOND_IN_MILISECONDS }); throw new IOException("Couldn't retrieve page " + url); } LOGGER.debug("Retrieved [{}] in: {}.{} sec.[interruptProcessing={}]", new Object[] { url, sTime / ONE_SECOND_IN_MILISECONDS, sTime % ONE_SECOND_IN_MILISECONDS, interruptProcessing }); } public final Set<CookieWrapper> getCookies() { Set<CookieWrapper> cookies = new HashSet<CookieWrapper>(); for (Cookie cookie : wc.getCookieManager().getCookies()) { Map<String, String> attributes = new HashMap<String, String>(); attributes.put(CookieAttributes.DOMAIN.getName(), cookie.getDomain()); attributes.put(CookieAttributes.PATH.getName(), cookie.getPath()); attributes.put(CookieAttributes.IS_SECURE.getName(), String.valueOf(cookie.isSecure())); cookies.add(new CookieWrapper(cookie.getName(), cookie.getValue(), attributes)); } return cookies; } final ScriptInterceptor getInterceptor() { return scriptInterceptor; } public final void closeAllWindows() { try { stopJavaScripts(); wc.closeAllWindows(); } catch (Exception e) { LOGGER.warn(e.getMessage(), e); } } public final void setContextData(WebClientWorker webClientWorker, ServiceParameters params, String urlForProcessing) { ctx.setServiceParams(params); ctx.setWebClientWorker(webClientWorker); } private void addRequiredAttributesToCurrentContext(String reasonFailed) throws ParameterException, ResourceException, StorageException { addRequiredAttributesToCurrentContext(null, reasonFailed); } private void addRequiredAttributesToCurrentContext(ProcessedPage processedPage, String reasonFailed) throws ParameterException, ResourceException, StorageException { try { RequestWrapper requestWrapper = composeRequest(processedPage); long referenceId = ctx.saveInDataStore(requestWrapper); ctx.addReference("http_request", referenceId); String referrer = ctx.getCurrentContextServiceData().getInputReferrer(); if (referrer != null) { ctx.addAttribute("referrer", referrer); } boolean isSuccessfull = processedPage != null && processedPage.isComplete(); ctx.addAttribute("active", isSuccessfull); if (isSuccessfull) { addAttrsForSuccessfulProcessing(processedPage); } else { addAttrsForFailedProcessing(reasonFailed); } } catch (StackOverflowError e) { ctx.addWarning("Serious problem with JVM - cannot recover task"); } catch (NullPointerException e) { LOGGER.error("NPE while processing task", e); String msg = e.getMessage(); if (msg == null) { String url = ctx.getCurrentContextServiceData() != null ? ctx.getCurrentContextServiceData().getUrlForProcessing() : null; msg = "NullPointerException while processing " + url; } ctx.addAttribute("reason_failed", msg); } } private void addAttrsForFailedProcessing(String reasonFailed) { if (reasonFailed != null && !reasonFailed.isEmpty()) { ctx.addAttribute("reason_failed", reasonFailed); } else { ctx.addAttribute("reason_failed", "Unable to access page content. Response or page are unavailable."); } if (workerDispatcher.getWarning() != null) { ctx.addWarning(workerDispatcher.getWarning()); LOGGER.warn("Adding warning to Task : {}", workerDispatcher.getWarning()); } } private void addAttrsForSuccessfulProcessing(ProcessedPage processedPage) throws StorageException, ResourceException, ParameterException { ctx.addAttribute("http_code", processedPage.getResponseCode()); ctx.addAttribute(HTML_STRING, processedPage.isHtml()); if (processedPage.isHtml()) { if (taskParams.isSaveHtml()) { InputStream content = null; ctx.addTimeAttribute("download_time_start", System.currentTimeMillis()); content = processedPage.getContentAsStream(); long referenceId = ctx.saveInDataStore(content); ctx.addTimeAttribute("download_time_end", System.currentTimeMillis()); ctx.addReference("html_source", referenceId); } handleCookies(); } else { // It's not HTML so download as single file if possible. downloadAndStoreSingleFile(processedPage); handleCookies(); } } private RequestWrapper composeRequest(ProcessedPage processedPage) throws RequiredParameterMissingException { if (processedPage == null) { return new RequestWrapper(ctx.getCurrentContextServiceData().getInputUrlOriginal(), ctx.getCurrentContextServiceData().getUrlForProcessing(), null); } else if (workerDispatcher.isSuccessfull()) { return new RequestWrapper(processedPage.getOriginalUrl(), processedPage.getRequestedUrl().toExternalForm(), processedPage.getRequestHeaders(), processedPage.getResponseCode(), processedPage.getResponseHeaders()); } else { return new RequestWrapper(processedPage.getOriginalUrl(), processedPage.getRequestedUrl().toExternalForm(), processedPage.getRequestHeaders()); } } private void handleCookies() throws StorageException { if (taskParams.isSaveCookies() && workerDispatcher.getCookies().size() != 0) { ctx.saveCookiesInDataStore(getComposedCookies()); ctx.addReference("cookie_list", ctx.getCookiesReferenceId()); } } private Set<CookieWrapper> getComposedCookies() throws StorageException { Set<CookieWrapper> cookies = workerDispatcher.getCookies(); if (ctx.getInputDataInputReferrerCookieId() != null) { cookies.addAll(ctx.getCookiesFromDataStore(ctx.getInputDataInputReferrerCookieId())); } return cookies; } /** * Used when reported URL is not HTML page but some other file. * * @param processedPage * @throws StorageException * @throws ParameterException * @throws ResourceException */ private void downloadAndStoreSingleFile(ProcessedPage processedPage) throws StorageException, ParameterException, ResourceException { String urlForProcessing = processedPage.getRequestedUrl().toExternalForm(); long downloadTimeStart = System.currentTimeMillis(); InputStream contentStream = processedPage.getContentAsStream(); String contentType = processedPage.getContentType(); long savedContentId = ctx.saveInDataStore(contentStream); long downloadTimeEnd = System.currentTimeMillis(); // Process PDF, SWF or other file. WebClientObjectType objectType = WebClientObjectType.forMimeType(contentType); if (objectType.isElliglibeForExtract(taskParams) && processedPage.getResponseCode() == HttpStatus.SC_OK) { try { NewWebClientUrlObject newWebClientUrlObject = new NewWebClientUrlObject(urlForProcessing, null, objectType.getName(), contentType, taskParams.isAddReferrer() ? ctx.getCurrentContextServiceData().getInputReferrer() : null, taskParams.isAddReferrerCookie() ? ctx.getInputDataInputReferrerCookieId() : null, savedContentId); newWebClientUrlObject.setDownloadTimeStart(downloadTimeStart); newWebClientUrlObject.setDownloadTimeEnd(downloadTimeEnd); ctx.newObject(newWebClientUrlObject); } catch (URIException e) { LOGGER.warn("Not an URL!: {}, msg={}", urlForProcessing, e.getMessage()); } } } public final Map<String, Map<String, ScriptElement>> getLaunchedScripts() { return scriptInterceptor.getSourcesByOrigin(); } public final void closeJsInterceptor() { LOGGER.debug("Closing javascript debugger/interceptor"); scriptInterceptor.disableProcessing(); } public final WebClient getWc() { return wc; } public final void setCookiesForInitialization(Set<CookieWrapper> cookies) { cookieWrappers = cookies; } }