Java tutorial
/** * Copyright 2014-2016 Emmanuel Keller / QWAZR * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package com.qwazr.crawler.web.driver; import com.fasterxml.jackson.databind.JsonNode; import com.qwazr.crawler.web.service.WebCrawlDefinition; import com.qwazr.utils.IOUtils; import com.qwazr.utils.StringUtils; import com.qwazr.utils.http.HttpUtils; import com.qwazr.utils.json.JsonMapper; import org.apache.http.client.fluent.Executor; import org.apache.http.client.fluent.Request; import org.apache.http.impl.client.CloseableHttpClient; import org.openqa.selenium.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.css.sac.CSSException; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.security.KeyManagementException; import java.security.KeyStoreException; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; final public class BrowserDriver implements WebDriver, Closeable, AdditionalCapabilities.All { protected static final Logger logger = LoggerFactory.getLogger(BrowserDriver.class); private final WebCrawlDefinition.ProxyDefinition currentProxy; private final BrowserDriverEnum type; private final WebDriver driver; BrowserDriver(BrowserDriverEnum type, WebDriver driver, WebCrawlDefinition.ProxyDefinition currentProxy) { this.type = type; this.driver = driver; this.currentProxy = currentProxy; Timeouts timeouts = driver.manage().timeouts(); timeouts.implicitlyWait(1, TimeUnit.MINUTES); timeouts.setScriptTimeout(2, TimeUnit.MINUTES); timeouts.pageLoadTimeout(3, TimeUnit.MINUTES); } @Override public void close() { if (driver == null) return; driver.close(); } @Override final public void get(String sUrl) { driver.get(sUrl); } public BrowserDriverEnum getType() { return type; } public Object executeScript(String javascript, boolean faultTolerant, Object... objects) { try { if (!(driver instanceof JavascriptExecutor)) throw new WebDriverException("The Web driver does not support javascript execution"); JavascriptExecutor js = (JavascriptExecutor) driver; return js.executeScript(javascript, objects); } catch (WebDriverException e) { if (!faultTolerant) throw e; logger.warn(e.getMessage(), e); return null; } } final public BufferedImage getScreenshot() throws IOException { if (!(driver instanceof TakesScreenshot)) throw new WebDriverException("This browser driver does not support screenshot"); TakesScreenshot takesScreenshot = (TakesScreenshot) driver; byte[] data = takesScreenshot.getScreenshotAs(OutputType.BYTES); return ImageIO.read(new ByteArrayInputStream(data)); } @Override final public String getTitle() { return driver.getTitle(); } final public void setSize(int width, int height) { driver.manage().window().setSize(new Dimension(width, height)); } final public void setTimeouts(Integer impWait, Integer pageLoad, Integer script) { Timeouts timeOuts = driver.manage().timeouts(); if (impWait != null) timeOuts.implicitlyWait(impWait, TimeUnit.SECONDS); if (pageLoad != null) timeOuts.pageLoadTimeout(pageLoad, TimeUnit.SECONDS); if (script != null) timeOuts.setScriptTimeout(script, TimeUnit.SECONDS); } final public WebCrawlDefinition.ProxyDefinition getProxy() { return this.currentProxy; } @Override final public List<WebElement> findElements(By by) { return driver.findElements(by); } @Override final public String getWindowHandle() { return driver.getWindowHandle(); } @Override final public String getCurrentUrl() { return driver.getCurrentUrl(); } @Override final public WebElement findElement(By by) { return driver.findElement(by); } @Override final public String getPageSource() { return driver.getPageSource(); } @Override public void quit() { driver.quit(); } @Override final public Set<String> getWindowHandles() { return driver.getWindowHandles(); } @Override final public TargetLocator switchTo() { return driver.switchTo(); } @Override final public Navigation navigate() { return driver.navigate(); } @Override final public Options manage() { return driver.manage(); } /** * Fill a list with all the href attributes found in a tag, relative to the * given rootElement * * @param searchContext the root of the search * @param hrefCollection the collection filled with the href content */ public void findLinks(SearchContext searchContext, Collection<String> hrefCollection) { extractLinks(searchContext, hrefCollection, "a", "href", "data-href"); extractLinks(searchContext, hrefCollection, "div", "data-href"); extractLinks(searchContext, hrefCollection, "frame", "src"); } /** * Iterate over every frames and extract all links. The driver is set to defaultContent() * * @param hrefCollection */ public void findEveryFramesLinks(Collection<String> hrefCollection) { driver.switchTo().defaultContent(); for (int i = 0; i < 100; i++) { try { driver.switchTo().frame(i); findLinks(driver, hrefCollection); } catch (NoSuchFrameException e) { // That's not an error, we just want to iterate over frames break; } } driver.switchTo().defaultContent(); } public void findRssItemLinks(SearchContext searchContext, Collection<String> linkCollection) { List<WebElement> channels = driver.findElements(By.tagName("channel")); for (WebElement channel : channels) { List<WebElement> items = channel.findElements(By.tagName("item")); for (WebElement item : items) { List<WebElement> links = item.findElements(By.tagName("link")); for (WebElement link : links) { linkCollection.add(link.getText()); } } } } public List<String> getRssItemLinks() { ArrayList<String> links = new ArrayList<String>(); findRssItemLinks(driver, links); return links; } @Override public String getTextSafe(WebElement webElement) { if (webElement == null) return null; if (driver instanceof AdditionalCapabilities.SafeText) return ((AdditionalCapabilities.SafeText) driver).getTextSafe(webElement); return webElement.getText(); } public String getSnippet(SearchContext searchContext, int sizeLimit) { List<WebElement> elements = searchContext.findElements(By.tagName("p")); if (elements == null) return null; StringBuilder sb = new StringBuilder(); for (WebElement element : elements) { String text = null; try { text = getTextSafe(element); } catch (CSSException e) { if (logger.isWarnEnabled()) logger.warn("Cannot extract snippet: " + e.getMessage(), e); } if (text == null) continue; text = StringUtils.join(StringUtils.split(text, " \r\n\t"), ' '); sb.append(text); if (!text.endsWith(".")) sb.append(' '); sb.append(' '); if (sb.length() > sizeLimit) break; } if (sb.length() <= sizeLimit) return sb.toString().trim(); int i = 0; int last = -1; for (;;) { i = sb.indexOf(" ", i + 1); if (i == -1 || i > sizeLimit) break; last = i; } if (last == -1) last = sizeLimit; return sb.substring(0, last) + ""; } private void extractLinks(SearchContext searchContext, Collection<String> hrefCollection, String tag, String... attrs) { // Let's look for the a tags List<WebElement> links = searchContext.findElements(By.tagName(tag)); if (links == null || links.isEmpty()) return; // Building the URI list for (WebElement link : links) { for (String attr : attrs) { String href = link.getAttribute(attr); if (href != null) { href = StringUtils.replace(href, " ", "+"); hrefCollection.add(href); break; } } } } private WebElement findElementBy(By by) { try { return driver.findElement(by); } catch (NoSuchElementException e) { return null; } } public String getInnerHtmlByXPath(String xPath) { WebElement element = findElementBy(By.xpath(xPath)); if (element == null) return null; return element.getAttribute("innerHTML"); } public WebElement findElementByXPath(String xPath) { return findElementBy(By.xpath(xPath)); } public WebElement findElementByTagName(String tagName) { return findElementBy(By.tagName(tagName)); } public WebElement findElementByCssSelector(String cssSelector) { return findElementBy(By.cssSelector(cssSelector)); } public List<WebElement> findElementsByXPath(String xPath) { return driver.findElements(By.xpath(xPath)); } public List<WebElement> findElementsByTagName(String tagName) { return driver.findElements(By.tagName(tagName)); } public List<WebElement> findElementsByCssSelector(String cssSelector) { return driver.findElements(By.cssSelector(cssSelector)); } public void setAttribute(WebElement element, String name, String value) { if (driver instanceof AdditionalCapabilities.SetAttribute) ((AdditionalCapabilities.SetAttribute) driver).setAttribute(element, name, value); else this.executeScript("arguments[0].setAttribute(arguments[1], arguments[2])", false, element, name, value); } @Override public void saveBinaryFile(File file) throws IOException { if (file == null) return; if (driver instanceof AdditionalCapabilities.SaveBinaryFile) { ((AdditionalCapabilities.SaveBinaryFile) driver).saveBinaryFile(file); return; } try { httpClientDownload(getCurrentUrl(), null, file); } catch (NoSuchAlgorithmException | KeyStoreException | KeyManagementException | URISyntaxException e) { throw new IOException(e); } } void httpClientDownload(String url, String userAgent, File file) throws NoSuchAlgorithmException, KeyStoreException, KeyManagementException, IOException, URISyntaxException { URI uri = new URI(url); final CloseableHttpClient httpClient = HttpUtils.createHttpClient_AcceptsUntrustedCerts(); try { final Executor executor = Executor.newInstance(httpClient); Request request = Request.Get(uri.toString()).addHeader("Connection", "close").connectTimeout(60000) .socketTimeout(60000); if (userAgent != null) request = request.addHeader("User-Agent", userAgent); if (currentProxy != null) { if (currentProxy.http_proxy != null && !currentProxy.http_proxy.isEmpty()) request = request.viaProxy(currentProxy.http_proxy); if ("https".equals(uri.getScheme()) && currentProxy.ssl_proxy != null && !currentProxy.ssl_proxy.isEmpty()) request = request.viaProxy(currentProxy.ssl_proxy); } executor.execute(request).saveContent(file); } finally { IOUtils.close(httpClient); } } public void savePageSource(String path) throws IOException { savePageSource(new File(path)); } public void savePageSource(File file) throws IOException { IOUtils.writeStringAsFile(getPageSource(), file); } @Override public Integer getStatusCode() { if (driver instanceof AdditionalCapabilities.ResponseHeader) return ((AdditionalCapabilities.ResponseHeader) driver).getStatusCode(); throw new WebDriverException("GetStatusCode is not implemented in " + driver.getClass()); } @Override public String getContentType() { if (driver instanceof AdditionalCapabilities.ResponseHeader) return ((AdditionalCapabilities.ResponseHeader) driver).getContentType(); throw new WebDriverException("GetContentType is not implemented in " + driver.getClass()); } @Override public String getContentDisposition() { if (driver instanceof AdditionalCapabilities.ResponseHeader) return ((AdditionalCapabilities.ResponseHeader) driver).getContentDisposition(); throw new WebDriverException("GetContentDisposition is not implemented in " + driver.getClass()); } @Override public String getContentDispositionFilename() { if (driver instanceof AdditionalCapabilities.ResponseHeader) return ((AdditionalCapabilities.ResponseHeader) driver).getContentDispositionFilename(); throw new WebDriverException("GetContentDispositionFilename is not implemented in " + driver.getClass()); } public String getErrorMessage(Exception error) { if (error == null) return null; String msg = error.getMessage(); if (msg == null) msg = error.toString(); if (msg == null) return error.getClass().getName(); if (!msg.startsWith("{")) return msg; try { JsonNode json = JsonMapper.MAPPER.readTree(msg); JsonNode jsonMsg = json.get("errorMessage"); if (jsonMsg != null && jsonMsg.isTextual()) return jsonMsg.textValue(); jsonMsg = json.get("error"); if (jsonMsg != null && jsonMsg.isTextual()) return jsonMsg.textValue(); return msg; } catch (IOException e) { return msg; } } }