Java tutorial
/** * License Agreement for OpenSearchServer * * Copyright (C) 2011-2014 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.webservice.crawler.webcrawler; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; import javax.xml.ws.WebServiceException; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang.BooleanUtils; import com.jaeksoft.searchlib.Client; import com.jaeksoft.searchlib.ClientFactory; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.crawler.web.database.CredentialManager; import com.jaeksoft.searchlib.crawler.web.database.HostUrlList; import com.jaeksoft.searchlib.crawler.web.database.UrlItem; import com.jaeksoft.searchlib.crawler.web.database.UrlManager; import com.jaeksoft.searchlib.crawler.web.database.UrlManager.SearchTemplate; import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager; import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternItem; import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternManager; import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread; import com.jaeksoft.searchlib.crawler.web.screenshot.ScreenshotManager; import com.jaeksoft.searchlib.crawler.web.spider.Crawl; import com.jaeksoft.searchlib.function.expression.SyntaxError; import com.jaeksoft.searchlib.index.IndexDocument; import com.jaeksoft.searchlib.query.ParseException; import com.jaeksoft.searchlib.request.AbstractSearchRequest; import com.jaeksoft.searchlib.user.Role; import com.jaeksoft.searchlib.user.User; import com.jaeksoft.searchlib.util.IOUtils; import com.jaeksoft.searchlib.util.LinkUtils; import com.jaeksoft.searchlib.web.servlet.restv1.ScreenshotServlet; import com.jaeksoft.searchlib.webservice.CommonListResult; import com.jaeksoft.searchlib.webservice.CommonResult; import com.jaeksoft.searchlib.webservice.CommonServices; import com.jaeksoft.searchlib.webservice.RestApplication; import com.jaeksoft.searchlib.webservice.crawler.CrawlerUtils; import com.jaeksoft.searchlib.webservice.query.document.FieldValueList; public class WebCrawlerImpl extends CommonServices implements RestWebCrawler { @Override public CommonResult run(String use, String login, String key, boolean once) { try { Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP); ClientFactory.INSTANCE.properties.checkApi(); if (once) return CrawlerUtils.runOnce(client.getWebCrawlMaster()); else { client.getWebPropertyManager().getCrawlEnabled().setValue(true); return CrawlerUtils.runForever(client.getWebCrawlMaster()); } } catch (IOException e) { throw new CommonServiceException(e); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } } @Override public CommonResult stop(String use, String login, String key) { try { Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP); ClientFactory.INSTANCE.properties.checkApi(); client.getWebPropertyManager().getCrawlEnabled().setValue(false); return CrawlerUtils.stop(client.getWebCrawlMaster()); } catch (IOException e) { throw new CommonServiceException(e); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } } @Override public CommonResult status(String use, String login, String key) { try { Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); return CrawlerUtils.status(client.getWebCrawlMaster()); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } private AbstractSearchRequest getRequest(UrlManager urlManager, String host) throws SearchLibException, ParseException { AbstractSearchRequest searchRequest = urlManager.getSearchRequest(SearchTemplate.urlExport); searchRequest.setQueryString("*:*"); if (host != null && host.length() > 0) searchRequest.addFilter("host:\"" + host + '"', false); return searchRequest; } public byte[] exportURLs(String use, String login, String key) { try { Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); File file = client.getUrlManager().exportURLs(getRequest(client.getUrlManager(), null)); return IOUtils.toByteArray(new FileInputStream(file)); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (FileNotFoundException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } catch (ParseException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } } public byte[] exportSiteMap(String use, String host, String login, String key) { try { Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); File file = client.getUrlManager().exportSiteMap(getRequest(client.getUrlManager(), host)); return IOUtils.toByteArray(new FileInputStream(file)); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (FileNotFoundException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } catch (ParseException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } } private CommonResult injectPatterns(String index, String login, String key, boolean replaceAll, List<String> patterns, boolean inclusion) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST); ClientFactory.INSTANCE.properties.checkApi(); List<PatternItem> patternList = PatternManager.getPatternList(patterns); PatternManager patternManager = inclusion ? client.getInclusionPatternManager() : client.getExclusionPatternManager(); patternManager.addList(patternList, replaceAll); int count = PatternManager.countStatus(patternList, PatternItem.Status.INJECTED); return new CommonResult(true, count + " patterns injected"); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } @Override public CommonResult injectPatternsInclusion(String index, String login, String key, boolean replaceAll, List<String> patterns) { return injectPatterns(index, login, key, replaceAll, patterns, true); } @Override public CommonResult injectPatternsExclusion(String index, String login, String key, boolean replaceAll, List<String> patterns) { return injectPatterns(index, login, key, replaceAll, patterns, false); } private CommonResult getPatternStatusResult(WebPropertyManager webPropertyManager) { CommonResult commonResult = new CommonResult(true, null); commonResult.addDetail("inclusion_enabled", webPropertyManager.getInclusionEnabled().getValue()); commonResult.addDetail("exclusion_enabled", webPropertyManager.getExclusionEnabled().getValue()); return commonResult; } @Override public CommonResult getPatternStatus(String index, String login, String key) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST); ClientFactory.INSTANCE.properties.checkApi(); WebPropertyManager webPropertyManager = client.getWebPropertyManager(); return getPatternStatusResult(webPropertyManager); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } @Override public CommonResult setPatternStatus(String index, String login, String key, Boolean inclusion, Boolean exclusion) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST); ClientFactory.INSTANCE.properties.checkApi(); WebPropertyManager webPropertyManager = client.getWebPropertyManager(); if (inclusion != null) webPropertyManager.getInclusionEnabled().setValue(inclusion); if (exclusion != null) webPropertyManager.getExclusionEnabled().setValue(exclusion); return getPatternStatusResult(webPropertyManager); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } private CommonResult deletePatterns(String index, String login, String key, List<String> patterns, boolean inclusion) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST); ClientFactory.INSTANCE.properties.checkApi(); PatternManager patternManager = inclusion ? client.getInclusionPatternManager() : client.getExclusionPatternManager(); int count = patternManager.delPattern(patterns); return new CommonResult(true, count + " patterns deleted"); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } @Override public CommonResult deletePatternsInclusion(String index, String login, String key, List<String> deleteList) { return deletePatterns(index, login, key, deleteList, true); } @Override public CommonResult deletePatternsExclusion(String index, String login, String key, List<String> deleteList) { return deletePatterns(index, login, key, deleteList, false); } public CommonListResult<String> extractPatterns(String index, String login, String key, String startsWith, boolean inclusion) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); PatternManager patternManager = inclusion ? client.getInclusionPatternManager() : client.getExclusionPatternManager(); List<String> patterns = new ArrayList<String>(); patternManager.getPatterns(startsWith, patterns); return new CommonListResult<String>(patterns); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } @Override public CommonListResult<String> extractPatternsInclusion(String index, String login, String key, String startsWith) { return extractPatterns(index, login, key, startsWith, true); } @Override public CommonListResult<String> extractPatternsExclusion(String index, String login, String key, String startsWith) { return extractPatterns(index, login, key, startsWith, false); } @Override public CommonResult crawl(String use, String login, String key, String url, Boolean returnData) { try { Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP); ClientFactory.INSTANCE.properties.checkApi(); WebCrawlThread webCrawlThread = client.getWebCrawlMaster().manualCrawl(LinkUtils.newEncodedURL(url), HostUrlList.ListType.MANUAL); if (!webCrawlThread.waitForStart(120)) throw new WebServiceException("Time out reached (120 seconds)"); if (!webCrawlThread.waitForEnd(3600)) throw new WebServiceException("Time out reached (3600 seconds)"); UrlItem urlItem = webCrawlThread.getCurrentUrlItem(); CommonResult cr = null; if (BooleanUtils.isTrue(returnData)) { Crawl crawl = webCrawlThread.getCurrentCrawl(); if (crawl != null) { List<IndexDocument> indexDocuments = crawl.getTargetIndexDocuments(); if (CollectionUtils.isNotEmpty(indexDocuments)) { CommonListResult<List<FieldValueList>> clr = new CommonListResult<List<FieldValueList>>( indexDocuments.size()); for (IndexDocument indexDocument : indexDocuments) { List<FieldValueList> list = FieldValueList.getNewList(indexDocument); if (list != null) clr.items.add(list); } cr = clr; } } } String message = urlItem != null ? "Result: " + urlItem.getFetchStatus() + " - " + urlItem.getParserStatus() + " - " + urlItem.getIndexStatus() : null; if (cr == null) cr = new CommonResult(true, message); cr.addDetail("URL", urlItem.getUrl()); cr.addDetail("HttpResponseCode", urlItem.getResponseCode()); cr.addDetail("RobotsTxtStatus", urlItem.getRobotsTxtStatus()); cr.addDetail("FetchStatus", urlItem.getFetchStatus()); cr.addDetail("ParserStatus", urlItem.getParserStatus()); cr.addDetail("IndexStatus", urlItem.getIndexStatus()); cr.addDetail("RedirectionURL", urlItem.getRedirectionUrl()); cr.addDetail("ContentBaseType", urlItem.getContentBaseType()); cr.addDetail("ContentTypeCharset", urlItem.getContentTypeCharset()); cr.addDetail("ContentLength", urlItem.getContentLength()); return cr; } catch (MalformedURLException e) { throw new CommonServiceException(e); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (ParseException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } catch (SyntaxError e) { throw new CommonServiceException(e); } catch (URISyntaxException e) { throw new CommonServiceException(e); } catch (ClassNotFoundException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (InstantiationException e) { throw new CommonServiceException(e); } catch (IllegalAccessException e) { throw new CommonServiceException(e); } } @Override public CommonResult crawlPost(String use, String login, String key, String url, Boolean returnData) { return crawl(use, login, key, url, returnData); } public CommonResult captureScreenshot(String use, String login, String key, String url) { try { Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); ScreenshotManager screenshotManager = client.getScreenshotManager(); CredentialManager credentialManager = client.getWebCredentialManager(); ScreenshotServlet.doCapture(null, screenshotManager, credentialManager, LinkUtils.newEncodedURL(url)); String message = "Captured URL " + url; return new CommonResult(true, message); } catch (MalformedURLException e) { throw new CommonServiceException(e); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (URISyntaxException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } public CommonResult checkScreenshot(String use, String login, String key, String url) { try { Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); ScreenshotManager screenshotManager = client.getScreenshotManager(); String message = ScreenshotServlet.doCheck(screenshotManager, LinkUtils.newEncodedURL(url)); return new CommonResult(true, message); } catch (MalformedURLException e) { throw new CommonServiceException(e); } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } catch (URISyntaxException e) { throw new CommonServiceException(e); } } public static String getCrawlXML(User user, Client client, String url) throws UnsupportedEncodingException { return RestApplication.getRestURL("/index/{index}/crawler/web/crawl", user, client, "url", url); } public static String getCrawlJSON(User user, Client client, String url) throws UnsupportedEncodingException { return RestApplication.getRestURL("/index/{index}/crawler/web/crawl", user, client, "url", url); } @Override public CommonResult injectUrls(String index, String login, String key, boolean replaceAll, List<String> urls) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PARAMETERS); ClientFactory.INSTANCE.properties.checkApi(); UrlManager urlManager = client.getUrlManager(); CommonResult result = new CommonResult(true, null); if (replaceAll) urlManager.deleteAll(null); urlManager.inject(urls, result); return result; } catch (SearchLibException e) { throw new CommonServiceException(e); } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } @Override public CommonResult getUrls(String index, String login, String key) { try { Client client = getLoggedClientAnyRole(index, login, key, Role.GROUP_WEB_CRAWLER); ClientFactory.INSTANCE.properties.checkApi(); return null; } catch (InterruptedException e) { throw new CommonServiceException(e); } catch (IOException e) { throw new CommonServiceException(e); } } }