com.jaeksoft.searchlib.webservice.crawler.webcrawler.WebCrawlerImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.webservice.crawler.webcrawler.WebCrawlerImpl.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2011-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/
package com.jaeksoft.searchlib.webservice.crawler.webcrawler;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;

import javax.xml.ws.WebServiceException;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.BooleanUtils;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.ClientFactory;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.web.database.CredentialManager;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList;
import com.jaeksoft.searchlib.crawler.web.database.UrlItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager;
import com.jaeksoft.searchlib.crawler.web.database.UrlManager.SearchTemplate;
import com.jaeksoft.searchlib.crawler.web.database.WebPropertyManager;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternItem;
import com.jaeksoft.searchlib.crawler.web.database.pattern.PatternManager;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.screenshot.ScreenshotManager;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.user.Role;
import com.jaeksoft.searchlib.user.User;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.web.servlet.restv1.ScreenshotServlet;
import com.jaeksoft.searchlib.webservice.CommonListResult;
import com.jaeksoft.searchlib.webservice.CommonResult;
import com.jaeksoft.searchlib.webservice.CommonServices;
import com.jaeksoft.searchlib.webservice.RestApplication;
import com.jaeksoft.searchlib.webservice.crawler.CrawlerUtils;
import com.jaeksoft.searchlib.webservice.query.document.FieldValueList;

public class WebCrawlerImpl extends CommonServices implements RestWebCrawler {

    @Override
    public CommonResult run(String use, String login, String key, boolean once) {
        try {
            Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP);
            ClientFactory.INSTANCE.properties.checkApi();
            if (once)
                return CrawlerUtils.runOnce(client.getWebCrawlMaster());
            else {
                client.getWebPropertyManager().getCrawlEnabled().setValue(true);
                return CrawlerUtils.runForever(client.getWebCrawlMaster());
            }
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult stop(String use, String login, String key) {
        try {
            Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP);
            ClientFactory.INSTANCE.properties.checkApi();
            client.getWebPropertyManager().getCrawlEnabled().setValue(false);
            return CrawlerUtils.stop(client.getWebCrawlMaster());
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult status(String use, String login, String key) {
        try {
            Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            return CrawlerUtils.status(client.getWebCrawlMaster());
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    private AbstractSearchRequest getRequest(UrlManager urlManager, String host)
            throws SearchLibException, ParseException {
        AbstractSearchRequest searchRequest = urlManager.getSearchRequest(SearchTemplate.urlExport);
        searchRequest.setQueryString("*:*");
        if (host != null && host.length() > 0)
            searchRequest.addFilter("host:\"" + host + '"', false);
        return searchRequest;
    }

    public byte[] exportURLs(String use, String login, String key) {
        try {
            Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            File file = client.getUrlManager().exportURLs(getRequest(client.getUrlManager(), null));
            return IOUtils.toByteArray(new FileInputStream(file));
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (FileNotFoundException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (ParseException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        }
    }

    public byte[] exportSiteMap(String use, String host, String login, String key) {
        try {
            Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            File file = client.getUrlManager().exportSiteMap(getRequest(client.getUrlManager(), host));
            return IOUtils.toByteArray(new FileInputStream(file));
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (FileNotFoundException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (ParseException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        }
    }

    private CommonResult injectPatterns(String index, String login, String key, boolean replaceAll,
            List<String> patterns, boolean inclusion) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
            ClientFactory.INSTANCE.properties.checkApi();
            List<PatternItem> patternList = PatternManager.getPatternList(patterns);
            PatternManager patternManager = inclusion ? client.getInclusionPatternManager()
                    : client.getExclusionPatternManager();
            patternManager.addList(patternList, replaceAll);
            int count = PatternManager.countStatus(patternList, PatternItem.Status.INJECTED);
            return new CommonResult(true, count + " patterns injected");
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult injectPatternsInclusion(String index, String login, String key, boolean replaceAll,
            List<String> patterns) {
        return injectPatterns(index, login, key, replaceAll, patterns, true);
    }

    @Override
    public CommonResult injectPatternsExclusion(String index, String login, String key, boolean replaceAll,
            List<String> patterns) {
        return injectPatterns(index, login, key, replaceAll, patterns, false);
    }

    private CommonResult getPatternStatusResult(WebPropertyManager webPropertyManager) {
        CommonResult commonResult = new CommonResult(true, null);
        commonResult.addDetail("inclusion_enabled", webPropertyManager.getInclusionEnabled().getValue());
        commonResult.addDetail("exclusion_enabled", webPropertyManager.getExclusionEnabled().getValue());
        return commonResult;
    }

    @Override
    public CommonResult getPatternStatus(String index, String login, String key) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
            ClientFactory.INSTANCE.properties.checkApi();
            WebPropertyManager webPropertyManager = client.getWebPropertyManager();
            return getPatternStatusResult(webPropertyManager);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult setPatternStatus(String index, String login, String key, Boolean inclusion,
            Boolean exclusion) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
            ClientFactory.INSTANCE.properties.checkApi();
            WebPropertyManager webPropertyManager = client.getWebPropertyManager();
            if (inclusion != null)
                webPropertyManager.getInclusionEnabled().setValue(inclusion);
            if (exclusion != null)
                webPropertyManager.getExclusionEnabled().setValue(exclusion);
            return getPatternStatusResult(webPropertyManager);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }

    }

    private CommonResult deletePatterns(String index, String login, String key, List<String> patterns,
            boolean inclusion) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PATTERN_LIST);
            ClientFactory.INSTANCE.properties.checkApi();
            PatternManager patternManager = inclusion ? client.getInclusionPatternManager()
                    : client.getExclusionPatternManager();
            int count = patternManager.delPattern(patterns);
            return new CommonResult(true, count + " patterns deleted");
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult deletePatternsInclusion(String index, String login, String key, List<String> deleteList) {
        return deletePatterns(index, login, key, deleteList, true);
    }

    @Override
    public CommonResult deletePatternsExclusion(String index, String login, String key, List<String> deleteList) {
        return deletePatterns(index, login, key, deleteList, false);
    }

    public CommonListResult<String> extractPatterns(String index, String login, String key, String startsWith,
            boolean inclusion) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            PatternManager patternManager = inclusion ? client.getInclusionPatternManager()
                    : client.getExclusionPatternManager();
            List<String> patterns = new ArrayList<String>();
            patternManager.getPatterns(startsWith, patterns);
            return new CommonListResult<String>(patterns);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonListResult<String> extractPatternsInclusion(String index, String login, String key,
            String startsWith) {
        return extractPatterns(index, login, key, startsWith, true);
    }

    @Override
    public CommonListResult<String> extractPatternsExclusion(String index, String login, String key,
            String startsWith) {
        return extractPatterns(index, login, key, startsWith, false);
    }

    @Override
    public CommonResult crawl(String use, String login, String key, String url, Boolean returnData) {
        try {
            Client client = getLoggedClient(use, login, key, Role.WEB_CRAWLER_START_STOP);
            ClientFactory.INSTANCE.properties.checkApi();
            WebCrawlThread webCrawlThread = client.getWebCrawlMaster().manualCrawl(LinkUtils.newEncodedURL(url),
                    HostUrlList.ListType.MANUAL);
            if (!webCrawlThread.waitForStart(120))
                throw new WebServiceException("Time out reached (120 seconds)");
            if (!webCrawlThread.waitForEnd(3600))
                throw new WebServiceException("Time out reached (3600 seconds)");
            UrlItem urlItem = webCrawlThread.getCurrentUrlItem();
            CommonResult cr = null;
            if (BooleanUtils.isTrue(returnData)) {
                Crawl crawl = webCrawlThread.getCurrentCrawl();
                if (crawl != null) {
                    List<IndexDocument> indexDocuments = crawl.getTargetIndexDocuments();
                    if (CollectionUtils.isNotEmpty(indexDocuments)) {
                        CommonListResult<List<FieldValueList>> clr = new CommonListResult<List<FieldValueList>>(
                                indexDocuments.size());
                        for (IndexDocument indexDocument : indexDocuments) {
                            List<FieldValueList> list = FieldValueList.getNewList(indexDocument);
                            if (list != null)
                                clr.items.add(list);
                        }
                        cr = clr;
                    }
                }
            }

            String message = urlItem != null
                    ? "Result: " + urlItem.getFetchStatus() + " - " + urlItem.getParserStatus() + " - "
                            + urlItem.getIndexStatus()
                    : null;
            if (cr == null)
                cr = new CommonResult(true, message);
            cr.addDetail("URL", urlItem.getUrl());
            cr.addDetail("HttpResponseCode", urlItem.getResponseCode());
            cr.addDetail("RobotsTxtStatus", urlItem.getRobotsTxtStatus());
            cr.addDetail("FetchStatus", urlItem.getFetchStatus());
            cr.addDetail("ParserStatus", urlItem.getParserStatus());
            cr.addDetail("IndexStatus", urlItem.getIndexStatus());
            cr.addDetail("RedirectionURL", urlItem.getRedirectionUrl());
            cr.addDetail("ContentBaseType", urlItem.getContentBaseType());
            cr.addDetail("ContentTypeCharset", urlItem.getContentTypeCharset());
            cr.addDetail("ContentLength", urlItem.getContentLength());
            return cr;
        } catch (MalformedURLException e) {
            throw new CommonServiceException(e);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (ParseException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (SyntaxError e) {
            throw new CommonServiceException(e);
        } catch (URISyntaxException e) {
            throw new CommonServiceException(e);
        } catch (ClassNotFoundException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (InstantiationException e) {
            throw new CommonServiceException(e);
        } catch (IllegalAccessException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult crawlPost(String use, String login, String key, String url, Boolean returnData) {
        return crawl(use, login, key, url, returnData);
    }

    public CommonResult captureScreenshot(String use, String login, String key, String url) {
        try {
            Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            ScreenshotManager screenshotManager = client.getScreenshotManager();
            CredentialManager credentialManager = client.getWebCredentialManager();
            ScreenshotServlet.doCapture(null, screenshotManager, credentialManager, LinkUtils.newEncodedURL(url));
            String message = "Captured URL " + url;
            return new CommonResult(true, message);
        } catch (MalformedURLException e) {
            throw new CommonServiceException(e);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (URISyntaxException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    public CommonResult checkScreenshot(String use, String login, String key, String url) {
        try {
            Client client = getLoggedClientAnyRole(use, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            ScreenshotManager screenshotManager = client.getScreenshotManager();
            String message = ScreenshotServlet.doCheck(screenshotManager, LinkUtils.newEncodedURL(url));
            return new CommonResult(true, message);
        } catch (MalformedURLException e) {
            throw new CommonServiceException(e);
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        } catch (URISyntaxException e) {
            throw new CommonServiceException(e);
        }
    }

    public static String getCrawlXML(User user, Client client, String url) throws UnsupportedEncodingException {
        return RestApplication.getRestURL("/index/{index}/crawler/web/crawl", user, client, "url", url);
    }

    public static String getCrawlJSON(User user, Client client, String url) throws UnsupportedEncodingException {
        return RestApplication.getRestURL("/index/{index}/crawler/web/crawl", user, client, "url", url);
    }

    @Override
    public CommonResult injectUrls(String index, String login, String key, boolean replaceAll, List<String> urls) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.WEB_CRAWLER_EDIT_PARAMETERS);
            ClientFactory.INSTANCE.properties.checkApi();
            UrlManager urlManager = client.getUrlManager();
            CommonResult result = new CommonResult(true, null);
            if (replaceAll)
                urlManager.deleteAll(null);
            urlManager.inject(urls, result);
            return result;
        } catch (SearchLibException e) {
            throw new CommonServiceException(e);
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

    @Override
    public CommonResult getUrls(String index, String login, String key) {
        try {
            Client client = getLoggedClientAnyRole(index, login, key, Role.GROUP_WEB_CRAWLER);
            ClientFactory.INSTANCE.properties.checkApi();
            return null;
        } catch (InterruptedException e) {
            throw new CommonServiceException(e);
        } catch (IOException e) {
            throw new CommonServiceException(e);
        }
    }

}