com.watchrabbit.crawler.executor.service.CrawlExecutorServiceImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.watchrabbit.crawler.executor.service.CrawlExecutorServiceImpl.java

Source

/*
 * Copyright 2015 Mariusz.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.watchrabbit.crawler.executor.service;

import com.watchrabbit.commons.clock.Stopwatch;
import com.watchrabbit.crawler.api.CrawlForm;
import com.watchrabbit.crawler.api.CrawlResult;
import com.watchrabbit.crawler.api.LinkDto;
import com.watchrabbit.crawler.driver.factory.RemoteWebDriverFactory;
import com.watchrabbit.crawler.driver.service.LoaderService;
import com.watchrabbit.crawler.executor.facade.AuthServiceFacade;
import com.watchrabbit.crawler.executor.facade.ManagerServiceFacade;
import com.watchrabbit.crawler.executor.listener.CrawlListener;
import com.watchrabbit.crawler.executor.strategy.KeywordGenerateStrategy;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import static java.util.stream.Collectors.toList;
import org.apache.commons.lang.StringUtils;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

/**
 *
 * @author Mariusz
 */
@Service
public class CrawlExecutorServiceImpl implements CrawlExecutorService {

    private static final Logger LOGGER = LoggerFactory.getLogger(CrawlExecutorServiceImpl.class);

    @Autowired
    AuthServiceFacade authServiceFacade;

    @Autowired
    RemoteWebDriverFactory remoteWebDriverFactory;

    @Autowired
    ManagerServiceFacade managerServiceFacade;

    @Autowired
    LoaderService loaderService;

    @Autowired
    KeywordGenerateStrategy keywordGenerateStrategy;

    @Autowired(required = false)
    CrawlListener crawlListener = (pageId, driver) -> 0;

    @Override
    public void processPage(CrawlForm form) {
        Collection<Cookie> session = authServiceFacade.getSession(form.getDomain());
        RemoteWebDriver driver = remoteWebDriverFactory.produceDriver();
        try {
            Stopwatch stopwatch = Stopwatch.createStarted(() -> enableSession(driver, form, session));
            LOGGER.debug("Finished loading {} in {}", form.getUrl(),
                    stopwatch.getExecutionTime(TimeUnit.MILLISECONDS));

            List<LinkDto> links = collectLinks(driver).stream()
                    .map(link -> new LinkDto.Builder().withUrl(link).build()).collect(toList());
            if (form.isGateway()) {
                LOGGER.debug("Processing gateway {}", form.getUrl());
                List<String> keywords = keywordGenerateStrategy.generateKeywords(form, driver);
                links.addAll(keywords.stream()
                        .map(keyword -> new LinkDto.Builder().withKeyword(keyword).withUrl(form.getUrl()).build())
                        .collect(toList()));
            }
            double importanceFactor = crawlListener.accept(form.getId(), driver);
            managerServiceFacade.consumeResult(new CrawlResult.Builder().withDomain(form.getDomain())
                    .withMiliseconds(stopwatch.getExecutionTime(TimeUnit.MILLISECONDS)).withUrl(form.getUrl())
                    .withLinks(links).withId(form.getId()).withImportanceFactor(importanceFactor).build());
        } catch (Exception ex) {
            LOGGER.error("Execption on processing page " + form.getUrl(), ex);
            managerServiceFacade.onError(form);
        } finally {
            remoteWebDriverFactory.returnWebDriver(driver);
        }
    }

    private void enableSession(RemoteWebDriver driver, CrawlForm form, Collection<Cookie> session) {
        driver.get(form.getUrl());
        loaderService.waitFor(driver);
        if (!session.isEmpty()) {
            driver.manage().deleteAllCookies();
            session.forEach(driver.manage()::addCookie);

            driver.get(form.getUrl());
            loaderService.waitFor(driver);
        }
        if (StringUtils.isNotEmpty(form.getKeyword())) {
            Optional<SearchForm> searchFormOptional = findSearchInput(driver);
            searchFormOptional.ifPresent(searchForm -> {
                searchForm.input.sendKeys(form.getKeyword());
                loaderService.waitFor(driver);
                searchForm.submit.click();
                loaderService.waitFor(driver);
            });

        }
    }

    private List<String> collectLinks(RemoteWebDriver driver) {
        return driver.findElements(By.xpath("//a")).stream().filter(element -> element.isDisplayed())
                .map(link -> link.getAttribute("href")).filter(link -> link != null)
                .filter(link -> link.startsWith("http")).distinct().collect(toList());
    }

    private Optional<SearchForm> findSearchInput(RemoteWebDriver driver) {
        for (WebElement form : driver.findElements(By.xpath("//form"))) {
            LOGGER.debug("Looking to form with action {}", form.getAttribute("action"));
            List<WebElement> inputs = form.findElements(By.xpath(".//input")).stream()
                    .filter(input -> input.getAttribute("type").equals("text")
                            || input.getAttribute("type").equals("search"))
                    .filter(input -> input.isDisplayed()).collect(toList());
            List<WebElement> passwords = form.findElements(By.xpath(".//input")).stream()
                    .filter(input -> input.getAttribute("type").equals("password"))
                    .filter(input -> input.isDisplayed()).collect(toList());
            if (inputs.size() == 1 && passwords.isEmpty()) {
                List<WebElement> submit = form.findElements(By.xpath(".//button[@type='submit']"));
                if (submit.isEmpty()) {
                    submit = form.findElements(By.xpath(".//input[@type='submit']"));
                }
                if (!submit.isEmpty()) {
                    return Optional.of(new SearchForm(inputs.get(0), submit.get(0)));
                }
            }
        }
        LOGGER.error("Cannot find form in gateway page");
        return Optional.<SearchForm>empty();
    }

    private class SearchForm {

        WebElement input;

        WebElement submit;

        public SearchForm(WebElement input, WebElement submit) {
            this.input = input;
            this.submit = submit;
        }

    }
}