cc.gospy.example.zhihu.ZhihuUserSpider.java Source code

Java tutorial

Introduction

Here is the source code for cc.gospy.example.zhihu.ZhihuUserSpider.java

Source

/*
 * Copyright 2017 ZhangJiupeng
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cc.gospy.example.zhihu;

import cc.gospy.core.Gospy;
import cc.gospy.core.TaskFilter;
import cc.gospy.core.entity.Result;
import cc.gospy.core.entity.Task;
import cc.gospy.core.fetcher.Fetchers;
import cc.gospy.core.pipeline.Pipelines;
import cc.gospy.core.processor.Processors;
import cc.gospy.core.scheduler.Schedulers;
import cc.gospy.core.util.webdriver.Browser;
import org.openqa.selenium.By;
import org.openqa.selenium.NoSuchWindowException;
import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.WebElement;

import java.util.*;

public class ZhihuUserSpider {
    public static void main(String[] args) {
        String startUrl = "selenium://https://www.zhihu.com/people/giantchen/following?page=1";
        String chromeDriverPath = "D:/Program Files/Chrome Driver/chromedriver.exe";

        // a demo of using selenium
        Gospy.custom()
                .setScheduler(Schedulers.VerifiableScheduler.custom().setPendingTimeInSeconds(300)
                        .setAutoExit(false).build())
                .addFetcher(Fetchers.TransparentFetcher.custom().convertHttpTaskToSelenium().build())
                .addProcessor(Processors.SeleniumProcessor.custom().setDriver(Browser.Chrome, chromeDriverPath)
                        .setWebDriverExecutor((page, driver) -> {
                            Collection<Task> newTasks = new HashSet<>();
                            boolean visited = false;
                            while (!visited) {
                                try {
                                    WebElement nextPageButton = driver
                                            .findElement(By.className("PaginationButton-next"));
                                    while (nextPageButton.isDisplayed()) {
                                        nextPageButton.click();
                                        while (true) {
                                            Thread.sleep(1000);
                                            try {
                                                if (driver.findElement(By.className("UserLink-link")).isEnabled()) {
                                                    visited = true;
                                                    break;
                                                }
                                            } catch (Exception e) {
                                            }
                                        }
                                        Thread.sleep(1000);
                                        List<WebElement> elements = driver.findElements(By.xpath(
                                                "//*[@id='Profile-following']/div[2]/div/div/div/div[2]/h2/div/span/div/div/a"));
                                        elements.forEach(e -> {
                                            Task newTask = new Task("selenium://"
                                                    + e.getAttribute("href").concat("/following?page=1"));
                                            newTask.setDepth(page.getTask().getDepth() + 1);
                                            System.out
                                                    .println("find " + e.getText() + "\t" + e.getAttribute("href"));
                                            newTasks.add(newTask);
                                        });
                                    }
                                } catch (Exception e) {
                                }
                            }
                            User user = new User();
                            user.setDistance(page.getTask().getDepth());
                            user.setUsername(driver.findElement(By.className("ProfileHeader-name")).getText());
                            WebElement detailButton;
                            try {
                                detailButton = driver.findElement(
                                        By.xpath("//*[@id='ProfileHeader']/div/div[2]/div/div[2]/div[3]/button"));
                                detailButton.click();
                            } catch (NoSuchElementException e) {
                                return new Result<>(newTasks, user);
                            }
                            while (!driver.findElement(By.className("ProfileHeader-detail")).isDisplayed())
                                ;
                            for (WebElement e : driver.findElements(By.xpath(
                                    "//*[@id='ProfileHeader']/div/div[2]/div/div[2]/div[2]/span/div/div/span"))) {
                                switch (e.getText().trim()) {
                                case "?":
                                    user.setResidence(e.findElement(By.xpath("./../div/span")).getText());
                                    break;
                                case "":
                                    user.setIndustry(e.findElement(By.xpath("./../div")).getText());
                                    break;
                                case "??":
                                    StringBuffer buffer = new StringBuffer();
                                    e.findElements(By.xpath("./../div/div")).forEach(webElement -> {
                                        buffer.append(webElement.getText()).append("\t");
                                    });
                                    user.setJobs(buffer.toString());
                                    break;
                                case "?":
                                    buffer = new StringBuffer();
                                    e.findElements(By.xpath("./../div/div")).forEach(webElement -> {
                                        buffer.append(webElement.getText()).append("\t");
                                    });
                                    user.setEducations(
                                            buffer.length() > 1 ? buffer.substring(0, buffer.length() - 1) : null);
                                    break;
                                case "":
                                    user.setIntro(
                                            e.findElement(By.xpath("./../div")).getText().replaceAll("\n", " "));
                                    break;
                                default:
                                    System.err.println("unknown detail [" + e.getText() + "]");
                                }
                            }
                            return new Result<>(newTasks, user);
                        }).setTaskFilter(TaskFilter.SELENIUM).build())
                .setExceptionHandler((throwable, task, page) -> {
                    if (throwable.getCause() != null
                            && (throwable.getCause().getClass() == NoSuchWindowException.class
                                    || throwable.getCause().getClass() == WebDriverException.class)) {
                        System.out.println("Browser is closed, exit now!");
                        System.exit(0);
                    }
                    throwable.printStackTrace();
                    if (task != null) {
                        if (!task.getUrl().startsWith("selenium://")) {
                            task.setUrl("selenium://".concat(task.getUrl()));
                            System.err.println("Retry " + task);
                            return Arrays.asList(task);
                        }
                    }
                    return null;
                }).addPipeline(Pipelines.ConsolePipeline.getDefault()).build().addTask(new Task(startUrl)).start();
    }
}