com.github.binlee1990.transformers.spider.PersonCrawler.java Source code

Java tutorial

Introduction

Here is the source code for com.github.binlee1990.transformers.spider.PersonCrawler.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.binlee1990.transformers.spider;

import com.alibaba.fastjson.JSON;
import com.github.binlee1990.transformers.dao.mapper.*;
import com.github.binlee1990.transformers.dao.model.*;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Pattern;

/**
 * @author Yasser Ganjisaffar
 */
public class PersonCrawler extends WebCrawler {
    private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png)$");

    private ActressMapper actressMapper;
    private CategoryMapper categoryMapper;
    private VideoMapper videoMapper;
    private VideoActressMapper videoActressMapper;
    private VideoCategoryMapper videoCategoryMapper;

    public PersonCrawler(ActressMapper actressMapper, CategoryMapper categoryMapper, VideoMapper videoMapper,
            VideoActressMapper videoActressMapper, VideoCategoryMapper videoCategoryMapper) {
        this.actressMapper = actressMapper;
        this.categoryMapper = categoryMapper;
        this.videoMapper = videoMapper;
        this.videoActressMapper = videoActressMapper;
        this.videoCategoryMapper = videoCategoryMapper;
    }

    @Override
    public boolean shouldVisit(Page referringPage, WebURL url) {
        String href = url.getURL().toLowerCase();

        if (IMAGE_EXTENSIONS.matcher(href).matches()) {
            return false;
        }

        return href.startsWith("http://www.javlibrary.com/cn/?v=jav");
    }

    @Override
    public void visit(Page page) {
        int docid = page.getWebURL().getDocid();
        String url = page.getWebURL().getURL();

        logger.info(url);
        if (!url.startsWith("http://www.javlibrary.com/cn/?v=jav")) {
            return;
        }

        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String html = htmlParseData.getHtml();

            Document doc = Jsoup.parse(html);

            String videoIdentificationCode = doc.select("div#video_id td.text").first().text().toString();
            Video queryVideo = new Video();
            queryVideo.setIdentificationCode(videoIdentificationCode);
            Video video = videoMapper.queryByVideo(queryVideo);

            if (null != video) {
                return;
            }

            video = new Video();
            video.setUrl(url);

            Date now = new Date();
            video.setCreateTime(now);
            video.setUpdateTime(now);

            String title = doc.select("div#video_title a").first().text().toString();
            video.setTitle(title);

            video.setIdentificationCode(videoIdentificationCode);

            Elements rdElements = doc.select("div#video_date td.text");
            if (CollectionUtils.isNotEmpty(rdElements)) {
                String releaseDate = rdElements.first().text().toString();
                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
                try {
                    Date date = sdf.parse(releaseDate);
                    video.setReleaseDate(date);
                } catch (ParseException e) {
                }
            }

            Elements dmElements = doc.select("div#video_length span.text");
            if (CollectionUtils.isNotEmpty(dmElements)) {
                String durationMinutes = dmElements.first().text().toString();
                video.setDurationMinutes(Integer.valueOf(durationMinutes));
            }

            Elements dElements = doc.select("div#video_director td.text");
            if (CollectionUtils.isNotEmpty(dElements)) {
                String director = dElements.first().text().toString();
                video.setDirector(director);
            }

            Elements pElements = doc.select("div#video_maker td.text");
            if (CollectionUtils.isNotEmpty(pElements)) {
                String producer = pElements.first().text().toString();
                video.setProducer(producer);
            }

            Elements disElements = doc.select("div#video_label td.text");
            if (CollectionUtils.isNotEmpty(disElements)) {
                String distributor = disElements.first().text().toString();
                video.setDistributor(distributor);
            }

            Elements countElements = doc.select("div#video_favorite_edit span");
            if (CollectionUtils.isNotEmpty(countElements)) {
                Elements countWantedElements = countElements.select("#subscribed a");
                if (CollectionUtils.isNotEmpty(countWantedElements)) {
                    String countWanted = countWantedElements.first().text();
                    try {
                        video.setCountWanted(Integer.valueOf(countWanted));
                    } catch (Exception e) {
                    }
                }

                Elements countWatchedElements = countElements.select("#watched a");
                if (CollectionUtils.isNotEmpty(countWatchedElements)) {
                    String countWatched = countWatchedElements.first().text();
                    try {
                        video.setCountWatched(Integer.valueOf(countWatched));
                    } catch (Exception e) {
                    }
                }

                Elements countOwnedElements = countElements.select("#owned a");
                if (CollectionUtils.isNotEmpty(countOwnedElements)) {
                    String countOwned = countOwnedElements.first().text();
                    try {
                        video.setCountOwned(Integer.valueOf(countOwned));
                    } catch (Exception e) {
                    }
                }
            }

            Elements sElements = doc.select("div#video_review td.text span.score");
            if (CollectionUtils.isNotEmpty(sElements)) {
                String score = sElements.first().text().toString();
                score = StringUtils.replace(score, "(", "");
                score = StringUtils.replace(score, ")", "");
                if (StringUtils.isNotBlank(score)) {
                    try {
                        video.setScore(Float.valueOf(score));
                    } catch (Exception e) {
                    }
                }
            }

            Elements actressElements = doc.select("div#video_cast span.star");
            if (CollectionUtils.isNotEmpty(actressElements)) {
                if (actressElements.size() <= 1) {
                    video.setSingleFemaleFlag(true);
                } else {
                    video.setSingleFemaleFlag(false);
                }
            }

            videoMapper.insertSelective(video);
            int videoId = videoMapper.queryByVideo(video).getId();

            logger.info("handle " + videoId + "\n" + JSON.toJSONString(video));

            if (CollectionUtils.isNotEmpty(actressElements)) {
                actressElements.stream().forEach(a -> {
                    String aName = a.text().toString().trim();

                    if (StringUtils.isNotBlank(aName)) {
                        Actress queryActress = new Actress();
                        queryActress.setName(aName);
                        Actress actress = actressMapper.queryByActress(queryActress);
                        if (null != actress) {
                            VideoActress va = new VideoActress();
                            va.setActressCode(actress.getCode());
                            va.setVideoId(videoId);
                            videoActressMapper.insertSelective(va);
                        } else {
                            actress = new Actress();
                            actress.setName(aName);
                            actressMapper.insertSelective(actress);
                            int actressId = actressMapper.queryByActress(actress).getId();

                            VideoActress va = new VideoActress();
                            va.setActressCode(actress.getCode());
                            va.setVideoId(videoId);
                            videoActressMapper.insertSelective(va);
                        }
                    }
                });
            }

            Elements categoryElements = doc.select("div#video_genres span.genre");
            if (CollectionUtils.isNotEmpty(categoryElements)) {
                categoryElements.stream().forEach(c -> {
                    String cDescription = c.text().toString().trim();

                    if (StringUtils.isNotBlank(cDescription)) {
                        Category queryCategory = new Category();
                        queryCategory.setSubtype(cDescription);
                        Category category = categoryMapper.queryByCategory(queryCategory);
                        if (null != category) {
                            VideoCategory vc = new VideoCategory();
                            vc.setCategoryId(category.getId());
                            vc.setCategoryDescription(category.getSubtype());
                            vc.setVideoId(videoId);
                            videoCategoryMapper.insertSelective(vc);
                        } else {
                            category = new Category();
                            category.setSubtype(cDescription);
                            categoryMapper.insertSelective(category);
                            int categoryId = categoryMapper.queryByCategory(category).getId();

                            VideoCategory vc = new VideoCategory();
                            vc.setCategoryId(categoryId);
                            vc.setCategoryDescription(category.getSubtype());
                            vc.setVideoId(videoId);
                            videoCategoryMapper.insertSelective(vc);
                        }
                    }
                });
            }
        }
    }
}