Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.binlee1990.transformers.spider; import com.alibaba.fastjson.JSON; import com.github.binlee1990.transformers.dao.mapper.*; import com.github.binlee1990.transformers.dao.model.*; import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.WebCrawler; import edu.uci.ics.crawler4j.parser.HtmlParseData; import edu.uci.ics.crawler4j.url.WebURL; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.regex.Pattern; /** * @author Yasser Ganjisaffar */ public class PersonCrawler extends WebCrawler { private static final Pattern IMAGE_EXTENSIONS = Pattern.compile(".*\\.(bmp|gif|jpg|png)$"); private ActressMapper actressMapper; private CategoryMapper categoryMapper; private VideoMapper videoMapper; private VideoActressMapper videoActressMapper; private VideoCategoryMapper videoCategoryMapper; public PersonCrawler(ActressMapper actressMapper, CategoryMapper categoryMapper, VideoMapper videoMapper, VideoActressMapper videoActressMapper, VideoCategoryMapper videoCategoryMapper) { this.actressMapper = actressMapper; this.categoryMapper = categoryMapper; this.videoMapper = videoMapper; this.videoActressMapper = videoActressMapper; this.videoCategoryMapper = videoCategoryMapper; } @Override public boolean shouldVisit(Page referringPage, WebURL url) { String href = url.getURL().toLowerCase(); if (IMAGE_EXTENSIONS.matcher(href).matches()) { return false; } return href.startsWith("http://www.javlibrary.com/cn/?v=jav"); } @Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); logger.info(url); if (!url.startsWith("http://www.javlibrary.com/cn/?v=jav")) { return; } if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); String videoIdentificationCode = doc.select("div#video_id td.text").first().text().toString(); Video queryVideo = new Video(); queryVideo.setIdentificationCode(videoIdentificationCode); Video video = videoMapper.queryByVideo(queryVideo); if (null != video) { return; } video = new Video(); video.setUrl(url); Date now = new Date(); video.setCreateTime(now); video.setUpdateTime(now); String title = doc.select("div#video_title a").first().text().toString(); video.setTitle(title); video.setIdentificationCode(videoIdentificationCode); Elements rdElements = doc.select("div#video_date td.text"); if (CollectionUtils.isNotEmpty(rdElements)) { String releaseDate = rdElements.first().text().toString(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); try { Date date = sdf.parse(releaseDate); video.setReleaseDate(date); } catch (ParseException e) { } } Elements dmElements = doc.select("div#video_length span.text"); if (CollectionUtils.isNotEmpty(dmElements)) { String durationMinutes = dmElements.first().text().toString(); video.setDurationMinutes(Integer.valueOf(durationMinutes)); } Elements dElements = doc.select("div#video_director td.text"); if (CollectionUtils.isNotEmpty(dElements)) { String director = dElements.first().text().toString(); video.setDirector(director); } Elements pElements = doc.select("div#video_maker td.text"); if (CollectionUtils.isNotEmpty(pElements)) { String producer = pElements.first().text().toString(); video.setProducer(producer); } Elements disElements = doc.select("div#video_label td.text"); if (CollectionUtils.isNotEmpty(disElements)) { String distributor = disElements.first().text().toString(); video.setDistributor(distributor); } Elements countElements = doc.select("div#video_favorite_edit span"); if (CollectionUtils.isNotEmpty(countElements)) { Elements countWantedElements = countElements.select("#subscribed a"); if (CollectionUtils.isNotEmpty(countWantedElements)) { String countWanted = countWantedElements.first().text(); try { video.setCountWanted(Integer.valueOf(countWanted)); } catch (Exception e) { } } Elements countWatchedElements = countElements.select("#watched a"); if (CollectionUtils.isNotEmpty(countWatchedElements)) { String countWatched = countWatchedElements.first().text(); try { video.setCountWatched(Integer.valueOf(countWatched)); } catch (Exception e) { } } Elements countOwnedElements = countElements.select("#owned a"); if (CollectionUtils.isNotEmpty(countOwnedElements)) { String countOwned = countOwnedElements.first().text(); try { video.setCountOwned(Integer.valueOf(countOwned)); } catch (Exception e) { } } } Elements sElements = doc.select("div#video_review td.text span.score"); if (CollectionUtils.isNotEmpty(sElements)) { String score = sElements.first().text().toString(); score = StringUtils.replace(score, "(", ""); score = StringUtils.replace(score, ")", ""); if (StringUtils.isNotBlank(score)) { try { video.setScore(Float.valueOf(score)); } catch (Exception e) { } } } Elements actressElements = doc.select("div#video_cast span.star"); if (CollectionUtils.isNotEmpty(actressElements)) { if (actressElements.size() <= 1) { video.setSingleFemaleFlag(true); } else { video.setSingleFemaleFlag(false); } } videoMapper.insertSelective(video); int videoId = videoMapper.queryByVideo(video).getId(); logger.info("handle " + videoId + "\n" + JSON.toJSONString(video)); if (CollectionUtils.isNotEmpty(actressElements)) { actressElements.stream().forEach(a -> { String aName = a.text().toString().trim(); if (StringUtils.isNotBlank(aName)) { Actress queryActress = new Actress(); queryActress.setName(aName); Actress actress = actressMapper.queryByActress(queryActress); if (null != actress) { VideoActress va = new VideoActress(); va.setActressCode(actress.getCode()); va.setVideoId(videoId); videoActressMapper.insertSelective(va); } else { actress = new Actress(); actress.setName(aName); actressMapper.insertSelective(actress); int actressId = actressMapper.queryByActress(actress).getId(); VideoActress va = new VideoActress(); va.setActressCode(actress.getCode()); va.setVideoId(videoId); videoActressMapper.insertSelective(va); } } }); } Elements categoryElements = doc.select("div#video_genres span.genre"); if (CollectionUtils.isNotEmpty(categoryElements)) { categoryElements.stream().forEach(c -> { String cDescription = c.text().toString().trim(); if (StringUtils.isNotBlank(cDescription)) { Category queryCategory = new Category(); queryCategory.setSubtype(cDescription); Category category = categoryMapper.queryByCategory(queryCategory); if (null != category) { VideoCategory vc = new VideoCategory(); vc.setCategoryId(category.getId()); vc.setCategoryDescription(category.getSubtype()); vc.setVideoId(videoId); videoCategoryMapper.insertSelective(vc); } else { category = new Category(); category.setSubtype(cDescription); categoryMapper.insertSelective(category); int categoryId = categoryMapper.queryByCategory(category).getId(); VideoCategory vc = new VideoCategory(); vc.setCategoryId(categoryId); vc.setCategoryDescription(category.getSubtype()); vc.setVideoId(videoId); videoCategoryMapper.insertSelective(vc); } } }); } } } }