Java tutorial
/** * ########################## GoodCrawler ############################ * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sbs.goodcrawler.plugin.extract; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.sbs.goodcrawler.exception.QueueException; import org.sbs.goodcrawler.extractor.Extractor; import org.sbs.goodcrawler.job.Page; import org.sbs.goodcrawler.jobconf.ExtractConfig; import org.sbs.goodcrawler.storage.PendingStore.ExtractedPage; import org.sbs.goodcrawler.urlmanager.WebURL; /** * @author shenbaise(shenbaise@outlook.com) * @date 2013-7-7 * extractor for 66ys * de precated . use defaultExtractor instead */ @Deprecated public class ExtractorDytt8 extends Extractor { private Log log = LogFactory.getLog(this.getClass()); /** * @param conf */ public ExtractorDytt8(ExtractConfig conf) { super(conf); } /* (non-Javadoc) * @see org.sbs.goodcrawler.extractor.Extractor#onExtract(org.sbs.goodcrawler.job.Page) */ @Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try { Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; } /* (non-Javadoc) * @see org.sbs.goodcrawler.extractor.Extractor#beforeExtract(org.sbs.goodcrawler.job.Page) */ @Override public ExtractedPage<?, ?> beforeExtract(Page page) { return null; } /* (non-Javadoc) * @see org.sbs.goodcrawler.extractor.Extractor#afterExtract(org.sbs.goodcrawler.job.Page) */ @Override public ExtractedPage<?, ?> afterExtract(Page page) { return null; } /** * @param args * @desc */ public static void main(String[] args) { } private HashMap<String, String> infoName = new HashMap<>(); { infoName.put("??", "pm"); infoName.put("??", "pm"); infoName.put("??", "pm"); infoName.put("??", "ym"); infoName.put("??", "ym"); infoName.put("??", "ym"); infoName.put("", "gj"); infoName.put("", "gj"); infoName.put("", "gj"); infoName.put("", "gj"); infoName.put("", "gj"); infoName.put("?", "nd"); infoName.put("", "dq"); infoName.put("", "bj"); infoName.put("", "gj"); infoName.put("", "gj"); infoName.put("", "lb"); infoName.put("", "lb"); infoName.put("", "lb"); infoName.put("", "yy"); infoName.put("", "yy"); infoName.put("", "yy"); infoName.put("", "zm"); infoName.put("", "zm"); infoName.put("", "zm"); infoName.put("?", "gs"); infoName.put("", "cc"); infoName.put("IMDB", "imdb-pf"); infoName.put("", "pf"); infoName.put("", "pf"); infoName.put("", "pf"); infoName.put("?", "dx"); infoName.put("", "pc"); infoName.put("", "pc"); infoName.put("", "pc"); infoName.put("", "dy"); infoName.put("", "dy"); infoName.put("", "dy"); infoName.put("", "zy"); infoName.put("", "zy"); infoName.put("", "zy"); infoName.put("", "jq"); infoName.put("", "jq"); infoName.put("?", "jq"); infoName.put("?", "xzdz"); infoName.put("", "zzdz"); infoName.put("", "nd"); infoName.put("", "nd"); infoName.put("", "pc"); infoName.put("", "yy"); infoName.put("", "zy"); } public HashMap<String, Object> getInfoName(String s, HashMap<String, Object> map) { try { String temp = null; if (s.contains("")) { String ss[] = s.split(""); if (ss.length >= 2) { temp = infoName.get(ss[0].trim()); if (StringUtils.isNotBlank(temp)) { map.put(temp, ss[1].trim()); } } } else if (s.contains("")) { String ss[] = s.split(":"); if (ss.length >= 2) { temp = infoName.get(ss[0]); if (StringUtils.isNotBlank(temp)) { map.put(temp, ss[1].trim()); } } } else if (s.contains("")) { String ss[] = s.split(""); if (ss.length >= 2) { temp = infoName.get(ss[0]); if (StringUtils.isNotBlank(temp)) { map.put(temp, ss[1].trim()); } } } else { if (s.length() > 6) { temp = infoName.get(s.substring(0, 4)); if (StringUtils.isNotBlank(temp)) { map.put(temp, s.substring(5).trim()); } } } } catch (Exception e) { e.printStackTrace(); } return map; } }