org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java Source code

Java tutorial

Introduction

Here is the source code for org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java

Source

/**
 * ##########################  GoodCrawler  ############################
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sbs.goodcrawler.plugin.extract;

import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.sbs.goodcrawler.exception.QueueException;
import org.sbs.goodcrawler.extractor.Extractor;
import org.sbs.goodcrawler.job.Page;
import org.sbs.goodcrawler.jobconf.ExtractConfig;
import org.sbs.goodcrawler.storage.PendingStore.ExtractedPage;
import org.sbs.goodcrawler.urlmanager.WebURL;

/**
 * @author shenbaise(shenbaise@outlook.com)
 * @date 2013-7-7
 * extractor for 66ys
 * de precated . use defaultExtractor instead
 */
@Deprecated
public class ExtractorDytt8 extends Extractor {
    private Log log = LogFactory.getLog(this.getClass());

    /**
     * @param conf
     */
    public ExtractorDytt8(ExtractConfig conf) {
        super(conf);
    }

    /* (non-Javadoc)
     * @see org.sbs.goodcrawler.extractor.Extractor#onExtract(org.sbs.goodcrawler.job.Page)
     */
    @Override
    public ExtractedPage<?, ?> onExtract(Page page) {
        if (null != page) {
            try {

                Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                        urlUtils.getBaseUrl(page.getWebURL().getURL()));
                if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/"))
                    return null;
                // ???Url?Url
                Elements links = doc.getElementsByTag("a");
                if (!links.isEmpty()) {
                    for (Element link : links) {
                        String linkHref = link.absUrl("href");
                        if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) {
                            try {
                                WebURL url = new WebURL();

                                url.setURL(linkHref);
                                url.setJobName(conf.jobName);
                                pendingUrls.addUrl(url);
                            } catch (QueueException e) {
                                log.error(e.getMessage());
                            } catch (Exception e) {
                                log.error(e.getMessage());
                            }
                        }
                    }
                }
                // ??
                //            Map<String, String> selects = conf.getSelects();
                Map<String, String> selects = null;
                ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>();
                epage.setUrl(page.getWebURL());
                HashMap<String, Object> result = new HashMap<>();
                Elements text = doc.select("#Zoom");
                if (null == text || text.size() == 0) {
                    return null;
                }
                String name = doc.select("h1").text();
                name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", "");
                result.put("movie", name);
                //            result.put("_id", name);
                String ts[] = doc.select("h2 a").text().split(" ");
                if (ts.length >= 2) {
                    result.put("type", ts[1].trim());
                } else {
                    result.put("type", "unknow");
                }
                result.put("url", page.getWebURL().getURL());
                for (Entry<String, String> entry : selects.entrySet()) {
                    Elements elements = doc.select(entry.getValue());
                    if (elements.isEmpty())
                        return null;
                    else {
                        if ("content".equals(entry.getKey())) {

                            for (Element element : elements) {
                                // 
                                Elements imgs = element.select("img[src]");
                                StringBuilder sb = new StringBuilder();
                                for (Element img : imgs) {
                                    sb.append(img.attr("src")).append(";");
                                }
                                result.put("img", sb.toString());
                                // ?
                                Elements movieInfos = element.select("p");
                                for (Element info : movieInfos) {
                                    String infotext = info.text();
                                    try {
                                        String infotext_ = info.html();
                                        int start, end = 0;
                                        start = infotext_.indexOf("");
                                        if (start > 0) {
                                            end = infotext_.lastIndexOf("");
                                            if (end > 0 && start < end) {
                                                result.put("jq", infotext_.substring(start, end));
                                            } else {
                                                end = infotext_.lastIndexOf(".");
                                                if (end > 0 && start < end) {
                                                    result.put("jq", infotext_.substring(start, end));
                                                }
                                            }
                                        }
                                        infotext_ = null;
                                    } catch (Exception e) {
                                        e.printStackTrace();
                                    }

                                    if (infotext.startsWith("")) {
                                        String ss[] = infotext.split("");
                                        for (String s : ss) {
                                            s.trim();
                                            result = getInfoName(s, result);
                                        }
                                    } else if (infotext.startsWith("?")) {
                                        String ss[] = infotext.split("?");
                                        for (String s : ss) {
                                            s.trim();
                                            result = getInfoName(s, result);
                                        }
                                    } else if (infotext.contains("")) {
                                        infotext = info.html();
                                        String[] ss = infotext.split("<br />");
                                        for (String s : ss) {
                                            s.trim();
                                            result = getInfoName(s, result);
                                        }
                                    } else if (infotext.contains(":")) {
                                        infotext = info.html();
                                        String[] ss = infotext.split("<br />");
                                        for (String s : ss) {
                                            s.trim();
                                            result = getInfoName(s, result);
                                        }
                                    }
                                }

                                //                        if(result.size()<5){
                                //                           result.put("content", value)
                                //                        }

                                // ?
                                Elements elements2 = elements.select("td");
                                sb.setLength(0);
                                for (Element download : elements2) {
                                    sb.append(download.text()).append(";");
                                }
                                result.put("download", sb.toString());
                            }
                        }
                    }
                    //               result.put(entry.getKey(), elements.html());
                }
                if (StringUtils.isNotBlank((String) result.get("nd"))) {
                    result.put("nd", Integer.parseInt((String) result.get("nd")));
                }
                epage.setMessages(result);
                try {
                    pendingStore.addExtracedPage(epage);
                } catch (QueueException e) {
                    log.error(e.getMessage());
                }
                return epage;
            } catch (UnsupportedEncodingException e) {
                log.error(e.getMessage());
                e.printStackTrace();
            }
        }
        return null;
    }

    /* (non-Javadoc)
     * @see org.sbs.goodcrawler.extractor.Extractor#beforeExtract(org.sbs.goodcrawler.job.Page)
     */
    @Override
    public ExtractedPage<?, ?> beforeExtract(Page page) {
        return null;
    }

    /* (non-Javadoc)
     * @see org.sbs.goodcrawler.extractor.Extractor#afterExtract(org.sbs.goodcrawler.job.Page)
     */
    @Override
    public ExtractedPage<?, ?> afterExtract(Page page) {
        return null;
    }

    /**
     * @param args
     * @desc 
     */
    public static void main(String[] args) {

    }

    private HashMap<String, String> infoName = new HashMap<>();
    {
        infoName.put("??", "pm");
        infoName.put("??", "pm");
        infoName.put("??", "pm");
        infoName.put("??", "ym");
        infoName.put("??", "ym");
        infoName.put("??", "ym");
        infoName.put("", "gj");
        infoName.put("", "gj");
        infoName.put("", "gj");
        infoName.put("", "gj");
        infoName.put("", "gj");
        infoName.put("?", "nd");
        infoName.put("", "dq");
        infoName.put("", "bj");
        infoName.put("", "gj");
        infoName.put("", "gj");
        infoName.put("", "lb");
        infoName.put("", "lb");
        infoName.put("", "lb");
        infoName.put("", "yy");
        infoName.put("", "yy");
        infoName.put("", "yy");
        infoName.put("", "zm");
        infoName.put("", "zm");
        infoName.put("", "zm");
        infoName.put("?", "gs");
        infoName.put("", "cc");
        infoName.put("IMDB", "imdb-pf");
        infoName.put("", "pf");
        infoName.put("", "pf");
        infoName.put("", "pf");
        infoName.put("?", "dx");
        infoName.put("", "pc");
        infoName.put("", "pc");
        infoName.put("", "pc");
        infoName.put("", "dy");
        infoName.put("", "dy");
        infoName.put("", "dy");
        infoName.put("", "zy");
        infoName.put("", "zy");
        infoName.put("", "zy");
        infoName.put("", "jq");
        infoName.put("", "jq");
        infoName.put("?", "jq");
        infoName.put("?", "xzdz");
        infoName.put("", "zzdz");
        infoName.put("", "nd");
        infoName.put("", "nd");
        infoName.put("", "pc");
        infoName.put("", "yy");
        infoName.put("", "zy");

    }

    public HashMap<String, Object> getInfoName(String s, HashMap<String, Object> map) {
        try {
            String temp = null;
            if (s.contains("")) {
                String ss[] = s.split("");
                if (ss.length >= 2) {
                    temp = infoName.get(ss[0].trim());
                    if (StringUtils.isNotBlank(temp)) {
                        map.put(temp, ss[1].trim());
                    }
                }
            } else if (s.contains("")) {
                String ss[] = s.split(":");
                if (ss.length >= 2) {
                    temp = infoName.get(ss[0]);
                    if (StringUtils.isNotBlank(temp)) {
                        map.put(temp, ss[1].trim());
                    }
                }
            } else if (s.contains("")) {
                String ss[] = s.split("");
                if (ss.length >= 2) {
                    temp = infoName.get(ss[0]);
                    if (StringUtils.isNotBlank(temp)) {
                        map.put(temp, ss[1].trim());
                    }
                }
            } else {
                if (s.length() > 6) {
                    temp = infoName.get(s.substring(0, 4));
                    if (StringUtils.isNotBlank(temp)) {
                        map.put(temp, s.substring(5).trim());
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return map;
    }
}