org.sbs.goodcrawler.jobconf.ExtractConfig.java Source code

Java tutorial

Introduction

Here is the source code for org.sbs.goodcrawler.jobconf.ExtractConfig.java

Source

/**
 * ########################  SHENBAISE'S WORK  ##########################
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sbs.goodcrawler.jobconf;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.sbs.goodcrawler.bootstrap.foreman.FetchForeman;
import org.sbs.goodcrawler.conf.Configuration;
import org.sbs.goodcrawler.exception.ConfigurationException;
import org.sbs.goodcrawler.exception.ExtractException;
import org.sbs.goodcrawler.extractor.GCElement;
import org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector;
import org.sbs.goodcrawler.extractor.selector.FileElementCssSelector;
import org.sbs.goodcrawler.extractor.selector.IFConditions;
import org.sbs.goodcrawler.extractor.selector.factory.ElementCssSelectorFactory;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * @author whiteme
 * @date 20131013
 * @desc ??
 */
//@SuppressWarnings("rawtypes")
public class ExtractConfig extends Configuration {
    /**
     * ????
     */
    private int threadNum = 10;
    /**
     * ???
     */
    private final List<ExtractTemplate> templates = Lists.newArrayList();

    /**
     * ??????
     * @param document
     * @return
     * @throws Exception
     */
    public Map<String, Object> getContentAll(Document document) throws ExtractException {
        Map<String, Object> content = Maps.newHashMap();
        for (ExtractTemplate template : templates) {
            Map<String, Object> m = template.getConten(document);
            if (m != null && m.size() > 0)
                content.putAll(m);
        }
        return content;
    }

    /**
     * ???????<br>
     * ?get(???????????)
     * @param document
     * @param url
     * @return
     * @throws ExtractException
     */
    public Map<String, Object> getContentSeprator(Document document, String url) throws ExtractException {
        Map<String, Object> content = Maps.newHashMap();
        for (ExtractTemplate template : templates) {
            if (template.urlFilter(url)) {
                Map<String, Object> m = template.getConten(document);
                if (m != null && m.size() > 0) {
                    content.put(indexName, m);
                    return content;
                }
            }
        }
        return content;
    }

    /**
     * ????
     * @param doc
     * @return
     * @throws ConfigurationException
     */
    public ExtractConfig loadConfig(Document doc) throws ConfigurationException {
        Elements extractElement = doc.select("extract");
        super.jobName = doc.select("job").attr("name");
        super.indexName = doc.select("job").attr("indexName");
        String temp = extractElement.select("threadNum").text();
        if (StringUtils.isNotBlank(temp)) {
            this.threadNum = Integer.parseInt(temp);
        }

        Elements templateElement = extractElement.select("extract").select("template");
        Iterator<Element> it = templateElement.iterator();
        while (it.hasNext()) {
            Element template = it.next();
            ExtractTemplate extractTemplate = new ExtractTemplate();
            // ?Url????
            Elements urlPatternElement = template.select("url");
            List<Pattern> patterns = Lists.newArrayList();
            for (Element urlElement : urlPatternElement) {
                patterns.add(Pattern.compile(urlElement.text()));
            }
            extractTemplate.setUrlPattern(patterns);
            extractTemplate.setName(template.attr("name"));
            // ???
            Elements selectElement = template.select("elements").first().children();
            for (Element element : selectElement) {
                if ("element".equals(element.tagName())) {
                    AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                    extractTemplate.addCssSelector(selector);
                } else if ("if".equals(element.tagName())) {
                    IFConditions ifConditions = IFConditions.create(element);
                    extractTemplate.addConditions(ifConditions);
                }
            }
            this.templates.add(extractTemplate);
        }
        return this;
    }

    public int getThreadNum() {
        return threadNum;
    }

    public void setThreadNum(int threadNum) {
        this.threadNum = threadNum;
    }

    public List<ExtractTemplate> getTemplates() {
        return templates;
    }

    @Override
    public String toString() {
        final int maxLen = 10;
        StringBuilder builder = new StringBuilder();
        builder.append("ExtractConfig [threadNum=").append(threadNum).append(", templates=")
                .append(templates != null ? templates.subList(0, Math.min(templates.size(), maxLen)) : null)
                .append(", jobName=").append(jobName).append("]");
        return builder.toString();
    }

    // test
    public static void main(String[] args) {
        ExtractConfig extractConfig = new ExtractConfig();
        FetchConfig fetchConfig = new FetchConfig();
        Document document;
        try {
            document = Jsoup.parse(new File("conf/youku_conf.xml"), "utf-8");
            // 
            //         String url = "http://v.youku.com/v_show/id_XNTgwNDUxNTQw.html";
            // 
            //         String url = "http://www.youku.com/show_page/id_zd4edea60e0d011df97c0.html";
            // 
            String url = "http://www.youku.com/show_page/id_z34e239382e2911e29013.html";
            // 
            //         String url = "http://www.youku.com/show_page/id_z9510781e2d4411e296ac.html";

            System.out.println(extractConfig.loadConfig(document).toString());
            FetchForeman fetchForeman = new FetchForeman();
            fetchForeman.start(fetchConfig.loadConfig(document));
            Map<String, Object> r = extractConfig.getContentSeprator(Jsoup.parse(new URL(url), 10000), url);
            System.out.println(r);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ConfigurationException e) {
            e.printStackTrace();
        } catch (ExtractException e) {
            e.printStackTrace();
        }
    }
}

/**
 * 
 * @author whiteme
 * @date 20131015
 * @desc ????????
 */
@SuppressWarnings("rawtypes")
class ExtractTemplate {
    /**
     * ???
     */
    private String name;
    /**
     * ?????????
     */
    private List<Pattern> urlPattern = Lists.newArrayList();
    /**
     * ?cssjsoup???
     */
    private List<GCElement> cssSelectors = Lists.newArrayList();
    /**
     * ?
     */
    private List<IFConditions> conditions = Lists.newArrayList();

    /**
     * ?????document?
     * @param document
     * @return
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    public Map<String, Object> getConten(Document document) throws ExtractException {
        try {
            Map<String, Object> content = Maps.newHashMap();
            for (GCElement selector : cssSelectors) {

                if (selector instanceof FileElementCssSelector) {
                    FileElementCssSelector s = (FileElementCssSelector) selector;
                    Map<String, Object> m = s.setResult(content).setDocument(document).getContentMap();
                    //System.out.println(m);
                    if ((null == m || m.size() == 0) && s.isRequired()) {
                        return null;
                    } else {
                        if (null != m && m.size() > 0)
                            content.putAll(m);
                    }
                } else if (selector instanceof AbstractElementCssSelector) {
                    Map<String, Object> m = ((AbstractElementCssSelector) selector).setDocument(document)
                            .getContentMap();
                    //System.out.println(m);
                    if ((null == m || m.size() == 0) && ((AbstractElementCssSelector) selector).isRequired()) {
                        return null;
                    } else {
                        if (null != m && m.size() > 0)
                            content.putAll(m);
                    }
                }
            }
            for (IFConditions con : conditions) {
                if (con.test(content)) {
                    for (AbstractElementCssSelector<?> selector : con.getSelectors()) {
                        if (selector instanceof FileElementCssSelector) {
                            Map<String, Object> m = ((FileElementCssSelector) selector).setResult(content)
                                    .setDocument(document).getContentMap();
                            //System.out.println(m);
                            if ((null == m || m.size() == 0) && selector.isRequired()) {
                                return null;
                            } else {
                                if (null != m && m.size() > 0)
                                    content.putAll(m);
                            }
                        } else {
                            Map<String, Object> m = selector.setDocument(document).getContentMap();
                            //System.out.println(m);
                            if ((null == m || m.size() == 0) && selector.isRequired()) {
                                return null;
                            } else {
                                if (null != m && m.size() > 0)
                                    content.putAll(m);
                            }
                        }
                    }
                }
            }
            return content;
        } catch (Exception e) {
            e.printStackTrace();
            throw new ExtractException("????" + e.getMessage());
        }
    }

    /**
     * ?????Url
     * @param url
     * @return
     */
    public boolean urlFilter(String url) {
        for (Pattern pattern : urlPattern) {
            if (pattern.matcher(url).matches()) {
                return true;
            }
        }
        return false;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<Pattern> getUrlPattern() {
        return urlPattern;
    }

    public void setUrlPattern(List<Pattern> urlPattern) {
        this.urlPattern = urlPattern;
    }

    public void addUrlPattern(Pattern urlPattern) {
        this.urlPattern.add(urlPattern);
    }

    public List<GCElement> getCssSelectors() {
        return cssSelectors;
    }

    public void setCssSelectors(List<GCElement> cssSelectors) {
        this.cssSelectors = cssSelectors;
    }

    public void addCssSelector(AbstractElementCssSelector<?> selector) {
        this.cssSelectors.add(selector);
    }

    public List<IFConditions> getConditions() {
        return conditions;
    }

    public void setConditions(List<IFConditions> conditions) {
        this.conditions = conditions;
    }

    public void addConditions(IFConditions condition) {
        this.conditions.add(condition);
    }

    @Override
    protected Object clone() throws CloneNotSupportedException {
        return super.clone();
    }
}