org.aliuge.crawler.jobconf.ExtractConfig.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.jobconf.ExtractConfig.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.jobconf;

import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

import org.aliuge.crawler.exception.ConfigurationException;
import org.aliuge.crawler.exception.ExtractException;
import org.aliuge.crawler.extractor.selector.AbstractElementCssSelector;
import org.aliuge.crawler.extractor.selector.IFConditions;
import org.aliuge.crawler.extractor.selector.factory.ElementCssSelectorFactory;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * @author 
 * @date 201481
 * @desc ??
 */
public class ExtractConfig extends Configuration {
    /**
     * ????
     */
    private int threadNum = 10;
    /**
     * ???
     */
    private final List<ExtractTemplate> templates = Lists.newArrayList();

    /**
     * ??????
     * @param document
     * @return
     * @throws Exception
     */
    public Map<String, Object> getContentAll(Document document) throws ExtractException {
        Map<String, Object> content = Maps.newHashMap();
        for (ExtractTemplate template : templates) {
            Map<String, Object> m = template.getContentResultMap(document);
            if (m != null && m.size() > 0)
                content.putAll(m);
        }
        return content;
    }

    /**
     * ???????<br>
     * ?get(???????????)
     * @param document
     * @param url
     * @return
     * @throws ExtractException
     */
    public Map<String, Object> getContentSeprator(Document document, String url) throws ExtractException {
        Map<String, Object> content = Maps.newHashMap();
        for (ExtractTemplate template : templates) {
            if (template.urlFilter(url)) {
                Map<String, Object> m = template.getContentResultMap(document);
                if (m != null && m.size() > 0) {
                    content.put(super.getIndexName(), m);
                    return content;
                }
            }
        }
        return content;
    }

    /**
     * ????
     * @param doc
     * @return
     * @throws ConfigurationException
     */
    public ExtractConfig loadConfig(Document doc) {
        Elements extractElement = doc.select("extract");
        super.setJobName(doc.select("job").attr("name"));
        super.setIndexName(doc.select("job").attr("indexName"));
        String temp = extractElement.select("threadNum").text();
        if (StringUtils.isNotBlank(temp)) {
            this.threadNum = Integer.parseInt(temp);
        }

        Elements templateElement = extractElement.select("extract").select("template");
        Iterator<Element> it = templateElement.iterator();

        while (it.hasNext()) {
            Element template = it.next();
            ExtractTemplate extractTemplate = new ExtractTemplate();
            // ?Url????
            Elements urlPatternElement = template.select("url");
            List<Pattern> patterns = Lists.newArrayList();
            for (Element urlElement : urlPatternElement) {
                patterns.add(Pattern.compile(urlElement.text()));
            }
            extractTemplate.setUrlPattern(patterns);
            extractTemplate.setName(template.attr("name"));
            // ???
            Elements selectElement = template.select("elements").first().children();
            for (Element element : selectElement) {
                if ("element".equals(element.tagName())) {
                    AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element);
                    extractTemplate.addCssSelector(selector);
                } else if ("if".equals(element.tagName())) {
                    IFConditions ifConditions = IFConditions.create(element);
                    extractTemplate.addConditions(ifConditions);
                }
            }
            super.setExtractConfig(this);
            this.templates.add(extractTemplate);
        }
        //super.setExtractConfig(this);
        return this;
    }

    public int getThreadNum() {
        return threadNum;
    }

    public void setThreadNum(int threadNum) {
        this.threadNum = threadNum;
    }

    public List<ExtractTemplate> getTemplates() {
        return templates;
    }

    @Override
    public String toString() {
        final int maxLen = 10;
        StringBuilder builder = new StringBuilder();
        builder.append("ExtractConfig [threadNum=").append(threadNum).append(", templates=")
                .append(templates != null ? templates.subList(0, Math.min(templates.size(), maxLen)) : null)
                .append(", jobName=").append(getJobName()).append("]");
        return builder.toString();
    }

}