org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.extractor.selector;

import java.util.List;
import java.util.Map;

import org.aliuge.crawler.exception.ExtractException;
import org.aliuge.crawler.extractor.selector.action.SelectorAction;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public abstract class AbstractElementCssSelector<T> {

    /**
     * ??
     */
    protected String name;
    /**
     * css selector
     */
    protected String value;
    /**
     * img?src?text
     */
    protected String attr;
    /**
     * attrSelectorAttr
     */
    protected SelectorAttr $Attr;
    /**
     * ?required
     */
    protected boolean isRequired = false;

    protected String regex;

    //protected Pattern pattern = null;
    /**
     * ??
     */
    protected Document document;
    /**
     * true??document<br>
     * falsedocument??
     */
    protected boolean newDoc = true;

    /**
     * ???.
     */
    protected int index = -1;
    /**
     * elementaction
     */
    protected List<SelectorAction<T>> actions;

    /**
     * 
     */
    public AbstractElementCssSelector() {
    };

    /**
     * 
     * @param name
     * @param value
     * @param atrr
     * @param isRequired
     * @param document
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index,
            String regex) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        if (this.$Attr == null) {
            this.$Attr = SelectorAttr.other;
        }
        this.isRequired = isRequired;
        this.index = index;
        this.regex = regex;
        /*if(StringUtils.isNotBlank(regex))
           this.pattern = Pattern.compile(regex);*/
    }

    /**
     * @param name
     * @param value
     * @param attr
     * @param isRequired
     * @param index
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        this.isRequired = isRequired;
        this.index = index;
    }

    /**
     * 
     * @param name
     * @param value
     * @param attr
     * @param isRequired
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        this.isRequired = isRequired;
    }

    /**
     * ??
     * @return
     */
    public abstract T getContent() throws ExtractException;

    /**
     * ??k???namevalue?
     * @return
     */
    public abstract Map<String, T> getContentMap() throws ExtractException;

    public abstract void addAction(SelectorAction<T> action);

    public String getName() {
        return name;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setName(String name) {
        this.name = name;
        return this;
    }

    public int getIndex() {
        return index;
    }

    public void setIndex(int index) {
        this.index = index;
    }

    public String getValue() {
        return value;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setValue(String value) {
        this.value = value;
        return this;
    }

    public String getAttr() {
        return attr;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setAttr(String attr) {
        this.attr = attr;
        return this;
    }

    public boolean isRequired() {
        return isRequired;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setRequired(boolean isRequired) {
        this.isRequired = isRequired;
        return this;
    }

    public Document getDocument() {
        return document;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setDocument(Document document) {
        this.document = document;
        this.newDoc = true;
        return this;
    }

    public SelectorAttr get$Attr() {
        return $Attr;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector set$Attr(SelectorAttr $Attr) {
        this.$Attr = $Attr;
        return this;
    }

    @SuppressWarnings("unchecked")
    public AbstractElementCssSelector setNewDoc(boolean newDoc) {
        this.newDoc = newDoc;
        return this;
    }

    /**
     * ????document?
     */
    protected void isNewDoc() {
        this.newDoc = true;
    }

    /**
     * ????
     * @param elements
     * @return
     */
    protected String getExtractText(Elements elements) {
        if (elements.size() == 0)
            return null;
        String temp = "";

        if (attr.equalsIgnoreCase("tostring")) {
            return temp = elements.toString();
        } else {
            if (index == -1 && StringUtils.isNotBlank(this.regex)) {
                for (Element e : elements) {
                    Element element = e;
                    if (element.select(this.regex).size() > 0) {
                        return temp = e.text();
                    }
                }
                return temp;
            } else {
                if (index > -1 && index < elements.size()) {
                    return elements.get(index).text();
                }
            }
            return elements.first().text();
        }

        /*if(attr.equals("tostring")){
           if(index==0 || index>elements.size())
        temp = elements.first().toString();
           else
        temp = elements.get(index).toString();
        }else{
           if(index==0 || index>elements.size())
        temp = elements.first().text();
           else
        temp = elements.get(index).text();
        }
            
        if(null!=pattern){
           Matcher m = pattern.matcher(temp);
           if(m.find()){
        temp = m.group(1);
           }
        }*/
        //return temp;
    }

    /**
     * ??????
     * @param elements
     * @param attr
     * @return
     */
    protected String getExtractAttr(Elements elements, String attr) {
        String temp = "";
        if (attr.equalsIgnoreCase("tostring")) {
            return temp = elements.attr(attr).toString();
        } else {
            if (index == -1 && StringUtils.isNotBlank(this.regex)) {
                for (Element e : elements) {
                    Element element = e;
                    if (element.select(this.regex).size() > 0) {
                        return temp = e.attr(attr);
                    }
                }
                return temp;
            } else {
                if (index > -1 && index < elements.size()) {
                    return elements.get(index).attr(attr);
                }
            }
            return elements.first().attr(attr);
        }
        /*if(null!=pattern){
           Matcher m = pattern.matcher(temp);
           if(m.find()){
        temp = m.group(1);
           }
        }*/
        //return temp;
    }

    public String getRegex() {
        return regex;
    }

    public void setRegex(String regex) {
        this.regex = regex;
    }

    @Override
    public String toString() {
        return "AbstractElementCssSelector [name=" + name + ", value=" + value + ", attr=" + attr + ", $Attr="
                + $Attr + ", isRequired=" + isRequired + ", regex=" + regex + ", newDoc=" + newDoc + ", index="
                + index + "]";
    }

}