com.weibo.datasys.crawler.impl.strategy.rule.parse.XpathMatchRule.java Source code

Java tutorial

Introduction

Here is the source code for com.weibo.datasys.crawler.impl.strategy.rule.parse.XpathMatchRule.java

Source

/**
 *  Copyright (c)  2016-2020 Weibo, Inc.
 *  All rights reserved.
 *
 *  This software is the confidential and proprietary information of Weibo, 
 *  Inc. ("Confidential Information"). You shall not
 *  disclose such Confidential Information and shall use it only in
 *  accordance with the terms of the license agreement you entered into with Weibo.
 */
package com.weibo.datasys.crawler.impl.strategy.rule.parse;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.weibo.datasys.common.data.CommonData;
import com.weibo.datasys.common.util.StringUtils;
import com.weibo.datasys.crawler.base.entity.Task;
import com.weibo.datasys.crawler.base.strategy.rule.parse.AbstractContentExtractRule;

/**
 * 
 * xpath????
 * 
 * 
 */
public class XpathMatchRule extends AbstractContentExtractRule {

    private static Logger logger = LoggerFactory.getLogger(XpathMatchRule.class);

    private static final String XPATH_PREFIX = "xpath:";

    private static final char MULTI_VALUE_SPLITER = ';';

    private boolean isMultiMatch;

    private String baseNodeXpath = "//";

    private Map<String, String> attrXpathMap = new LinkedHashMap<String, String>();

    /**
     * @param task
     */
    public XpathMatchRule(Task task) {
        super(task);
    }

    @Override
    public void configWithParameters(Map<String, String> paraMap) {
        setMultiMatch(Boolean.parseBoolean(paraMap.get("isMultiMatch")));
        baseNodeXpath = paraMap.get("baseNode");
        if (StringUtils.isEmptyString(baseNodeXpath)) {
            baseNodeXpath = "//";
        }
        for (Entry<String, String> entry : paraMap.entrySet()) {
            String key = entry.getKey();
            if (key.startsWith(XPATH_PREFIX)) {
                String attr = key.substring(XPATH_PREFIX.length());
                String xpath = entry.getValue();
                attrXpathMap.put(attr, xpath);
            }
        }
    }

    @SuppressWarnings("unchecked")
    @Override
    public List<CommonData> apply(CommonData in) {
        List<CommonData> fields = new ArrayList<CommonData>();
        String xml = in.getBaseField("src");
        try {
            Document doc = DocumentHelper.parseText(xml);
            List<Node> baseNodes = doc.selectNodes(baseNodeXpath);
            for (int i = 0; i < baseNodes.size(); i++) {
                Node baseNode = baseNodes.get(i);
                CommonData fieldData = new CommonData();
                // ??xpathField
                for (Entry<String, String> entry : attrXpathMap.entrySet()) {
                    String attr = entry.getKey();
                    String xpath = entry.getValue();
                    // ?xml?
                    List<Node> valueNodes = baseNode.selectNodes(xpath);
                    StringBuilder builder = new StringBuilder();
                    for (Node valueNode : valueNodes) {
                        String tmpValue = valueNode.getText();
                        if (!StringUtils.isEmptyString(tmpValue)) {
                            builder.append(tmpValue).append(MULTI_VALUE_SPLITER);
                        }
                    }
                    if (builder.length() > 0) {
                        builder.setLength(builder.length() - 1);
                    }
                    String value = builder.toString();
                    if ("id".equalsIgnoreCase(attr)) {
                        fieldData.setId(value);
                    } else {
                        fieldData.setBaseField(attr, value);
                    }
                }
                // Field?
                fields.add(fieldData);
                if (!isMultiMatch) {
                    // ??????
                    break;
                }
            } // end of for each baseNode
        } catch (Exception e) {
            logger.error("[XpathMatchRuleError] - ", e);
        }
        return fields;
    }

    /**
     * @param isMultiMatch
     *            the isMultiMatch to set
     */
    public void setMultiMatch(boolean isMultiMatch) {
        this.isMultiMatch = isMultiMatch;
    }

    /**
     * attrxpath
     * 
     * @param attrName
     * @param xpath
     */
    public void addExtractPattern(String attrName, String xpath) {
        this.attrXpathMap.put(attrName, xpath);
    }

}