org.apdplat.extractor.html.ExtractRegular.java Source code

Java tutorial

Introduction

Here is the source code for org.apdplat.extractor.html.ExtractRegular.java

Source

/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, ??, yang-shangchuan@qq.com
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.apdplat.extractor.html;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.lang.StringUtils;
import org.apdplat.extractor.html.model.CssPath;
import org.apdplat.extractor.html.model.ExtractFunction;
import org.apdplat.extractor.html.model.HtmlTemplate;
import org.apdplat.extractor.html.model.UrlPattern;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import redis.clients.jedis.JedisPubSub;

/**
 * URL? 
 * Redis?Channelpr??CHANGE??? 
 * ?
 * 1???web??? 
 * 2?? 
 * 3?
 *
 * @author ??
 *
 */
public class ExtractRegular {
    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractRegular.class);
    private static final ObjectMapper MAPPER = new ObjectMapper();
    private static ExtractRegular extractRegular = null;
    private volatile Map<String, List<UrlPattern>> urlPatternMap = null;

    /**
     * ?
     */
    private ExtractRegular() {
    }

    /**
     * ??
     * @param urlPatterns url?
     * @return ?
     */
    public static ExtractRegular getInstance(List<UrlPattern> urlPatterns) {
        if (extractRegular != null) {
            return extractRegular;
        }
        synchronized (ExtractRegular.class) {
            if (extractRegular == null) {
                extractRegular = new ExtractRegular();
                //??
                extractRegular.init(urlPatterns);
            }
        }
        return extractRegular;
    }

    /**
     * ??
     *
     * @param serverUrl ??WEB???
     * @param redisHost Redis?
     * @param redisPort Redis??
     * @return ?
     */
    public static ExtractRegular getInstance(String serverUrl, String redisHost, int redisPort) {
        if (extractRegular != null) {
            return extractRegular;
        }
        synchronized (ExtractRegular.class) {
            if (extractRegular == null) {
                extractRegular = new ExtractRegular();
                //Redis?Channelpr??CHANGE???
                extractRegular.subscribeRedis(redisHost, redisPort, serverUrl);
                //??
                extractRegular.init(serverUrl);
            }
        }
        return extractRegular;
    }

    /**
     * ? 
     * 1???web???json 
     * 2??jsonjava 
     * 
     * @param serverUrl ??WEB???
     */
    private synchronized void init(String serverUrl) {
        LOGGER.info("URL?");
        LOGGER.info("serverUrl: " + serverUrl);
        //??web???
        String json = downJson(serverUrl);
        LOGGER.info("?URL?");
        //?
        LOGGER.info("?URL?");
        List<UrlPattern> urlPatterns = parseJson(json);
        LOGGER.info("??URL?");
        init(urlPatterns);
    }

    /**
     * ?
     * ?
     * 
     * @param urlPatterns url?
     */
    private synchronized void init(List<UrlPattern> urlPatterns) {
        LOGGER.info("?URL?");
        //?
        Map<String, List<UrlPattern>> newUrlPatterns = toMap(urlPatterns);
        if (!newUrlPatterns.isEmpty()) {
            Map<String, List<UrlPattern>> oldUrlPatterns = urlPatternMap;
            urlPatternMap = newUrlPatterns;
            //??
            if (oldUrlPatterns != null) {
                for (List<UrlPattern> list : oldUrlPatterns.values()) {
                    list.clear();
                }
                oldUrlPatterns.clear();
            }
        }
        LOGGER.info("??URL?");
    }

    /**
     * Redis?Channelpr??CHANGE???
     */
    private void subscribeRedis(final String redisHost, final int redisPort, final String serverUrl) {
        if (null == redisHost || redisPort < 1) {
            LOGGER.error("redis??!");
            return;
        }
        Thread thread = new Thread(new Runnable() {
            @Override
            public void run() {
                String channel = "pr";
                LOGGER.info("redis??? host:" + redisHost + ",port:" + redisPort + ",channel:"
                        + channel);
                while (true) {
                    try {
                        JedisPool jedisPool = new JedisPool(new JedisPoolConfig(), redisHost, redisPort);
                        Jedis jedis = jedisPool.getResource();
                        LOGGER.info("redis?");
                        jedis.subscribe(new ExtractRegularChangeRedisListener(serverUrl), new String[] { channel });
                        jedisPool.returnResource(jedis);
                        LOGGER.info("redis?");
                        break;
                    } catch (Exception e) {
                        LOGGER.info("redis????");
                        try {
                            Thread.sleep(600000);
                        } catch (InterruptedException ex) {
                            LOGGER.error(ex.getMessage(), ex);
                        }
                    }
                }
            }
        });
        thread.setDaemon(true);
        thread.setName("redis??");
        thread.start();
    }

    /**
     * Redis????
     *
     * @author ??
     *
     */
    private class ExtractRegularChangeRedisListener extends JedisPubSub {
        private final String serverUrl;

        public ExtractRegularChangeRedisListener(String serverUrl) {
            this.serverUrl = serverUrl;
        }

        @Override
        public void onMessage(String channel, String message) {
            LOGGER.debug("onMessage channel:" + channel + " and message:" + message);
            if ("pr".equals(channel) && "CHANGE".equals(message)) {
                synchronized (ExtractRegularChangeRedisListener.class) {
                    init(serverUrl);
                }
            }
        }

        @Override
        public void onPMessage(String pattern, String channel, String message) {
            LOGGER.debug("pattern:" + pattern + " and channel:" + channel + " and message:" + message);
            onMessage(channel, message);
        }

        @Override
        public void onPSubscribe(String pattern, int subscribedChannels) {
            LOGGER.debug("psubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels);
        }

        @Override
        public void onPUnsubscribe(String pattern, int subscribedChannels) {
            LOGGER.debug("punsubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels);
        }

        @Override
        public void onSubscribe(String channel, int subscribedChannels) {
            LOGGER.debug("subscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels);
        }

        @Override
        public void onUnsubscribe(String channel, int subscribedChannels) {
            LOGGER.debug("unsubscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels);
        }
    }

    /**
     * ??WEB?json
     *
     * @param url ??WEB??
     * @return json
     */
    private String downJson(String url) {
        // HttpClient
        HttpClient httpClient = new HttpClient();
        // GET
        GetMethod method = new GetMethod(url);
        try {
            // GetMethod
            int statusCode = httpClient.executeMethod(method);
            LOGGER.info("??" + statusCode);
            if (statusCode != HttpStatus.SC_OK) {
                LOGGER.error(": " + method.getStatusLine());
            }
            // ?
            String responseBody = new String(method.getResponseBody(), "utf-8");
            return responseBody;
        } catch (IOException e) {
            LOGGER.error("" + url, e);
        } finally {
            // 
            method.releaseConnection();
        }
        return "";
    }

    /**
     * json?URL??JAVA
     *
     * @param json URL?JSON
     * @return URL?JAVA
     */
    private List<UrlPattern> parseJson(String json) {
        List<UrlPattern> urlPatterns = new ArrayList<>();
        try {
            List<Map<String, Object>> ups = MAPPER.readValue(json, List.class);
            for (Map<String, Object> up : ups) {
                try {
                    UrlPattern urlPattern = new UrlPattern();
                    urlPatterns.add(urlPattern);
                    urlPattern.setUrlPattern(up.get("urlPattern").toString());
                    List<Map<String, Object>> pageTemplates = (List<Map<String, Object>>) up.get("pageTemplates");
                    for (Map<String, Object> pt : pageTemplates) {
                        try {
                            HtmlTemplate htmlTemplate = new HtmlTemplate();
                            urlPattern.addHtmlTemplate(htmlTemplate);
                            htmlTemplate.setTemplateName(pt.get("templateName").toString());
                            htmlTemplate.setTableName(pt.get("tableName").toString());
                            List<Map<String, Object>> cssPaths = (List<Map<String, Object>>) pt.get("cssPaths");
                            for (Map<String, Object> cp : cssPaths) {
                                try {
                                    CssPath cssPath = new CssPath();
                                    htmlTemplate.addCssPath(cssPath);
                                    cssPath.setCssPath(cp.get("cssPath").toString());
                                    cssPath.setFieldName(cp.get("fieldName").toString());
                                    cssPath.setFieldDescription(cp.get("fieldDescription").toString());
                                    List<Map<String, Object>> extractFunctions = (List<Map<String, Object>>) cp
                                            .get("extractFunctions");
                                    for (Map<String, Object> pf : extractFunctions) {
                                        try {
                                            ExtractFunction extractFunction = new ExtractFunction();
                                            cssPath.addExtractFunction(extractFunction);
                                            extractFunction
                                                    .setExtractExpression(pf.get("extractExpression").toString());
                                            extractFunction.setFieldName(pf.get("fieldName").toString());
                                            extractFunction
                                                    .setFieldDescription(pf.get("fieldDescription").toString());
                                        } catch (Exception e) {
                                            LOGGER.error("JSON?", e);
                                        }
                                    }
                                } catch (Exception e) {
                                    LOGGER.error("JSON?", e);
                                }
                            }
                        } catch (Exception e) {
                            LOGGER.error("JSON?", e);
                        }
                    }
                } catch (Exception e) {
                    LOGGER.error("JSON?", e);
                }
            }
        } catch (Exception e) {
            LOGGER.error("JSON?", e);
        }
        return urlPatterns;
    }

    /**
     * url???url? 
     * map+url?? 
     * url???
     *
     * @param urlPatterns url?
     * @return url?keymap
     */
    private Map<String, List<UrlPattern>> toMap(List<UrlPattern> urlPatterns) {
        Map<String, List<UrlPattern>> map = new ConcurrentHashMap<>();
        for (UrlPattern urlPattern : urlPatterns) {
            try {
                URL url = new URL(urlPattern.getUrlPattern());
                String key = urlPrefix(url);
                List<UrlPattern> value = map.get(key);
                if (value == null) {
                    value = new ArrayList<>();
                    map.put(key, value);
                }
                value.add(urlPattern);
            } catch (Exception e) {
                LOGGER.error("URL?" + urlPattern.getUrlPattern(), e);
            }
        }
        return map;
    }

    /**
     * ?URL?
     * @param urlPatterns URL?
     */
    public void addUrlPatterns(List<UrlPattern> urlPatterns) {
        for (UrlPattern urlPattern : urlPatterns) {
            addUrlPattern(urlPattern);
        }
    }

    /**
     * ?URL?
     * @param urlPattern URL?
     */
    public void addUrlPattern(UrlPattern urlPattern) {
        try {
            URL url = new URL(urlPattern.getUrlPattern());
            String key = urlPrefix(url);
            List<UrlPattern> value = urlPatternMap.get(key);
            if (value == null) {
                value = new ArrayList<>();
                urlPatternMap.put(key, value);
            }
            value.add(urlPattern);
        } catch (Exception e) {
            LOGGER.error("URL" + urlPattern.getUrlPattern(), e);
        }
    }

    public void removeUrlPattern(String urlPattern) {
        try {
            URL url = new URL(urlPattern);
            String key = urlPrefix(url);
            urlPatternMap.remove(key);
        } catch (Exception e) {
            LOGGER.error("URL" + urlPattern, e);
        }
    }

    /**
     * ?url??URL? 
     *  
     * ??+??.)+??
     *
     * @param url
     * @return
     */
    private String urlPrefix(URL url) {
        StringBuilder result = new StringBuilder();
        result.append(url.getProtocol());
        String[] splits = StringUtils.split(url.getHost(), '.');
        if (splits.length > 0) {
            for (String split : splits) {
                result.append(split);
            }
        }
        if (url.getPort() > -1) {
            result.append(Integer.toString(url.getPort()));
        }
        return result.toString();
    }

    /**
     * ????URL???
     *
     * @param urlString url
     * @return ???
     */
    public List<HtmlTemplate> getHtmlTemplate(String urlString) {
        List<HtmlTemplate> pageTemplates = new ArrayList<>();
        if (urlPatternMap != null) {
            try {
                URL url = new URL(urlString);
                String key = urlPrefix(url);
                List<UrlPattern> patterns = urlPatternMap.get(key);
                for (UrlPattern urlPattern : patterns) {
                    Matcher matcher = urlPattern.getRegexPattern().matcher(urlString);
                    if (matcher.find()) {
                        //??
                        pageTemplates.addAll(urlPattern.getHtmlTemplates());
                    }
                }
            } catch (Exception e) {
                LOGGER.error("?URL?" + urlString, e);
            }
        }
        return pageTemplates;
    }

    public static void main(String[] args) throws Exception {
        ExtractRegular extractRegular = ExtractRegular
                .getInstance("http://localhost:8080/HtmlExtractorServer/api/all_extract_regular.jsp", null, -1);

        List<HtmlTemplate> pageTemplates = extractRegular
                .getHtmlTemplate("http://money.163.com/14/0529/19/9TEGPK5T00252G50.html");
        for (HtmlTemplate pageTemplate : pageTemplates) {
            System.out.println(pageTemplate);
        }

        pageTemplates = extractRegular.getHtmlTemplate("http://finance.qq.com/a/20140530/004254.htm");
        for (HtmlTemplate pageTemplate : pageTemplates) {
            System.out.println(pageTemplate);
        }
    }
}