Java tutorial
/** * * APDPlat - Application Product Development Platform * Copyright (c) 2013, ??, yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.extractor.html; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.lang.StringUtils; import org.apdplat.extractor.html.model.CssPath; import org.apdplat.extractor.html.model.ExtractFunction; import org.apdplat.extractor.html.model.HtmlTemplate; import org.apdplat.extractor.html.model.UrlPattern; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import redis.clients.jedis.Jedis; import redis.clients.jedis.JedisPool; import redis.clients.jedis.JedisPoolConfig; import redis.clients.jedis.JedisPubSub; /** * URL? * Redis?Channelpr??CHANGE??? * ? * 1???web??? * 2?? * 3? * * @author ?? * */ public class ExtractRegular { private static final Logger LOGGER = LoggerFactory.getLogger(ExtractRegular.class); private static final ObjectMapper MAPPER = new ObjectMapper(); private static ExtractRegular extractRegular = null; private volatile Map<String, List<UrlPattern>> urlPatternMap = null; /** * ? */ private ExtractRegular() { } /** * ?? * @param urlPatterns url? * @return ? */ public static ExtractRegular getInstance(List<UrlPattern> urlPatterns) { if (extractRegular != null) { return extractRegular; } synchronized (ExtractRegular.class) { if (extractRegular == null) { extractRegular = new ExtractRegular(); //?? extractRegular.init(urlPatterns); } } return extractRegular; } /** * ?? * * @param serverUrl ??WEB??? * @param redisHost Redis? * @param redisPort Redis?? * @return ? */ public static ExtractRegular getInstance(String serverUrl, String redisHost, int redisPort) { if (extractRegular != null) { return extractRegular; } synchronized (ExtractRegular.class) { if (extractRegular == null) { extractRegular = new ExtractRegular(); //Redis?Channelpr??CHANGE??? extractRegular.subscribeRedis(redisHost, redisPort, serverUrl); //?? extractRegular.init(serverUrl); } } return extractRegular; } /** * ? * 1???web???json * 2??jsonjava * * @param serverUrl ??WEB??? */ private synchronized void init(String serverUrl) { LOGGER.info("URL?"); LOGGER.info("serverUrl: " + serverUrl); //??web??? String json = downJson(serverUrl); LOGGER.info("?URL?"); //? LOGGER.info("?URL?"); List<UrlPattern> urlPatterns = parseJson(json); LOGGER.info("??URL?"); init(urlPatterns); } /** * ? * ? * * @param urlPatterns url? */ private synchronized void init(List<UrlPattern> urlPatterns) { LOGGER.info("?URL?"); //? Map<String, List<UrlPattern>> newUrlPatterns = toMap(urlPatterns); if (!newUrlPatterns.isEmpty()) { Map<String, List<UrlPattern>> oldUrlPatterns = urlPatternMap; urlPatternMap = newUrlPatterns; //?? if (oldUrlPatterns != null) { for (List<UrlPattern> list : oldUrlPatterns.values()) { list.clear(); } oldUrlPatterns.clear(); } } LOGGER.info("??URL?"); } /** * Redis?Channelpr??CHANGE??? */ private void subscribeRedis(final String redisHost, final int redisPort, final String serverUrl) { if (null == redisHost || redisPort < 1) { LOGGER.error("redis??!"); return; } Thread thread = new Thread(new Runnable() { @Override public void run() { String channel = "pr"; LOGGER.info("redis??? host:" + redisHost + ",port:" + redisPort + ",channel:" + channel); while (true) { try { JedisPool jedisPool = new JedisPool(new JedisPoolConfig(), redisHost, redisPort); Jedis jedis = jedisPool.getResource(); LOGGER.info("redis?"); jedis.subscribe(new ExtractRegularChangeRedisListener(serverUrl), new String[] { channel }); jedisPool.returnResource(jedis); LOGGER.info("redis?"); break; } catch (Exception e) { LOGGER.info("redis????"); try { Thread.sleep(600000); } catch (InterruptedException ex) { LOGGER.error(ex.getMessage(), ex); } } } } }); thread.setDaemon(true); thread.setName("redis??"); thread.start(); } /** * Redis???? * * @author ?? * */ private class ExtractRegularChangeRedisListener extends JedisPubSub { private final String serverUrl; public ExtractRegularChangeRedisListener(String serverUrl) { this.serverUrl = serverUrl; } @Override public void onMessage(String channel, String message) { LOGGER.debug("onMessage channel:" + channel + " and message:" + message); if ("pr".equals(channel) && "CHANGE".equals(message)) { synchronized (ExtractRegularChangeRedisListener.class) { init(serverUrl); } } } @Override public void onPMessage(String pattern, String channel, String message) { LOGGER.debug("pattern:" + pattern + " and channel:" + channel + " and message:" + message); onMessage(channel, message); } @Override public void onPSubscribe(String pattern, int subscribedChannels) { LOGGER.debug("psubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels); } @Override public void onPUnsubscribe(String pattern, int subscribedChannels) { LOGGER.debug("punsubscribe pattern:" + pattern + " and subscribedChannels:" + subscribedChannels); } @Override public void onSubscribe(String channel, int subscribedChannels) { LOGGER.debug("subscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels); } @Override public void onUnsubscribe(String channel, int subscribedChannels) { LOGGER.debug("unsubscribe channel:" + channel + " and subscribedChannels:" + subscribedChannels); } } /** * ??WEB?json * * @param url ??WEB?? * @return json */ private String downJson(String url) { // HttpClient HttpClient httpClient = new HttpClient(); // GET GetMethod method = new GetMethod(url); try { // GetMethod int statusCode = httpClient.executeMethod(method); LOGGER.info("??" + statusCode); if (statusCode != HttpStatus.SC_OK) { LOGGER.error(": " + method.getStatusLine()); } // ? String responseBody = new String(method.getResponseBody(), "utf-8"); return responseBody; } catch (IOException e) { LOGGER.error("" + url, e); } finally { // method.releaseConnection(); } return ""; } /** * json?URL??JAVA * * @param json URL?JSON * @return URL?JAVA */ private List<UrlPattern> parseJson(String json) { List<UrlPattern> urlPatterns = new ArrayList<>(); try { List<Map<String, Object>> ups = MAPPER.readValue(json, List.class); for (Map<String, Object> up : ups) { try { UrlPattern urlPattern = new UrlPattern(); urlPatterns.add(urlPattern); urlPattern.setUrlPattern(up.get("urlPattern").toString()); List<Map<String, Object>> pageTemplates = (List<Map<String, Object>>) up.get("pageTemplates"); for (Map<String, Object> pt : pageTemplates) { try { HtmlTemplate htmlTemplate = new HtmlTemplate(); urlPattern.addHtmlTemplate(htmlTemplate); htmlTemplate.setTemplateName(pt.get("templateName").toString()); htmlTemplate.setTableName(pt.get("tableName").toString()); List<Map<String, Object>> cssPaths = (List<Map<String, Object>>) pt.get("cssPaths"); for (Map<String, Object> cp : cssPaths) { try { CssPath cssPath = new CssPath(); htmlTemplate.addCssPath(cssPath); cssPath.setCssPath(cp.get("cssPath").toString()); cssPath.setFieldName(cp.get("fieldName").toString()); cssPath.setFieldDescription(cp.get("fieldDescription").toString()); List<Map<String, Object>> extractFunctions = (List<Map<String, Object>>) cp .get("extractFunctions"); for (Map<String, Object> pf : extractFunctions) { try { ExtractFunction extractFunction = new ExtractFunction(); cssPath.addExtractFunction(extractFunction); extractFunction .setExtractExpression(pf.get("extractExpression").toString()); extractFunction.setFieldName(pf.get("fieldName").toString()); extractFunction .setFieldDescription(pf.get("fieldDescription").toString()); } catch (Exception e) { LOGGER.error("JSON?", e); } } } catch (Exception e) { LOGGER.error("JSON?", e); } } } catch (Exception e) { LOGGER.error("JSON?", e); } } } catch (Exception e) { LOGGER.error("JSON?", e); } } } catch (Exception e) { LOGGER.error("JSON?", e); } return urlPatterns; } /** * url???url? * map+url?? * url??? * * @param urlPatterns url? * @return url?keymap */ private Map<String, List<UrlPattern>> toMap(List<UrlPattern> urlPatterns) { Map<String, List<UrlPattern>> map = new ConcurrentHashMap<>(); for (UrlPattern urlPattern : urlPatterns) { try { URL url = new URL(urlPattern.getUrlPattern()); String key = urlPrefix(url); List<UrlPattern> value = map.get(key); if (value == null) { value = new ArrayList<>(); map.put(key, value); } value.add(urlPattern); } catch (Exception e) { LOGGER.error("URL?" + urlPattern.getUrlPattern(), e); } } return map; } /** * ?URL? * @param urlPatterns URL? */ public void addUrlPatterns(List<UrlPattern> urlPatterns) { for (UrlPattern urlPattern : urlPatterns) { addUrlPattern(urlPattern); } } /** * ?URL? * @param urlPattern URL? */ public void addUrlPattern(UrlPattern urlPattern) { try { URL url = new URL(urlPattern.getUrlPattern()); String key = urlPrefix(url); List<UrlPattern> value = urlPatternMap.get(key); if (value == null) { value = new ArrayList<>(); urlPatternMap.put(key, value); } value.add(urlPattern); } catch (Exception e) { LOGGER.error("URL" + urlPattern.getUrlPattern(), e); } } public void removeUrlPattern(String urlPattern) { try { URL url = new URL(urlPattern); String key = urlPrefix(url); urlPatternMap.remove(key); } catch (Exception e) { LOGGER.error("URL" + urlPattern, e); } } /** * ?url??URL? * * ??+??.)+?? * * @param url * @return */ private String urlPrefix(URL url) { StringBuilder result = new StringBuilder(); result.append(url.getProtocol()); String[] splits = StringUtils.split(url.getHost(), '.'); if (splits.length > 0) { for (String split : splits) { result.append(split); } } if (url.getPort() > -1) { result.append(Integer.toString(url.getPort())); } return result.toString(); } /** * ????URL??? * * @param urlString url * @return ??? */ public List<HtmlTemplate> getHtmlTemplate(String urlString) { List<HtmlTemplate> pageTemplates = new ArrayList<>(); if (urlPatternMap != null) { try { URL url = new URL(urlString); String key = urlPrefix(url); List<UrlPattern> patterns = urlPatternMap.get(key); for (UrlPattern urlPattern : patterns) { Matcher matcher = urlPattern.getRegexPattern().matcher(urlString); if (matcher.find()) { //?? pageTemplates.addAll(urlPattern.getHtmlTemplates()); } } } catch (Exception e) { LOGGER.error("?URL?" + urlString, e); } } return pageTemplates; } public static void main(String[] args) throws Exception { ExtractRegular extractRegular = ExtractRegular .getInstance("http://localhost:8080/HtmlExtractorServer/api/all_extract_regular.jsp", null, -1); List<HtmlTemplate> pageTemplates = extractRegular .getHtmlTemplate("http://money.163.com/14/0529/19/9TEGPK5T00252G50.html"); for (HtmlTemplate pageTemplate : pageTemplates) { System.out.println(pageTemplate); } pageTemplates = extractRegular.getHtmlTemplate("http://finance.qq.com/a/20140530/004254.htm"); for (HtmlTemplate pageTemplate : pageTemplates) { System.out.println(pageTemplate); } } }