Java tutorial
package com.bdx.rainbow.service.etl.analyze; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.bdx.rainbow.entity.etl.OrgiPage; import com.bdx.rainbow.entity.etl.SyjTableBean; import com.bdx.rainbow.etl.analyze.Analyze; import com.bdx.rainbow.etl.entity.seed.HttpSeed; import com.bdx.rainbow.mapper.etl.SyjTableBeanMapper; import com.bdx.rainbow.service.etl.IOrgiPageService; import com.bdx.rainbow.service.etl.util.AnalyzeUtil; import com.bdx.rainbow.service.etl.util.SpringBeanFactory; /** * HTML? * * @author Administrator * */ public class SYJHttpAnalyze implements Analyze<HttpSeed, HttpSeed, Collection<Object>> { private final Logger logger = LoggerFactory.getLogger(SYJHttpAnalyze.class); /** * ???? */ public static final String DOMAIN = "http://app1.sfda.gov.cn/datasearch/face3/"; /** * ? */ private static final String PREFIX_ATTRIBUTE = "setTd"; /** * Entity? */ private static final String PREFIX_ENTITY_PATH = "com.bdx.rainbow.entity.etl."; /** * ??? */ public static final String INSERT_METHOD = "insertBatch"; /** * */ private SyjTableBean syjTableBean; /** * DAO */ private SyjTableBeanMapper syjTableBeanMapper; /** * ?? */ private String correntLock = "correntLock"; /** * ? */ private long corrent = 0l; /** * ?? */ private String errorLock = "errorLock"; /** * ? */ private long error = 0l; public SYJHttpAnalyze() { this.syjTableBeanMapper = (SyjTableBeanMapper) SpringBeanFactory.getSpringBean("syjTableBeanMapper"); } /** * ? * * @param document * @return * @throws Exception */ private Object analyzeLicenseDetail(HttpSeed seed) throws Exception { Document doc = parse(seed.getHtml()); Elements eleTable = doc.select(".listmain table"); // TR Elements eleTrs = eleTable.get(0).select("tr"); // ? Object entity = AnalyzeUtil.getInstant(PREFIX_ENTITY_PATH + syjTableBean.getTableClass()); // tr?trtd? int rowNo = 1; for (int i = 0; i < eleTrs.size(); i++) { Element eleTr = eleTrs.get(i); // ??trtd??nowrapnowrap?true if (i != eleTrs.size() - 1 && (!eleTr.select("td").get(0).hasAttr("nowrap") || !"true".equals(eleTr.select("td").get(0).attr("nowrap")))) { continue; } // td? String tdVal = parseDetailTr(eleTr); // TABLE7411?? if (syjTableBean.getTableClass().equals("TABLE74") && rowNo == 11) { continue; } // entity AnalyzeUtil.executeMethod(entity, PREFIX_ATTRIBUTE + rowNo++, new Object[] { tdVal }, new Class[] { String.class }); } // ?ID, ?createEmpCode String regex = ".+?&Id=(.+?)"; Object obj = AnalyzeUtil.regex(seed.getUrl(), regex); if (null == obj) { // ID AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { 0l }, new Class[] { Long.class }); } else { // ID AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { Long.valueOf(obj.toString()) }, new Class[] { Long.class }); } // ? AnalyzeUtil.executeMethod(entity, "setCreateTime", new Object[] { new Timestamp(new Date().getTime()) }, new Class[] { Timestamp.class }); return entity; } /** * ?? */ @Override public Collection<Object> findResult(Collection<HttpSeed> seeds) throws Exception { Collection<Object> pages = new ArrayList<Object>(); if (CollectionUtils.isEmpty(seeds)) { return null; } for (HttpSeed seed : seeds) { // ?? Object t = analyzeLicenseDetail(seed); pages.add(t); } Object tableService = SpringBeanFactory.getSpringBean("tableService"); try { // ? AnalyzeUtil.executeMethod(tableService, INSERT_METHOD, new Object[] { pages, syjTableBean.getTableClass() + "Mapper" }, new Class[] { List.class, String.class }); // synchronized (correntLock) { syjTableBean.setCorrect(syjTableBean.getCorrect() + pages.size()); corrent += pages.size(); // ?2000?? if (corrent > 2000) { try { syjTableBeanMapper.updateByPrimaryKey(syjTableBean); } catch (Exception e) { } corrent = 0l; logger.info("[" + syjTableBean.getTitle() + "]?[" + syjTableBean.getCorrect() + "]/[" + syjTableBean.getTotal() + "]"); } } } catch (Exception e) { throw new Exception(e); } return pages; } /** * ??,? */ @Override public Collection<HttpSeed> findSeed(Collection<HttpSeed> seeds) throws Exception { if (CollectionUtils.isEmpty(seeds)) { return null; } Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>(); // ?HTMLA for (HttpSeed seed : seeds) { Document doc = parse(seed.getHtml()); Elements drug_elements = doc.select("a[href]"); if (drug_elements.isEmpty()) { return null; } for (Element drug_e : drug_elements) { String href_string = drug_e.attr("href"); String uri = href_string.substring(href_string.indexOf("'") + 1, href_string.lastIndexOf("'")); if (StringUtils.isBlank(uri)) { continue; } seedGroups.add(initDetailHttpSeed(DOMAIN + uri)); } } return seedGroups; } /** * ??? */ @Override public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception { if (CollectionUtils.isEmpty(seeds)) { return null; } Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>(); for (HttpSeed seed : seeds) { Document doc = parse(seed.getHtml()); // ?URL Elements page_form_elements = doc.select("#pageForm"); if (page_form_elements.isEmpty()) { return null; } Element page_form_e = page_form_elements.get(0); // URL String url = DOMAIN + page_form_e.attr("action"); Elements param_elements = page_form_e.select("input"); // int totalPageNum = this.getTotalPageNum(doc); for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) { // ? Map<String, String> params = new HashMap<String, String>(); for (Element param_e : param_elements) { params.put(param_e.attr("name"), param_e.attr("value")); } // params.put("curstart", String.valueOf(pageNo)); HttpSeed httpSeed = this.initListHttpSeed(url, params); seedGroups.add(httpSeed); } } return seedGroups; } /** * ?? */ @Override public void error(Map<String, Collection<HttpSeed>> seeds) throws Exception { IOrgiPageService orgiPageService = (IOrgiPageService) SpringBeanFactory.getSpringBean("orgiPageService"); for (Entry<String, Collection<HttpSeed>> entry : seeds.entrySet()) { List<HttpSeed> list = new ArrayList<HttpSeed>(entry.getValue()); List<OrgiPage> orgiList = new ArrayList<OrgiPage>(); for (HttpSeed seed : list) { OrgiPage orgiPage = new OrgiPage(); orgiPage.setContent(seed.getHtml()); orgiPage.setResolveType(entry.getKey()); orgiPage.setClassName(syjTableBean.getTableClass()); orgiPage.setStatus((short) 0); orgiPage.setUpdateDate(new Timestamp(new Date().getTime())); orgiList.add(orgiPage); } orgiPageService.saveBatch(orgiList); // synchronized (errorLock) { syjTableBean.setError(syjTableBean.getError() + orgiList.size()); error += orgiList.size(); if (error >= 2000) { try { syjTableBeanMapper.updateByPrimaryKey(syjTableBean); } catch (Exception e) { } error = 0l; logger.info("[" + syjTableBean.getTitle() + "]?[" + syjTableBean.getError() + "]/[" + syjTableBean.getTotal() + "]"); } } } } /** * ?HttpSeed * * @return * @throws Exception */ private HttpSeed initDetailHttpSeed(String url) throws Exception { HttpSeed httpSeed = new HttpSeed(); httpSeed.setUrl(url); httpSeed.setCreateTime(System.currentTimeMillis()); // ?? httpSeed.setResolveTypes(new HashSet<String>()); httpSeed.getResolveTypes().add(HttpSeed.RESOLVETYPE_RESULT); return httpSeed; } /** * ? * * @throws Exception */ private int getTotalPageNum(Document document) throws Exception { // ?table? Element pageTable = document.select("table").get(4); String pageHtml = pageTable.select("tr td").get(0).html(); // ?? String regex = ".+?(.+?).+?"; Object result = AnalyzeUtil.regex(pageHtml, regex); if (null == result) { return 0; } int totalPageNum = Integer.parseInt(result.toString()); return totalPageNum; } /** * ?HttpSeed * * @return * @throws Exception */ private HttpSeed initListHttpSeed(String url, Map<String, String> params) throws Exception { HttpSeed httpSeed = new HttpSeed(); httpSeed.setUrl(url); httpSeed.setParam(params); httpSeed.setCreateTime(System.currentTimeMillis()); // ?? httpSeed.setResolveTypes(new HashSet<String>()); httpSeed.getResolveTypes().add(HttpSeed.RESOLVETYPE_SEED_COMMON); return httpSeed; } /** * HTML?Document * * @param html * @return * @throws Exception */ private Document parse(String html) throws Exception { return Jsoup.parse(html, "UTF-8"); } /** * ??? * * @param eleTrs * @param rowNo * @return */ private String parseDetailTr(Element eleTr) throws Exception { Element eleTd = eleTr.select("td").get(1); // td if (eleTd.children().size() > 0) { return eleTd.child(0).html(); } else { return eleTd.html().trim(); } } /** * @return the syjTableBean */ public SyjTableBean getSyjTableBean() { return syjTableBean; } /** * @param syjTableBean * the syjTableBean to set */ public void setSyjTableBean(SyjTableBean syjTableBean) { this.syjTableBean = syjTableBean; } }