com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java Source code

Java tutorial

Introduction

Here is the source code for com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

Source

package com.bdx.rainbow.service.etl.analyze;

import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.bdx.rainbow.entity.etl.OrgiPage;
import com.bdx.rainbow.entity.etl.SyjTableBean;
import com.bdx.rainbow.etl.analyze.Analyze;
import com.bdx.rainbow.etl.entity.seed.HttpSeed;
import com.bdx.rainbow.mapper.etl.SyjTableBeanMapper;
import com.bdx.rainbow.service.etl.IOrgiPageService;
import com.bdx.rainbow.service.etl.util.AnalyzeUtil;
import com.bdx.rainbow.service.etl.util.SpringBeanFactory;

/**
 * HTML?
 * 
 * @author Administrator
 * 
 */
public class SYJHttpAnalyze implements Analyze<HttpSeed, HttpSeed, Collection<Object>> {

    private final Logger logger = LoggerFactory.getLogger(SYJHttpAnalyze.class);

    /**
     * ????
     */
    public static final String DOMAIN = "http://app1.sfda.gov.cn/datasearch/face3/";

    /**
     * ?
     */
    private static final String PREFIX_ATTRIBUTE = "setTd";

    /**
     * Entity?
     */
    private static final String PREFIX_ENTITY_PATH = "com.bdx.rainbow.entity.etl.";

    /**
     * ???
     */
    public static final String INSERT_METHOD = "insertBatch";

    /**
     * 
     */
    private SyjTableBean syjTableBean;

    /**
     * DAO
     */
    private SyjTableBeanMapper syjTableBeanMapper;

    /**
     * ??
     */
    private String correntLock = "correntLock";

    /**
     * ?
     */
    private long corrent = 0l;

    /**
     * ??
     */
    private String errorLock = "errorLock";

    /**
     * ?
     */
    private long error = 0l;

    public SYJHttpAnalyze() {
        this.syjTableBeanMapper = (SyjTableBeanMapper) SpringBeanFactory.getSpringBean("syjTableBeanMapper");
    }

    /**
     * ?
     * 
     * @param document
     * @return
     * @throws Exception
     */
    private Object analyzeLicenseDetail(HttpSeed seed) throws Exception {

        Document doc = parse(seed.getHtml());

        Elements eleTable = doc.select(".listmain table");
        // TR
        Elements eleTrs = eleTable.get(0).select("tr");

        // ?
        Object entity = AnalyzeUtil.getInstant(PREFIX_ENTITY_PATH + syjTableBean.getTableClass());

        // tr?trtd?
        int rowNo = 1;
        for (int i = 0; i < eleTrs.size(); i++) {
            Element eleTr = eleTrs.get(i);

            // ??trtd??nowrapnowrap?true
            if (i != eleTrs.size() - 1 && (!eleTr.select("td").get(0).hasAttr("nowrap")
                    || !"true".equals(eleTr.select("td").get(0).attr("nowrap")))) {
                continue;
            }

            // td?
            String tdVal = parseDetailTr(eleTr);

            // TABLE7411??
            if (syjTableBean.getTableClass().equals("TABLE74") && rowNo == 11) {
                continue;
            }

            // entity
            AnalyzeUtil.executeMethod(entity, PREFIX_ATTRIBUTE + rowNo++, new Object[] { tdVal },
                    new Class[] { String.class });
        }

        // ?ID, ?createEmpCode
        String regex = ".+?&Id=(.+?)";
        Object obj = AnalyzeUtil.regex(seed.getUrl(), regex);

        if (null == obj) {
            // ID
            AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { 0l }, new Class[] { Long.class });
        } else {
            // ID
            AnalyzeUtil.executeMethod(entity, "setContentId", new Object[] { Long.valueOf(obj.toString()) },
                    new Class[] { Long.class });
        }

        // ?
        AnalyzeUtil.executeMethod(entity, "setCreateTime", new Object[] { new Timestamp(new Date().getTime()) },
                new Class[] { Timestamp.class });

        return entity;
    }

    /**
     * ??
     */
    @Override
    public Collection<Object> findResult(Collection<HttpSeed> seeds) throws Exception {
        Collection<Object> pages = new ArrayList<Object>();
        if (CollectionUtils.isEmpty(seeds)) {
            return null;
        }

        for (HttpSeed seed : seeds) {
            // ??
            Object t = analyzeLicenseDetail(seed);

            pages.add(t);
        }

        Object tableService = SpringBeanFactory.getSpringBean("tableService");

        try {
            // ?
            AnalyzeUtil.executeMethod(tableService, INSERT_METHOD,
                    new Object[] { pages, syjTableBean.getTableClass() + "Mapper" },
                    new Class[] { List.class, String.class });

            // 
            synchronized (correntLock) {
                syjTableBean.setCorrect(syjTableBean.getCorrect() + pages.size());

                corrent += pages.size();

                // ?2000??
                if (corrent > 2000) {
                    try {
                        syjTableBeanMapper.updateByPrimaryKey(syjTableBean);
                    } catch (Exception e) {
                    }
                    corrent = 0l;

                    logger.info("[" + syjTableBean.getTitle() + "]?[" + syjTableBean.getCorrect()
                            + "]/[" + syjTableBean.getTotal() + "]");
                }
            }

        } catch (Exception e) {
            throw new Exception(e);
        }

        return pages;
    }

    /**
     * ??,?
     */
    @Override
    public Collection<HttpSeed> findSeed(Collection<HttpSeed> seeds) throws Exception {

        if (CollectionUtils.isEmpty(seeds)) {
            return null;
        }

        Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>();

        // ?HTMLA
        for (HttpSeed seed : seeds) {
            Document doc = parse(seed.getHtml());

            Elements drug_elements = doc.select("a[href]");

            if (drug_elements.isEmpty()) {
                return null;
            }

            for (Element drug_e : drug_elements) {
                String href_string = drug_e.attr("href");
                String uri = href_string.substring(href_string.indexOf("'") + 1, href_string.lastIndexOf("'"));

                if (StringUtils.isBlank(uri)) {
                    continue;
                }

                seedGroups.add(initDetailHttpSeed(DOMAIN + uri));
            }
        }

        return seedGroups;
    }

    /**
     * ???
     */
    @Override
    public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception {

        if (CollectionUtils.isEmpty(seeds)) {
            return null;
        }

        Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>();

        for (HttpSeed seed : seeds) {
            Document doc = parse(seed.getHtml());

            // ?URL
            Elements page_form_elements = doc.select("#pageForm");
            if (page_form_elements.isEmpty()) {
                return null;
            }

            Element page_form_e = page_form_elements.get(0);
            // URL
            String url = DOMAIN + page_form_e.attr("action");
            Elements param_elements = page_form_e.select("input");

            // 
            int totalPageNum = this.getTotalPageNum(doc);

            for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) {

                // ?
                Map<String, String> params = new HashMap<String, String>();
                for (Element param_e : param_elements) {
                    params.put(param_e.attr("name"), param_e.attr("value"));
                }
                // 
                params.put("curstart", String.valueOf(pageNo));

                HttpSeed httpSeed = this.initListHttpSeed(url, params);

                seedGroups.add(httpSeed);
            }
        }

        return seedGroups;
    }

    /**
     * ??
     */
    @Override
    public void error(Map<String, Collection<HttpSeed>> seeds) throws Exception {
        IOrgiPageService orgiPageService = (IOrgiPageService) SpringBeanFactory.getSpringBean("orgiPageService");

        for (Entry<String, Collection<HttpSeed>> entry : seeds.entrySet()) {
            List<HttpSeed> list = new ArrayList<HttpSeed>(entry.getValue());

            List<OrgiPage> orgiList = new ArrayList<OrgiPage>();
            for (HttpSeed seed : list) {
                OrgiPage orgiPage = new OrgiPage();
                orgiPage.setContent(seed.getHtml());
                orgiPage.setResolveType(entry.getKey());
                orgiPage.setClassName(syjTableBean.getTableClass());
                orgiPage.setStatus((short) 0);
                orgiPage.setUpdateDate(new Timestamp(new Date().getTime()));

                orgiList.add(orgiPage);
            }

            orgiPageService.saveBatch(orgiList);

            // 
            synchronized (errorLock) {
                syjTableBean.setError(syjTableBean.getError() + orgiList.size());

                error += orgiList.size();

                if (error >= 2000) {
                    try {
                        syjTableBeanMapper.updateByPrimaryKey(syjTableBean);
                    } catch (Exception e) {
                    }
                    error = 0l;

                    logger.info("[" + syjTableBean.getTitle() + "]?[" + syjTableBean.getError()
                            + "]/[" + syjTableBean.getTotal() + "]");
                }
            }
        }
    }

    /**
     * ?HttpSeed
     * 
     * @return
     * @throws Exception
     */
    private HttpSeed initDetailHttpSeed(String url) throws Exception {
        HttpSeed httpSeed = new HttpSeed();

        httpSeed.setUrl(url);
        httpSeed.setCreateTime(System.currentTimeMillis());
        // ??
        httpSeed.setResolveTypes(new HashSet<String>());
        httpSeed.getResolveTypes().add(HttpSeed.RESOLVETYPE_RESULT);

        return httpSeed;
    }

    /**
     * ?
     * 
     * @throws Exception
     */
    private int getTotalPageNum(Document document) throws Exception {
        // ?table?
        Element pageTable = document.select("table").get(4);

        String pageHtml = pageTable.select("tr td").get(0).html();

        // ??
        String regex = ".+?(.+?).+?";

        Object result = AnalyzeUtil.regex(pageHtml, regex);
        if (null == result) {
            return 0;
        }

        int totalPageNum = Integer.parseInt(result.toString());

        return totalPageNum;
    }

    /**
     * ?HttpSeed
     * 
     * @return
     * @throws Exception
     */
    private HttpSeed initListHttpSeed(String url, Map<String, String> params) throws Exception {
        HttpSeed httpSeed = new HttpSeed();

        httpSeed.setUrl(url);
        httpSeed.setParam(params);
        httpSeed.setCreateTime(System.currentTimeMillis());
        // ??
        httpSeed.setResolveTypes(new HashSet<String>());
        httpSeed.getResolveTypes().add(HttpSeed.RESOLVETYPE_SEED_COMMON);

        return httpSeed;
    }

    /**
     * HTML?Document
     * 
     * @param html
     * @return
     * @throws Exception
     */
    private Document parse(String html) throws Exception {
        return Jsoup.parse(html, "UTF-8");
    }

    /**
     * ???
     * 
     * @param eleTrs
     * @param rowNo
     * @return
     */
    private String parseDetailTr(Element eleTr) throws Exception {
        Element eleTd = eleTr.select("td").get(1);

        // td
        if (eleTd.children().size() > 0) {
            return eleTd.child(0).html();
        } else {
            return eleTd.html().trim();
        }
    }

    /**
     * @return the syjTableBean
     */
    public SyjTableBean getSyjTableBean() {
        return syjTableBean;
    }

    /**
     * @param syjTableBean
     *            the syjTableBean to set
     */
    public void setSyjTableBean(SyjTableBean syjTableBean) {
        this.syjTableBean = syjTableBean;
    }
}