com.weibo.datasys.crawler.impl.strategy.rule.save.PageToFileSaveRule.java Source code

Java tutorial

Introduction

Here is the source code for com.weibo.datasys.crawler.impl.strategy.rule.save.PageToFileSaveRule.java

Source

/**
 *  Copyright (c)  2016-2020 Weibo, Inc.
 *  All rights reserved.
 *
 *  This software is the confidential and proprietary information of Weibo, 
 *  Inc. ("Confidential Information"). You shall not
 *  disclose such Confidential Information and shall use it only in
 *  accordance with the terms of the license agreement you entered into with Weibo.
 */
package com.weibo.datasys.crawler.impl.strategy.rule.save;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;

import org.apache.commons.lang.ObjectUtils.Null;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.weibo.datasys.common.util.IOUtil;
import com.weibo.datasys.common.util.StringUtils;
import com.weibo.datasys.crawler.base.entity.CrawlInfo;
import com.weibo.datasys.crawler.base.entity.PageData;
import com.weibo.datasys.crawler.base.entity.ParseInfo;
import com.weibo.datasys.crawler.base.entity.Task;
import com.weibo.datasys.crawler.base.manager.QueueManager.QueueType;
import com.weibo.datasys.crawler.base.strategy.rule.save.AbstractSaveRule;

/**
 * 
 * ??
 */
public class PageToFileSaveRule extends AbstractSaveRule {

    private static Logger logger = LoggerFactory.getLogger(PageToFileSaveRule.class);

    private File baseDir = new File("page");

    private int subDirCount = 2;

    private int subDirLength = 2;

    /**
     * @param task
     */
    public PageToFileSaveRule(Task task) {
        super(task);
        this.type = QueueType.SAVE_PAGE.name();
    }

    @Override
    public void configWithParameters(Map<String, String> paraMap) {
        if (StringUtils.isNotEmpty(paraMap.get("baseDir"))) {
            baseDir = new File(paraMap.get("baseDir"));
            if (!baseDir.exists()) {
                baseDir.mkdirs();
            }
        }
        subDirCount = StringUtils.parseInt(paraMap.get("subDirCount"), subDirCount);
        subDirLength = StringUtils.parseInt(paraMap.get("subDirLength"), subDirLength);
    }

    @Override
    public Null apply(ParseInfo parseInfo) {
        CrawlInfo crawlInfo = parseInfo.getThisCrawlInfo();
        Task task = crawlInfo.getValidTask();
        if (task == null) {
            return null;
        }
        PageData pageData = parseInfo.getPageData();
        if (pageData != null) {
            try {
                String id = crawlInfo.getSeedData().getExtendField("id");
                if (StringUtils.isEmptyString(id)) {
                    id = pageData.getId();
                }
                String path = IOUtil.md5ToPath(id, subDirCount, subDirLength);
                SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
                String today = sdf.format(new Date());
                String pageDirString = baseDir.getPath() + File.separatorChar + today + File.separatorChar + path;
                File pageDir = new File(pageDirString);
                pageDir.mkdirs();
                String pagePath = pageDir.getPath() + File.separatorChar + id + ".p";
                OutputStream out = new BufferedOutputStream(new FileOutputStream(pagePath));
                out.write(pageData.getHtml());
                out.close();
                // logger.info("[SavePageToFile] - url={}", pageData.getUrl());
            } catch (Exception e) {
                logger.error("[PageToFileSaveRuleError] - ", e);
            }
            countSaveResult("Page", 1);
        }
        return null;
    }

}