net.javacoding.jspider.extension.rule.IncreasingRule.java Source code

Java tutorial

Introduction

Here is the source code for net.javacoding.jspider.extension.rule.IncreasingRule.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package net.javacoding.jspider.extension.rule;

import java.net.MalformedURLException;
import java.net.URL;
import java.text.MessageFormat;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.javacoding.jspider.api.model.Decision;
import net.javacoding.jspider.api.model.Site;
import net.javacoding.jspider.core.SpiderContext;
import net.javacoding.jspider.core.event.impl.URLFoundEvent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import net.javacoding.jspider.core.model.DecisionInternal;
import net.javacoding.jspider.core.rule.impl.BaseRuleImpl;
import net.javacoding.jspider.core.util.URLUtil;
import net.javacoding.jspider.core.util.config.PropertySet;

/**
 *
 * @author changshu.li
 */
public class IncreasingRule extends BaseRuleImpl {

    private static final Log log = LogFactory.getLog(AcceptPattenUrlOnlyRule.class);

    public static final String FROM = "from";
    public static final String TO = "to";
    public static final String PATTERN = "pattern";
    public static final String REPLACE = "replace";
    public static final String QUERY_ENABLE = "query.enable";

    protected int from;
    protected int to;
    protected Pattern pattern;
    protected String replace;
    protected boolean queryEnable = false;

    /**
     * ? form  to
     *
     * @param config
     */
    public IncreasingRule(PropertySet config) {
        super();
        this.from = config.getInteger(FROM, 0);
        this.to = config.getInteger(TO, 0);
        this.pattern = Pattern.compile(config.getString(PATTERN, null));
        this.replace = config.getString(REPLACE, null);
        this.queryEnable = config.getBoolean(QUERY_ENABLE, false);
        log.info("load pattern " + from + " to " + to);
    }

    //   private static final Set<String> set = new ConcurrentSkipListSet();
    //   @Override
    //   public Decision apply(SpiderContext context, Site currentSite, URL url) {
    //      if (!set.contains(url.toString())) {
    //         String path = url.getPath();
    //         if (this.queryEnable) {
    //            if (url.getQuery() != null) {
    //               path += "?" + url.getQuery();
    //            }
    //         }
    //         Matcher mt = pattern.matcher(path);
    //         if (mt.matches()) {
    //            int size = mt.groupCount() + 1;
    //            Object[] pts = new Object[size + 1];
    //            for (int i = 0; i < size; i++) {
    //               pts[i] = mt.group(i);
    //            }
    //            for (int i = from; i <= to; i++) {
    //               if (i % 20 == 0) {
    //                  log.info(String.format("Increasing page for [%s], from [%d] to [%d]!", url, from, i));
    //               }
    //               pts[size] = i;
    //               String fm = MessageFormat.format(replace, pts);
    //               try {
    //                  URL find = URLUtil.normalize(new URL(url, fm));
    //                  set.add(find.toString());
    //                  context.getAgent().registerEvent(url, new URLFoundEvent(context, url, find));
    //                  log.debug("Increasing page " + find);
    //               } catch (MalformedURLException ex) {
    //                  log.error(ex);
    //               }
    //            }
    //            log.info(String.format("Increasing page for [%s], from [%d] to [%d]!", url, from, to));
    //         }
    //      }
    //      return new DecisionInternal(DecisionInternal.RULE_ACCEPT);
    //   }
    /**
     * Object[] ?,<br/>
     * ???. <br/>
     * ??!!
     */
    private final Map<String, Object[]> map = new ConcurrentHashMap();

    /**
     *
     *
     * @param context
     * @param currentSite
     * @param url
     * @return
     */
    @Override
    public Decision apply(SpiderContext context, Site currentSite, URL url) {
        URL orgin = url;
        Object[] pts = null;
        if (map.containsKey(url.toString())) {
            // remove !
            pts = map.remove(url.toString());
            orgin = (URL) pts[0];
        } else {
            String path = url.getPath();
            if (this.queryEnable) {
                if (url.getQuery() != null) {
                    path += "?" + url.getQuery();
                }
            }
            Matcher mt = pattern.matcher(path);
            if (mt.matches()) {
                orgin = url;
                int size = mt.groupCount();
                pts = new Object[size + 2];
                pts[0] = url;
                for (int i = 1; i <= size; i++) {
                    pts[i] = mt.group(i);
                }
            }
        }
        if (pts != null) {
            URL find = this.getNexturl(pts, orgin);
            if (find != null) {
                log.debug("Increasing page " + find);
                map.put(find.toString(), pts);
                context.getAgent().registerEvent(orgin, new URLFoundEvent(context, orgin, find));
            }
        }
        return new DecisionInternal(DecisionInternal.RULE_ACCEPT);
    }

    public URL getNexturl(Object[] pts, URL url) {
        int size = pts.length - 1;
        // hear + 1
        pts[size] = pts[size] == null ? this.from : ((Integer) pts[size]) + 1;
        if (((Integer) pts[size]) <= this.to) {
            String fm = MessageFormat.format(replace, pts);
            try {
                URL find = URLUtil.normalize(new URL(url, fm));
                return find;
            } catch (MalformedURLException ex) {
                log.error(ex);
            }
        }
        return null;
    }
}