org.aliuge.crawler.fetcher.Fetcher.java Source code

Java tutorial

Introduction

Here is the source code for org.aliuge.crawler.fetcher.Fetcher.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.aliuge.crawler.fetcher;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.TimeUnit;

import org.aliuge.crawler.jobconf.FetchConfig;
import org.aliuge.crawler.util.DateTimeUtil;
import org.aliuge.crawler.util.ProxyIp;
import org.aliuge.crawler.util.WrongUrlLog;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpHost;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.google.common.collect.Maps;

public abstract class Fetcher {
    protected static FetchConfig config;
    private static Logger slog = Logger.getLogger(Fetcher.class);
    public Logger log;

    public static volatile Map<String, Integer> m = Maps.newConcurrentMap();
    public static volatile List<String> proxyIps = null;
    public static volatile boolean proxyerisRuning = true;

    public Fetcher() {
    }

    @SuppressWarnings({ "static-access", "unchecked" })
    public Fetcher(FetchConfig config) {
        log = Logger.getLogger(config.getJobName() + config.getIndexName());
        this.config = config;
        List<String> ips = config.getProxyIps();
        proxyIps = ips;
        if (ips != null && ips.size() < 10) {
            //List fetchIPs = ProxyIp.fetchProxyIps(true);
            List fetchIPs = ProxyIp.fetchProxyIps(false);
            proxyIps.addAll(fetchIPs);
            slog.info("??Ip" + fetchIPs.size() + "");
        }
        // proxyIps = proxyIps.addAll(config.)
    }

    public static synchronized void runProxyer() throws Exception {
        // ?ip?
        Thread t = new Thread(new Runnable() {
            @SuppressWarnings("null")
            @Override
            public void run() {
                int count = 0;
                @SuppressWarnings("unused")
                boolean flag_fetchProxyIps = false;
                while (proxyerisRuning) {
                    try {
                        TimeUnit.SECONDS.sleep(5);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    try {
                        @SuppressWarnings("unused")
                        Iterable<String> it = null;
                        for (String ip : it = m.keySet()) {
                            int t = m.get(ip);

                            if (t > 10 && StringUtils.isNotBlank(ip)) {
                                m.remove(ip);
                                proxyIps.remove(ip);
                                slog.info("?ip" + ip);
                            }
                        }
                        if (proxyIps.size() < 30) {
                            slog.info("?ip?30??");
                            //proxyIps.addAll(ProxyIp.fetchProxyIps(true));
                            proxyIps.addAll(ProxyIp.fetchProxyIps(false));
                            m.clear();
                        }
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    // 10???ip
                    if (count == 10) {
                        storeProxyIp();
                        count = 0;
                    } else {
                        count++;
                    }
                }
                // ? ??ip
                storeProxyIp();

            }

        }, "?Ip");
        t.start();

    }

    private static void storeProxyIp() {
        // ? ???ip
        Properties pro = new Properties();
        for (int i = 0; i < proxyIps.size(); i++) {
            pro.setProperty("proxy" + i, proxyIps.get(i));
        }
        OutputStream out = null;
        try {
            out = new FileOutputStream(new File(config.getProxyPath()));
            pro.store(out, "?Ip,[" + DateTimeUtil.getDateTime() + "]");
        } catch (FileNotFoundException e) {
            slog.error("?IP????:" + new File(config.getProxyPath()).toString(),
                    e);
            e.printStackTrace();
        } catch (IOException e) {
            slog.error("?Ip:", e);
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
            }
        }
    }

    public HttpHost getProxyIp() {
        if (proxyIps != null) {
            int size = proxyIps.size();
            if (size <= 0)
                return null;
            int posi = (int) (Math.random() * size);
            String ip = proxyIps.get(posi);
            String[] ip_port = ip.split(":");
            if (ip_port.length != 2)
                return null;
            return new HttpHost(ip_port[0], Integer.valueOf(ip_port[1]));
        }
        return null;
    }

    /*
     * ??document
     */
    public static Document goFetchPage(String url, FetchConfig config) {
        int count = 5;
        Document doc = null;
        while (count > 0 && doc == null) {
            // String proxyIp = setProxy(config.getProxyMap());
            String proxyIp = setProxy(proxyIps);
            try {
                doc = Jsoup.connect(url).timeout(30000).userAgent(config.getAgent()).get();

            } catch (IOException e) {
                count--;
                System.out.println(config.getIndexName() + "\tERROR:?\t" + url + "}\t"
                        + "\tIP\t" + proxyIp + "\t?\t" + (5 - count));
                slog.error(config.getIndexName() + "\t?{" + url + "}!" + "\t?ip\t"
                        + proxyIp);
                m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1);
            }
        }
        // 3???
        if (count <= 0 && doc == null) {
            WrongUrlLog.writeUrl(config.getIndexName() + "_error_url." + DateTimeUtil.getDate() + ".txt",
                    config.getIndexName() + "\t" + url + "\t" + new Date().toString());
        }

        return doc;

    }

    public static void addFailedProxy(String proxyIp) {
        m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1);
    }

    /*
     * ??document
     */
    public static Document goFetchPage(String url, List<String> proxyIps) {
        int count = 5;
        Document doc = null;
        while (count > 0 && doc == null) {
            String proxyIp = setProxy(proxyIps);
            if (proxyIp == null)
                setProxy(proxyIps);
            try {
                doc = Jsoup.connect(url).timeout(5000).userAgent("Agent").get();

            } catch (IOException e) {
                count--;
                slog.error("?ip\t" + proxyIp);
                m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1);

            }
        }
        return doc;

    }

    /*
     * ??document
     */
    public static Document goFetchPageNoTry(String url, List<String> proxyIps) {
        Document doc = null;

        String proxyIp = setProxy(proxyIps);
        if (proxyIp == null)
            setProxy(proxyIps);
        try {
            doc = Jsoup.connect(url).timeout(5000).userAgent("Agent").get();

        } catch (Exception e) {
            e.printStackTrace();
        }
        return doc;
    }

    /**
     * ??iP
     * 
     * @param proxyMap
     * @return
     */
    public static String setProxy(final List<String> proxyMap) {
        if (null == proxyMap || proxyMap.isEmpty())
            return "";
        int i = (int) (Math.random() * proxyMap.size());
        String ip_text = proxyMap.get(i);
        String[] ip_port = ip_text.split(":");
        if (ip_port.length != 2)
            return null;
        String ip = ip_port[0];
        String port = ip_port[1];

        System.setProperty("http.maxRedirects", "50");
        System.getProperties().setProperty("proxySet", "true");

        System.setProperty("http.proxyHost", ip);
        System.setProperty("http.proxyPort", port);
        return ip_text;
    }

    /**
     * ??iP
     * 
     * @param proxyMap
     * @return
     */
    public static String setProxy(String ip_str) {
        if (!StringUtils.isNotBlank(ip_str))
            return "";

        String[] ip_port = ip_str.split(":");
        if (ip_port.length != 2)
            return null;
        String ip = ip_port[0];
        String port = ip_port[1];

        System.setProperty("http.maxRedirects", "50");
        System.getProperties().setProperty("proxySet", "true");

        System.setProperty("http.proxyHost", ip);
        System.setProperty("http.proxyPort", port);
        return ip_str;
    }

    /*
     * ??document ?ip
     */
    public static Document goFetchPage(String url) {
        // setProxy(proxyIps);
        Document doc = null;
        while (doc == null) {
            try {
                doc = Jsoup.connect(url).timeout(30000).userAgent("Agent").get();

            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return doc;

    }

    public static void main(String[] args) throws Exception {
        String url = "http://www.xici.net.co";
        setProxy("60.221.253.2040:8080");
        System.out.println(Jsoup.connect(url).get());
        ;
    }

}