Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.fetcher; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.Date; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.TimeUnit; import org.aliuge.crawler.jobconf.FetchConfig; import org.aliuge.crawler.util.DateTimeUtil; import org.aliuge.crawler.util.ProxyIp; import org.aliuge.crawler.util.WrongUrlLog; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpHost; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import com.google.common.collect.Maps; public abstract class Fetcher { protected static FetchConfig config; private static Logger slog = Logger.getLogger(Fetcher.class); public Logger log; public static volatile Map<String, Integer> m = Maps.newConcurrentMap(); public static volatile List<String> proxyIps = null; public static volatile boolean proxyerisRuning = true; public Fetcher() { } @SuppressWarnings({ "static-access", "unchecked" }) public Fetcher(FetchConfig config) { log = Logger.getLogger(config.getJobName() + config.getIndexName()); this.config = config; List<String> ips = config.getProxyIps(); proxyIps = ips; if (ips != null && ips.size() < 10) { //List fetchIPs = ProxyIp.fetchProxyIps(true); List fetchIPs = ProxyIp.fetchProxyIps(false); proxyIps.addAll(fetchIPs); slog.info("??Ip" + fetchIPs.size() + ""); } // proxyIps = proxyIps.addAll(config.) } public static synchronized void runProxyer() throws Exception { // ?ip? Thread t = new Thread(new Runnable() { @SuppressWarnings("null") @Override public void run() { int count = 0; @SuppressWarnings("unused") boolean flag_fetchProxyIps = false; while (proxyerisRuning) { try { TimeUnit.SECONDS.sleep(5); } catch (InterruptedException e) { e.printStackTrace(); } try { @SuppressWarnings("unused") Iterable<String> it = null; for (String ip : it = m.keySet()) { int t = m.get(ip); if (t > 10 && StringUtils.isNotBlank(ip)) { m.remove(ip); proxyIps.remove(ip); slog.info("?ip" + ip); } } if (proxyIps.size() < 30) { slog.info("?ip?30??"); //proxyIps.addAll(ProxyIp.fetchProxyIps(true)); proxyIps.addAll(ProxyIp.fetchProxyIps(false)); m.clear(); } } catch (Exception e) { e.printStackTrace(); } // 10???ip if (count == 10) { storeProxyIp(); count = 0; } else { count++; } } // ? ??ip storeProxyIp(); } }, "?Ip"); t.start(); } private static void storeProxyIp() { // ? ???ip Properties pro = new Properties(); for (int i = 0; i < proxyIps.size(); i++) { pro.setProperty("proxy" + i, proxyIps.get(i)); } OutputStream out = null; try { out = new FileOutputStream(new File(config.getProxyPath())); pro.store(out, "?Ip,[" + DateTimeUtil.getDateTime() + "]"); } catch (FileNotFoundException e) { slog.error("?IP????:" + new File(config.getProxyPath()).toString(), e); e.printStackTrace(); } catch (IOException e) { slog.error("?Ip:", e); e.printStackTrace(); } finally { try { out.close(); } catch (IOException e) { } } } public HttpHost getProxyIp() { if (proxyIps != null) { int size = proxyIps.size(); if (size <= 0) return null; int posi = (int) (Math.random() * size); String ip = proxyIps.get(posi); String[] ip_port = ip.split(":"); if (ip_port.length != 2) return null; return new HttpHost(ip_port[0], Integer.valueOf(ip_port[1])); } return null; } /* * ??document */ public static Document goFetchPage(String url, FetchConfig config) { int count = 5; Document doc = null; while (count > 0 && doc == null) { // String proxyIp = setProxy(config.getProxyMap()); String proxyIp = setProxy(proxyIps); try { doc = Jsoup.connect(url).timeout(30000).userAgent(config.getAgent()).get(); } catch (IOException e) { count--; System.out.println(config.getIndexName() + "\tERROR:?\t" + url + "}\t" + "\tIP\t" + proxyIp + "\t?\t" + (5 - count)); slog.error(config.getIndexName() + "\t?{" + url + "}!" + "\t?ip\t" + proxyIp); m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1); } } // 3??? if (count <= 0 && doc == null) { WrongUrlLog.writeUrl(config.getIndexName() + "_error_url." + DateTimeUtil.getDate() + ".txt", config.getIndexName() + "\t" + url + "\t" + new Date().toString()); } return doc; } public static void addFailedProxy(String proxyIp) { m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1); } /* * ??document */ public static Document goFetchPage(String url, List<String> proxyIps) { int count = 5; Document doc = null; while (count > 0 && doc == null) { String proxyIp = setProxy(proxyIps); if (proxyIp == null) setProxy(proxyIps); try { doc = Jsoup.connect(url).timeout(5000).userAgent("Agent").get(); } catch (IOException e) { count--; slog.error("?ip\t" + proxyIp); m.put(proxyIp, m.get(proxyIp) == null ? 1 : m.get(proxyIp) + 1); } } return doc; } /* * ??document */ public static Document goFetchPageNoTry(String url, List<String> proxyIps) { Document doc = null; String proxyIp = setProxy(proxyIps); if (proxyIp == null) setProxy(proxyIps); try { doc = Jsoup.connect(url).timeout(5000).userAgent("Agent").get(); } catch (Exception e) { e.printStackTrace(); } return doc; } /** * ??iP * * @param proxyMap * @return */ public static String setProxy(final List<String> proxyMap) { if (null == proxyMap || proxyMap.isEmpty()) return ""; int i = (int) (Math.random() * proxyMap.size()); String ip_text = proxyMap.get(i); String[] ip_port = ip_text.split(":"); if (ip_port.length != 2) return null; String ip = ip_port[0]; String port = ip_port[1]; System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.setProperty("http.proxyHost", ip); System.setProperty("http.proxyPort", port); return ip_text; } /** * ??iP * * @param proxyMap * @return */ public static String setProxy(String ip_str) { if (!StringUtils.isNotBlank(ip_str)) return ""; String[] ip_port = ip_str.split(":"); if (ip_port.length != 2) return null; String ip = ip_port[0]; String port = ip_port[1]; System.setProperty("http.maxRedirects", "50"); System.getProperties().setProperty("proxySet", "true"); System.setProperty("http.proxyHost", ip); System.setProperty("http.proxyPort", port); return ip_str; } /* * ??document ?ip */ public static Document goFetchPage(String url) { // setProxy(proxyIps); Document doc = null; while (doc == null) { try { doc = Jsoup.connect(url).timeout(30000).userAgent("Agent").get(); } catch (IOException e) { e.printStackTrace(); } } return doc; } public static void main(String[] args) throws Exception { String url = "http://www.xici.net.co"; setProxy("60.221.253.2040:8080"); System.out.println(Jsoup.connect(url).get()); ; } }