Java tutorial
/* * Copyright 2011-2016 MSUN.com All right reserved. This software is the confidential and proprietary information of * MSUN.com ("Confidential Information"). You shall not disclose such Confidential Information and shall use it only in * accordance with the terms of the license agreement you entered into with MSUN.com. */ package com.mmj.app.common.util; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author luckscript Nov 30, 2014 11:40:40 PM */ public class SpiderHtmlUtils { private static Logger logger = LoggerFactory.getLogger(SpiderHtmlUtils.class); /** * ?URLhtml? * * @param url * @return */ public static String getHtmlByUrl(String url) { if (StringUtils.isEmpty(url)) { return null; } String html = null; HttpClient httpClient = new DefaultHttpClient();// httpClient HttpUriRequest httpget = new HttpGet(url);// get?URL httpget.setHeader("Connection", "keep-alive"); httpget.setHeader("Referer", "http://www.baidu.com"); httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpget.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36"); try { HttpResponse responce = httpClient.execute(httpget);// responce int responseCode = responce.getStatusLine().getStatusCode();// ? if (responseCode == HttpStatus.SC_OK || responseCode == HttpStatus.SC_MOVED_PERMANENTLY || responseCode == HttpStatus.SC_MOVED_TEMPORARILY) {// 200 ? // HttpEntity entity = responce.getEntity(); if (entity != null) { html = EntityUtils.toString((org.apache.http.HttpEntity) entity);// html?? } } } catch (Exception e) { logger.error("SpiderHtmlUtils:getHtmlByUrl sprider url={} error!!!", url); } finally { httpClient.getConnectionManager().shutdown(); } return html; } public static String fetchTitleHtml(String html) { return parserHtml(html, "head>title"); } public static String fetchDescriptionHtml(String html) { if (StringUtils.isNotEmpty(html)) { Document doc = Jsoup.parse(html); Elements linksElements = doc.select("head>meta[name=Description]"); if (linksElements == null || linksElements.isEmpty()) { return null; } for (Element ele : linksElements) { String contextS = ele.attr("content"); return contextS; } } return null; } public static String parserHtml(String html, String select) { if (StringUtils.isNotEmpty(html)) { Document doc = Jsoup.parse(html); Elements linksElements = doc.select(select); if (linksElements == null || linksElements.isEmpty()) { return null; } for (Element ele : linksElements) { return ele.text(); } } return null; } public static void main(String[] args) { Pattern pattern = Pattern.compile("[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*"); Matcher matcher = pattern.matcher("http://haitao.smzdm.com/youhui/307563"); System.out.println(matcher.matches()); } }