Java tutorial
/** * ########################## GoodCrawler ############################ * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sbs.htmlunit; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; import java.util.List; import org.apache.commons.logging.LogFactory; import org.junit.Test; import org.sbs.goodcrawler.fetcher.FetcherInstance; import org.sbs.goodcrawler.fetcher.PageFetcher; import org.sbs.goodcrawler.fetcher.ResynchronizingAjaxController; import org.sbs.goodcrawler.page.PageFetchResult; import org.sbs.url.WebURL; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.Page; import com.gargoylesoftware.htmlunit.ScriptResult; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlAnchor; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlInput; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput; import com.gargoylesoftware.htmlunit.html.HtmlTextInput; /** * @author shenbaise(shenbaise@outlook.com) * @desc htmlunit */ public class HtmlUnitTest { public static void main(String[] args) { try { // testCrawler(); // baiduTest(); // testGoogle(); // execJavaScript(); // get100Times_htmlunit(); // get100Times_httpclient(); testYouku(); } catch (Exception e) { e.printStackTrace(); } } /** * form?? */ public static void baiduTest() { final WebClient webClient = new WebClient(); webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setThrowExceptionOnFailingStatusCode(true); webClient.getOptions().setThrowExceptionOnScriptError(true); // webClient.getOptions().getProxyConfig().set...(); HtmlPage page; try { page = webClient.getPage("http://www.baidu.com"); final HtmlForm form = page.getFormByName("f"); HtmlInput input = form.getInputByName("wd"); input.setValueAttribute(""); HtmlSubmitInput button = page.getHtmlElementById("su"); Page p = button.click(); // HtmlPage p = button.click(); System.out.println(p.getWebResponse().getContentAsString()); webClient.closeAllWindows(); } catch (FailingHttpStatusCodeException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * google form?? * * @throws FailingHttpStatusCodeException * @throws IOException */ public static void testGoogle() throws FailingHttpStatusCodeException, IOException { WebClient client = new WebClient(); URL url = new URL("http://www.google.com.hk"); HtmlPage page = (HtmlPage) client.getPage(url); HtmlForm form = page.getFormByName("f"); HtmlTextInput input = (HtmlTextInput) form.getInputByName("q"); input.setValueAttribute("james"); HtmlSubmitInput ok = (HtmlSubmitInput) form.getInputByName("btnG"); HtmlPage p = ok.click(); System.out.println(p.asText()); } public void testHomePage() throws FailingHttpStatusCodeException, MalformedURLException, IOException { final WebClient webClient = new WebClient(); final HtmlPage startPage = webClient.getPage("http://www.baidu.com"); System.out.println(":" + startPage.getTitleText()); } /** * js? * * @throws Exception */ public static void execJavaScript() throws Exception { String url = "http://person.sac.net.cn/pages/registration/sac-publicity.html"; // String url="http://www.baidu.com"; // ? final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); // final WebClient webClient=new // WebClient(BrowserVersion.FIREFOX_10,"http://myproxyserver",8000); // //? // final WebClient webClient2=new // WebClient(BrowserVersion.INTERNET_EXPLORER_10); // webClient? webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); // ?? final HtmlPage page = webClient.getPage(url); System.out.println("page.asText=" + page.asText()); System.out.println("page.getUrl=" + page.getUrl()); // page.executeJavaScript("javascript:searchFinishPerson('6655',2);"); System.out.println("------------------"); ScriptResult sr = page.executeJavaScript("javascript:searchFinishPerson('6655',2);"); HtmlPage newPage = (HtmlPage) sr.getNewPage(); System.out.println("new page.asText=" + newPage.asText()); System.out.println("new page.getUrl=" + newPage.getUrl()); } public static void testYouku() throws Exception { String url = "http://v.youku.com/v_show/id_XNDc2MDkzMTIw.html"; String xurl = "http://v.youku.com/v_vpofficiallistv5/id_119023280_showid_271942_page_2?__rt=1&__ro=listitem_page2"; // String a = "<a page=\"2\">178-101</a>"; // String url="http://www.baidu.com"; // ? final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); java.util.logging.Logger.getLogger("net.sourceforge.htmlunit").setLevel(java.util.logging.Level.OFF); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); // final WebClient webClient=new // WebClient(BrowserVersion.FIREFOX_10,"http://myproxyserver",8000); // //? // final WebClient webClient2=new // WebClient(BrowserVersion.INTERNET_EXPLORER_10); // webClient? webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.waitForBackgroundJavaScript(600 * 1000); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); webClient.getOptions().setJavaScriptEnabled(true); /* webClient.setJavaScriptTimeout(3600*1000); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setThrowExceptionOnScriptError(true); webClient.getOptions().setThrowExceptionOnFailingStatusCode(true); webClient.getOptions().setTimeout(3600*1000); webClient.waitForBackgroundJavaScript(600*1000); */ // webClient.waitForBackgroundJavaScript(600*1000); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); // ?? final HtmlPage page = webClient.getPage(url); // getPage()?? webClient.waitForBackgroundJavaScript(1000 * 3); webClient.setJavaScriptTimeout(0); // Thread.sleep(1000 *3L); // String js = "javascript:checkShowFollow('271942','2');"; // ScriptResult sr = page.executeJavaScript(js); // HtmlPage newPage = (HtmlPage) sr.getNewPage(); // System.out.println("new page.asText=" + newPage.asText()); // System.out.println("page.asText=" + page.asText()); // System.out.println("page.getUrl=" + page.getUrl()); List links = (List) page.getByXPath("//*[@id=\"groups_tab\"]/div[1]/ul/li[1]/a"); HtmlAnchor anchor = page.getFirstByXPath("//*[@id=\"rkLst0\"]/table/tbody/tr[1]/td[2]/a"); System.out.println(anchor.getAttribute("href")); // page.getHtmlElementById(id) if (null != links) { System.out.println(links.size()); HtmlAnchor link = (HtmlAnchor) links.get(0); System.out.println(link.asXml()); HtmlPage p = link.click(); webClient.waitForBackgroundJavaScript(1000 * 3L); // webClient.waitForBackgroundJavaScriptStartingBefore(1000L); // Thread.sleep(3000L); System.out.println(p.asText()); } } @Test public void testYouku_n_Thread() throws Exception { new Thread(new Runnable() { @Override public void run() { final String url = "http://v.youku.com/v_show/id_XNDc2MDkzMTIw.html"; String xurl = "http://v.youku.com/v_vpofficiallistv5/id_119023280_showid_271942_page_2?__rt=1&__ro=listitem_page2"; // ? final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); java.util.logging.Logger.getLogger("net.sourceforge.htmlunit") .setLevel(java.util.logging.Level.OFF); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.setJavaScriptTimeout(0); // ? webClient.waitForBackgroundJavaScript(3 * 1000); final ResynchronizingAjaxController ac = new ResynchronizingAjaxController(); webClient.setAjaxController(ac); try { HtmlPage page = webClient.getPage(url); webClient.setJavaScriptTimeout(0); webClient.waitForBackgroundJavaScript(3 * 1000); List links = (List) page.getByXPath("//*[@id=\"groups_tab\"]/div[1]/ul/li[1]/a"); if (null != links) { System.out.println(links.size()); HtmlAnchor link = (HtmlAnchor) links.get(0); System.out.println(link.asXml()); System.err.println("now1 = " + (new Date().toLocaleString())); link.click(); // webClient.waitForBackgroundJavaScript(3*1000); System.out.println(" @@"); System.out.println(" t1 ### " + ac.getResynchronizedCallUlr(10000).toString()); } } catch (FailingHttpStatusCodeException e) { } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }, "t1").start(); new Thread(new Runnable() { @Override public void run() { final String url = "http://v.youku.com/v_show/id_XNDc2MDkzMTIw.html"; String xurl = "http://v.youku.com/v_vpofficiallistv5/id_119023280_showid_271942_page_2?__rt=1&__ro=listitem_page2"; // ? final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); java.util.logging.Logger.getLogger("net.sourceforge.htmlunit") .setLevel(java.util.logging.Level.OFF); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.setJavaScriptTimeout(0); webClient.waitForBackgroundJavaScript(3 * 1000); final ResynchronizingAjaxController ac = new ResynchronizingAjaxController(); webClient.setAjaxController(ac); try { HtmlPage page = webClient.getPage(url); webClient.setJavaScriptTimeout(0); webClient.waitForBackgroundJavaScript(3 * 1000); List links = (List) page.getByXPath("//*[@id=\"groups_tab\"]/div[1]/ul/li[1]/a"); if (null != links) { System.out.println(links.size()); HtmlAnchor link = (HtmlAnchor) links.get(0); System.out.println(link.asXml()); System.err.println("now2 = " + (new Date().toLocaleString())); Page p = link.click(); System.out.println(" t2 ### " + ac.getResynchronizedCallUlr().toString()); // System.out.println(p.getUrl().toString()); } } catch (FailingHttpStatusCodeException e) { } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }, "t2").start(); } public static void get100Times_htmlunit() { long c = System.currentTimeMillis(); String url = "http://person.sac.net.cn/pages/registration/sac-publicity.html"; // String url="http://www.baidu.com/"; // ? final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_17); // final WebClient webClient=new // WebClient(BrowserVersion.FIREFOX_10,"http://myproxyserver",8000); // //? // final WebClient webClient2=new // WebClient(BrowserVersion.INTERNET_EXPLORER_10); // webClient? webClient.getOptions().setJavaScriptEnabled(true); webClient.getOptions().setActiveXNative(false); webClient.getOptions().setCssEnabled(false); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); System.out.println("init costs = " + (System.currentTimeMillis() - c)); c = System.currentTimeMillis(); // ?? try { for (int i = 0; i < 100; i++) { final HtmlPage page = webClient.getPage(url); System.out.println(page.getTitleText()); } } catch (FailingHttpStatusCodeException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } System.out.println("get costs = " + (System.currentTimeMillis() - c)); // about 243643 ms | 113722 ms } public static void get100Times_httpclient() { long c = System.currentTimeMillis(); // String // url="http://person.sac.net.cn/pages/registration/sac-publicity.html"; String url = "http://v.youku.com/v_show/id_XNDc2MDkzMTIw.html"; PageFetcher fetcher = FetcherInstance.getFetcher(); System.out.println("init costs = " + (System.currentTimeMillis() - c)); c = System.currentTimeMillis(); WebURL webUrl = new WebURL(); webUrl.setURL(url); for (int i = 0; i < 1; i++) { PageFetchResult p = fetcher.fetchHeader(webUrl); try { // Jsoup.parse(p.getEntity().getContent(), "gb2312", // "http://person.sac.net.cn"); System.out.println(new String(p.getEntity().getContent().toString())); } catch (IllegalStateException | IOException e) { e.printStackTrace(); } } System.out.println("get costs = " + (System.currentTimeMillis() - c)); // about 22018 ms || 36952 ms } }