Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.fetcher; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.util.HashSet; import java.util.Set; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import junit.framework.Assert; import org.apache.http.HttpStatus; import org.apache.http.conn.HttpHostConnectException; import org.eclipse.jetty.http.HttpException; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.Request; import org.eclipse.jetty.server.Response; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.bio.SocketConnector; import org.eclipse.jetty.server.handler.AbstractHandler; import org.junit.Test; import bixo.config.FetcherPolicy; import bixo.config.FetcherPolicy.RedirectMode; import bixo.datum.FetchedDatum; import bixo.datum.ScoredUrlDatum; import bixo.exceptions.AbortedFetchException; import bixo.exceptions.AbortedFetchReason; import bixo.exceptions.BaseFetchException; import bixo.exceptions.IOFetchException; import bixo.exceptions.RedirectFetchException; import bixo.exceptions.RedirectFetchException.RedirectExceptionReason; import bixo.fetcher.simulation.SimulationWebServer; import bixo.utils.ConfigUtils; public class SimpleHttpFetcherTest extends SimulationWebServer { private class RedirectResponseHandler extends AbstractHandler { private boolean _permanent; public RedirectResponseHandler() { this(false); } public RedirectResponseHandler(boolean permanent) { super(); _permanent = permanent; } @Override public void handle(String pathInContext, Request baseRequest, HttpServletRequest servletRequest, HttpServletResponse response) throws HttpException, IOException { if (pathInContext.endsWith("base")) { if (_permanent) { if (response instanceof Response) { Response jettyResponse = (Response) response; // Can't use sendRedirect, as that forces it to be a temp redirect. jettyResponse.setStatus(HttpStatus.SC_MOVED_PERMANENTLY); jettyResponse.setHeader("Location", "http://localhost:8089/redirect"); if (servletRequest instanceof Request) { Request request = (Request) servletRequest; request.setHandled(true); } } } else { response.sendRedirect("http://localhost:8089/redirect"); } } else { response.setStatus(HttpStatus.SC_OK); response.setContentType("text/plain"); String content = "redirected"; response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes()); } } } private class LanguageResponseHandler extends AbstractHandler { private String _englishContent; private String _foreignContent; public LanguageResponseHandler(String englishContent, String foreignContent) { _englishContent = englishContent; _foreignContent = foreignContent; } @Override public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException { String language = request.getHeader(HttpHeaderNames.ACCEPT_LANGUAGE); String content; if ((language != null) && (language.contains("en"))) { content = _englishContent; } else { content = _foreignContent; } response.setStatus(HttpStatus.SC_OK); response.setContentType("text/plain"); response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes()); } } private class MimeTypeResponseHandler extends AbstractHandler { private String _mimeType; public MimeTypeResponseHandler(String mimeType) { _mimeType = mimeType; } @Override public void handle(String pathInContext, Request baseRequest, HttpServletRequest request, HttpServletResponse response) throws HttpException, IOException { String content = "test"; response.setStatus(HttpStatus.SC_OK); if (_mimeType != null) { response.setContentType(_mimeType); } response.setContentLength(content.length()); response.getOutputStream().write(content.getBytes()); } } @Test public final void testConnectionTimeout() throws Exception { Server server = startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8088/simple-page.html"; try { fetcher.get(new ScoredUrlDatum(url)); fail("Exception not thrown"); } catch (IOFetchException e) { assertTrue(e.getCause() instanceof HttpHostConnectException); } finally { server.stop(); } } @Test public final void testStaleConnection() throws Exception { Server server = startServer(new ResourcesResponseHandler(), 8089); Connector[] connectors = server.getConnectors(); for (Connector connector : connectors) { if (connector instanceof SocketConnector) { SocketConnector sConnector = (SocketConnector) connector; sConnector.setSoLingerTime(-1); } } BaseFetcher fetcher = new SimpleHttpFetcher(1, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; fetcher.get(new ScoredUrlDatum(url)); // TODO KKr - control keep-alive (linger?) value for Jetty, so we can set it // to something short and thus make this sleep delay much shorter. Thread.sleep(2000); fetcher.get(new ScoredUrlDatum(url)); server.stop(); } @Test public final void testSlowServerTermination() throws Exception { // Need to read in more than 2 8K blocks currently, due to how // HttpClientFetcher // is designed...so use 20K bytes. And the duration is 2 seconds, so 10K // bytes/sec. Server server = startServer(new RandomResponseHandler(20000, 2 * 1000L), 8089); // Set up for a minimum response rate of 20000 bytes/second. FetcherPolicy policy = new FetcherPolicy(); policy.setMinResponseRate(20000); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/test.html"; try { fetcher.get(new ScoredUrlDatum(url)); fail("Aborted fetch exception not thrown"); } catch (AbortedFetchException e) { assertEquals(AbortedFetchReason.SLOW_RESPONSE_RATE, e.getAbortReason()); } server.stop(); } @Test public final void testInterruptedFetch() throws Exception { // Need to read in lots of data that we get very slowly Server server = startServer(new RandomResponseHandler(20000, 2 * 1000L), 8089); // Set no response rate, so that doesn't trigger an exception FetcherPolicy policy = new FetcherPolicy(); policy.setMinResponseRate(FetcherPolicy.NO_MIN_RESPONSE_RATE); final BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); final String[] failMsg = new String[1]; Thread t = new Thread(new Runnable() { @Override public void run() { String url = "http://localhost:8089/test.html"; try { fetcher.get(new ScoredUrlDatum(url)); failMsg[0] = "No exception thrown, should have thrown an aborted by interrupt exception"; } catch (AbortedFetchException e) { if (e.getAbortReason() != AbortedFetchReason.INTERRUPTED) { failMsg[0] = "Wrong abort exception thrown, should have thrown an aborted by interrupt exception"; } } catch (BaseFetchException e) { failMsg[0] = "Wrong exception thrown, should have thrown an aborted by interrupt exception"; } } }); t.start(); t.interrupt(); while (t.isAlive()) { Thread.sleep(100); } server.stop(); if (failMsg[0] != null) { fail(failMsg[0]); } } @Test public final void testNotTerminatingSlowServers() throws Exception { // Return 1K bytes at 2K bytes/second - would normally trigger an // error. Server server = startServer(new RandomResponseHandler(1000, 500), 8089); // Set up for no minimum response rate. FetcherPolicy policy = new FetcherPolicy(); policy.setMinResponseRate(FetcherPolicy.NO_MIN_RESPONSE_RATE); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/test.html"; fetcher.get(new ScoredUrlDatum(url)); server.stop(); } @Test public final void testLargeContent() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new RandomResponseHandler(policy.getMaxContentSize() * 2), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/test.html"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); assertTrue("Content size should be truncated", result.getContentLength() <= policy.getMaxContentSize()); } @Test public final void testTruncationWithKeepAlive() throws Exception { Server server = startServer(new ResourcesResponseHandler(), 8089); FetcherPolicy policy = new FetcherPolicy(); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); fetcher.setDefaultMaxContentSize(1000); fetcher.setMaxContentSize("image/png", 5000); ScoredUrlDatum datumToFetch = new ScoredUrlDatum("http://localhost:8089/karlie.html"); FetchedDatum result1 = fetcher.get(datumToFetch); FetchedDatum result2 = fetcher.get(datumToFetch); // Verify that we got the same data from each fetch request. assertEquals(1000, result1.getContentLength()); assertEquals(1000, result2.getContentLength()); byte[] bytes1 = result1.getContentBytes(); byte[] bytes2 = result2.getContentBytes(); for (int i = 0; i < bytes1.length; i++) { assertEquals(bytes1[i], bytes2[i]); } datumToFetch = new ScoredUrlDatum("http://localhost:8089/bixolabs_mining.png"); FetchedDatum result3 = fetcher.get(datumToFetch); assertTrue(result3.getContentLength() > 1000); fetcher.setMaxContentSize("image/png", 1500); try { fetcher.get(datumToFetch); fail("Aborted fetch exception not thrown"); } catch (AbortedFetchException e) { Assert.assertEquals(AbortedFetchReason.CONTENT_SIZE, e.getAbortReason()); } server.stop(); } @Test public final void testLargeHtml() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/karlie.html"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); assertTrue("Content size should be truncated", result.getContentLength() <= policy.getMaxContentSize()); } @Test public final void testContentTypeHeader() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); String contentType = result.getHeaders().getFirst(HttpHeaderNames.CONTENT_TYPE); assertNotNull(contentType); assertEquals("text/html", contentType); } @Test public final void testTempRedirectHandling() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new RedirectResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/base"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl()); assertNull(result.getNewBaseUrl()); assertEquals(1, result.getNumRedirects()); } @Test public final void testPermRedirectHandling() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new RedirectResponseHandler(true), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/base"; ScoredUrlDatum scoredUrl = new ScoredUrlDatum(url); scoredUrl.setPayloadValue("payload-field-1", 1); FetchedDatum result = fetcher.get(scoredUrl); server.stop(); assertEquals("Redirected URL", "http://localhost:8089/redirect", result.getFetchedUrl()); assertEquals("New base URL", "http://localhost:8089/redirect", result.getNewBaseUrl()); assertEquals(1, result.getNumRedirects()); assertEquals(1, result.getPayloadValue("payload-field-1")); } @Test public final void testRedirectPolicy() throws Exception { FetcherPolicy policy = new FetcherPolicy(); policy.setRedirectMode(RedirectMode.FOLLOW_TEMP); Server server = startServer(new RedirectResponseHandler(true), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/base"; try { fetcher.get(new ScoredUrlDatum(url)); fail("Exception should have been thrown"); } catch (RedirectFetchException e) { assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl()); assertEquals(RedirectExceptionReason.PERM_REDIRECT_DISALLOWED, e.getReason()); } finally { server.stop(); } // Now try setting the mode to follow none policy.setRedirectMode(RedirectMode.FOLLOW_NONE); server = startServer(new RedirectResponseHandler(false), 8089); fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); try { fetcher.get(new ScoredUrlDatum(url)); fail("Exception should have been thrown"); } catch (RedirectFetchException e) { assertEquals("Redirected URL", "http://localhost:8089/redirect", e.getRedirectedUrl()); assertEquals(RedirectExceptionReason.TEMP_REDIRECT_DISALLOWED, e.getReason()); } finally { server.stop(); } } @Test public final void testAcceptLanguage() throws Exception { final String englishContent = "English"; final String foreignContent = "Foreign"; FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new LanguageResponseHandler(englishContent, foreignContent), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); String contentStr = new String(result.getContentBytes(), 0, result.getContentLength()); assertTrue(englishContent.equals(contentStr)); } @Test public final void testMimeTypeFiltering() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); policy.setValidMimeTypes(validMimeTypes); Server server = startServer(new MimeTypeResponseHandler("text/xml"), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/"; try { fetcher.get(new ScoredUrlDatum(url)); fail("Fetch should have failed"); } catch (AbortedFetchException e) { assertEquals(AbortedFetchReason.INVALID_MIMETYPE, e.getAbortReason()); } finally { server.stop(); } } @Test public final void testMimeTypeFilteringNoContentType() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); validMimeTypes.add(""); // We want unknown (not reported) mime-types too. policy.setValidMimeTypes(validMimeTypes); Server server = startServer(new MimeTypeResponseHandler(null), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/"; try { fetcher.get(new ScoredUrlDatum(url)); } catch (AbortedFetchException e) { fail("Fetch should not have failed if no mime-type is specified"); } finally { server.stop(); } } @Test public final void testMimeTypeFilteringWithCharset() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/html"); policy.setValidMimeTypes(validMimeTypes); Server server = startServer(new MimeTypeResponseHandler("text/html; charset=UTF-8"), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/"; try { fetcher.get(new ScoredUrlDatum(url)); } catch (AbortedFetchException e) { fail("Fetch should have worked"); } finally { server.stop(); } } @Test public final void testHostAddress() throws Exception { FetcherPolicy policy = new FetcherPolicy(); Server server = startServer(new ResourcesResponseHandler(), 8089); BaseFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); String url = "http://localhost:8089/simple-page.html"; FetchedDatum result = fetcher.get(new ScoredUrlDatum(url)); server.stop(); String hostAddress = result.getHostAddress(); assertNotNull(hostAddress); assertEquals("127.0.0.1", hostAddress); } @Test public final void testAcceptEncoding() throws Exception { FetcherPolicy policy = new FetcherPolicy(); SimpleHttpFetcher fetcher = new SimpleHttpFetcher(1, policy, ConfigUtils.BIXO_TEST_AGENT); System.out.println(fetcher.getAcceptEncoding()); final String acceptEncoding = "bogus"; fetcher.setAcceptEncoding(acceptEncoding); assertEquals(acceptEncoding, fetcher.getAcceptEncoding()); } }