Java tutorial
/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.extractor; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.Predicate; import org.apache.commons.httpclient.URIException; import org.archive.modules.CrawlMetadata; import org.archive.modules.CrawlURI; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.util.Recorder; public class ExtractorHTMLTest extends StringExtractorTestBase { final public static String[] VALID_TEST_DATA = new String[] { "<a href=\"http://www.slashdot.org\">yellow journalism</a> A", "http://www.slashdot.org", "<a href='http://www.slashdot.org'>yellow journalism</a> A", "http://www.slashdot.org", "<a href=http://www.slashdot.org>yellow journalism</a> A", "http://www.slashdot.org", "<a href=\"http://www.slashdot.org\">yellow journalism A", "http://www.slashdot.org", "<a href='http://www.slashdot.org'>yellow journalism A", "http://www.slashdot.org", "<a href=http://www.slashdot.org>yellow journalism A", "http://www.slashdot.org", "<a href=\"http://www.slashdot.org\"/>yellow journalism A", "http://www.slashdot.org", "<a href='http://www.slashdot.org'/>yellow journalism A", "http://www.slashdot.org", "<a href=http://www.slashdot.org/>yellow journalism A", "http://www.slashdot.org", "<img src=\"foo.gif\"> IMG", "http://www.archive.org/start/foo.gif", }; @Override protected String[] getValidTestData() { return VALID_TEST_DATA; } @Override protected Extractor makeExtractor() { ExtractorHTML result = new ExtractorHTML(); UriErrorLoggerModule ulm = new UnitTestUriLoggerModule(); result.setLoggerModule(ulm); CrawlMetadata metadata = new CrawlMetadata(); metadata.afterPropertiesSet(); result.setMetadata(metadata); result.setExtractorJS(new ExtractorJS()); result.afterPropertiesSet(); return result; } protected ExtractorHTML getExtractor() { return (ExtractorHTML) extractor; } @Override protected Collection<TestData> makeData(String content, String destURI) throws Exception { List<TestData> result = new ArrayList<TestData>(); UURI src = UURIFactory.getInstance("http://www.archive.org/start/"); CrawlURI euri = new CrawlURI(src, null, null, LinkContext.NAVLINK_MISC); Recorder recorder = createRecorder(content, "UTF-8"); euri.setContentType("text/html"); euri.setRecorder(recorder); euri.setContentSize(content.length()); UURI dest = UURIFactory.getInstance(destURI); LinkContext context = determineContext(content); Hop hop = determineHop(content); CrawlURI link = euri.createCrawlURI(dest, context, hop); result.add(new TestData(euri, link)); euri = new CrawlURI(src, null, null, LinkContext.NAVLINK_MISC); recorder = createRecorder(content, "UTF-8"); euri.setContentType("application/xhtml"); euri.setRecorder(recorder); euri.setContentSize(content.length()); result.add(new TestData(euri, link)); return result; } private static Hop determineHop(String s) { if (s.endsWith(" IMG")) { return Hop.EMBED; } return Hop.NAVLINK; } private static LinkContext determineContext(String s) { if (s.endsWith(" A")) { return HTMLLinkContext.get("a/@href"); } if (s.endsWith(" IMG")) { return HTMLLinkContext.get("img/@src"); } return LinkContext.NAVLINK_MISC; } /** * Test a missing whitespace issue found in form * * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128 */ public void testNoWhitespaceBeforeValidAttribute() throws URIException { expectSingleLink("http://expected.example.com/", "<frame name=\"main\"src=\"http://expected.example.com/\"> "); } /** * Expect the extractor to find the single given URI in the supplied * source material. Fail if that one lik is not found. * * TODO: expand to capture expected Link instance characteristics * (source, hop, context, etc?) * * @param expected String target URI that should be extracted * @param source CharSequence source material to extract * @throws URIException */ protected void expectSingleLink(String expected, CharSequence source) throws URIException { CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com")); getExtractor().extract(puri, source); CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]); assertTrue("did not find single link", links.length == 1); assertTrue("expected link not found", links[0].getURI().equals(expected)); } /** * Test only extract FORM ACTIONS with METHOD GET * * [HER-1280] do not by default GET form action URLs declared as POST, * because it can cause problems/complaints * http://webteam.archive.org/jira/browse/HER-1280 */ public void testOnlyExtractFormGets() throws URIException { CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com")); CharSequence cs = "<form method=\"get\" action=\"http://www.example.com/ok1\"> " + "<form action=\"http://www.example.com/ok2\" method=\"get\"> " + "<form method=\"post\" action=\"http://www.example.com/notok\"> " + "<form action=\"http://www.example.com/ok3\"> "; getExtractor().extract(puri, cs); // find exactly 3 (not the POST) action URIs assertTrue("incorrect number of links found", puri.getOutLinks().size() == 3); } /* * positive and negative tests for uris in meta tag's content attribute */ public void testMetaContentURI() throws URIException { CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com")); CharSequence cs = "<meta property=\"og:video\" content=\"http://www.example.com/absolute.mp4\" /> " + "<meta property=\"og:video\" content=\"/relative.mp4\" /> " + "<meta property=\"og:video:height\" content=\"333\" />" + "<meta property=\"og:video:type\" content=\"video/mp4\" />" + "<meta property=\"strangeproperty\" content=\"notaurl\" meaninglessurl=\"http://www.example.com/shouldnotbeextracted.html\" />"; getExtractor().extract(puri, cs); CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]); Arrays.sort(links); String dest1 = "http://www.example.com/absolute.mp4"; String dest2 = "http://www.example.com/relative.mp4"; assertTrue("incorrect number of links found", puri.getOutLinks().size() == 2); assertEquals("expected uri in 'content' attribute of meta tag not found", dest1, links[0].getURI()); assertEquals("expected uri in 'content' attribute of meta tag not found", dest2, links[1].getURI()); } /** * Test detection, respect of meta robots nofollow directive */ public void testMetaRobots() throws URIException { CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com")); CharSequence cs = "Blah Blah " + "<meta name='robots' content='index,nofollow'>" + "<a href='blahblah'>blah</a> " + "blahblah"; getExtractor().extract(puri, cs); assertEquals("meta robots content not extracted", "index,nofollow", puri.getData().get(ExtractorHTML.A_META_ROBOTS)); CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]); assertTrue("link extracted despite meta robots", links.length == 0); } /** * Test that relative URIs with late colons aren't misinterpreted * as absolute URIs with long, illegal scheme components. * * See http://webteam.archive.org/jira/browse/HER-1268 * * @throws URIException */ public void testBadRelativeLinks() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com")); CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>" + "<a href=\"example.html?parameter=this:value\"/>"; getExtractor().extract(curi, cs); assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { public boolean evaluate(Object object) { return ((CrawlURI) object).getURI() .indexOf("/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0; } })); assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { public boolean evaluate(Object object) { return ((CrawlURI) object).getURI().indexOf("/example.html?parameter=this:value") >= 0; } })); } /** * Test if scheme is maintained by speculative hops onto exact * same host * * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme */ public void testSpeculativeLinkExtraction() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.com")); CharSequence cs = "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";" + "_anotherparameter=\"www.example.com/index.html\"" + ";</script>"; getExtractor().extract(curi, cs); assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { public boolean evaluate(Object object) { System.err.println( "comparing: " + ((CrawlURI) object).getURI() + " and https://www.anotherexample.com/"); return ((CrawlURI) object).getURI().equals("http://www.anotherexample.com/"); } })); assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() { public boolean evaluate(Object object) { return ((CrawlURI) object).getURI().equals("https://www.example.com/index.html"); } })); } /** * test to see if embedded <SCRIPT/> which writes script TYPE * creates any outlinks, e.g. "type='text/javascript'". * * [HER-1526] SCRIPT writing script TYPE common trigger of bogus links * (eg. 'text/javascript') * * @throws URIException */ public void testScriptTagWritingScriptType() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/en/fiche/dossier/322/")); CharSequence cs = "<script type=\"text/javascript\">" + "var gaJsHost = ((\"https:\" == document.location.protocol) " + "? \"https://ssl.\" : \"http://www.\");" + " document.write(unescape(\"%3Cscript src='\" + gaJsHost + " + "\"google-analytics.com/ga.js' " + "type='text/javascript'%3E%3C/script%3E\"));" + "</script>"; getExtractor().extract(curi, cs); assertEquals(Collections.EMPTY_SET, curi.getOutLinks()); } public void testOutLinksWithBaseHref() throws URIException { CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com/abc/index.html")); puri.setBaseURI(puri.getUURI()); CharSequence cs = "<base href=\"http://www.example.com/\">" + "<a href=\"def/another1.html\">" + "<a href=\"ghi/another2.html\">"; getExtractor().extract(puri, cs); CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]); Arrays.sort(links); String dest1 = "http://www.example.com/def/another1.html"; String dest2 = "http://www.example.com/ghi/another2.html"; // ensure outlink from base href assertEquals("outlink1 from base href", dest1, links[1].getURI()); assertEquals("outlink2 from base href", dest2, links[2].getURI()); } protected Predicate destinationContainsPredicate(final String fragment) { return new Predicate() { public boolean evaluate(Object object) { return ((CrawlURI) object).getURI().indexOf(fragment) >= 0; } }; } protected Predicate destinationsIsPredicate(final String value) { return new Predicate() { public boolean evaluate(Object object) { return ((CrawlURI) object).getURI().equals(value); } }; } /** * HER-1728 * @throws URIException */ public void testFlashvarsParamValue() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/")); CharSequence cs = "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + " <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + " <param name=\"menu\" value=\"false\">\n" + " <param name=\"bgcolor\" value=\"#000000\">\n" + " <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + " <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + "</object> "; getExtractor().extract(curi, cs); String expected = "http://www.example.com/ParamZoomifySlideshowViewer.xml"; assertTrue("outlinks should contain: " + expected, CollectionUtils.exists(curi.getOutLinks(), destinationsIsPredicate(expected))); } /** * HER-1728 * @throws URIException */ public void testFlashvarsEmbedAttribute() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/")); CharSequence cs = "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + " <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n" + " <param name=\"menu\" value=\"false\">\n" + " <param name=\"bgcolor\" value=\"#000000\">\n" + " <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n" + " <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n" + "</object> "; getExtractor().extract(curi, cs); String expected = "http://www.example.com/EmbedZoomifySlideshowViewer.xml"; assertTrue("outlinks should contain: " + expected, CollectionUtils.exists(curi.getOutLinks(), destinationsIsPredicate(expected))); } /** * HER-1998 * @throws URIException */ public void testConditionalComment1() throws URIException { CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/")); CharSequence cs = "<!--[if IE 6]><img src=\"foo.gif\"><![endif]-->" + "<!--[if IE 6]><script src=\"foo.js\"><![endif]-->"; UriErrorLoggerModule ulm = new UnitTestUriLoggerModule(); getExtractor().setLoggerModule(ulm); CrawlMetadata metadata = new CrawlMetadata(); metadata.afterPropertiesSet(); getExtractor().setMetadata(metadata); getExtractor().afterPropertiesSet(); getExtractor().extract(curi, cs); CrawlURI[] links = curi.getOutLinks().toArray(new CrawlURI[0]); Arrays.sort(links); String dest1 = "http://www.example.com/foo.gif"; String dest2 = "http://www.example.com/foo.js"; assertEquals("outlink1 from conditional comment img src", dest1, links[0].getURI()); assertEquals("outlink2 from conditional comment script src", dest2, links[1].getURI()); } }