org.archive.modules.extractor.ExtractorHTMLTest.java Source code

Introduction

Here is the source code for org.archive.modules.extractor.ExtractorHTMLTest.java
Source

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.Predicate;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.Recorder;

public class ExtractorHTMLTest extends StringExtractorTestBase {

    final public static String[] VALID_TEST_DATA = new String[] {
            "<a href=\"http://www.slashdot.org\">yellow journalism</a> A", "http://www.slashdot.org",

            "<a href='http://www.slashdot.org'>yellow journalism</a> A", "http://www.slashdot.org",

            "<a href=http://www.slashdot.org>yellow journalism</a> A", "http://www.slashdot.org",

            "<a href=\"http://www.slashdot.org\">yellow journalism A", "http://www.slashdot.org",

            "<a href='http://www.slashdot.org'>yellow journalism A", "http://www.slashdot.org",

            "<a href=http://www.slashdot.org>yellow journalism A", "http://www.slashdot.org",

            "<a href=\"http://www.slashdot.org\"/>yellow journalism A", "http://www.slashdot.org",

            "<a href='http://www.slashdot.org'/>yellow journalism A", "http://www.slashdot.org",

            "<a href=http://www.slashdot.org/>yellow journalism A", "http://www.slashdot.org",

            "<img src=\"foo.gif\"> IMG", "http://www.archive.org/start/foo.gif",

    };

    @Override
    protected String[] getValidTestData() {
        return VALID_TEST_DATA;
    }

    @Override
    protected Extractor makeExtractor() {
        ExtractorHTML result = new ExtractorHTML();
        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
        result.setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        result.setMetadata(metadata);
        result.setExtractorJS(new ExtractorJS());
        result.afterPropertiesSet();
        return result;
    }

    protected ExtractorHTML getExtractor() {
        return (ExtractorHTML) extractor;
    }

    @Override
    protected Collection<TestData> makeData(String content, String destURI) throws Exception {
        List<TestData> result = new ArrayList<TestData>();
        UURI src = UURIFactory.getInstance("http://www.archive.org/start/");
        CrawlURI euri = new CrawlURI(src, null, null, LinkContext.NAVLINK_MISC);
        Recorder recorder = createRecorder(content, "UTF-8");
        euri.setContentType("text/html");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());

        UURI dest = UURIFactory.getInstance(destURI);
        LinkContext context = determineContext(content);
        Hop hop = determineHop(content);
        CrawlURI link = euri.createCrawlURI(dest, context, hop);
        result.add(new TestData(euri, link));

        euri = new CrawlURI(src, null, null, LinkContext.NAVLINK_MISC);
        recorder = createRecorder(content, "UTF-8");
        euri.setContentType("application/xhtml");
        euri.setRecorder(recorder);
        euri.setContentSize(content.length());
        result.add(new TestData(euri, link));

        return result;
    }

    private static Hop determineHop(String s) {
        if (s.endsWith(" IMG")) {
            return Hop.EMBED;
        }
        return Hop.NAVLINK;
    }

    private static LinkContext determineContext(String s) {
        if (s.endsWith(" A")) {
            return HTMLLinkContext.get("a/@href");
        }
        if (s.endsWith(" IMG")) {
            return HTMLLinkContext.get("img/@src");
        }
        return LinkContext.NAVLINK_MISC;
    }

    /**
     * Test a missing whitespace issue found in form
     * 
     * [HER-1128] ExtractorHTML fails to extract FRAME SRC link without
     * whitespace before SRC http://webteam.archive.org/jira/browse/HER-1128
     */
    public void testNoWhitespaceBeforeValidAttribute() throws URIException {
        expectSingleLink("http://expected.example.com/",
                "<frame name=\"main\"src=\"http://expected.example.com/\"> ");
    }

    /**
     * Expect the extractor to find the single given URI in the supplied
     * source material. Fail if that one lik is not found. 
     * 
     * TODO: expand to capture expected Link instance characteristics 
     * (source, hop, context, etc?)
     * 
     * @param expected String target URI that should be extracted
     * @param source CharSequence source material to extract
     * @throws URIException
     */
    protected void expectSingleLink(String expected, CharSequence source) throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com"));
        getExtractor().extract(puri, source);
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        assertTrue("did not find single link", links.length == 1);
        assertTrue("expected link not found", links[0].getURI().equals(expected));
    }

    /**
     * Test only extract FORM ACTIONS with METHOD GET 
     * 
     * [HER-1280] do not by default GET form action URLs declared as POST, 
     * because it can cause problems/complaints 
     * http://webteam.archive.org/jira/browse/HER-1280
     */
    public void testOnlyExtractFormGets() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com"));
        CharSequence cs = "<form method=\"get\" action=\"http://www.example.com/ok1\"> "
                + "<form action=\"http://www.example.com/ok2\" method=\"get\"> "
                + "<form method=\"post\" action=\"http://www.example.com/notok\"> "
                + "<form action=\"http://www.example.com/ok3\"> ";
        getExtractor().extract(puri, cs);
        // find exactly 3 (not the POST) action URIs
        assertTrue("incorrect number of links found", puri.getOutLinks().size() == 3);
    }

    /*
     * positive and negative tests for uris in meta tag's content attribute
     */
    public void testMetaContentURI() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com"));
        CharSequence cs = "<meta property=\"og:video\" content=\"http://www.example.com/absolute.mp4\" /> "
                + "<meta property=\"og:video\" content=\"/relative.mp4\" /> "
                + "<meta property=\"og:video:height\" content=\"333\" />"
                + "<meta property=\"og:video:type\" content=\"video/mp4\" />"
                + "<meta property=\"strangeproperty\" content=\"notaurl\" meaninglessurl=\"http://www.example.com/shouldnotbeextracted.html\" />";

        getExtractor().extract(puri, cs);

        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links);
        String dest1 = "http://www.example.com/absolute.mp4";
        String dest2 = "http://www.example.com/relative.mp4";

        assertTrue("incorrect number of links found", puri.getOutLinks().size() == 2);
        assertEquals("expected uri in 'content' attribute of meta tag not found", dest1, links[0].getURI());
        assertEquals("expected uri in 'content' attribute of meta tag not found", dest2, links[1].getURI());
    }

    /**
     * Test detection, respect of meta robots nofollow directive
     */
    public void testMetaRobots() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com"));
        CharSequence cs = "Blah Blah " + "<meta name='robots' content='index,nofollow'>"
                + "<a href='blahblah'>blah</a> " + "blahblah";
        getExtractor().extract(puri, cs);
        assertEquals("meta robots content not extracted", "index,nofollow",
                puri.getData().get(ExtractorHTML.A_META_ROBOTS));
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        assertTrue("link extracted despite meta robots", links.length == 0);
    }

    /**
     * Test that relative URIs with late colons aren't misinterpreted
     * as absolute URIs with long, illegal scheme components. 
     * 
     * See http://webteam.archive.org/jira/browse/HER-1268
     * 
     * @throws URIException
     */
    public void testBadRelativeLinks() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com"));
        CharSequence cs = "<a href=\"example.html;jsessionid=deadbeef:deadbeed?parameter=this:value\"/>"
                + "<a href=\"example.html?parameter=this:value\"/>";
        getExtractor().extract(curi, cs);

        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI()
                        .indexOf("/example.html;jsessionid=deadbeef:deadbeed?parameter=this:value") >= 0;
            }
        }));

        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf("/example.html?parameter=this:value") >= 0;
            }
        }));
    }

    /**
     * Test if scheme is maintained by speculative hops onto exact 
     * same host
     * 
     * [HER-1524] speculativeFixup in ExtractorJS should maintain URL scheme
     */
    public void testSpeculativeLinkExtraction() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("https://www.example.com"));
        CharSequence cs = "<script type=\"text/javascript\">_parameter=\"www.anotherexample.com\";"
                + "_anotherparameter=\"www.example.com/index.html\"" + ";</script>";
        getExtractor().extract(curi, cs);

        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                System.err.println(
                        "comparing: " + ((CrawlURI) object).getURI() + " and https://www.anotherexample.com/");
                return ((CrawlURI) object).getURI().equals("http://www.anotherexample.com/");
            }
        }));
        assertTrue(CollectionUtils.exists(curi.getOutLinks(), new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().equals("https://www.example.com/index.html");
            }
        }));
    }

    /**
     * test to see if embedded <SCRIPT/> which writes script TYPE
     * creates any outlinks, e.g. "type='text/javascript'". 
     * 
     * [HER-1526] SCRIPT writing script TYPE common trigger of bogus links 
     *   (eg. 'text/javascript')
     *   
     * @throws URIException
     */
    public void testScriptTagWritingScriptType() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/en/fiche/dossier/322/"));
        CharSequence cs = "<script type=\"text/javascript\">"
                + "var gaJsHost = ((\"https:\" == document.location.protocol) "
                + "? \"https://ssl.\" : \"http://www.\");"
                + " document.write(unescape(\"%3Cscript src='\" + gaJsHost + " + "\"google-analytics.com/ga.js' "
                + "type='text/javascript'%3E%3C/script%3E\"));" + "</script>";
        getExtractor().extract(curi, cs);
        assertEquals(Collections.EMPTY_SET, curi.getOutLinks());
    }

    public void testOutLinksWithBaseHref() throws URIException {
        CrawlURI puri = new CrawlURI(UURIFactory.getInstance("http://www.example.com/abc/index.html"));
        puri.setBaseURI(puri.getUURI());
        CharSequence cs = "<base href=\"http://www.example.com/\">" + "<a href=\"def/another1.html\">"
                + "<a href=\"ghi/another2.html\">";
        getExtractor().extract(puri, cs);
        CrawlURI[] links = puri.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links);
        String dest1 = "http://www.example.com/def/another1.html";
        String dest2 = "http://www.example.com/ghi/another2.html";
        // ensure outlink from base href
        assertEquals("outlink1 from base href", dest1, links[1].getURI());
        assertEquals("outlink2 from base href", dest2, links[2].getURI());
    }

    protected Predicate destinationContainsPredicate(final String fragment) {
        return new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().indexOf(fragment) >= 0;
            }
        };
    }

    protected Predicate destinationsIsPredicate(final String value) {
        return new Predicate() {
            public boolean evaluate(Object object) {
                return ((CrawlURI) object).getURI().equals(value);
            }
        };
    }

    /**
     * HER-1728 
     * @throws URIException 
     */
    public void testFlashvarsParamValue() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
        CharSequence cs = "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n"
                + "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n"
                + "    <param name=\"menu\" value=\"false\">\n" + "    <param name=\"bgcolor\" value=\"#000000\">\n"
                + "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n"
                + "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n"
                + "</object> ";
        getExtractor().extract(curi, cs);
        String expected = "http://www.example.com/ParamZoomifySlideshowViewer.xml";
        assertTrue("outlinks should contain: " + expected,
                CollectionUtils.exists(curi.getOutLinks(), destinationsIsPredicate(expected)));
    }

    /**
     * HER-1728 
     * @throws URIException 
     */
    public void testFlashvarsEmbedAttribute() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));
        CharSequence cs = "<object classid=\"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000\" codebase=\"http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=9,0,28,0\" id=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n"
                + "    <param name=\"flashvars\" value=\"zoomifyXMLPath=ParamZoomifySlideshowViewer.xml\">\n"
                + "    <param name=\"menu\" value=\"false\">\n" + "    <param name=\"bgcolor\" value=\"#000000\">\n"
                + "    <param name=\"src\" value=\"ZoomifySlideshowViewer.swf\">\n"
                + "    <embed flashvars=\"zoomifyXMLPath=EmbedZoomifySlideshowViewer.xml\" src=\"ZoomifySlideshowViewer.swf\" menu=\"false\" bgcolor=\"#000000\" pluginspage=\"http://www.adobe.com/go/getflashplayer\" type=\"application/x-shockwave-flash\" name=\"ZoomifySlideshowViewer\" height=\"372\" width=\"590\">\n"
                + "</object> ";
        getExtractor().extract(curi, cs);
        String expected = "http://www.example.com/EmbedZoomifySlideshowViewer.xml";
        assertTrue("outlinks should contain: " + expected,
                CollectionUtils.exists(curi.getOutLinks(), destinationsIsPredicate(expected)));
    }

    /**
     * HER-1998 
     * @throws URIException 
     */
    public void testConditionalComment1() throws URIException {
        CrawlURI curi = new CrawlURI(UURIFactory.getInstance("http://www.example.com/"));

        CharSequence cs = "<!--[if IE 6]><img src=\"foo.gif\"><![endif]-->"
                + "<!--[if IE 6]><script src=\"foo.js\"><![endif]-->";

        UriErrorLoggerModule ulm = new UnitTestUriLoggerModule();
        getExtractor().setLoggerModule(ulm);
        CrawlMetadata metadata = new CrawlMetadata();
        metadata.afterPropertiesSet();
        getExtractor().setMetadata(metadata);
        getExtractor().afterPropertiesSet();

        getExtractor().extract(curi, cs);

        CrawlURI[] links = curi.getOutLinks().toArray(new CrawlURI[0]);
        Arrays.sort(links);

        String dest1 = "http://www.example.com/foo.gif";
        String dest2 = "http://www.example.com/foo.js";

        assertEquals("outlink1 from conditional comment img src", dest1, links[0].getURI());
        assertEquals("outlink2 from conditional comment script src", dest2, links[1].getURI());

    }

}