bixo.parser.SimpleParserTest.java Source code

Introduction

Here is the source code for bixo.parser.SimpleParserTest.java
Source

/*
 * Copyright 2009-2013 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package bixo.parser;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import junit.framework.Assert;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.ccil.cowan.tagsoup.Parser;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import org.dom4j.XPath;
import org.dom4j.io.SAXReader;
import org.hsqldb.lib.StringInputStream;
import org.junit.Test;

import bixo.config.ParserPolicy;
import bixo.datum.ContentBytes;
import bixo.datum.FetchedDatum;
import bixo.datum.HttpHeaders;
import bixo.datum.Outlink;
import bixo.datum.ParsedDatum;
import bixo.fetcher.HttpHeaderNames;

public class SimpleParserTest {

    @Test
    public void testRelativeLinkWithBaseUrl() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/base-url.html");

        // Create FetchedDatum using data
        String url = "http://olddomain.com/base-url.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlink is correct.
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);

        // TODO KKr - reenable this test when Tika parser calls my handler with
        // the <base> element, which is needed to correctly resolve relative links.
        // Assert.assertEquals("http://newdomain.com/link", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        Assert.assertEquals("http://domain.com/link", outlinks[1].getToUrl());
        Assert.assertEquals("link2", outlinks[1].getAnchor());
    }

    @Test
    public void testRelativeLinkWithLocationUrl() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/relative-urls.html");

        // Create FetchedDatum using data
        String url = "http://olddomain.com/relative-urls.html";
        String location = "http://newdomain.com";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlink is correct.
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);

        Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        // TODO KKr - reenable this test when Tika changes are submitted:
        // Assert.assertEquals("nofollow", outlinks[0].getRelAttributes());
        Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
        Assert.assertEquals("link2", outlinks[1].getAnchor());
    }

    @Test
    public void testRelativeLinkWithRelativeLocationUrl() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/relative-urls.html");

        // Create FetchedDatum using data
        String url = "http://olddomain.com/relative-urls.html";
        String location = "redirected/";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        headers.add(HttpHeaderNames.CONTENT_LOCATION, location);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlink is correct.
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);

        Assert.assertEquals("http://olddomain.com/redirected/link1", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
        Assert.assertEquals("link2", outlinks[1].getAnchor());
    }

    @Test
    public void testRelativeLinkWithRedirectUrl() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/relative-urls.html");

        // Create FetchedDatum using data
        String url = "http://olddomain.com/relative-urls.html";
        String redirectedUrl = "http://newdomain.com";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, redirectedUrl, System.currentTimeMillis(), headers,
                content, contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlink is correct.
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);

        Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
        Assert.assertEquals("link2", outlinks[1].getAnchor());
    }

    @Test
    public void testDefaultLinkTypes() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/all-link-types.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/all-link-types.html";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(2, outlinks.length);

        Assert.assertEquals("http://newdomain.com/link1", outlinks[0].getToUrl());
        Assert.assertEquals("link1", outlinks[0].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[1].getToUrl());
        Assert.assertEquals("link2", outlinks[1].getAnchor());
    }

    @Test
    public void testAllLinkTypes() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/all-link-types.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/all-link-types.html";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        ParserPolicy policy = new ParserPolicy(ParserPolicy.DEFAULT_MAX_PARSE_DURATION,
                BaseLinkExtractor.ALL_LINK_TAGS, BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(7, outlinks.length);

        Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
        Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
        Assert.assertEquals("link1", outlinks[1].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
        Assert.assertEquals("link2", outlinks[2].getAnchor());
        Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
        Assert.assertEquals("http://en.wikipedia.org/wiki/Australia's_Big_Things", outlinks[4].getToUrl());
        Assert.assertEquals("http://newdomain.com/giant-dog.jpg", outlinks[5].getToUrl());
        Assert.assertEquals("http://www.brucelawson.co.uk/index.php/2005/stupid-stock-photography/",
                outlinks[6].getToUrl());
    }

    @SuppressWarnings("serial")
    @Test
    public void testSomeLinkTypes() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/all-link-types.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/all-link-types.html";

        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        Set<String> linkTags = new HashSet<String>() {
            {
                add("a");
                add("img");
                add("link");
            }
        };

        Set<String> linkAttributeTypes = new HashSet<String>() {
            {
                add("href");
                add("src");
            }
        };

        ParserPolicy policy = new ParserPolicy(ParserPolicy.DEFAULT_MAX_PARSE_DURATION, linkTags,
                linkAttributeTypes);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlinks are correct (and we only get the a href ones).
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(4, outlinks.length);

        Assert.assertEquals("http://newdomain.com/favicon.ico", outlinks[0].getToUrl());
        Assert.assertEquals("http://newdomain.com/link1", outlinks[1].getToUrl());
        Assert.assertEquals("link1", outlinks[1].getAnchor());
        Assert.assertEquals("http://domain.com/link2", outlinks[2].getToUrl());
        Assert.assertEquals("link2", outlinks[2].getAnchor());
        Assert.assertEquals("http://newdomain.com/giant-prawn.jpg", outlinks[3].getToUrl());
    }

    @Test
    public void testContentExtraction() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/simple-content.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/simple-content.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify content is correct
        Assert.assertEquals("Simple", parsedDatum.getTitle());

        compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
    }

    @Test
    public void testHtmlParsing() throws Exception {
        URL path = SimpleParserTest.class.getResource("/simple-page.html");

        BaseParser parser = new SimpleParser();
        FetchedDatum content = makeFetchedDatum(path);
        ParsedDatum parse = parser.parse(content);
        Assert.assertNotNull(parse.getParsedText());

        // TODO - add back in title text to simple-page, when we generate this
        File parsedTextFile = new File(SimpleParserTest.class.getResource("/" + "simple-page.txt").getFile());
        String expectedString = FileUtils.readFileToString(parsedTextFile, "utf-8");
        String actualString = parse.getParsedText();

        // Trim of leading returns so split() doesn't give us an empty term
        // TODO - use our own split that skips leading/trailing separators
        compareTermsInStrings(expectedString, actualString.replaceFirst("^[\\n]+", ""));

        // TODO reenable when Tika bug is fixed re not emitting <img> links.
        // Outlink[] outlinks = parse.getOutlinks();
        // Assert.assertEquals(10, outlinks.length);

        Assert.assertEquals("TransPac Software", parse.getTitle());
    }

    @SuppressWarnings("serial")
    @Test
    public void testCustomContentExtractor() throws Exception {
        String html = readFromFile("parser-files/simple-content.html");

        String url = "http://domain.com/simple-content.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        SimpleParser parser = new SimpleParser(new BaseContentExtractor() {

            @Override
            public String getContent() {
                return "Custom";
            }
        }, new BaseLinkExtractor() {

            @Override
            public Outlink[] getLinks() {
                return new Outlink[0];
            }
        }, new ParserPolicy());

        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify content is correct
        Assert.assertEquals("Simple", parsedDatum.getTitle());

        compareTermsInStrings("Custom", parsedDatum.getParsedText());
    }

    @Test
    public void testLinkExtractorWithMetaTags() throws Exception {
        String html = readFromFile("parser-files/meta-nofollow.html");

        String url = "http://domain.com/meta-nofollow.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        ParserPolicy policy = new ParserPolicy(Integer.MAX_VALUE);
        SimpleParser parser = new SimpleParser(policy);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify we got no URLs
        Assert.assertEquals(0, parsedDatum.getOutlinks().length);
    }

    @Test
    public void testLanguageDetectionHttpHeader() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/simple-content.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/simple-content.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify content is correct
        Assert.assertEquals("Simple", parsedDatum.getTitle());

        compareTermsInStrings("Simple Content", parsedDatum.getParsedText());
        Assert.assertEquals("en", parsedDatum.getLanguage());

    }

    @Test
    public void testLanguageDetectionDublinCore() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/lang-dc.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/lang-dc.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify content is correct
        Assert.assertEquals("DublinCore Language Example", parsedDatum.getTitle());

        compareTermsInStrings("DublinCore Language Example Content", parsedDatum.getParsedText());

        Assert.assertEquals("ja", parsedDatum.getLanguage());

    }

    @Test
    public void testLanguageDetectionHttpEquiv() throws Exception {
        // Read in test data from test/resources
        String html = readFromFile("parser-files/lang-http-equiv.html");

        // Create FetchedDatum using data
        String url = "http://domain.com/lang-dc.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        headers.add(HttpHeaderNames.CONTENT_LANGUAGE, "en");

        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser();
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify content is correct
        Assert.assertEquals("SimpleHttpEquiv", parsedDatum.getTitle());

        compareTermsInStrings("SimpleHttpEquiv Content", parsedDatum.getParsedText());

        Assert.assertEquals("ja", parsedDatum.getLanguage());

    }

    @Test
    public void testExtractingObjectTag() throws Exception {
        final String html = "<html><head><title>Title</title></head>"
                + "<body><object data=\"http://domain.com/song.mid\" /></body></html>";

        // Create FetchedDatum using data
        String url = "http://domain.com/music.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(html.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        ParserPolicy policy = new ParserPolicy(ParserPolicy.NO_MAX_PARSE_DURATION, BaseLinkExtractor.ALL_LINK_TAGS,
                BaseLinkExtractor.ALL_LINK_ATTRIBUTE_TYPES);
        SimpleParser parser = new SimpleParser(new SimpleContentExtractor(), new SimpleLinkExtractor(), policy,
                true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Verify outlinks are correct
        Outlink[] outlinks = parsedDatum.getOutlinks();
        Assert.assertEquals(1, outlinks.length);
        Assert.assertEquals("http://domain.com/song.mid", outlinks[0].getToUrl());
    }

    @Test
    public void testHtmlWithTags() throws Exception {
        final String htmlText = "<html><head><title>Title</title></head>"
                + "<body><p>this is a test</p></body></html>";

        // Create FetchedDatum using data
        String url = "http://domain.com/page.html";
        String contentType = "text/html; charset=utf-8";
        HttpHeaders headers = new HttpHeaders();
        headers.add(HttpHeaderNames.CONTENT_TYPE, contentType);
        ContentBytes content = new ContentBytes(htmlText.getBytes("utf-8"));
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), headers, content,
                contentType, 0);

        // Call parser.parse
        SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
        ParsedDatum parsedDatum = parser.parse(fetchedDatum);

        // Now take the resulting HTML, process it using Dom4J
        SAXReader reader = new SAXReader(new Parser());
        reader.setEncoding("UTF-8");
        String htmlWithMarkup = parsedDatum.getParsedText();
        Document doc = reader.read(new StringInputStream(htmlWithMarkup));

        // We have to do helicopter stunts since HTML has a global namespace on it, set
        // at the <html> element level.
        XPath xpath = DocumentHelper.createXPath("/xhtml:html/xhtml:body/xhtml:p");
        Map<String, String> namespaceUris = new HashMap<String, String>();
        namespaceUris.put("xhtml", "http://www.w3.org/1999/xhtml");
        xpath.setNamespaceURIs(namespaceUris);

        Node paragraphNode = xpath.selectSingleNode(doc);
        Assert.assertNotNull(paragraphNode);
        Assert.assertEquals("this is a test", paragraphNode.getText());
    }

    private static String readFromFile(String filePath) throws IOException {
        InputStream is = SimpleParserTest.class.getResourceAsStream("/" + filePath);

        return IOUtils.toString(is);
    }

    private FetchedDatum makeFetchedDatum(URL path) throws IOException {
        File file = new File(path.getFile());
        byte[] bytes = new byte[(int) file.length()];
        DataInputStream in = new DataInputStream(new FileInputStream(file));
        in.readFully(bytes);

        String url = path.toExternalForm().toString();
        FetchedDatum fetchedDatum = new FetchedDatum(url, url, System.currentTimeMillis(), new HttpHeaders(),
                new ContentBytes(bytes), "text/html", 0);
        return fetchedDatum;
    }

    private void compareTermsInStrings(String expected, String actual) {
        String[] expectedTerms = expected.split("[ \\n\\r\\t\\n]+");
        // Trim of leading returns so split() doesn't give us an empty term
        // TODO - use our own split that skips leading/trailing separators
        String[] actualTerms = actual.split("[ \\n\\r\\t\\n]+");

        int compLength = Math.min(expectedTerms.length, actualTerms.length);
        for (int i = 0; i < compLength; i++) {
            Assert.assertEquals("Term at index " + i, expectedTerms[i], actualTerms[i]);
        }

        Assert.assertEquals(expectedTerms.length, actualTerms.length);
    }

}