org.zilverline.extractors.TestHTMLExtractor.java Source code

Introduction

Here is the source code for org.zilverline.extractors.TestHTMLExtractor.java
Source

/*
 * Copyright 2003-2005 Michael Franken, Zilverline.
 *
 * The contents of this file, or the files included with this file, are subject to
 * the current version of ZILVERLINE Collaborative Source License for the
 * Zilverline Search Engine (the "License"); You may not use this file except in
 * compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *     http://www.zilverline.org.
 *
 * See the License for the rights, obligations and
 * limitations governing use of the contents of the file.
 *
 * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
 * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
 * copyrights in the portions it created. All Rights Reserved.
 *
 */

package org.zilverline.extractors;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

import junit.framework.TestCase;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.springframework.util.StringUtils;

import org.zilverline.core.ParsedFileInfo;

/**
 * HTNLExtractorTest
 * 
 * @author Michael Franken
 */
public class TestHTMLExtractor extends TestCase {
    /** logger for Commons logging. */
    private static Log log = LogFactory.getLog(TestHTMLExtractor.class);

    public void testExtractInfo() {
        HTMLExtractor hex = new HTMLExtractor();
        File file = new File("test\\data\\test.html");
        // File file = new File("d:\\books\\problems\\manual_fr.htm2");
        ParsedFileInfo pfi = hex.extractInfo(file);
        assertNotNull(pfi);
        assertEquals("HTML", pfi.getType());
        assertTrue(pfi.getSize() > 0);
        log.debug("Summary: " + pfi.getSummary());
        assertEquals("HTML title", pfi.getTitle());

        char[] text = new char[pfi.getSummary().length()];
        try {
            assertTrue(pfi.getReader().read(text) > 0);
            log.debug("Content with length of summary: " + new String(text));
        } catch (IOException e) {
            fail(e.getMessage());
        }
        assertTrue(pfi.getSummary().length() > 0);
        // assertTrue(new String(text).startsWith(pfi.getSummary()));
        assertTrue(StringUtils.countOccurrencesOf(pfi.getSummary(), "test") > 0);
    }

    public void testGetContentAsFile() {
        HTMLExtractor hex = new HTMLExtractor();
        File file = new File("test\\data\\test.html");
        // File file = new File("d:\\books\\problems\\manual_fr.htm2");
        Reader r = hex.getContent(file);
        assertNotNull(r);

        char[] text = new char[50];
        try {
            assertTrue(r.read(text) > 0);
            log.debug("Content with length of 50: " + new String(text));
        } catch (IOException e) {
            fail(e.getMessage());
        }
        assertTrue(text.length > 0);
    }

    public void testGetContentAsInputStream() {
        try {
            HTMLExtractor tex = new HTMLExtractor();
            InputStream is = new FileInputStream("test\\data\\test.html");
            String txt = tex.getContent(is);
            assertNotNull(txt);
            log.debug(txt.trim());
            assertTrue(txt.length() > 0);
            assertTrue(txt.trim().endsWith("Dit is een test in een HTML document."));
            assertTrue(StringUtils.countOccurrencesOf(txt, "test") > 0);
        } catch (FileNotFoundException e) {
            fail(e.getMessage());
        }
    }

    public void testGetContentAsString() {
        try {
            HTMLExtractor tex = new HTMLExtractor();
            String html = "<HTML><BODY>here is <B>text</B> to test</BODY></HTML>";
            String txt = tex.getContent(html);
            assertNotNull(txt);
            log.debug(txt);
            assertTrue(txt.length() > 0);
            assertTrue(txt.trim().startsWith("here is text"));
            assertTrue(StringUtils.countOccurrencesOf(txt, "test") > 0);
        } catch (Exception e) {
            fail(e.getMessage());
        }
    }
}