disko.HTMLDocument.java Source code

Introduction

Here is the source code for disko.HTMLDocument.java
Source

/*******************************************************************************
 * Copyright (c) 2005, Kobrix Software, Inc.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v2.1
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 * 
 * Contributors:
 *     Borislav Iordanov - initial API and implementation
 *     Murilo Saraiva de Queiroz - initial API and implementation
 ******************************************************************************/
package disko;

import java.io.File;
import java.lang.ref.WeakReference;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import disko.utils.IntervalTree;

import relex.corpus.TextInterval;

import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Attributes;
import au.id.jericho.lib.html.CharacterReference;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.HTMLElementName;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTagType;
import au.id.jericho.lib.html.Tag;
import au.id.jericho.lib.html.TextExtractor;

@SuppressWarnings("unchecked")
public class HTMLDocument extends UrlTextDocument {
    public static final String PAGE_ANN = "page";
    private static Log log = LogFactory.getLog(HTMLDocument.class);

    public static final Set<String> DEFAULT_TAGS;
    public static final Set<String> DEFAULT_ATTRIBUTES;
    public static final Set<String> DEFAULT_PARAGRAPH_DELIMITING_TAGS;

    private String htmlText = null;
    private boolean ignoreEnclosingParagraphs = true;
    private Set<String> desiredTags = new HashSet<String>(DEFAULT_TAGS);;
    private Set<String> desiredAttributes = new HashSet<String>(DEFAULT_ATTRIBUTES);;
    private Set<String> paragraphDelimitingTags = new HashSet<String>(DEFAULT_PARAGRAPH_DELIMITING_TAGS);

    static {
        DEFAULT_TAGS = new HashSet<String>();
        DEFAULT_TAGS.add(HTMLElementName.HTML);
        DEFAULT_TAGS.add(HTMLElementName.HEAD);
        DEFAULT_TAGS.add(HTMLElementName.TITLE);
        DEFAULT_TAGS.add(HTMLElementName.META);
        DEFAULT_TAGS.add(HTMLElementName.BODY);
        DEFAULT_TAGS.add(HTMLElementName.UL);
        DEFAULT_TAGS.add(HTMLElementName.OL);
        DEFAULT_TAGS.add(HTMLElementName.LI);
        DEFAULT_TAGS.add(HTMLElementName.TABLE);
        DEFAULT_TAGS.add(HTMLElementName.TH);
        DEFAULT_TAGS.add(HTMLElementName.TR);
        DEFAULT_TAGS.add(HTMLElementName.TD);
        DEFAULT_TAGS.add(HTMLElementName.DIV);
        DEFAULT_TAGS.add(HTMLElementName.SPAN);
        DEFAULT_TAGS.add(HTMLElementName.P);

        DEFAULT_ATTRIBUTES = new HashSet<String>();
        DEFAULT_ATTRIBUTES.add("class");

        DEFAULT_PARAGRAPH_DELIMITING_TAGS = new HashSet<String>();
        DEFAULT_PARAGRAPH_DELIMITING_TAGS.add(HTMLElementName.P);
        DEFAULT_PARAGRAPH_DELIMITING_TAGS.add(HTMLElementName.LI);
        DEFAULT_PARAGRAPH_DELIMITING_TAGS.add(HTMLElementName.TD);
    }

    private Ann getParagraphAnn(Element element) {
        String clean = CharacterReference.decode(new TextExtractor(element).toString()).trim();
        if (clean.length() == 0)
            return null;
        // This is because link parser's inability to deal with those characters
        // yet...
        return new ParagraphAnn(element.getBegin(), element.getEnd(), DU.replaceUnicodePunctuation(clean));
    }

    public List<Ann> htmlAnnotate(String htmlText) {
        Source source = new Source(htmlText);
        source.setLogger(null);
        List allTags = source.findAllTags();
        ArrayList<Ann> annotations = new ArrayList<Ann>();

        if (allTags.isEmpty()) // this is not HTML text, treat it as a single
                               // paragraph
        {
            ParagraphAnn p = new ParagraphAnn(0, htmlText.length(), htmlText);
            annotations.add(p);
            return annotations;
        }

        IntervalTree<Element> paragraphTree = new IntervalTree<Element>(new TextInterval(0, htmlText.length()));

        for (Tag tag : (Iterable<Tag>) allTags) {
            if (tag.getTagType() != StartTagType.NORMAL || !desiredTags.contains(tag.getName()))
                continue;
            Element element = tag.getElement();
            int begin = element.getBegin();
            int end = element.getEnd();
            MarkupAnn markupAnn = new MarkupAnn(begin, end, tag.getName());
            Attributes attributes = element.getAttributes();
            for (Iterator it = attributes.iterator(); it.hasNext();) {
                Attribute attribute = (Attribute) it.next();
                String name = attribute.getName().toLowerCase();
                String value = attribute.getValue();
                if (desiredAttributes.contains(name)) {
                    log.debug("Found attribute " + name + ":" + value);
                    markupAnn.getAttributes().put(name, value);
                }
            }

            annotations.add(markupAnn);

            if (paragraphDelimitingTags.contains(tag.getName()))
                if (ignoreEnclosingParagraphs)
                    paragraphTree.add(new TextInterval(begin, end), element);
                else {
                    Ann a = getParagraphAnn(element);
                    if (a != null)
                        annotations.add(a);
                }
        }

        if (ignoreEnclosingParagraphs) {
            Set<TextInterval> ignore = new HashSet<TextInterval>();
            for (Iterator<TextInterval> ti = paragraphTree.leafs(); ti.hasNext();) {
                TextInterval current = ti.next();
                if (ignore.contains(current))
                    continue;
                else
                    ignore.add(current);
                List<Element> elements = paragraphTree.get(current);
                if (elements.size() == 1) {
                    Ann a = getParagraphAnn(elements.get(0));
                    if (a != null)
                        annotations.add(a);
                } else if (elements.isEmpty())
                    continue;
                else
                    log.error("Paragraph overlap at " + current + " in document " + getUrl());
            }
        }
        return annotations;
    }

    public synchronized String load() {
        String rawText = htmlText == null ? super.load() : htmlText;
        annotations.addAll(htmlAnnotate(rawText));
        for (Ann a : annotations)
            if (a instanceof MarkupAnn && ((MarkupAnn) a).getTag().equals(HTMLElementName.TITLE))
                setTitle(rawText.substring(a.getInterval().getStart(), a.getInterval().getEnd()).trim());
        return (fullText = new WeakReference<String>(rawText)).get();
    }

    /**
     * 
     * <p>
     * Extract all plain text from this document's body.
     * </p>
     * 
     * @return
     */
    public String getPlainText() {
        Source source = new Source(this.getFullText());
        source.setLogger(null);
        List<Tag> allTags = source.findAllTags();
        for (Tag tag : allTags) {
            if (tag.getTagType() != StartTagType.NORMAL) {
                continue;
            }
            if (tag.getName() == HTMLElementName.TITLE)
                setTitle(tag.getElement().getTextExtractor().toString());
            if (tag.getName() == HTMLElementName.BODY)
                return tag.getElement().getTextExtractor().toString();
        }
        return source.getTextExtractor().toString();
    }

    public HTMLDocument() {
    }

    public HTMLDocument(String htmlText) {
        this.htmlText = htmlText;
    }

    public HTMLDocument(URL url) {
        super(url);
    }

    public HTMLDocument(File f) {
        super(f);
    }

    public Set<String> getDesiredTags() {
        return desiredTags;
    }

    public Set<String> getDesiredAttributes() {
        return desiredAttributes;
    }

    public Set<String> getParagraphDelimitingTags() {
        return paragraphDelimitingTags;
    }
}