org.asqatasun.rules.elementselector.LinkElementSelector.java Source code

Java tutorial

Introduction

Here is the source code for org.asqatasun.rules.elementselector.LinkElementSelector.java

Source

/*
 *  Asqatasun - Automated webpage assessment
 * Copyright (C) 2008-2015  Asqatasun.org
 * 
 *  This file is part of Asqatasun.
 * 
 *  Asqatasun is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as
 *  published by the Free Software Foundation, either version 3 of the
 *  License, or (at your option) any later version.
 * 
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 * 
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 *  Contact us by mail: asqatasun AT asqatasun DOT org
 */

package org.asqatasun.rules.elementselector;

import java.util.Collection;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.asqatasun.processor.SSPHandler;
import org.asqatasun.ruleimplementation.ElementHandler;
import org.asqatasun.ruleimplementation.ElementHandlerImpl;
import static org.asqatasun.rules.keystore.AttributeStore.TITLE_ATTR;
import static org.asqatasun.rules.keystore.AttributeStore.ARIA_LABEL_ATTR;
import static org.asqatasun.rules.keystore.AttributeStore.ARIA_LABELLEDBY_ATTR;
import org.asqatasun.rules.keystore.CssLikeQueryStore;
import static org.asqatasun.rules.keystore.CssLikeQueryStore.TEXT_LINK_CSS_LIKE_QUERY;
import org.asqatasun.rules.keystore.HtmlElementStore;
import org.asqatasun.rules.textbuilder.LinkTextElementBuilder;
import org.springframework.util.CollectionUtils;

/**
 * Element selector implementation that select text links.
 * The initial selection is split between results that have a context 
 * and results that have not. Each selection is then exposed
 * @author jkowalczyk
 */
public class LinkElementSelector implements ElementSelector {

    /** 
     * The list of elements that are considered as context of the link. 
     * The presence of the td element in that list enables to deal with the case
     * where the context is handled by a table header as defined in the rule
     */
    private static final String[] PARENT_CONTEXT_ELEMENTS_TAB = { HtmlElementStore.P_ELEMENT,
            HtmlElementStore.H1_ELEMENT, HtmlElementStore.H2_ELEMENT, HtmlElementStore.H3_ELEMENT,
            HtmlElementStore.H4_ELEMENT, HtmlElementStore.H5_ELEMENT, HtmlElementStore.H6_ELEMENT,
            HtmlElementStore.LI_ELEMENT, HtmlElementStore.TD_ELEMENT, };

    /** */
    private static final String[] PREV_SIBLING_CONTEXT_ELEMENTS_TAB = { HtmlElementStore.H1_ELEMENT,
            HtmlElementStore.H2_ELEMENT, HtmlElementStore.H3_ELEMENT, HtmlElementStore.H4_ELEMENT,
            HtmlElementStore.H5_ELEMENT, HtmlElementStore.H6_ELEMENT, };

    /** */
    private static Collection PARENT_CONTEXT_ELEMENTS = CollectionUtils.arrayToList(PARENT_CONTEXT_ELEMENTS_TAB);

    /** */
    private static Collection PREV_SIBLING_CONTEXT_ELEMENTS = CollectionUtils
            .arrayToList(PREV_SIBLING_CONTEXT_ELEMENTS_TAB);

    /** */
    private final ElementHandler<Element> decidableElements = new ElementHandlerImpl();

    public ElementHandler<Element> getDecidableElements() {
        return decidableElements;
    }

    /** */
    private final ElementHandler<Element> notDecidableElements = new ElementHandlerImpl();

    public ElementHandler<Element> getNotDecidableElements() {
        return notDecidableElements;
    }

    /* 
     * does the selection split results between the one that have a context 
     * and the one that have not
     */
    private boolean considerContext = true;

    public boolean considerContext() {
        return considerContext;
    }

    /* 
     * does the selection split results between the one that have a context 
     and the one that have not
     */
    private boolean considerTitleAsContext = true;

    public boolean considerTitleAsContext() {
        return considerTitleAsContext;
    }

    /* The element builder needed to build the link text */
    private final LinkTextElementBuilder linkTextElementBuilder = new LinkTextElementBuilder();

    /**
     * 
     * @param considerContext 
     */
    public LinkElementSelector(boolean considerContext) {
        this.considerContext = considerContext;
    }

    /**
     * Constructor
     * @param considerTitleAsContext
     * @param considerContext 
     */
    public LinkElementSelector(boolean considerTitleAsContext, boolean considerContext) {
        this.considerContext = considerContext;
        this.considerTitleAsContext = considerTitleAsContext;
    }

    /**
     * 
     * @return 
     */
    protected String getCssLikeQuery() {
        return TEXT_LINK_CSS_LIKE_QUERY;
    }

    @Override
    public void selectElements(SSPHandler sspHandler, ElementHandler<Element> elementHandler) {
        // the elementHandler is ignored, the selection is handled by two 
        // local collections 
        Elements elements = sspHandler.beginCssLikeSelection().domCssLikeSelectNodeSet(getCssLikeQuery())
                .getSelectedElements();
        characteriseElements(elements);
        if (elementHandler != null) {
            elementHandler.addAll(notDecidableElements.get());
            elementHandler.addAll(decidableElements.get());
        }
    }

    /**
     * Expose the selectElement without ElementHandler argument to delegate 
     * the null value usage responsibility to the current class
     * @param sspHandler 
     */
    public void selectElements(SSPHandler sspHandler) {
        this.selectElements(sspHandler, null);
    }

    public boolean isEmpty() {
        return notDecidableElements.isEmpty() && decidableElements.isEmpty();
    }

    /**
     * 
     * @param elements 
     */
    protected void characteriseElements(Elements elements) {
        for (Element el : elements) {
            characteriseElement(el);
        }
    }

    /**
     * 
     * @param element 
     */
    protected void characteriseElement(Element element) {
        String linkText = getLinkText(element);
        if (!isLinkPartOfTheScope(element, linkText)) {
            return;
        }
        if (considerContext) {
            if (doesLinkHaveContext(element, linkText)) {
                notDecidableElements.add(element);
            } else {
                decidableElements.add(element);
            }
        } else {
            decidableElements.add(element);
        }
    }

    /**
    * 
    * @param linkElement
    * @return the link text
    */
    protected String getLinkText(Element linkElement) {
        return linkTextElementBuilder.buildTextFromElement(linkElement);
    }

    /**
     * 
     * @param linkElement
     * @param linkText
     * @return whether the link is part o the scope, i.e the link text is not 
     * empty
     */
    protected boolean isLinkPartOfTheScope(Element linkElement, String linkText) {
        return StringUtils.isNotBlank(linkText);
    }

    /**
     * 
     * @param linkElement
     * @param linkText
     * @return whether the current link have a context
     */
    protected boolean doesLinkHaveContext(Element linkElement, String linkText) {
        // does the current link have a title attribute? 
        if (considerTitleAsContext && linkElement.hasAttr(TITLE_ATTR)
                && !StringUtils.equalsIgnoreCase(linkElement.attr(TITLE_ATTR), linkText)) {
            return true;
        }
        if (linkElement.hasAttr(ARIA_LABEL_ATTR) && StringUtils.isNotBlank(linkElement.attr(ARIA_LABEL_ATTR))) {
            return true;
        }
        if (linkElement.hasAttr(ARIA_LABELLEDBY_ATTR)
                && StringUtils.isNotBlank(linkElement.attr(ARIA_LABELLEDBY_ATTR))) {
            return true;
        }
        // does the parent of the current link have some text?
        if (StringUtils.isNotBlank(linkElement.parent().ownText())) {
            return true;
        }
        // does the current element have a previous sibling of heading type?
        if (isOneOfPrecedingSiblingofHeadingType(linkElement)) {
            return true;
        }
        // does one of the parent of the current element have a previous sibling 
        // of heading type or is found in the PARENT_CONTEXT_ELEMENTS list?
        for (Element parent : linkElement.parents()) {
            if (PARENT_CONTEXT_ELEMENTS.contains(parent.tagName())
                    || isOneOfPrecedingSiblingofHeadingType(parent)) {
                return true;
            }
        }
        return false;
    }

    /**
     * 
     * @param element
     * @return whether one of the preceding sibling is of heading type
     */
    private boolean isOneOfPrecedingSiblingofHeadingType(Element element) {
        Element prevElementSibling = element.previousElementSibling();
        while (prevElementSibling != null) {
            if (PREV_SIBLING_CONTEXT_ELEMENTS.contains(prevElementSibling.tagName())
                    || !prevElementSibling.select(CssLikeQueryStore.HEADINGS_CSS_LIKE_QUERY).isEmpty()) {
                return true;
            }
            prevElementSibling = prevElementSibling.previousElementSibling();
        }
        return false;
    }

}