org.cee.webreader.client.util.ContentHtmlSanitizer.java Source code

Java tutorial

Introduction

Here is the source code for org.cee.webreader.client.util.ContentHtmlSanitizer.java

Source

package org.cee.webreader.client.util;

/*
 * #%L
 * News Reader
 * %%
 * Copyright (C) 2013 Andreas Behnke
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import com.google.gwt.safehtml.shared.HtmlSanitizer;
import com.google.gwt.safehtml.shared.SafeHtml;
import com.google.gwt.safehtml.shared.SafeHtmlUtils;

/**
 * HTML Sanitizer supporting the following tags and attributes:
 * 
 * Supported Tags - Supported Attributes:
 * 
 * b - class 
 * em - class 
 * i - class 
 * hr - class 
 * ul - class 
 * ol - class 
 * li - class 
 * p - class 
 * h[1-6] - class
 * a - href, target, class 
 * img - src, class
 * 
 * @author andreasbehnke
 */
public final class ContentHtmlSanitizer implements HtmlSanitizer {

    private static final Set<String> TAG_WHITELIST = new HashSet<String>(Arrays.asList("b", "em", "i", "h1", "h2",
            "h3", "h4", "h5", "h6", "hr", "ul", "ol", "li", "p", "a", "img", "br"));

    private static final String[] DEFAULT_ATTRIBUTES = new String[] { "class" };

    private static final String[] A_ATTRIBUTES = new String[] { "class", "href", "target" };

    private static final String[] IMG_ATTRIBUTES = new String[] { "class", "src" };

    private final boolean escapeIllegalTags;

    public ContentHtmlSanitizer(boolean escapeIllegalTags) {
        this.escapeIllegalTags = escapeIllegalTags;
    }

    @Override
    public SafeHtml sanitize(String html) {
        StringBuilder sanitized = new StringBuilder();

        boolean firstSegment = true;
        for (String segment : html.split("<", -1)) {
            if (firstSegment) {
                firstSegment = false;
                sanitized.append(SafeHtmlUtils.htmlEscapeAllowEntities(segment));
                continue;
            }

            int tagStart = 0; // will be 1 if this turns out to be an end tag.
            int tagEnd = segment.indexOf('>');
            String tag = null;
            String tagName = null;
            String tagAttributes = null;
            boolean isValidTag = false;
            if (tagEnd > 0) {
                if (segment.charAt(0) == '/') {
                    tagStart = 1;
                }
                tag = segment.substring(tagStart, tagEnd);
                int index = 0;
                int tagLength = tag.length();
                //find tag name
                while (index <= tagLength) {
                    if (index == tagLength || tag.charAt(index) == ' ' || tag.charAt(index) == '/') {
                        tagName = tag.substring(0, index).toLowerCase();
                        break;
                    }
                    index++;
                }
                if (tagName != null && TAG_WHITELIST.contains(tagName)) {
                    isValidTag = true;
                    tagAttributes = tag.substring(index, tag.length());
                }
            }

            if (isValidTag) {
                // append the tag, not escaping it
                if (tagStart == 0) {
                    sanitized.append('<').append(tagName);
                    String[] validAttributes = null;
                    if (tagName.equalsIgnoreCase("A")) {
                        validAttributes = A_ATTRIBUTES;
                    } else if (tagName.equalsIgnoreCase("IMG")) {
                        validAttributes = IMG_ATTRIBUTES;
                    } else {
                        validAttributes = DEFAULT_ATTRIBUTES;
                    }
                    appendAttributes(sanitized, tagAttributes, Arrays.asList(validAttributes));

                } else {
                    // we had seen an end-tag
                    sanitized.append("</").append(tagName);
                }
                sanitized.append('>').append(SafeHtmlUtils.htmlEscapeAllowEntities(segment.substring(tagEnd + 1)));
            } else {
                if (escapeIllegalTags) {
                    // just escape the whole segment
                    sanitized.append("&lt;").append(SafeHtmlUtils.htmlEscapeAllowEntities(segment));
                } else {
                    sanitized.append(segment.substring(tagEnd + 1));
                }
            }
        }

        return new SafeContentString(sanitized.toString());
    }

    protected void appendAttributes(StringBuilder sanitized, String segment, List<String> validAttributes) {
        int lastSpace = -1;
        int nameStart = 0;
        int valueStart = -1;
        int valueEnd = -1;
        String attributeName = null;
        String attributeValue = null;
        for (int index = 0; index < segment.length(); index++) {
            if (segment.charAt(index) == ' ') {
                lastSpace = index;
            } else if (segment.charAt(index) == '=') {
                //found attribute
                attributeName = segment.substring(nameStart, index).trim().toLowerCase();
                if (validAttributes.contains(attributeName)) {
                    //found valid attribute, extract attribute value
                    valueStart = -1;
                    valueEnd = -1;
                    while (index < segment.length()) {
                        index++;
                        if (segment.charAt(index) == '\'' || segment.charAt(index) == '"') {
                            valueStart = index + 1;
                            break;
                        }
                    }
                    while (index < segment.length()) {
                        index++;
                        if (segment.charAt(index) == '\'' || segment.charAt(index) == '"') {
                            valueEnd = index;
                            break;
                        }
                    }
                    if (valueStart > -1 && valueEnd > -1) {
                        attributeValue = segment.substring(valueStart, valueEnd);
                        sanitized.append(' ').append(attributeName).append("=\"").append(attributeValue)
                                .append("\"");
                    }
                }
            } else if (lastSpace == index - 1) {
                nameStart = index;
            }
        }
    }
}