util.StripHTMLTags.java Source code

Java tutorial

Introduction

Here is the source code for util.StripHTMLTags.java

Source

/**
* Copyright (c) 2001-2012 "Redbasin Networks, INC" [http://redbasin.org]
*
* This file is part of Redbasin OpenDocShare community project.
*
* Redbasin OpenDocShare is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package util;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.util.regex.Pattern;
import java.util.regex.Matcher;

/**
 * This contains some strip html methods.
 *
 * @author Smitha Gudur (smitha@redbasin.com)
 * @version $Revision: 1.1 $
 */
public class StripHTMLTags {

    /** Logger for this class and subclasses */
    protected final Log logger = LogFactory.getLog(getClass());

    public final String[] LINEBREAK_ELEMENTS = { "p", "br" };
    public final String[] LINEBREAK_BOLD = { "p", "br", "b" };
    public final String[] NO_HTML = { "" };

    /**
     * This method strips all tags from the body except those given in
     * in the tag array.
     *
     * @param body The body or text to strip
     * @param tags The tag array for strip exclusion
     * @return String Return the body with the stripped tags.
     */
    public String stripTags(String body, String[] tags) {
        if (null == body)
            return body;
        Pattern pattern = Pattern.compile("<.*?>", Pattern.DOTALL);
        Pattern[] tagPatterns = null;
        if (tags != null) {
            tagPatterns = new Pattern[tags.length];
            for (int i = 0; i < tags.length; i++) {
                tagPatterns[i] = Pattern.compile("<(\\s*?)(/??)(\\s*?)" + tags[i] + "((\\s*?>)||(\\s(.*?)>))",
                        Pattern.DOTALL);
            }
        }
        StringBuffer bodyStr = new StringBuffer(body);

        bodyStr = stripCSS(bodyStr);
        bodyStr = stripJS(bodyStr);

        Matcher matcher = pattern.matcher(bodyStr);
        while (matcher.find()) {
            logger.debug("Match: " + matcher.group());
            boolean matches = false;
            if ((tags != null) && (tags.length != 0)) {
                for (int i = 0; i < tags.length && (!matches); i++) {
                    logger.debug("Pattern: " + tagPatterns[i].pattern());
                    Matcher ematcher = tagPatterns[i].matcher(matcher.group());
                    matches = matches || ematcher.matches();
                }
            }
            if (!matches) {
                logger.debug("Substituting Match");
                bodyStr = bodyStr.replace(matcher.start(), matcher.end(), "");
                matcher = pattern.matcher(bodyStr);
            }
        }
        return bodyStr.toString();
    }

    /**
     * This strips any given complete pattern from the body.
     *
     * @StringBuffer the body to strip
     * @String the pattern to apply
     * @return StringBuffer return the stripped body
     */
    StringBuffer stripPattern(StringBuffer body, String pattern) {
        Pattern mypattern = Pattern.compile(pattern, Pattern.DOTALL);
        Matcher matcher = mypattern.matcher(body);
        while (matcher.find()) {
            body = body.replace(matcher.start(), matcher.end(), "");
        }
        return body;
    }

    /**
     * This strips the complete CSS section.
     *
     * @param StringBuffer The body to strip.
     * @return StringBuffer returns the stripped body.
     */
    StringBuffer stripCSS(StringBuffer body) {
        return stripPattern(body, "<[sS][tT][yY][lL][eE].*?[sS][tT][yY][lL][eE]>");
    }

    /**
     * This strips the complete javascript section.
     *
     * @param StringBuffer The body to strip.
     * @return StringBuffer returns the stripped body.
     */
    StringBuffer stripJS(StringBuffer body) {
        return stripPattern(body, "<[sS][cC][rR][iI][pP][tT].*?[sS][cC][rR][iI][pP][tT]>");
    }
}