Here you can find the source of removeTag(String text)
public static final String removeTag(String text)
//package com.java2s; /** //from w ww . jav a2s .c om * * Copyright (C) 2009-2013 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ import java.util.regex.Matcher; import java.util.regex.Pattern; public class Main { private final static Pattern removeTagPattern = Pattern.compile("<[^>]*>"); private final static Pattern removeBrPattern1 = Pattern.compile("\\.\\p{Space}+<br\\p{Space}*/?>", Pattern.CASE_INSENSITIVE); private final static Pattern removeEndTagBlockPattern1 = Pattern.compile( "\\.\\p{Space}+</(p|td|div|h1|h2|h3|h4|h5|h6|hr|li|option|pre|select|table|tbody|td|textarea|tfoot|thead|th|title|tr|ul)>", Pattern.CASE_INSENSITIVE); private final static Pattern removeEndTagBlockPattern2 = Pattern.compile( "</(p|td|div|h1|h2|h3|h4|h5|h6|hr|li|option|pre|select|table|tbody|td|textarea|tfoot|thead|th|title|tr|ul)>", Pattern.CASE_INSENSITIVE); private final static Pattern removeBrPattern2 = Pattern.compile("<br\\p{Space}*/?>", Pattern.CASE_INSENSITIVE); private final static Pattern removeScriptObjectStylePattern = Pattern .compile("<(script|object|style)[^>]*>[^<]*</(script|object|style)>", Pattern.CASE_INSENSITIVE); public static final String removeTag(String text) { text = replaceConsecutiveSpaces(text, " "); synchronized (removeScriptObjectStylePattern) { text = removeScriptObjectStylePattern.matcher(text).replaceAll(""); } synchronized (removeBrPattern1) { text = removeBrPattern1.matcher(text).replaceAll("</p>"); } synchronized (removeEndTagBlockPattern1) { text = removeEndTagBlockPattern1.matcher(text).replaceAll("</p>"); } synchronized (removeEndTagBlockPattern2) { text = removeEndTagBlockPattern2.matcher(text).replaceAll(". "); } synchronized (removeBrPattern2) { text = removeBrPattern2.matcher(text).replaceAll(". "); } synchronized (removeTagPattern) { text = removeTagPattern.matcher(text).replaceAll(""); } text = replaceConsecutiveSpaces(text, " "); return text; } public static final String removeTag(String text, String[] allowedTags) { if (allowedTags == null) text = replaceConsecutiveSpaces(text, " "); StringBuffer sb = new StringBuffer(); Matcher matcher; synchronized (removeTagPattern) { matcher = removeTagPattern.matcher(text); } while (matcher.find()) { boolean allowed = false; String group = matcher.group(); if (allowedTags != null) { for (String tag : allowedTags) { if (tag.equals(group)) { allowed = true; break; } } } matcher.appendReplacement(sb, allowed ? group : ""); } matcher.appendTail(sb); return sb.toString(); } public static final String replaceConsecutiveSpaces(String source, String replace) { StringBuilder target = new StringBuilder(); int l = source.length(); boolean consecutiveSpace = false; for (int i = 0; i < l; i++) { char c = source.charAt(i); if (Character.isWhitespace(c)) { if (!consecutiveSpace) { if (replace != null) target.append(replace); consecutiveSpace = true; } } else { target.append(c); if (consecutiveSpace) consecutiveSpace = false; } } return target.toString(); } }