Here you can find the source of cleanText(String text)
Parameter | Description |
---|---|
text | A string with rests of WikiMarkup. |
public static String cleanText(String text)
//package com.java2s; /*//ww w. j a v a 2 s . c om * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universit?t Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ public class Main { /** * Clean a string from left-over WikiMarkup (most parsers do not work 100% correct). * * @param text A string with rests of WikiMarkup. * @return The cleaned string. */ public static String cleanText(String text) { String plainText = text; plainText = plainText.replaceAll("<.+?>", " "); plainText = plainText.replaceAll("__.+?__", " "); plainText = plainText.replaceAll("\\[http.+?\\]", " "); plainText = plainText.replaceAll("\\{\\|.+?\\|\\}", " "); plainText = plainText.replaceAll("\\{\\{.+?\\}\\}", " "); plainText = plainText.replaceAll(" - ", " "); plainText = plainText.replace('"', ' '); plainText = plainText.replace('\'', ' '); plainText = plainText.replace('[', ' '); plainText = plainText.replace(']', ' '); plainText = plainText.replace('=', ' '); plainText = plainText.replace('*', ' '); plainText = plainText.replace('|', ' '); plainText = plainText.replace(':', ' '); plainText = plainText.replace('{', ' '); plainText = plainText.replace('}', ' '); plainText = plainText.replace('(', ' '); plainText = plainText.replace(')', ' '); plainText = plainText.replaceAll("\\s{2,}", " "); return plainText; } }