Java examples for java.lang:String HTML
This method will get all real word in the String contain html tag.
//package com.java2s; import java.util.ArrayList; import java.util.List; public class Main { public static void main(String[] argv) throws Exception { String content = "java2s.com"; System.out.println(getListWord(content)); }/* w w w. jav a2s . c om*/ private static final String TAGBEGIN = "<"; private static final String TAGEND = ">"; private static final String DOUBLESPACE = "00"; private static final char SPACE = '0'; private static final String[] SPECAILWORD = new String[] { "'s", "'re", "'ll", "'ve", "'m" }; /** * This method will get all real word in the String contain html tag. * @return all real word in list */ public static List<String> getListWord(String content) { List<String> list = new ArrayList<String>(); StringBuilder contentHTML = new StringBuilder(content); try { // remove all tags int beginTagIndex = -1; int endTagIndex = 0; while (beginTagIndex < endTagIndex) { beginTagIndex = contentHTML.indexOf(TAGBEGIN); endTagIndex = contentHTML.indexOf(TAGEND); if (beginTagIndex < endTagIndex) { // delete tag // contentHTML = contentHTML.substring(0, beginTagIndex) // + contentHTML.substring(endTagIndex + 1, contentHTML.length()); contentHTML = contentHTML.delete(beginTagIndex, endTagIndex + 1); //System.out.println(contentHTML); } } // if contentHTML empty then return if (contentHTML.equals("")) { return list; } // remove all special characters int i; char ch; for (i = 0; i < contentHTML.length(); i++) { ch = contentHTML.charAt(i); if (!Character.isLetter(ch) && (ch != '\'')) { contentHTML = contentHTML.replace(i, i + 1, SPACE + ""); } } // if contentHTML empty then return if (contentHTML.equals("")) { return list; } // remove all redundant spaces int doubleSpaceIndex = contentHTML.indexOf(DOUBLESPACE); while (doubleSpaceIndex >= 0) { //contentHTML = contentHTML.replaceAll(DOUBLESPACE, SPACE + ""); contentHTML = contentHTML.replace(doubleSpaceIndex, doubleSpaceIndex + 2, SPACE + ""); doubleSpaceIndex = contentHTML.indexOf(DOUBLESPACE); } // if contentHTML empty then return if (contentHTML.equals("")) { return list; } // remove space if contentHMTL begin with space if (contentHTML.charAt(0) == SPACE) { //contentHTML = contentHTML.substring(1, contentHTML.length()); contentHTML = contentHTML.delete(0, 1); } // if contentHTML empty then return if (contentHTML.equals("")) { return list; } // remove space if contentHTML end with space; if (contentHTML.charAt(contentHTML.length() - 1) != SPACE) { contentHTML.append(SPACE); } // get all words int beginSpaceIndex = 0; int endSpaceIndex = contentHTML.indexOf(SPACE + "", beginSpaceIndex + 1); String word = ""; while ((beginSpaceIndex <= endSpaceIndex) && (endSpaceIndex >= 0)) { word = contentHTML .substring(beginSpaceIndex, endSpaceIndex); if (isRealWord(word) && word.length() > 2) { if (!list.contains(word)) { list.add(word); //System.out.println(word); } } beginSpaceIndex = endSpaceIndex + 1; endSpaceIndex = contentHTML.indexOf(SPACE + "", beginSpaceIndex + 1); } } catch (Exception e) { } return list; } /** * [Give the description for method]. * @param word * @return */ private static boolean isRealWord(String word) { for (int i = 0; i < SPECAILWORD.length; i++) { if (word.contains(SPECAILWORD[i])) { return false; } } return true; } }