This method will get all real word in the String contain html tag. - Java java.lang

Java examples for java.lang:String HTML

Description

This method will get all real word in the String contain html tag.

Demo Code


//package com.java2s;
import java.util.ArrayList;
import java.util.List;

public class Main {
    public static void main(String[] argv) throws Exception {
        String content = "java2s.com";
        System.out.println(getListWord(content));
    }/*  w  w  w.  jav  a2s . c om*/

    private static final String TAGBEGIN = "<";
    private static final String TAGEND = ">";
    private static final String DOUBLESPACE = "00";
    private static final char SPACE = '0';
    private static final String[] SPECAILWORD = new String[] { "'s", "'re",
            "'ll", "'ve", "'m" };

    /**
     * This method will get all real word in the String contain html tag.
     * @return all real word in list
     */
    public static List<String> getListWord(String content) {
        List<String> list = new ArrayList<String>();
        StringBuilder contentHTML = new StringBuilder(content);
        try {
            // remove all tags
            int beginTagIndex = -1;
            int endTagIndex = 0;
            while (beginTagIndex < endTagIndex) {
                beginTagIndex = contentHTML.indexOf(TAGBEGIN);
                endTagIndex = contentHTML.indexOf(TAGEND);
                if (beginTagIndex < endTagIndex) {
                    // delete tag
                    // contentHTML = contentHTML.substring(0, beginTagIndex)
                    //         + contentHTML.substring(endTagIndex + 1, contentHTML.length());
                    contentHTML = contentHTML.delete(beginTagIndex,
                            endTagIndex + 1);
                    //System.out.println(contentHTML);
                }
            }

            // if contentHTML empty then return
            if (contentHTML.equals("")) {
                return list;
            }

            // remove all special characters
            int i;
            char ch;
            for (i = 0; i < contentHTML.length(); i++) {
                ch = contentHTML.charAt(i);
                if (!Character.isLetter(ch) && (ch != '\'')) {
                    contentHTML = contentHTML.replace(i, i + 1, SPACE + "");
                }
            }

            // if contentHTML empty then return
            if (contentHTML.equals("")) {
                return list;
            }
            // remove all redundant spaces
            int doubleSpaceIndex = contentHTML.indexOf(DOUBLESPACE);
            while (doubleSpaceIndex >= 0) {
                //contentHTML = contentHTML.replaceAll(DOUBLESPACE, SPACE + "");
                contentHTML = contentHTML.replace(doubleSpaceIndex,
                        doubleSpaceIndex + 2, SPACE + "");
                doubleSpaceIndex = contentHTML.indexOf(DOUBLESPACE);

            }

            // if contentHTML empty then return
            if (contentHTML.equals("")) {
                return list;
            }

            // remove space if contentHMTL begin with space
            if (contentHTML.charAt(0) == SPACE) {
                //contentHTML = contentHTML.substring(1, contentHTML.length());
                contentHTML = contentHTML.delete(0, 1);
            }

            // if contentHTML empty then return
            if (contentHTML.equals("")) {
                return list;
            }

            // remove space if contentHTML end with space;
            if (contentHTML.charAt(contentHTML.length() - 1) != SPACE) {
                contentHTML.append(SPACE);
            }

            // get all words
            int beginSpaceIndex = 0;
            int endSpaceIndex = contentHTML.indexOf(SPACE + "",
                    beginSpaceIndex + 1);
            String word = "";
            while ((beginSpaceIndex <= endSpaceIndex)
                    && (endSpaceIndex >= 0)) {
                word = contentHTML
                        .substring(beginSpaceIndex, endSpaceIndex);
                if (isRealWord(word) && word.length() > 2) {
                    if (!list.contains(word)) {
                        list.add(word);
                        //System.out.println(word);
                    }
                }
                beginSpaceIndex = endSpaceIndex + 1;
                endSpaceIndex = contentHTML.indexOf(SPACE + "",
                        beginSpaceIndex + 1);
            }

        } catch (Exception e) {

        }
        return list;
    }

    /**
     * [Give the description for method].
     * @param word
     * @return
     */
    private static boolean isRealWord(String word) {
        for (int i = 0; i < SPECAILWORD.length; i++) {
            if (word.contains(SPECAILWORD[i])) {
                return false;
            }
        }
        return true;
    }
}

Related Tutorials