HashTagSegmenter.java Source code

Java tutorial

Introduction

Here is the source code for HashTagSegmenter.java

Source

/*
 * HashTagSegmenter.java
 *
 * Author: DANNY DELOTT <DANNYDELOTT@gmail.com>
 * Licensed under GPL Version 3
 *
 * A Java class to segment words in a hash tag
*/

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.math.NumberUtils;

public class HashTagSegmenter {

    private String wordListLocation;
    private Hashtable<String, String> wordHashTable = new Hashtable<String, String>();

    /* CONSTRUCTOR */
    HashTagSegmenter(String wll) throws IOException {
        wordListLocation = wll;
        wordHashTable = getWordListHashTable();

    }

    /*
     * Segments the words of the token (eg: "#iwant2eatfood")
     */
    List<String> segmentWordsInHashTaggedToken(String text) {

        // holds crude segments from number split
        List<String> crudeSegments = new ArrayList<String>();

        // holds completely segmented tokens
        List<String> tempSegments = new ArrayList<String>();
        List<String> finalSegments = new ArrayList<String>();

        // sets the token to lower case
        StringBuilder tokenText = new StringBuilder(text.toLowerCase());

        // checks for hashtag
        if (tokenText.charAt(0) == '#') {

            // deletes the hashtag
            tokenText = tokenText.deleteCharAt(0);

            // splits the token text into crude segments when a number exists
            // eg: "iwant2eatfood" -> ['iwant', '2', 'eatfood']
            Matcher m = Pattern.compile("[\\d.]+|\\D+").matcher(tokenText);
            while (m.find()) {
                crudeSegments.add(m.group());
            }

            // segments items from crude segments list
            // eg: temp[0] = ['iwant'] ->
            // segments = ['i','want']
            for (int i = 0; i < crudeSegments.size(); i++) {

                // if crude item is a number, add it to the segments list
                if (NumberUtils.isNumber(crudeSegments.get(i))) {
                    finalSegments.add(crudeSegments.get(i));
                } else {

                    // if crude item is not a number, segment and add each
                    // new item to the segments list
                    tempSegments = getSegments(crudeSegments.get(i));

                    // adds new segments list to final segments
                    if (tempSegments != null) {
                        for (int j = 0; j < tempSegments.size(); j++) {
                            finalSegments.add(tempSegments.get(j));
                        }
                    } else {
                        // adds crude segment to list if it cannot be segmented
                        finalSegments.add(crudeSegments.get(i));
                    }
                }
            }

        }

        return finalSegments;
    }

    private List<String> getSegments(String text) {
        List<String> segments = new ArrayList<String>();

        // begins segmenting the text from the beginning
        segments = segment(text);

        if (segments != null) {
            return segments;
        } else {
            return null;
        }
    }

    private List<String> segment(String text) {
        List<String> segments = new ArrayList<String>();
        String currentSegment = "";
        StringBuilder trimmedText = new StringBuilder(text);
        StringBuilder finalText = new StringBuilder(text);
        boolean foundLastWord = true;

        while (trimmedText.length() >= 0) {

            // returns text if text is empty or the last word is not found
            if ((trimmedText.length() == 0 && segments.size() == 0) || foundLastWord == false) {
                segments.clear();
                segments.add(text);
                return segments;
            }
            // returns the segments if crude segment text has no more characters
            else if (trimmedText.length() == 0 && segments.size() > 0) {
                return segments;
            }
            // segments the crude segment text if not empty
            else if (trimmedText.length() > 0) {

                // adds text to segments list if text exists in Hashtable
                if (wordHashTable.containsKey(trimmedText.toString())) {

                    // stores the segment for easy removal
                    currentSegment = trimmedText.toString();

                    // newText only contains the key, adds it to list
                    segments.add(currentSegment);

                    // deletes the current segment from front of finalText
                    finalText = new StringBuilder(finalText.delete(0, currentSegment.length()));

                    // resets newText
                    trimmedText = new StringBuilder(finalText.toString());

                }
                // trims last letter of crude segment text if key doesn't exist
                else {

                    trimmedText = trimmedText.deleteCharAt(trimmedText.length() - 1);

                    if (trimmedText.length() == 0) {
                        foundLastWord = false;
                    }
                }

            }

        }

        // returns null if unable to segment
        return null;

    }

    /* Returns the word list specified in the constructor */
    private Hashtable<String, String> getWordListHashTable() throws IOException {
        Hashtable<String, String> tempWordList = new Hashtable<String, String>();
        File file = new File(wordListLocation);
        BufferedReader br = new BufferedReader(new FileReader(file));
        String line;
        while ((line = br.readLine()) != null) {
            // process the line.
            tempWordList.put(line, line);

        }
        br.close();

        return tempWordList;
    }

    void printList(List<String> list) {
        for (int i = 0; i < list.size(); i++) {
            System.out.println(list.get(i));
        }
    }

}