org.korecky.nlp.library.BasicTextProcessing.java Source code

Introduction

Here is the source code for org.korecky.nlp.library.BasicTextProcessing.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package org.korecky.nlp.library;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;

/**
 *
 * @author vkorecky
 */
public class BasicTextProcessing {

    /**
     * Normalize text
     *
     * @param input: Input string
     * @param removeAccents: Remove accents form the input text
     * @return Output string
     */
    public String normalizeText(String input, boolean removeAccents) {

        // Convert text to lowercase        
        String output = input.toLowerCase();

        // Remove accents 
        if (removeAccents) {
            output = StringUtils.stripAccents(output);
        }

        return output;
    }

    /**
     * Segments and calculate words in the input text
     *
     * @param input: Input string
     * @param normalizeText: Indicator, if text should be normalized before
     * counting
     * @param removeAccents: Indicator, if should be removed accent in
     * normalization process
     * @return Sorted TreeMap by count
     */
    public TreeMap<String, Long> wordSegmentation(String input, boolean normalizeText, boolean removeAccents) {
        HashMap<String, Long> map = new HashMap<>();
        WordCountComparator comparator = new WordCountComparator(map);
        TreeMap<String, Long> sorted_map = new TreeMap<String, Long>(comparator);

        if (normalizeText) {
            input = normalizeText(input, removeAccents);
        }

        // Words segmentation and calculation
        Pattern pattern = Pattern.compile(
                "[a-zA-Z\\u00C1\\u00C2\\u00C4\\u00C7\\u00C9\\u00CB\\u00CD\\u00CE\\u00D3\\u00D4\\u00D6\\u00DA\\u00DC\\u00DD\\u00DF\\u00E1\\u00E2\\u00E4\\u00E7\\u00E9\\u00EB\\u00ED\\u00EE\\u00F3\\u00F4\\u00F6\\u00FA\\u00FC\\u00FD\\u0102\\u0103\\u0104\\u0105\\u0106\\u0107\\u010C\\u010D\\u010E\\u010F\\u0110\\u0111\\u0118\\u0119\\u011A\\u011B\\u0139\\u013A\\u013D\\u013E\\u0141\\u0142\\u0143\\u0144\\u0147\\u0148\\u0150\\u0151\\u0154\\u0155\\u0158\\u0159\\u015A\\u015B\\u015E\\u015F\\u0160\\u0161\\u0162\\u0163\\u0164\\u0165\\u016E\\u016F\\u0170\\u0171\\u0179\\u017A\\u017B\\u017C\\u017D\\u017E]+");
        Matcher matcher = pattern.matcher(input);
        while (matcher.find()) {
            if (matcher.group() != null) {
                String foundString = matcher.group().trim();
                if (map.containsKey(foundString)) {
                    map.replace(foundString, map.get(foundString) + 1);
                } else {
                    map.put(foundString, 1L);
                }
            }
        }

        sorted_map.putAll(map);
        return sorted_map;
    }

    class WordCountComparator implements Comparator<String> {

        Map<String, Long> base;

        public WordCountComparator(Map<String, Long> base) {
            this.base = base;
        }

        // Note: this comparator imposes orderings that are inconsistent with equals.    
        @Override
        public int compare(String a, String b) {
            if (base.get(a) >= base.get(b)) {
                return -1;
            } else {
                return 1;
            } // returning 0 would merge keys
        }
    }
}