utils.StringManip.java Source code

Java tutorial

Introduction

Here is the source code for utils.StringManip.java

Source

/*
 * Copyright (c) 2016 Chris Bellis
 * This software is subject to the MIT License, see LICENSE.txt in the root of the repository.
 */

package utils;

import org.apache.commons.lang.StringUtils;
import resources.StopwordsProvider;

import java.text.NumberFormat;
import java.text.ParsePosition;
import java.util.Arrays;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * String manipulator for removing stopwords.
 * Created by chris on 1/5/16.
 */
public class StringManip {
    private StringManip() {
    }

    public static String removeStopwords(String original, Set<String> stopwords) {
        return original.replaceAll(getMultiwordRegexString(stopwords), " ");
    }

    public static String removeStopwords(String original, Pattern p) {
        return p.matcher(original).replaceAll(" ");
    }

    /**
     * Removes stopwords using the default stopwords provider
     * @param original The original string
     * @return The string with the stopwords removed
     */
    public static String removeStopwords(String original) {
        return removeStopwords(original, StopwordsProvider.getProvider().getRegex());
    }

    /**
     * Splits a text into an array of sentences.
     *
     * @param text Text to split into individual sentences
     * @return An array of strings that contain sentences
     */
    public static String[] splitSentences(String text) {
        return text.split("(?<=[.!?])\\s*");
    }

    /**
     * Removes The numbers from a string!
     * @param s The string to remove numbers from
     * @return A string without numbers
     */
    public static String removeNumbers(String s) {
        return Arrays.asList(s.split(" ")).stream().filter(str -> !isNumeric(str)).collect(Collectors.joining(" "));
    }

    public static String removeTerm(String s, String toRemove) {
        return s.replaceAll(getOnlyWordRegex(toRemove), " ");
    }

    public static String getMultiwordRegexString(Set<String> stopwords) {
        return StringUtils.join(
                stopwords.parallelStream().map(StringManip::getOnlyWordRegex).collect(Collectors.toList()), "|");
    }

    /**
     * Creates a string representation of a regex that contains ONLY a word and may or may not be surrounded by whitespace.
     * @param term The term
     * @return The regex string
     */
    private static String getOnlyWordRegex(String term) {
        String newRegex = "\\s*\\b";
        newRegex += term;
        newRegex += "\\b\\s*";
        return newRegex;
    }

    public static String removeSmartQuotes(String str) {
        String retVal = str;
        retVal = retVal.replaceAll("[\u2018\u2019\u201A\u201B\u2032\u2035]", "'");
        retVal = retVal.replaceAll("[\u201C\u201D\u201E\u201F\u2033\u2036]", "\"");
        return retVal;
    }

    /**
     * Checks if a string is numeric
     *
     * @param str String to check if it is a number
     * @return True if the string is a number
     */
    public static boolean isNumeric(String str) {
        NumberFormat formatter = NumberFormat.getInstance();
        ParsePosition pos = new ParsePosition(0);
        formatter.parse(str, pos);
        boolean IsANumber = str.length() == pos.getIndex();

        Pattern p = Pattern.compile("[0-9]");
        Matcher m = p.matcher(str);
        boolean containsNumber = m.matches();

        return IsANumber || containsNumber;
    }
}