edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString.java Source code

Java tutorial

Introduction

Here is the source code for edu.illinois.cs.cogcomp.ner.StringStatisticsUtils.MyString.java

Source

/**
 * This software is released under the University of Illinois/Research and Academic Use License. See
 * the LICENSE file in the root folder for details. Copyright (c) 2016
 *
 * Developed by: The Cognitive Computation Group University of Illinois at Urbana-Champaign
 * http://cogcomp.cs.illinois.edu/
 */
package edu.illinois.cs.cogcomp.ner.StringStatisticsUtils;

import org.apache.commons.lang3.time.FastDateFormat;

import java.text.ParsePosition;

public class MyString {
    public static String cleanPunctuation(String s) {
        StringBuilder res = new StringBuilder(s.length());
        for (int i = 0; i < s.length(); i++) {
            char c = s.charAt(i);
            if (Character.isLetter(c) || Character.isDigit(c))
                res.append(c);
        }
        return res.toString();
    }

    public static String normalizeDigitsForFeatureExtraction(String s) {
        String form = s;
        if (MyString.isDate(form))
            form = "*DATE*";
        if (MyString.hasDigits(form))
            form = MyString.normalizeDigits(form);
        return form;
    }

    /** fast date formatter for identifying date instances. */
    static final FastDateFormat df = FastDateFormat.getInstance();

    /** we will reuse this class to avoid the allocation. */
    static final ParsePosition pp = new ParsePosition(0);

    /**
     * Tests the string for a date starting at position zero.
     * 
     * @param s the date string.
     * @return true if it is a date, or false if it is not.
     */
    public static boolean isDate(String s) {
        pp.setIndex(0);
        return df.parse(s, pp) != null;
    }

    public static String collapseDigits(String s) {
        StringBuilder res = new StringBuilder(s.length() * 2);
        for (int i = 0; i < s.length(); i++) {
            if (Character.isDigit(s.charAt(i))) {
                while (i < s.length() && Character.isDigit(s.charAt(i)))
                    i++;
                res.append("*D*");
                if (i < s.length())
                    res.append(s.charAt(i));
            } else {
                res.append(s.charAt(i));
            }
        }
        return res.toString();
    }

    public static String normalizeDigits(String s) {
        StringBuilder res = new StringBuilder(s.length() * 2);
        for (int i = 0; i < s.length(); i++) {
            if (Character.isDigit(s.charAt(i))) {
                res.append("*D*");
            } else {
                res.append(s.charAt(i));
            }
        }
        return res.toString();
    }

    public static boolean hasDigits(String s) {
        for (int i = 0; i < s.length(); i++)
            if (Character.isDigit(s.charAt(i)))
                return true;
        return false;
    }

}