com.hmsinc.epicenter.classifier.util.ClassifierUtils.java Source code

Introduction

Here is the source code for com.hmsinc.epicenter.classifier.util.ClassifierUtils.java
Source

/**
 * Copyright (C) 2008 University of Pittsburgh
 * 
 * 
 * This file is part of Open EpiCenter
 * 
 *     Open EpiCenter is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     Open EpiCenter is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with Open EpiCenter.  If not, see <http://www.gnu.org/licenses/>.
 * 
 * 
 *   
 */
package com.hmsinc.epicenter.classifier.util;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;

import org.apache.commons.lang.StringUtils;

/**
 * @author <a href="mailto:steve.kondik@hmsinc.com">Steve Kondik</a>
 * @version $Id:ClassifierUtils.java 219 2007-07-17 14:37:39Z steve.kondik $
 */
public final class ClassifierUtils {

    public static final Set<String> BASIC_STOPWORDS = makeStopSet("a", "about", "above", "ack", "across", "adj",
            "after", "afterwards", "again", "against", "ago", "albeit", "all", "almost", "alone", "along",
            "already", "also", "although", "always", "am", "ambulance", "among", "amongst", "an", "and", "another",
            "any", "anyhow", "anyone", "anything", "anywhere", "april", "are", "area", "around", "arr", "arrive",
            /*"as",*/ "at", "august", "bad", /*"be",*/ "became", "because", "become", "becomes", "becoming", "bed",
            "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond",
            "bilat", "bilateral", "both", /*"brother",*/ "but", "by", /*"ca",*/ "can", "cannot", "cc", "check",
            "child", /*"co",*/ "could", /*"cp",*/ "dad", "daughter", "day", "days", "december",
            /*"diff", "difficult", "difficulty",*/ "doctor", "down", "dr", "due", "during", "dx", "each", "earlier",
            "eg", "either", "else", "elsewhere", "em", "ems", "enough", "episode", "er", "etc", "even", "ever",
            "every", "everyone", "everything", "everywhere", "except", "father", "february", "female", "few",
            "first", "for", "fordays", "former", "formerly", "friday", "friend", "from", "further", "gave", "gx",
            "had", "has", "have", "having", "he", "hence", "her", "here", "hereafter", "hereby", "herein",
            "hereupon", "hers", "herself", "him", "himself", "his", "hours", "how", "however", "hrs", /*"hx",*/ "i",
            "ie", "if", /*"in",*/ "inc", "indeed", "into", "is", /*"it",*/ "its", "itself", "january", "july",
            "june", "just", "known", "last", "latter", "latterly", "least", "left", "less", "lft", "like", "ls",
            "lt", "ltd", "male", "many", "march", "may", "me", "medic", "meds", "meanwhile", "middle", "might",
            "mom", "monday", "month", "months", "more", "moreover", "mos", "most", "mostly", "mother", "mths",
            "much", "must", "my", "myself", "namely", "near", "neither", "never", "nevertheless", "next", "no",
            "nobody", "none", "noone", "nor", /*"not",*/ "nothing", "november", "now", "nowhere", "october", "of",
            /*"off",*/ "often", "on", "once", "one", "only", "onto", "op", "or", "other", "others", "otherwise",
            "our", "ours", "ourselves", /*"out",*/ "over", "own", "patient", "per", "perhaps", "pm", "pos", "poss",
            "possible", "post", "pn", "pre", "previous", "ps", /*"pt",*/ "pts", "px", /*"rad",*/
            "radiating", "rather", "re", /*"recheck",*/ "ref", "related", "right", "rm", "room", "rt",
            /*"rx",*/ "s", "same", "seem", "seemed", "seeming", "seems", "sent", "september", "several", "severe",
            "she", "should", "side", "sided", "since", /*"sister",*/ "so", "some", "somehow", "someone",
            "something", "sometime", "sometimes", "somewhere", "sp", "squad", "states", "status", "still", "st",
            "sts", "such", "sx", "sxs", "symptom", "symptoms", /*"t",*/ "than", "that", "the", "their", "them",
            "themselves", "then", "thence", "there", "thereafter", "thereby", "therefor", "therein", "thereupon",
            "these", "they", "think", "thinks", "this", "those", "though", "through", "throughout", "thru",
            "thursday", "thus", /*"to",*/ "today", "together", "too", "toward", "towards", "tuesday", "under",
            "unknown", "unspecified", "until", "up", "upon", /*"us",*/ "vb", "very", "via", "walkin", "was", "we",
            "wednesday", "week", "weeks", "well", "were", "what", "whatever", "whatsoever", "when", "whence",
            "whenever", "whensoever", "where", "whereafter", "whereas", "whereat", "whereby", "wherefrom",
            "wherein", "whereinto", "whereof", "whereon", "whereto", "whereunto", "whereupon", "wherever",
            "wherewith", "whether", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who",
            "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "wife",
            /*"will",*/ "wi", "with", "within", "without", "wk", "wks", "would", "work", "xauthor", "xcal", "xdays",
            "xhours", "xhrs", "xnote", "xother", "xsubj", "xweek", "xweeks", "xwk", "xwks", "year", "years", "yes",
            "yesterday", "yet", "you", "your", "yours", "yourself", "yourselves", "yearly", "yrly", "yrs");

    public static CharSequence filter(final CharSequence complaint) {
        return filter(complaint, BASIC_STOPWORDS);
    }

    public static CharSequence filterAllowNumbers(final CharSequence complaint, final Set<String> stopwords) {
        String ret = "";
        if (complaint != null) {

            // Lowercase, alphabetic only, remove extra spaces..
            final String cleaned = StringUtils.trimToNull(complaint.toString().toLowerCase(Locale.getDefault())
                    .replaceAll("h/a", "headache").replaceAll("n/v", "nausea vomiting").replaceAll("[/,]", " ")
                    .replaceAll("[^a-z\\s\\d]", " "));

            if (cleaned != null) {

                final StringBuilder buf = new StringBuilder();

                final String[] sp = cleaned.split("\\s");

                for (int i = 0; i < sp.length; i++) {
                    if (sp[i] != null && sp[i].length() > 1 && !stopwords.contains(sp[i])) {
                        if (buf.length() > 0) {
                            buf.append(" ");
                        }
                        buf.append(sp[i]);
                    }
                }
                if (buf.length() > 0) {
                    ret = buf.toString();
                }
            }
        }
        return ret;
    }

    public static CharSequence filter(final CharSequence complaint, final Set<String> stopwords) {

        String ret = "";
        if (complaint != null) {

            // Lowercase, alphabetic only, remove extra spaces..
            final String cleaned = StringUtils.trimToNull(complaint.toString().toLowerCase(Locale.getDefault())
                    .replaceAll("h/a", "headache").replaceAll("n/v", "nausea vomiting").replaceAll("[/,]", " ")
                    .replaceAll("[^a-z\\s]", " "));

            if (cleaned != null) {

                final StringBuilder buf = new StringBuilder();

                final String[] sp = cleaned.split("\\s");

                for (int i = 0; i < sp.length; i++) {
                    if (sp[i] != null && sp[i].length() > 1 && !stopwords.contains(sp[i])) {
                        if (buf.length() > 0) {
                            buf.append(" ");
                        }
                        buf.append(sp[i]);
                    }
                }
                if (buf.length() > 0) {
                    ret = buf.toString();
                }
            }
        }
        return ret;
    }

    private static Set<String> makeStopSet(String... words) {
        final Set<String> stopSet = new HashSet<String>();
        stopSet.addAll(Arrays.asList(words));
        return Collections.unmodifiableSet(stopSet);
    }

}