edu.isi.pfindr.learn.util.CleanDataUtil.java Source code

Introduction

Here is the source code for edu.isi.pfindr.learn.util.CleanDataUtil.java
Source

package edu.isi.pfindr.learn.util;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;
import org.tartarus.snowball.ext.englishStemmer;

/*
 * Copyright 2012 University of Southern California
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *    http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * * CleanDataUtil.java 
 * 
 * Utility functions for data pre-processing; stemming,tokenization,bi-gram construction 
 * 
 * @author sharma@isi.edu 
 * 
 */
public class CleanDataUtil {

    private static englishStemmer stemmer = new englishStemmer(); ///////MOVE TO PREPROCESS
    private static Set<String> stopwords = new HashSet<String>(); ///////MOVE TO PREPROCESS
    private static final Pattern digitPattern = Pattern.compile("\\d"); ////MOVE TO PREPROCESS

    static private Logger logger = Logger.getLogger("AppLogging");

    public static void loadStopwordFile(String pathname) {
        logger.info("Loading stop words ..");
        BufferedReader br;
        String thisLine;
        try {
            br = new BufferedReader(new FileReader(pathname));
            while ((thisLine = br.readLine()) != null) {
                thisLine = thisLine.trim();
                if (thisLine.equals(""))
                    continue;
                stopwords.add(thisLine);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /*
     * Words containing special characters : ?, (), (, ), !, /, :, ?, /, ', -
     * 
     * These characters are replaced with single whitespace : ?, (), (, ), !, /, :, ?, /, '
     * Extra whitespace is removed
     * For words containing hyphen(-) e.g. aa-bb, they are replaced with: aabb, aa, bb
     */
    public static void preProcessWordsSpecialCharacters(String line, StringBuilder newLine) {

        //System.out.println("Preprocess words with special characaters ...");
        List<String> newLineList = new ArrayList<String>();
        String eachWord;
        String[] hyphenWordArray;
        //replace all forward slash in the string with whitespace
        //line = line.replaceAll("[/|:|?|..|adverb|noun]", " ");
        //System.out.println("Line before modification:  "+ line);
        line = line.replaceAll("(\\?)|(\\()|(\\))|(\\,)|(\\.)|(\\!)|(\\/)|(\\:)|(\\?)|(\\')|(\\])|(\\[)", " ");
        //System.out.println("Line after first modification: "+ line);
        line = line.replaceAll("\\s+", " ");

        newLineList = Arrays.asList(line.split("\\s+"));
        Iterator<String> iter = newLineList.iterator();

        //replace the hyphen words like aaa-bb-ccc with aaabbccc, aaa, bb, cc
        while (iter.hasNext()) {
            eachWord = (String) iter.next();
            newLine.append(" ").append(eachWord);
            if (eachWord.contains("-")) { //if the word contains a hyphen
                //System.out.println("Word containing hyphen: "+ eachWord);
                hyphenWordArray = eachWord.split("\\-");
                //adding aaabbccc for aaa-bb-ccc 
                newLine.append(" ").append(eachWord.replaceAll("\\-", ""));
                for (int i = 0; i < hyphenWordArray.length; i++)
                    //adding aaa, bb, cc for aaa-bb-ccc 
                    newLine.append(" ").append(hyphenWordArray[i]);
            }
        }
        //System.out.println("Line after modification: "+ newLine.toString());
    }

    /*
     * Words containing special characters : ?, (), (, ), !, /, :, ?, /, ', -
     * 
     * These characters are replaced with single whitespace : ?, (), (, ), !, /, :, ?, /, '
     * Extra whitespace is removed
     * For words containing hyphen(-) e.g. aa-bb, they are replaced with: aabb, aa, bb
     */
    public static String preProcessWordsSpecialCharacters(String line) {

        StringBuilder newLine = new StringBuilder();
        //System.out.println("Preprocess words with special characaters ...");
        List<String> newLineList = new ArrayList<String>();
        String eachWord;
        String[] hyphenWordArray;
        //replace all forward slash in the string with whitespace
        //line = line.replaceAll("[/|:|?|..|adverb|noun]", " ");
        //System.out.println("Line before modification:  "+ line);
        line = line.replaceAll("(\\?)|(\\()|(\\))|(\\,)|(\\.)|(\\!)|(\\/)|(\\:)|(\\?)|(\\')|(\\])|(\\[)", " ");
        //System.out.println("Line after first modification: "+ line);
        line = line.replaceAll("\\s+", " ");

        newLineList = Arrays.asList(line.split("\\s+"));
        Iterator<String> iter = newLineList.iterator();

        while (iter.hasNext()) {
            eachWord = (String) iter.next();
            newLine.append(" ").append(eachWord);
            if (eachWord.contains("-")) { //if the word contains a hyphen
                //System.out.println("Word containing hyphen: "+ eachWord);
                hyphenWordArray = eachWord.split("\\-");
                //adding aaabbccc for aaa-bb-ccc 
                newLine.append(" ").append(eachWord.replaceAll("\\-", ""));
                for (int i = 0; i < hyphenWordArray.length; i++)
                    //adding aaa, bb, cc for aaa-bb-ccc 
                    newLine.append(" ").append(hyphenWordArray[i]);
            } /*else{ //if the word does not contain hyphen, add it as it is
                newLine.append(" ").append(eachWord);
              }*/
        }
        //System.out.println("Line after modification: "+ newLine.toString());
        return newLine.toString();
    }

    /* Preprocess data
     * Remove stop words, stem, tokenize
     */
    public static String preprocessStemAndTokenize(String data) {

        Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept
        StringBuilder strBuilder = new StringBuilder();
        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        TermAttribute termAttribute;
        String term;
        //System.out.println("The value of data in tokenizeAndStem: "+ data);
        try {
            while (tokenStream.incrementToken()) {
                termAttribute = tokenStream.getAttribute(TermAttribute.class);
                term = termAttribute.term();
                if (stopwords.contains(term)) { //ignore stopwords
                    //System.out.println("Contains stopword: "+ term);
                    continue;
                }
                if (digitPattern.matcher(term).find()) //ignore digits
                    continue;
                if (term.length() <= 1) //ignore 1 letter words
                    continue;

                if (!digitPattern.matcher(term).find()) { //ignore digits
                    stemmer.setCurrent(term);
                    stemmer.stem();
                    transformedSet.add(stemmer.getCurrent());
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size());
        for (Object token : transformedSet.toArray()) {
            strBuilder.append(token).append(" ");
        }
        //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
        return strBuilder.toString();
    }

    public static String preprocessRemoveStopWords(String data) {

        StringBuilder strBuilder = new StringBuilder();
        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        TermAttribute termAttribute;
        String term;
        //System.out.println("The value of data in tokenizeAndStem: "+ data);
        try {
            while (tokenStream.incrementToken()) {
                termAttribute = tokenStream.getAttribute(TermAttribute.class);
                term = termAttribute.term();
                if (digitPattern.matcher(term).find()) //ignore digits
                    continue;
                if (term.length() <= 1)
                    continue;
                if (stopwords.contains(term))
                    continue;
                strBuilder.append(term).append(" ");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
        return strBuilder.toString().trim();
    }

    /*
     * Remove special characters : ?, (), , !, /, :, ?, ', [, ], {, } are replaced with whote space
     * Extra white space is removed
     */
    public static String removeSpecificCharacters(String line) {
        //System.out.println("\nLine before modification: "+ line);
        line = line.replaceAll(
                "(\\?)|(\\()|(\\))|(\\,)|(\\.)|(\\!)|(\\:)|(\\/)|(\\')|(\\])|(\\[)|(\\})|(\\{)|(\\;)|(\")", " ");
        line = line.replaceAll("\\s+", " ");
        //System.out.println("\nLine after modification: "+ line);
        return line;
    }

    /*
     * Preprocess variable: removes stop words, stems, tokenizes and gets token uni/bi-grams from the variable description
     */
    public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) {
        //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

        Set<String> transformedSet = new LinkedHashSet<String>();
        List<String> stemmedList = new ArrayList<String>();

        //System.out.println("Stop words length:" + stopwords.size());
        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        TermAttribute termAttribute;
        String term;
        try {
            while (tokenStream.incrementToken()) {
                termAttribute = tokenStream.getAttribute(TermAttribute.class);
                term = termAttribute.term();
                if (digitPattern.matcher(term).find()) //ignore digits
                    continue;
                if (stopwords.contains(term)) //ignore stopwords
                    continue;
                if (term.length() <= 1) //ignore single letter words
                    continue;
                stemmer.setCurrent(term);
                stemmer.stem();
                stemmedList.add(stemmer.getCurrent());

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        String[] ds = stemmedList.toArray(new String[0]);

        /*for(int i=0; i<stemmedList.size(); i++)
           System.out.print(ds[i]+"\t");*/

        //add bi-grams
        final int size = 2;
        for (int i = 0; i < ds.length; i++) {
            transformedSet.add(ds[i]); //add single words
            if (i + size <= ds.length) {
                String t = "";
                for (int j = i; j < i + size; j++) {
                    t += " " + ds[j];
                }
                t = t.trim().replaceAll("\\s+", "_");
                transformedSet.add(t); //add bi-gram combined with "_"
            }
        }
        //System.out.println(" ")
        stemmedList.clear();
        stemmedList = null;
        ds = null;
        return transformedSet;
    }

    /*
     * Preprocess variable: removes stop words, stems, tokenizes the variable description, returning a distinct tokens
     */
    public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) {
        //System.out.println("Preprocess data, remove stop words, stem, tokenize ..");
        Set<String> transformedSet = new LinkedHashSet<String>();
        List<String> stemmedList = new ArrayList<String>();

        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        TermAttribute termAttribute;
        String term;
        try {
            while (tokenStream.incrementToken()) {
                termAttribute = tokenStream.getAttribute(TermAttribute.class);
                term = termAttribute.term();
                if (digitPattern.matcher(term).find()) //ignore digits
                    continue;
                if (stopwords.contains(term)) //ignore stopwords
                    continue;
                if (term.length() <= 1) //ignore single letter words
                    continue;
                stemmer.setCurrent(term);
                stemmer.stem();
                stemmedList.add(stemmer.getCurrent());
            }
            transformedSet.addAll(stemmedList);
        } catch (Exception e) {
            e.printStackTrace();
        }
        stemmedList.clear();
        stemmedList = null;

        return StringUtils.join(transformedSet.toArray(), " ");
    }

    /*
     * Preprocess variable: removes stop words, stems, tokenizes and gets token uni/bi-grams from the variable description
     */
    public static String preprocessStemAndTokenizeAddBigramsInString(String data) {
        //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

        Set<String> transformedSet = new LinkedHashSet<String>();
        List<String> stemmedList = new ArrayList<String>();

        Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
        TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
        TermAttribute termAttribute;
        String term;
        try {
            while (tokenStream.incrementToken()) {
                termAttribute = tokenStream.getAttribute(TermAttribute.class);
                term = termAttribute.term();
                if (digitPattern.matcher(term).find()) //ignore digits
                    continue;
                if (stopwords.contains(term)) //ignore stopwords
                    continue;
                if (term.length() <= 1) //ignore stopwords
                    continue;
                stemmer.setCurrent(term);
                stemmer.stem();
                stemmedList.add(stemmer.getCurrent());

            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        String[] ds = stemmedList.toArray(new String[0]);

        /*for(int i=0; i<stemmedList.size(); i++)
           System.out.print(ds[i]+"\t");*/

        //add bi-grams
        final int size = 2;
        for (int i = 0; i < ds.length; i++) {
            transformedSet.add(ds[i]); //add single words
            if (i + size <= ds.length) {
                String t = "";
                for (int j = i; j < i + size; j++) {
                    t += " " + ds[j];
                }
                t = t.trim().replaceAll("\\s+", "_");
                transformedSet.add(t); //add bi-gram combined with "_"
            }
        }
        //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString());
        return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " ");

    }
}