opennlp.tools.parse_thicket.opinion_processor.StopList.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.parse_thicket.opinion_processor.StopList.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.opinion_processor;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;

import opennlp.tools.stemmer.PStemmer;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class StopList {
    private static StopList m_StopList = null;
    private static Hashtable<String, HashSet<String>> m_stopHash = new Hashtable<String, HashSet<String>>();
    public static final Log logger = LogFactory.getLog(StopList.class);
    private static final String DEFAULT_STOPLIST = "STANDARD";
    public static String resourceDir = null;
    private static PStemmer stemmer = new PStemmer();

    static {
        synchronized (StopList.class) {
            try {
                LoadStopList();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    /**
     * Get the StopList singleton instance.
     * 
     * @return The StopList
     */
    static public synchronized StopList getInstance() {

        if (m_StopList == null) {
            m_StopList = new StopList();

            try {
                m_StopList.LoadStopList();
            } catch (Exception e) {

            }
        }
        return m_StopList;
    }

    static public synchronized StopList getInstance(String dir) {
        resourceDir = dir;
        if (m_StopList == null) {
            m_StopList = new StopList();

            try {
                m_StopList.LoadStopList();
            } catch (Exception e) {

            }
        }
        return m_StopList;
    }

    private static void LoadStopList() throws IOException {

        File dir = new File(resourceDir + "/maps");
        String[] children = dir.list();
        if (children == null) {
            System.err.println("Problem reading Stop Lists!");
        } else {
            for (int i = 0; i < children.length; i++) {
                String fn = children[i];
                if (fn.endsWith(".vcb")) {
                    String fileName = resourceDir + "/maps/" + fn;
                    File f = new File(fileName);
                    loadStopListFile(f);
                }
            }
        }
    }

    private static void loadStopListFile(File f) throws FileNotFoundException {

        FileReader fileReader = new FileReader(f);
        BufferedReader in = new BufferedReader(fileReader);

        String str = new String();
        boolean fLine = true;
        HashSet<String> t = new HashSet<String>();
        String listName = "";

        try {
            while ((str = in.readLine()) != null) {
                if (fLine && str.length() > 0) {
                    fLine = false;
                    listName = str;
                } else {
                    t.add(str);
                }
            }
        } catch (IOException ioe) {

        } finally {
            try {
                if (in != null) {
                    in.close();
                }
                if (fileReader != null) {
                    fileReader.close();
                }
            } catch (IOException ioe) {
                ioe.printStackTrace();
            }
        }

        if (listName.length() > 0) {
            HashSet<String> l = m_stopHash.get(listName);
            if (l != null) {
                synchronized (l) {
                    m_stopHash.put(listName, t);
                }
            } else {
                m_stopHash.put(listName, t);
            }
        }
    }

    /**
     * Is the given word in the stop words list? Uses the defaut "STANDARD"
     * stoplist
     * 
     * @param str
     *            The word to check
     * @return is a stop word
     */
    public static boolean isStopWord(String str) {
        boolean retVal = false;
        if (m_stopHash.containsKey(DEFAULT_STOPLIST))
            retVal = m_stopHash.get(DEFAULT_STOPLIST).contains(str);
        return retVal;
    }

    public static boolean isFirstName(String str) {
        boolean retVal = false;
        if (m_stopHash.containsKey("FIRST_NAMES"))
            retVal = m_stopHash.get("FIRST_NAMES").contains(str.toUpperCase());
        return retVal;
    }

    public String getRandomFirstName() {
        HashSet<String> firstNames = m_stopHash.get("FIRST_NAMES");
        int indexRand = (int) (Math.random() * new Float(firstNames.size()));
        Iterator iter = firstNames.iterator();
        for (int i = 0; i < indexRand; i++) {
            iter.next();
        }
        return ((String) iter.next()).toLowerCase();
    }

    public static boolean isCommonWord(String str) {
        if (str == null)
            return true;
        String stemmed = "";
        try {
            stemmed = stemmer.stem(str).toLowerCase();
        } catch (Exception e) {
            // stemming exceptions are not informative, jiust ignore wthis word
            // e.printStackTrace();
        }

        boolean retVal = false;
        if (m_stopHash.containsKey("ENG_DICT"))
            retVal = m_stopHash.get("ENG_DICT").contains(stemmed);
        return retVal;
    }

    public boolean isCommonEventWord(String str) {
        if (str == null)
            return true;
        boolean retVal = false;

        try {
            String stemmed = str.toLowerCase();

            if (m_stopHash.containsKey("fREQUENTEVENTNAMEWORDS"))
                retVal = m_stopHash.get("fREQUENTEVENTNAMEWORDS").contains(stemmed);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return retVal;
    }

    /**
     * Is the given word in the stop words list provided?
     * 
     * @param str
     *            The word to check
     * @param stop_list
     *            the name of the stoplist to check against
     * @return is a stop word
     */
    public static boolean isStopWord(String str, String stop_list) {
        boolean retVal = false;
        if (m_stopHash.containsKey(stop_list))
            retVal = m_stopHash.get(stop_list).contains(str);
        return retVal;
    }

    public boolean isStopWordAll(String str) {
        return isStopWord(str);
    }

    public HashSet<String> getStopListMap(String name) {
        return m_stopHash.get(name);
    }

    public static List<List<String>> preFilterCommonEnglishExpressions(List<String> userLikes) {
        List<List<String>> results = new ArrayList<List<String>>();

        List<String> resultUserLikes = new ArrayList<String>(), potentialCategs = new ArrayList<String>();
        if (userLikes.size() < 6) {// too short, do not filter
            results.add(userLikes);
            results.add(potentialCategs);
            return results;

        }

        for (String like : userLikes) {
            like = like.toLowerCase();
            if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
                logger.info("removed isAlphanumeric " + like);
                continue;
            }

            if (StringUtils.isNumeric(like)) {
                logger.info("removed isNumericSpace " + like);
                continue;
            }

            if (like.length() < 4) {
                logger.info("removed too short likes " + like);
                continue;
            }
            boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
            String[] comps = like.split(" ");
            StringBuffer buf = new StringBuffer();
            for (String word : comps) {
                boolean isCommon = isCommonWord(word);
                boolean isName = isFirstName(word);
                if (!isCommon)
                    allWordsCommonEnglish = false;
                if (isName)
                    existFirstName = true;
                if (isStopWord(word) || word.length() < 3)
                    bStop = true;
                else
                    buf.append(word + " ");
            } // / does not have to include stop word
            if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
                logger.info("moved to category:  NoFirstName+AllCommonEng+ShorterThan3 " + like);

                continue;
            }
            if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
                logger.info("moved to category: NoFirstName+AllCommonEng+Short1word " + like);
                potentialCategs.add(like);
                continue;
            }

            if (existFirstName && comps.length == 1) {
                logger.info("removed : only first name, no last name " + like);

                continue;
            }

            resultUserLikes.add(buf.toString().trim());

        }

        resultUserLikes = new ArrayList<String>(new HashSet<String>(resultUserLikes));
        if (resultUserLikes.size() > 1) {
            results.add(resultUserLikes);
            results.add(potentialCategs);
            return results;
        }

        else {// do not do reduction
            results.add(userLikes);
            results.add(potentialCategs);
            return results;
        }
    }

    public static boolean isAcceptableIndividualLikes(String like) {
        StopList finder = StopList.getInstance();
        like = like.toLowerCase();
        if (!StringUtils.isAlphanumeric(like.replace(" ", ""))) {
            logger.info("removed isAlphanumeric " + like);
            return false;
        }

        if (StringUtils.isNumeric(like)) {
            logger.info("removed isNumericSpace " + like);
            return false;
        }

        if (like.length() < 4) {
            logger.info("removed too short likes " + like);
            return false;
        }
        boolean existFirstName = false, allWordsCommonEnglish = true, bStop = false;
        String[] comps = like.split(" ");
        StringBuffer buf = new StringBuffer();
        for (String word : comps) {
            boolean isCommon = finder.isCommonWord(word);
            boolean isName = finder.isFirstName(word);
            if (!isCommon)
                allWordsCommonEnglish = false;
            if (isName)
                existFirstName = true;
            if (finder.isStopWord(word) || word.length() < 3)
                bStop = true;
            else
                buf.append(word + " ");
        } // / does not have to include stop word
        if (!existFirstName && allWordsCommonEnglish && comps.length < 3) {
            logger.info("  NoFirstName+AllCommonEng+ShorterThan3 " + like);

            return false;
        }
        if (!existFirstName && allWordsCommonEnglish && comps.length == 1) {
            logger.info(" NoFirstName+AllCommonEng+Short1word " + like);

            return false;
        }

        if (existFirstName && comps.length == 1) {
            logger.info("removed : only first name, no last name " + like);

            return false;
        }

        return true;
    }

    @SuppressWarnings("all")
    public static void main(String[] args) {

        StopList list = StopList
                .getInstance("/Users/borisgalitsky/Documents/workspace/opennlp-similarity/src/test/resources/");
        Boolean b = list.isCommonWord("demonstration");

        String fname = list.getRandomFirstName();

        b = list.isCommonEventWord("tour");
        b = list.isCommonEventWord("dance");
        b = list.isCommonEventWord("salsa");
        b = list.isCommonEventWord("center");
        b = list.isCommonEventWord("family");

        b = isAcceptableIndividualLikes("forest glen");
        b = isAcceptableIndividualLikes("drive");
        b = isAcceptableIndividualLikes("house");
        b = isAcceptableIndividualLikes("Timothy Kloug");
        b = isAcceptableIndividualLikes("Mamma Mia");

    }
}