edu.stanford.muse.index.NEROld.java Source code

Introduction

Here is the source code for edu.stanford.muse.index.NEROld.java
Source

/*
 Copyright (C) 2012 The Stanford MobiSocial Laboratory
    
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
    
   http://www.apache.org/licenses/LICENSE-2.0
    
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
*/
package edu.stanford.muse.index;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.zip.GZIPInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;

import edu.stanford.muse.util.DictUtils;
import edu.stanford.muse.util.Pair;
import edu.stanford.muse.util.Util;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.util.Triple;

public class NEROld {
    public static Log log = LogFactory.getLog(NER.class);
    public static boolean REMOVE_I18N_CHARS = false;

    public static final Map<String, LocationInfo> locations = new LinkedHashMap<String, LocationInfo>();
    public static final Map<String, Long> populations = new LinkedHashMap<String, Long>();
    public static final Set<String> locationsToSuppress = new LinkedHashSet<String>();
    public static Map<String, Integer> defaultTokenTypeWeights = new LinkedHashMap<String, Integer>();
    public static final int MAX_NAME_LENGTH = 1000; // max # of chars in a name. above this we'll drop it. avoids NER returning garbage like sometimes it returns a string of 200K 'a's.

    static {
        //   readLocationNamesToSuppress();
        //         readLocationsFreebase();
        //   readLocationsWG();
        //   Indexer.log.info ("locations DB has " + locations.size() + " coordinates");
        try {
            NER.initialize();
        } catch (Exception e) {
            throw new RuntimeException();
        }
        defaultTokenTypeWeights.put("PERSON", 10000); // 1000 FLAG DEBUG
        if (System.getProperty("muse.remove18n") != null) {
            REMOVE_I18N_CHARS = true;
            log.info("Remove i18n chars = true");
        }
    }

    public static void readLocationsFreebase() throws IOException {
        InputStream is = new GZIPInputStream(NER.class.getClassLoader().getResourceAsStream("locations.gz"));
        LineNumberReader lnr = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
        while (true) {
            String line = lnr.readLine();
            if (line == null)
                break;
            StringTokenizer st = new StringTokenizer(line, "\t");
            if (st.countTokens() == 3) {
                String locationName = st.nextToken();
                String canonicalName = locationName.toLowerCase();
                String lat = st.nextToken();
                String longi = st.nextToken();
                locations.put(canonicalName, new LocationInfo(locationName, lat, longi));
            }
        }
    }

    public static void readLocationsWG() {
        try {
            InputStream is = new GZIPInputStream(
                    NER.class.getClassLoader().getResourceAsStream("WG.locations.txt.gz"));
            LineNumberReader lnr = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
            while (true) {
                String line = lnr.readLine();
                if (line == null)
                    break;
                StringTokenizer st = new StringTokenizer(line, "\t");
                if (st.countTokens() == 4) {
                    String locationName = st.nextToken();
                    String canonicalName = locationName.toLowerCase();
                    if (locationsToSuppress.contains(canonicalName))
                        continue;
                    String lat = st.nextToken();
                    String longi = st.nextToken();
                    String pop = st.nextToken();
                    long popl = Long.parseLong(pop);
                    float latf = ((float) Integer.parseInt(lat)) / 100.0f;
                    float longif = ((float) Integer.parseInt(longi)) / 100.0f;
                    Long existingPop = populations.get(canonicalName);
                    if (existingPop == null || popl > existingPop) {
                        populations.put(canonicalName, popl);
                        locations.put(canonicalName,
                                new LocationInfo(locationName, Float.toString(latf), Float.toString(longif)));
                    }
                }
            }
        } catch (Exception e) {
            log.warn("Unable to read World Gazetteer file, places info may be inaccurate");
            log.debug(Util.stackTrace(e));
        }
    }

    public static void readLocationNamesToSuppress() {
        String suppress_file = "suppress.locations.txt.gz";
        try {
            InputStream is = new GZIPInputStream(NER.class.getClassLoader().getResourceAsStream(suppress_file));
            LineNumberReader lnr = new LineNumberReader(new InputStreamReader(is, "UTF-8"));
            while (true) {
                String line = lnr.readLine();
                if (line == null)
                    break;
                StringTokenizer st = new StringTokenizer(line);
                if (st.hasMoreTokens()) {
                    String s = st.nextToken();
                    if (!s.startsWith("#"))
                        locationsToSuppress.add(s.toLowerCase());
                }
            }
        } catch (Exception e) {
            log.warn("Error: unable to read " + suppress_file);
            Util.print_exception(e);
        }
        log.info(locationsToSuppress.size() + " names to suppress as locations");
    }

    public static CRFClassifier classifier;

    public synchronized static void initialize() throws ClassCastException, IOException, ClassNotFoundException {
        if (classifier == null) {
            String serializedClassifier = "ner-eng-ie.crf-3-all2008.ser.gz";
            classifier = CRFClassifier.getClassifier(
                    new GZIPInputStream(NER.class.getClassLoader().getResourceAsStream(serializedClassifier)));
        }
    }

    public synchronized static void release_classifier() {
        classifier = null;
    }

    public static Set<String> allTypes = new LinkedHashSet<String>();

    public synchronized static MyTokenizer parse(String documentText, boolean locationsOnly, boolean orgsOnly,
            Map<String, Integer> locationCounts) {
        return parseAndGetOffsets(documentText, locationsOnly, orgsOnly, locationCounts).first;
    }

    /** replaced i18n chars above ascii 127, with spaces */
    private static String cleanI18NChars(String s) {
        char[] chars = s.toCharArray();
        int nCharsReplaced = 0;
        for (int i = 0; i < chars.length; i++)
            if (chars[i] < 0 || chars[i] > 127) {
                nCharsReplaced++;
                chars[i] = ' ';
            }
        if (nCharsReplaced > 0) {
            log.info("i18n char(s) replaced: " + nCharsReplaced);
            s = new String(chars);
        }
        return s;
    }

    /** triple is a set of <entity, start char offset (inclusive), end char offset (not inclusive).
     * see http://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/ie/AbstractSequenceClassifier.html#classifyToCharacterOffsets(java.lang.String)
     */
    private synchronized static Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> parseAndGetOffsets(
            String documentText, boolean locationsOnly, boolean orgsOnly, Map<String, Integer> locationCounts) {
        if (documentText.indexOf("\u00A0") > 0)
            documentText = documentText.replaceAll("\\xA0", " "); // 0xA0 is seen often and generates a lot of annoying messages.

        // replace i18n chars with space, causes annoying NER messages + perhaps slows down NER?
        if (REMOVE_I18N_CHARS)
            documentText = cleanI18NChars(documentText);

        final List<Pair<String, String>> tokensList = new ArrayList<Pair<String, String>>();

        /* this does NER word by word, we prefer phrases, so use characterOffsets instead
        List<List<CoreLabel>> out = classifier.classify(documentText);
         for (List<CoreLabel> sentence : out)
         {
             for (CoreLabel word : sentence)
             {
          String x = word.get(AnswerAnnotation.class);
          allTypes.add(x);
          if (x.equals("PERSON") || x.equals("ORGANIZATION") || x.equals("LOCATION"))
          {
             tokensList.add(word.word());
             System.out.toString(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
          }
             }
             System.out.println();
          }
          */

        try {
            NER.initialize();
        } catch (Exception e) {
            Util.print_exception(e, log);
        }

        documentText = getSafeText(documentText);
        List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(documentText);
        for (Triple<String, Integer, Integer> t : triples) {
            String type = t.first();
            if (type == null)
                type = "UNKNOWN"; // we see type = null sometimes #!@#$
            allTypes.add(type);
            if (type.equals("PERSON") || type.equals("ORGANIZATION") || type.equals("LOCATION")) {
                String token = documentText.substring(t.second(), t.third());
                // we tend to see a lot of annoying [Hi Sam] or [Dear Caroline] phrases. surprising NER can't handle it already.
                if (token.toLowerCase().startsWith("hi "))
                    token = token.substring("hi ".length()).trim();
                if (token.toLowerCase().startsWith("dear "))
                    token = token.substring("dear ".length()).trim();
                if (token.length() > MAX_NAME_LENGTH) // drop it
                    continue;
                if (locationsOnly) {
                    if (type.equals("LOCATION")) {
                        if (locations.containsKey(token.toLowerCase()))
                            tokensList.add(new Pair<String, String>(token, type));
                    }
                } else if (orgsOnly) {
                    if (type.equals("ORGANIZATION"))
                        tokensList.add(new Pair<String, String>(token, type));
                } else {
                    tokensList.add(new Pair<String, String>(token, type));
                    if (locationCounts != null && type.equals("LOCATION")) {
                        Integer I = locationCounts.get(token.toLowerCase());
                        locationCounts.put(token.toLowerCase(), (I == null) ? 1 : I + 1);
                    }
                }
            }

            //          System.out.println (t.first() + " : [" + t.second() + ":" + t.third() + "] " + documentText.substring(t.second(), t.third()));
        }

        return new Pair<MyTokenizer, List<Triple<String, Integer, Integer>>>(new NERTokenizer(tokensList), triples);
    }

    private static String getSafeText(String documentText) {
        // workaround for "Warning: calling makeObjectBankFromString with an existing file name! This will open the file instead."
        // if "documentText" happens to match an existing filename, makeObjectBankFromString, which is used by classifyToCharacterOffsets, can misbehave.
        if ((new File(documentText)).exists()) {
            if (!(".".equals(documentText) || "..".equals(documentText))) { // do not warn on "." and ".." files which always exist
                log.warn(
                        "Muse should be launched from an empty directory to workaround a caveat of name extraction"); // would still have problem with "/home" or "\windows\system\" etc.
            }
            //return new NERTokenizer(tokensList);
            do {
                documentText += " % ? * / \\ | > < \" :"; // a number of invalid chars that should not be in filename
            } while ((new File(documentText)).exists());
        }

        return documentText;
    }

    public static void printAllTypes() {
        Indexer.log.info(allTypes.size() + " NER types");
        for (String s : allTypes)
            Indexer.log.info(s);
    }

    public static List<Pair<String, Float>> namesFromURL(String url)
            throws ClassCastException, IOException, ClassNotFoundException {
        return namesFromURL(url, false);
    }

    /** returns a list of <name, #occurrences> */
    public static List<Pair<String, Float>> namesFromURL(String url, boolean removeCommonNames)
            throws ClassCastException, IOException, ClassNotFoundException {
        // only http conns allowed currently
        Indexer.log.info(url);
        HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
        conn.setInstanceFollowRedirects(true);
        conn.setRequestProperty("User-agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
        conn.connect();
        Indexer.log.info("url for extracting names:" + conn.getURL());
        byte[] b = Util.getBytesFromStream(conn.getInputStream());
        String text = new String(b, "UTF-8");
        text = Util.unescapeHTML(text);
        org.jsoup.nodes.Document doc = Jsoup.parse(text);
        text = doc.body().text();
        return namesFromText(text, removeCommonNames);
    }

    public static List<Pair<String, Float>> namesFromArchive(String url, boolean removeCommonNames)
            throws ClassCastException, IOException, ClassNotFoundException {
        // only http conns allowed currently
        HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();

        conn.setInstanceFollowRedirects(true);
        conn.setRequestProperty("User-agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
        conn.connect();

        byte[] b = Util.getBytesFromStream(conn.getInputStream());
        String text = new String(b, "UTF-8");
        text = Util.unescapeHTML(text);
        org.jsoup.nodes.Document doc = Jsoup.parse(text);
        text = doc.body().text();
        return namesFromText(text, removeCommonNames);
    }

    public static List<Pair<String, Float>> namesFromURLs(String urls) {
        return namesFromURLs(urls, false);
    }

    /** like namesFromURLs, just takes multiple \s separated urls */
    public static List<Pair<String, Float>> namesFromURLs(String urls, boolean removeCommonNames) {
        List<Pair<String, Float>> result = new ArrayList<Pair<String, Float>>();
        StringTokenizer st = new StringTokenizer(urls);
        while (st.hasMoreTokens()) {
            try {
                String url = st.nextToken();
                result.addAll(namesFromURL(url, removeCommonNames));
            } catch (Exception e) {
                Util.print_exception(e);
            }
        }
        return result;
    }

    /** returns a list of <name, #occurrences> */
    public static List<Pair<String, Float>> namesFromText(String text)
            throws ClassCastException, IOException, ClassNotFoundException {
        return namesFromText(text, false);
    }

    public static List<Pair<String, Float>> namesFromText(String text, boolean removeCommonNames)
            throws ClassCastException, IOException, ClassNotFoundException {
        Map<String, Integer> defaultTokenTypeWeights = new LinkedHashMap<String, Integer>();
        defaultTokenTypeWeights.put("PERSON", 100); // 1000 FLAG DEBUG
        return namesFromText(text, removeCommonNames, defaultTokenTypeWeights);
    }

    public static List<Pair<String, Float>> namesFromText(String text, boolean removeCommonNames,
            Map<String, Integer> tokenTypeWeights) throws ClassCastException, IOException, ClassNotFoundException {
        return namesFromText(text, removeCommonNames, tokenTypeWeights, false, 0);
    }

    /** return value.first: a list of <category> -> list of names in that category.
     *  */
    public static Pair<Map<String, List<String>>, List<Triple<String, Integer, Integer>>> categoryToNamesMap(
            String text) {
        Pair<MyTokenizer, List<Triple<String, Integer, Integer>>> ner_result = NEROld.parseAndGetOffsets(text,
                false, false, null);
        NERTokenizer t = (NERTokenizer) ner_result.first;
        Map<String, List<String>> map = new LinkedHashMap<String, List<String>>();

        // filter tokens first, map them out by category
        while (t.hasMoreTokens()) {
            Pair<String, String> tokenAndType = t.nextTokenAndType();
            String token = tokenAndType.getFirst();
            String type = tokenAndType.getSecond();
            List<String> list = map.get(type);
            if (list == null) {
                list = new ArrayList<String>();
                map.put(type, list);
            }
            list.add(token);
        }
        return new Pair<Map<String, List<String>>, List<Triple<String, Integer, Integer>>>(map, ner_result.second);
    }

    /** returns a list of <name, weighted #occurrences>.
     * positional_boost can be 0, in which case no weight bias based on position */
    public static List<Pair<String, Float>> namesFromText(String text, boolean removeCommonNames,
            Map<String, Integer> tokenTypeWeights, boolean normalizeByLength, int max_positional_boost)
            throws ClassCastException, IOException, ClassNotFoundException {
        NERTokenizer t = (NERTokenizer) NER.parse(text, false, false, null);
        Map<String, Float> termFreqMap = new LinkedHashMap<String, Float>();
        List<Pair<String, String>> tokenList = new ArrayList<Pair<String, String>>();

        // filter tokens first
        while (t.hasMoreTokens()) {
            Pair<String, String> tokenAndType = t.nextTokenAndType();
            String token = tokenAndType.getFirst().trim().toLowerCase();
            // drop dictionary words
            if (DictUtils.fullDictWords != null && DictUtils.fullDictWords.contains(token))
                continue;
            if (removeCommonNames && DictUtils.topNames != null && DictUtils.topNames.contains(token))
                continue;
            // drop "terms" with | -- often used as a separator on the web, and totally confuses our indexer's phrase lookup
            if (token.indexOf("|") >= 0)
                continue;
            tokenList.add(tokenAndType);
        }

        // initialize weights
        for (Pair<String, String> token : tokenList) {
            String normalizedTerm = token.getFirst().toLowerCase();
            termFreqMap.put(normalizedTerm, 0.0f);
        }

        int n_tokens = tokenList.size();

        int current_token_count = 0;
        // note: tokenList could have lower and upper case
        for (Pair<String, String> token : tokenList) {
            // if normalizing by length, each occurrence of a token has a weight of 1/tokenList.size()
            float basic_weight = (normalizeByLength) ? 1.0f / tokenList.size() : 1.0f;

            float positional_boost = 1.0f;
            if (max_positional_boost > 0) {
                // higher boost based on earlier position in doc
                positional_boost = (n_tokens - current_token_count) * max_positional_boost * 1.0f / n_tokens;
                current_token_count++;
            }

            // compute boost based on token type
            float token_type_boost = 1.0f;
            if (tokenTypeWeights != null) {
                String tokenType = token.getSecond();
                Integer X = tokenTypeWeights.get(tokenType);
                token_type_boost = (X != null) ? X : 1.0f;

                // tokens of length < 3 chars should not count
                String x = token.getFirst();
                if (x.length() <= 3)
                    token_type_boost = 0.0f;
                else {
                    int nTokens = new StringTokenizer(x).countTokens();
                    // boost terms that have a space in them, 'cos they're less likely to be ambiguous
                    if (nTokens > 1)
                        token_type_boost *= nTokens;
                }
            }

            float weight = basic_weight * positional_boost * token_type_boost;

            // multiplier should be between 1 and positional_boost
            String normalizedTerm = token.getFirst().toLowerCase();
            Float F = termFreqMap.get(normalizedTerm);
            termFreqMap.put(normalizedTerm, F + weight);
        }

        // reconstruct orgTermFreqMap from termFreqMap using the original terms instead of normalized terms
        Map<String, Float> orgTermFreqMap = new LinkedHashMap<String, Float>();
        for (Pair<String, String> token : tokenList) {
            String term = token.getFirst();
            String normalizedTerm = term.toLowerCase();
            if (termFreqMap.containsKey(normalizedTerm)) {
                orgTermFreqMap.put(term, termFreqMap.get(normalizedTerm));
                termFreqMap.remove(normalizedTerm);
            }
        }
        assert (termFreqMap.isEmpty());

        List<Pair<String, Float>> list = Util.sortMapByValue(orgTermFreqMap);
        //   for (Pair<String, Integer> p: list)
        //      log.info (p.getFirst() + " (" + p.getSecond() + " occurrence(s))");
        return list;
    }

    // classifier.classifyToCharacterOffsets() is compute intensive
    public static String retainOnlyNames(String text) {
        try {
            NER.initialize();
        } catch (Exception e) {
            Util.print_exception(e, log);
        }

        String nerText = getSafeText(text);
        return retainOnlyNames(text, classifier.classifyToCharacterOffsets(nerText));
    }

    public static String retainOnlyNames(String text, List<Triple<String, Integer, Integer>> offsets) {
        if (offsets == null)
            return retainOnlyNames(text); // be forgiving

        int len = text.length();
        offsets.add(new Triple<String, Integer, Integer>(null, len, len)); // sentinel
        int prev_name_end_pos = 0; // pos of first char after previous name
        StringBuilder result = new StringBuilder();
        for (Triple<String, Integer, Integer> t : offsets) {
            int begin_pos = t.second();
            int end_pos = t.third();
            if (begin_pos > len || end_pos > len) {
                // TODO: this is unclean. currently happens because we concat body & title together when we previously generated these offsets but now we only have body.
                begin_pos = end_pos = len;
            }
            String filler = text.substring(prev_name_end_pos, begin_pos);
            //filler = filler.replaceAll("\\w", "."); // CRITICAL: \w only matches (redacts) english language
            filler = filler.replaceAll("[^\\p{Punct}\\s]", ".");
            result.append(filler);
            result.append(text.substring(begin_pos, end_pos));
            prev_name_end_pos = end_pos;
        }

        return result.toString();
    }

    public static int countNames(String text) {
        List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(text);
        return triples.size();
    }

    static void trainForLowerCase() {
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        countNames(
                "rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
    }

    public static void main(String args[]) throws ClassCastException, IOException, ClassNotFoundException {
        String serializedClassifier = "ner-eng-ie.crf-3-all2008.ser.gz";
        classifier = CRFClassifier.getClassifier(
                new GZIPInputStream(NER.class.getClassLoader().getResourceAsStream(serializedClassifier)));
        // trainForLowerCase();
        int x = countNames(
                "Rishabh has so far not taken to drama, so it was wonderful to see him involved and actively participating");
        Util.ASSERT(x == 1); // fails if trained for lower case, passes if not      
    }
}