com.screenslicer.core.nlp.Person.java Source code

Introduction

Here is the source code for com.screenslicer.core.nlp.Person.java
Source

/* 
 * ScreenSlicer (TM) -- automatic, zero-config web scraping (TM)
 * Copyright (C) 2013-2014 Machine Publishers, LLC
 * ops@machinepublishers.com | screenslicer.com | machinepublishers.com
 * 717 Martin Luther King Dr W Ste I, Cincinnati, Ohio 45220
 *
 * You can redistribute this program and/or modify it under the terms of the
 * GNU Affero General Public License version 3 as published by the Free
 * Software Foundation. Additional permissions or commercial licensing may be
 * available--see LICENSE file or contact Machine Publishers, LLC for details.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License version 3
 * for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * version 3 along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * For general details about how to investigate and report license violations,
 * please see: https://www.gnu.org/licenses/gpl-violation.html
 * and email the author: ops@machinepublishers.com
 * Keep in mind that paying customers have more rights than the AGPL alone offers.
 */
package com.screenslicer.core.nlp;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.util.Span;

import org.apache.commons.io.IOUtils;

import com.screenslicer.api.datatype.Contact;
import com.screenslicer.common.CommonUtil;
import com.screenslicer.common.Log;
import com.screenslicer.core.nlp.resource.NlpResource;

public class Person {
    public static final Pattern email = Pattern.compile(
            "(?<=\r\n|[\r\n]|^).*?([^&\"':><\\s@]+@[^&\"':><\\s@.]+\\.[^&\"':><\\s@]*[^&\"':><\\s@.]+).*(?=\r\n|[\r\n]|$)",
            Pattern.UNICODE_CHARACTER_CLASS);
    public static final Pattern phone = Pattern.compile(
            "(?<=\r\n|[\r\n]|^).*?\\s*((?=.?(?:([0-9].*?){7,15}))[-+().0-9\\s]{8,20}[0-9]).*(?=\r\n|[\r\n]|$)",
            Pattern.UNICODE_CHARACTER_CLASS);
    private static final Pattern invalidNameChars = Pattern.compile(
            "!|@|#|\\$|%|\\^|&|\\*|\\(|\\)|_|\\+|=|\\{|\\}|\\[|\\]|\\||:|;|\"|'|<|>|\\?|/|\\\\|~|`",
            Pattern.UNICODE_CHARACTER_CLASS);
    private static Collection<String> firstNames = null;
    private static Collection<String> lastNames = null;
    private static Collection<String> firstNamesPopular = null;
    private static Collection<String> englishWords = null;
    private static TokenNameFinderModel nameModel = null;

    static {
        InputStream modelIn = null;
        try {
            modelIn = NlpResource.class.getResourceAsStream("apache-open-nlp/en-ner-person.bin");
            nameModel = new TokenNameFinderModel(modelIn);
        } catch (Throwable t) {
            Log.exception(t);
        } finally {
            IOUtils.closeQuietly(modelIn);
        }

        try {
            firstNames = new HashSet<String>(
                    IOUtils.readLines(NlpResource.class.getResourceAsStream("us-firstnames")));
        } catch (Throwable t) {
            Log.exception(t);
        }

        try {
            firstNamesPopular = new HashSet<String>(
                    IOUtils.readLines(NlpResource.class.getResourceAsStream("us-firstnames-popular")));
        } catch (Throwable t) {
            Log.exception(t);
        }

        try {
            lastNames = new HashSet<String>(
                    IOUtils.readLines(NlpResource.class.getResourceAsStream("us-surnames")));
        } catch (Throwable t) {
            Log.exception(t);
        }

        try {
            englishWords = new HashSet<String>(
                    IOUtils.readLines(NlpResource.class.getResourceAsStream("en-words")));
        } catch (Throwable t) {
            Log.exception(t);
        }
    }

    public static Contact extractContact(String src) {
        if (CommonUtil.isEmpty(src)) {
            return new Contact();
        }
        List<String> lines = new ArrayList<String>();
        String emailMatch = null;
        Matcher matcher = email.matcher(src);
        if (matcher.find()) {
            emailMatch = matcher.group(1);
            lines.add(matcher.group(0));
        }
        String phoneMatch = null;
        matcher = phone.matcher(src);
        if (matcher.find()) {
            phoneMatch = matcher.group(1);
            lines.add(matcher.group(0));
        }
        Collection<String> names = new HashSet<String>();
        for (String line : lines) {
            String name = extractName(line, false, false);
            if (!CommonUtil.isEmpty(name)) {
                names.add(name);
            }
        }
        String name = null;
        if (names.size() == 1) {
            name = names.iterator().next();
        }
        if (name == null) {
            name = extractName(src, true, false);
        }
        Contact person = new Contact();
        person.name = name;
        person.email = emailMatch;
        person.phone = phoneMatch;
        return person;
    }

    public static String extractName(String src, boolean strict, boolean dictionaryOnly) {
        NameFinderME nameFinder = new NameFinderME(nameModel);
        String[] sentences = NlpUtil.sentences(src);
        Collection<String> nlpNames = new HashSet<String>();
        Collection<String> nlpFallbacks = new HashSet<String>();
        Collection<String> dictionaryNames = new HashSet<String>();
        Collection<String> dictionaryFallbacks = new HashSet<String>();
        for (int i = 0; i < sentences.length; i++) {
            String[] tokens = NlpUtil.tokensFromSentence(sentences[i]);
            for (int j = 0; j < tokens.length; j++) {
                String first = tokens[j];
                String last = null;
                if (j + 1 < tokens.length) {
                    last = tokens[j + 1];
                }
                if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                    dictionaryNames.add(first + " " + last);
                } else if (!strict && isFirstName(first, strict)) {
                    dictionaryFallbacks.add(first);
                }
            }
            Span[] spans = nameFinder.find(tokens);
            for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                for (String curName : curNames) {
                    if (curName.contains(" ") && isFullName(curName, strict)) {
                        nlpNames.add(curName);
                    } else if (isFirstName(curName, strict)) {
                        nlpFallbacks.add(curName);
                    }
                }
            }
        }
        if (nlpNames.isEmpty()) {
            nlpNames = nlpFallbacks;
        }
        if (dictionaryNames.isEmpty()) {
            dictionaryNames = dictionaryFallbacks;
        }

        if ((dictionaryOnly || nlpNames.size() != 1) && dictionaryNames.size() != 1) {
            nlpNames.clear();
            nlpFallbacks.clear();
            dictionaryNames.clear();
            dictionaryFallbacks.clear();
            nameFinder.clearAdaptiveData();
            for (int s = 0; s < sentences.length; s++) {
                String[] tokens = sentences[s].split("[\\W\\s]|$|^");
                for (int i = 0; i < tokens.length; i++) {
                    String first = tokens[i];
                    String last = null;
                    if (i + 1 < tokens.length) {
                        last = tokens[i + 1];
                    }
                    if (isFirstName(first, strict) && isLastName(last) && isFullName(first + " " + last, strict)) {
                        dictionaryNames.add(first + " " + last);
                    } else if (!strict && isFirstName(first, strict)) {
                        dictionaryFallbacks.add(first);
                    }
                }
                Span[] spans = nameFinder.find(tokens);
                for (int j = 0; !dictionaryOnly && j < spans.length; j++) {
                    List<String> curNames = Arrays.asList(Span.spansToStrings(spans, tokens));
                    for (String curName : curNames) {
                        if (curName.contains(" ") && isFullName(curName, strict)) {
                            nlpNames.add(curName);
                        } else if (isFirstName(curName, strict)) {
                            nlpFallbacks.add(curName);
                        }
                    }
                }
            }
        }
        if (nlpNames.isEmpty()) {
            nlpNames = nlpFallbacks;
        }
        if (dictionaryNames.isEmpty()) {
            dictionaryNames = dictionaryFallbacks;
        }
        if (nlpNames.size() == 1) {
            return nlpNames.iterator().next();
        }
        if (nlpFallbacks.size() == 1) {
            return nlpFallbacks.iterator().next();
        }
        if (dictionaryNames.size() == 1) {
            return dictionaryNames.iterator().next();
        }
        if (dictionaryFallbacks.size() == 1) {
            return dictionaryFallbacks.iterator().next();
        }
        return null;
    }

    private static boolean isValidNameChars(String str) {
        return !CommonUtil.isEmpty(str) && !invalidNameChars.matcher(str).find();
    }

    private static boolean isFullName(String str, boolean strict) {
        if (!isValidNameChars(str)) {
            return false;
        }
        if (str.contains(" ")) {
            String[] parts = str.split(" ");
            int upper = 0;
            int nonDictionary = 0;
            if (!firstNames.contains(parts[0])) {
                return false;
            }
            for (int i = 0; i < parts.length; i++) {
                if (Character.isUpperCase(parts[i].charAt(0))) {
                    ++upper;
                }
                if (!englishWords.contains(parts[i].toLowerCase())
                        || (i == 0 && ((strict && firstNamesPopular.contains(parts[i]))
                                || (!strict && firstNames.contains(parts[i]))))
                        || (i > 0 && isLastName(parts[i]))) {
                    ++nonDictionary;
                }
            }
            return upper > 1 && nonDictionary > 0;
        }
        return false;
    }

    private static boolean isFirstName(String str, boolean strict) {
        if (!isValidNameChars(str)) {
            return false;
        }
        if (Character.isLowerCase(str.charAt(0))) {
            return false;
        }
        return (strict && firstNamesPopular.contains(str)) || (!strict && firstNames.contains(str));
    }

    private static boolean isLastName(String str) {
        if (!isValidNameChars(str)) {
            return false;
        }
        if (Character.isLowerCase(str.charAt(0))) {
            return false;
        }
        return lastNames.contains(str.toUpperCase());
    }
}