org.languagetool.tokenizers.WordTokenizer.java Source code

Introduction

Here is the source code for org.languagetool.tokenizers.WordTokenizer.java
Source

/* LanguageTool, a natural language style checker
 * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de)
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
 * USA
 */
package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.languagetool.tools.StringTools;

/**
 * Tokenizes a sentence into words. Punctuation and whitespace gets their own tokens.
 * The tokenizer is a quite simple character-based one, though it knows
 * about urls and will put them in one token, if fully specified including
 * a protocol (like {@code http://foobar.org}).
 * 
 * @author Daniel Naber
 */
public class WordTokenizer implements Tokenizer {

    private static final List<String> PROTOCOLS = Collections
            .unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-Z0-9/%$-_.+!*'(),\\?#]+");
    private static final Pattern DOMAIN_CHARS = Pattern.compile("[a-zA-Z0-9][a-zA-Z0-9-]+");
    private static final Pattern NO_PROTOCOL_URL = Pattern
            .compile("([a-zA-Z0-9][a-zA-Z0-9-]+\\.)?([a-zA-Z0-9][a-zA-Z0-9-]+)\\.([a-zA-Z0-9][a-zA-Z0-9-]+)/.*");
    private static final Pattern E_MAIL = Pattern.compile(
            "(?<!:)\\b[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\])|(([a-zA-Z\\-0-9]+\\.)+[a-zA-Z]{2,}))\\b");

    private static final String TOKENIZING_CHARACTERS = "\u0020\u00A0\u115f" + "\u1160\u1680"
            + "\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
            + "\u2008\u2009\u200A\u200B\u200c\u200d\u200e\u200f"
            + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
            + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
            + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"
            + ",.;()[]{}=*#+<>!?:/|\\\"'?`???_" + "" // em dash
            + "\t\n\r";

    /**
     * Get the protocols that the tokenizer knows about.
     * @return currently {@code http}, {@code https}, and {@code ftp}
     * @since 2.1
     */
    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    /**
     * @since 3.0
     */
    public static boolean isUrl(String token) {
        for (String protocol : WordTokenizer.getProtocols()) {
            if (token.startsWith(protocol + "://") || token.startsWith("www.")) {
                return true;
            }
        }
        return NO_PROTOCOL_URL.matcher(token).matches();
    }

    /**
     * @since 3.5
     */
    public static boolean isEMail(String token) {
        return E_MAIL.matcher(token).matches();
    }

    @Override
    public List<String> tokenize(String text) {
        List<String> l = new ArrayList<>();
        StringTokenizer st = new StringTokenizer(text, getTokenizingCharacters(), true);
        while (st.hasMoreElements()) {
            l.add(st.nextToken());
        }
        return joinEMailsAndUrls(l);
    }

    /**
     * @return The string containing the characters used by the
     * tokenizer to tokenize words.
     * @since 2.5
     */
    public String getTokenizingCharacters() {
        return TOKENIZING_CHARACTERS;
    }

    protected List<String> joinEMailsAndUrls(List<String> list) {
        return joinUrls(joinEMails(list));
    }

    /**
     * @since 3.5
     */
    protected List<String> joinEMails(List<String> list) {
        StringBuilder sb = new StringBuilder();
        for (String item : list) {
            sb.append(item);
        }
        String text = sb.toString();
        if (text.contains("@") && E_MAIL.matcher(text).find()) { // explicit check for "@" speeds up method by factor of ~10
            Matcher matcher = E_MAIL.matcher(text);
            List<String> l = new ArrayList<>();
            int currentPosition = 0, start, end, idx = 0;
            while (matcher.find()) {
                start = matcher.start();
                end = matcher.end();
                while (currentPosition < end) {
                    if (currentPosition < start) {
                        l.add(list.get(idx));
                    } else if (currentPosition == start) {
                        l.add(matcher.group());
                    }
                    currentPosition += list.get(idx).length();
                    idx++;
                }
            }
            if (currentPosition < text.length()) {
                l.addAll(list.subList(idx, list.size()));
            }
            return l;
        }
        return list;
    }

    // see rfc1738 and http://stackoverflow.com/questions/1856785/characters-allowed-in-a-url
    protected List<String> joinUrls(List<String> l) {
        List<String> newList = new ArrayList<>();
        boolean inUrl = false;
        StringBuilder url = new StringBuilder();
        String urlQuote = null;
        for (int i = 0; i < l.size(); i++) {
            if (urlStartsAt(i, l)) {
                inUrl = true;
                if (i - 1 >= 0) {
                    urlQuote = l.get(i - 1);
                }
                url.append(l.get(i));
            } else if (inUrl && urlEndsAt(i, l, urlQuote)) {
                inUrl = false;
                urlQuote = null;
                newList.add(url.toString());
                url.setLength(0);
                newList.add(l.get(i));
            } else if (inUrl) {
                url.append(l.get(i));
            } else {
                newList.add(l.get(i));
            }
        }
        if (url.length() > 0) {
            newList.add(url.toString());
        }
        return newList;
    }

    private boolean urlStartsAt(int i, List<String> l) {
        String token = l.get(i);
        if (isProtocol(token) && l.size() > i + 3) {
            String nToken = l.get(i + 1);
            String nnToken = l.get(i + 2);
            String nnnToken = l.get(i + 3);
            if (nToken.equals(":") && nnToken.equals("/") && nnnToken.equals("/")) {
                return true;
            }
        }
        if (l.size() > i + 1) {
            // e.g. www.mydomain.org
            String nToken = l.get(i);
            String nnToken = l.get(i + 1);
            if (nToken.equals("www") && nnToken.equals(".")) {
                return true;
            }
        }
        if (l.size() > i + 3 && // e.g. mydomain.org/ (require slash to avoid missing errors that can be interpreted as domains)
                l.get(i + 1).equals(".") && // use this order so the regex only gets matched if needed
                l.get(i + 3).equals("/") && DOMAIN_CHARS.matcher(token).matches()
                && DOMAIN_CHARS.matcher(l.get(i + 2)).matches()) {
            return true;
        }
        return (l.size() > i + 5 && // e.g. sub.mydomain.org/ (require slash to avoid missing errors that can be interpreted as domains)
                l.get(i + 1).equals(".") && // use this order so the regex only gets matched if needed
                l.get(i + 3).equals(".") && l.get(i + 5).equals("/") && DOMAIN_CHARS.matcher(token).matches()
                && DOMAIN_CHARS.matcher(l.get(i + 2)).matches() && DOMAIN_CHARS.matcher(l.get(i + 4)).matches());
    }

    private boolean isProtocol(String token) {
        return PROTOCOLS.contains(token);
    }

    private boolean urlEndsAt(int i, List<String> l, String urlQuote) {
        String token = l.get(i);
        if (StringTools.isWhitespace(token) || token.equals(")") || token.equals("]")) { // this is guesswork
            return true;
        } else if (l.size() > i + 1) {
            String nextToken = l.get(i + 1);
            if ((StringTools.isWhitespace(nextToken)
                    || StringUtils.equalsAny(nextToken, "\"", "", "", "", "", "", "?", "'", "."))
                    && (StringUtils.equalsAny(token, ".", ",", ";", ":", "!", "?") || token.equals(urlQuote))) {
                return true;
            } else if (!URL_CHARS.matcher(token).matches()) {
                return true;
            }
        } else {
            if (!URL_CHARS.matcher(token).matches() || token.equals(".")) {
                return true;
            }
        }
        return false;
    }

}