gate.twitter.HashtagTokenizer.java Source code

Introduction

Here is the source code for gate.twitter.HashtagTokenizer.java
Source

/*
 * HashtagTokenizer.java
 * 
 * Copyright (c) 1995-2014, The University of Sheffield. See the file
 * COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 * 
 * This file is part of GATE (see http://gate.ac.uk/), and is free software,
 * licenced under the GNU Library General Public License, Version 2, June 1991
 * (in the distribution as file licence.html, and also available at
 * http://gate.ac.uk/gate/licence.html).
 * 
 * Mark A. Greenwood, 24 Jan 2014
 */
package gate.twitter;

import static gate.Utils.getAnnotationsAtOffset;
import static gate.Utils.stringFor;
import static org.apache.commons.lang.StringUtils.isAllLowerCase;
import static org.apache.commons.lang.StringUtils.isAllUpperCase;
import static org.apache.commons.lang.StringUtils.isAlpha;
import static org.apache.commons.lang.StringUtils.isNumeric;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.Resource;
import gate.annotation.AnnotationSetImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;

import java.net.URL;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

@CreoleResource(name = "Hashtag Tokenizer", icon = "HashtagTokenizer", comment = "Tokenizes Multi-Word Hashtags", helpURL = "http://gate.ac.uk/userguide/sec:social:twitter:hashtag")
public class HashtagTokenizer extends AbstractLanguageAnalyser {

    private static final long serialVersionUID = -7848183952807024913L;

    // A comparator that sorts annotations by length; longest first
    private static Comparator<Annotation> lengthComparator = new Comparator<Annotation>() {
        @Override
        public int compare(Annotation a1, Annotation a2) {
            long l1 = a1.getEndNode().getOffset() - a1.getStartNode().getOffset();
            long l2 = a2.getEndNode().getOffset() - a2.getStartNode().getOffset();

            return (int) (l2 - l1);
        }
    };

    // the gazetteer that looks for likely words
    private LanguageAnalyser gaz;

    // the URL from which the gazetteer is loaded
    private URL gazURL;

    // the names of the input and output annotation sets
    private String inputASName, outputASName;

    // a debug flag
    private Boolean debug = Boolean.FALSE;

    public Boolean getDebug() {
        return debug;
    }

    @CreoleParameter(defaultValue = "false")
    @RunTime
    @Optional
    public void setDebug(Boolean debug) {
        this.debug = debug;
    }

    public URL getGazetteerURL() {
        return gazURL;
    }

    @CreoleParameter(defaultValue = "resources/hashtag/gazetteer/lists.def")
    public void setGazetteerURL(URL gazURL) {
        this.gazURL = gazURL;
    }

    public String getInputASName() {
        return inputASName;
    }

    @CreoleParameter
    @RunTime
    @Optional
    public void setInputASName(String inputASName) {
        this.inputASName = inputASName;
    }

    public String getOutputASName() {
        return outputASName;
    }

    @CreoleParameter
    @RunTime
    @Optional
    public void setOutputASName(String outputASName) {
        this.outputASName = outputASName;
    }

    @Override
    public Resource init() throws ResourceInstantiationException {

        // load and configure the hidden gazetteer
        FeatureMap hidden = Factory.newFeatureMap();
        Gate.setHiddenAttribute(hidden, true);

        FeatureMap params = Factory.newFeatureMap();
        params.put("listsURL", gazURL);
        params.put("caseSensitive", Boolean.FALSE);

        params.put("longestMatchOnly", Boolean.FALSE);
        params.put("wholeWordsOnly", Boolean.FALSE);

        if (gaz == null) {
            gaz = (LanguageAnalyser) Factory.createResource("gate.creole.gazetteer.DefaultGazetteer", params,
                    hidden, "Hashtag Tokenizer Gazetteer");
        } else {
            gaz.setParameterValues(params);
            gaz.reInit();
        }

        return this;
    }

    @Override
    public void execute() throws ExecutionException {
        AnnotationSet inputAS = document.getAnnotations(inputASName);
        AnnotationSet outputAS = document.getAnnotations(outputASName);

        FeatureMap features = Factory.newFeatureMap();

        long startTime = System.currentTimeMillis();
        fireStatusChanged("Tokenizing Hashtags: " + document.getName());
        fireProgressChanged(0);
        int count = 0;

        // get all the lookups we are going to use for decomposition...
        AnnotationSet lookups = new AnnotationSetImpl(document);

        try {
            // run the gazetteer to produce the HashtagLookup annotations
            gaz.setParameterValue("annotationSetName", inputASName);
            gaz.setDocument(document);
            gaz.execute();

            // get all the hashtags
            AnnotationSet hashtags = inputAS.get("Hashtag");

            for (Annotation hashtag : inputAS.get("Hashtag")) {
                // for each hashtag in the document...

                AnnotationSet contained = inputAS.getContained(hashtag.getStartNode().getOffset(),
                        hashtag.getEndNode().getOffset());

                //clear away any left overs from previous tags
                lookups.clear();

                // which are the HashtagLookup
                lookups.addAll(contained.get("HashtagLookup"));

                // any other Lookups the user has generated
                lookups.addAll(contained.get("Lookup"));

                // and any number tokens
                features = Factory.newFeatureMap();
                features.put("kind", "number");
                lookups.addAll(contained.get("Token", features));

                // the _ appears to be allowed so add them as well
                features = Factory.newFeatureMap();
                features.put("string", "_");
                lookups.addAll(contained.get("Token", features));

                if (isInterrupted()) {
                    throw new ExecutionInterruptedException(
                            "The execution of the hashtag tokenizer has been abruptly interrupted!");
                }

                // this will hold the best we have seen so far
                List<List<Annotation>> fewestTokens = new ArrayList<List<Annotation>>();

                // get all the lookups that start at the beginning of the hashtag
                List<Annotation> start = sort(
                        getAnnotationsAtOffset(lookups, hashtag.getStartNode().getOffset() + 1));

                for (Annotation a : start) {
                    // for each lookup search for a valid tokenization
                    List<List<Annotation>> found = search(lookups, hashtag.getEndNode().getOffset(), a);

                    if (found != null) {
                        // if we found a contender and it's the best so far store it
                        if (fewestTokens.isEmpty()) {
                            fewestTokens.addAll(found);
                        } else if (found.get(0).size() == fewestTokens.get(0).size()) {
                            fewestTokens.addAll(found);
                        } else if (found.get(0).size() < fewestTokens.get(0).size()) {
                            fewestTokens.clear();
                            fewestTokens.addAll(found);
                        }
                    }
                }

                if (debug && fewestTokens.size() > 1) {
                    System.out.println(stringFor(document, hashtag));
                    display(fewestTokens);
                }

                if (fewestTokens.isEmpty()) {
                    // if we didn't find any sensible tokenizations then let's see if the
                    // hashtag is mized case

                    String tagText = stringFor(document, hashtag).substring(1);
                    if ("mixedCaps".equals(getTokenType(tagText)[1])) {
                        // if we have a mixed case hahstag then let's assume it is
                        // CamelCased and split it accordingly

                        // TODO think about camel case which includes numbers

                        // a list to hold the tokens
                        List<Annotation> found = new ArrayList<Annotation>();

                        // start looking for token breaks aftert the initial #
                        long begin = hashtag.getStartNode().getOffset() + 1;

                        for (String token : tagText.split("((?<=[a-z])(?=[A-Z]))|((?<=[A-Z]{2,})(?=[a-z]))")) {
                            // split the token at the case changes...

                            // create the annotation in the Lookup set and add it to the found
                            // list
                            found.add(lookups.get(lookups.add(begin, (begin += token.length()), "CamelToken",
                                    Factory.newFeatureMap())));
                        }

                        // record the tokenization so we can process it later
                        fewestTokens.add(found);
                    }

                }

                if (!fewestTokens.isEmpty()) {
                    // if we found a valid tokenization then...

                    // remove any existing Token annotations
                    inputAS.removeAll(inputAS.get("Token").getContained(hashtag.getStartNode().getOffset(),
                            hashtag.getEndNode().getOffset()));

                    // create a punctuation Token over the #
                    features = Factory.newFeatureMap();
                    features.put("string", "#");
                    features.put("length", "1");
                    features.put("kind", "punctuation");
                    outputAS.add(hashtag.getStartNode().getOffset(), hashtag.getStartNode().getOffset() + 1,
                            "Token", features);

                    // let's assume that the first one we found is best
                    int prefered = 0;

                    for (int i = 0; i < fewestTokens.size(); ++i) {
                        // check those we have found and skip over any that contain single
                        // letter or mixed case words

                        boolean okay = true;
                        for (Annotation a : fewestTokens.get(i)) {
                            // single letter words are not great
                            if (a.getEndNode().getOffset() - a.getStartNode().getOffset() == 1)
                                okay = false;
                        }

                        if (okay) {
                            // if it contains neither a single letter word or a mixed case
                            // word then we should definitely prefer this one
                            prefered = i;
                            break;
                        }
                    }

                    for (Annotation a : fewestTokens.get(prefered)) {
                        // for each new token...

                        // find where it starts/ends and its length
                        long startOffset = a.getStartNode().getOffset();
                        long endOffset = a.getEndNode().getOffset();
                        String length = Long.toString(endOffset - startOffset);

                        // get the actual text
                        String string = stringFor(document, a);

                        // work out what kind of token it is and if it is a word
                        // what its orthography is
                        String[] tokenType = getTokenType(string);
                        String kind = tokenType[0];
                        String orth = tokenType[1];

                        // create the new Token annotation
                        features = Factory.newFeatureMap();
                        features.put("string", string);
                        features.put("length", length);
                        features.put("kind", kind);
                        if (orth != null)
                            features.put("orth", orth);
                        outputAS.add(startOffset, endOffset, "Token", features);

                        if (debug) {
                            // for debug purposes add a matching set of HashtagToken
                            // annotations
                            features = Factory.newFeatureMap();
                            features.put("string", string);
                            features.put("length", length);
                            features.put("kind", kind);
                            if (orth != null)
                                features.put("orth", orth);
                            outputAS.add(startOffset, endOffset, "HashtagToken", features);
                        }
                    }
                } else if (debug) {
                    System.err.println(stringFor(document, hashtag));

                    AnnotationSet tokens = inputAS.get("Token").getContained(hashtag.getStartNode().getOffset() + 1,
                            hashtag.getEndNode().getOffset());

                    for (Annotation token : tokens) {

                        features = Factory.newFeatureMap();
                        features.putAll(token.getFeatures());
                        outputAS.add(token.getStartNode().getOffset(), token.getEndNode().getOffset(),
                                "HashtagToken", features);
                    }
                }

                fireProgressChanged(count++ * 100 / hashtags.size());
            }

            fireProcessFinished();
            fireStatusChanged("Hashtags in " + document.getName() + " tokenized in "
                    + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                    + " seconds!");
        } catch (InvalidOffsetException e) {
            throw new ExecutionException(e);
        } catch (ResourceInstantiationException e) {
            throw new ExecutionException(e);
        } finally {
            gaz.setDocument(null);

            if (!debug) {
                inputAS.removeAll(inputAS.get("HashtagLookup"));
            }
        }
    }

    @Override
    public void cleanup() {
        Factory.deleteResource(gaz);
    }

    private static String[] getTokenType(String string) {
        String kind = "symbol";
        String orth = null;
        if (isAlpha(string)) {
            kind = "word";

            if (isAllLowerCase(string))
                orth = "lowercase";
            else if (isAllUpperCase(string))
                orth = "allCaps";
            else if (isAllUpperCase(string.substring(0, 1)) && isAllLowerCase(string.substring(1)))
                orth = "upperInitial";
            else
                orth = "mixedCaps";
        } else if (isNumeric(string)) {
            kind = "number";
        } else if (string.matches("(\\p{Punct})+")) {
            kind = "punctuation";
        }

        return new String[] { kind, orth };
    }

    private void display(List<List<Annotation>> found) {
        for (List<Annotation> tokens : found) {
            System.out.print("   ");
            for (Annotation token : tokens) {
                System.out.print(stringFor(document, token) + " ");
            }
            System.out.print("\n");
        }
    }

    /**
     * Depth first search through a set of annotations to find a contiguous set
     * for a given character range.
     **/
    private List<List<Annotation>> search(AnnotationSet lookups, Long endOffset, Annotation token)
            throws InvalidOffsetException {

        if ("mixedCaps".equals(getTokenType(stringFor(lookups.getDocument(), token))[1]))
            return null;

        List<List<Annotation>> shortest = new ArrayList<List<Annotation>>();

        if (token.getEndNode().getOffset().equals(endOffset)) {
            // if the token we are starting from ends at the right place then we
            // can stop and just return the single token as the result
            List<Annotation> found = new ArrayList<Annotation>();
            found.add(token);

            shortest.add(found);
            return shortest;
        }

        if (endOffset - token.getEndNode().getOffset() > 1) {
            // if there are two or more characters after this token then...

            // get the rest of the text of the hashtag
            String rest = lookups.getDocument().getContent()
                    .getContent(token.getEndNode().getOffset() - 1, endOffset).toString();

            if (rest.substring(1).matches(rest.substring(0, 1) + "+")) {
                // if the rest of the hashtag is just the same as the last letter of
                // this token then someone has just lent on the keyboard for emphasis

                // so... extend the current token to include the rest of the hashtag
                Annotation newToken = lookups.get(lookups.add(token.getStartNode().getOffset(), endOffset,
                        "HashtagLookup", Factory.newFeatureMap()));

                // return this extended token
                List<Annotation> found = new ArrayList<Annotation>();
                found.add(newToken);

                shortest.add(found);
                return shortest;
            }
        }

        // get all possibilities that start after the current annotation that
        // could be used to cover the next bit of the document
        List<Annotation> next = sort(getAnnotationsAtOffset(lookups, token.getEndNode().getOffset()));

        // if there aren't any annotations we can use then return null as we can
        // never fully cover the range from this point
        if (next == null || next.isEmpty())
            return null;

        for (Annotation a : next) {
            // use each possible annotation to start a new search
            List<List<Annotation>> part = search(lookups, endOffset, a);

            if (part != null) {
                if (shortest.isEmpty()) {
                    shortest.addAll(part);
                } else if (part.get(0).size() == shortest.get(0).size()) {
                    shortest.addAll(part);
                } else if (part.get(0).size() < shortest.get(0).size()) {
                    shortest.clear();
                    shortest.addAll(part);
                }
            }
        }

        // if we didn't find a match then return null
        if (shortest.isEmpty())
            return null;

        // add the token we started from to the beginning of the match
        for (List<Annotation> found : shortest) {
            found.add(0, token);
        }

        // return the match we found
        return shortest;
    }

    /**
     * Returns a list in which the Annotations are sorted by length, longest
     * first.
     **/
    private List<Annotation> sort(AnnotationSet annotations) {

        List<Annotation> sorted = new ArrayList<Annotation>();

        if (annotations == null || annotations.isEmpty())
            return sorted;

        sorted.addAll(annotations);
        Collections.sort(sorted, lengthComparator);

        // TODO filter out annotations with the same span

        List<Annotation> filtered = new ArrayList<Annotation>();
        long length = -1;
        for (Annotation a : sorted) {
            long al = a.getEndNode().getOffset() - a.getStartNode().getOffset();
            if (length == -1 || al != length) {
                filtered.add(a);
                length = al;
            }
        }

        return filtered;
    }
}