gate.creole.splitter.RegexSentenceSplitter.java Source code

Introduction

Here is the source code for gate.creole.splitter.RegexSentenceSplitter.java
Source

/*
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Valentin Tablan, 04 Sep 2007
 *
 *  $Id$
 */
package gate.creole.splitter;

import java.io.*;
import java.net.URL;
import java.util.*;
import java.util.regex.*;

import org.apache.commons.io.IOUtils;

import gate.*;
import gate.creole.*;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.*;

/**
 * A fast sentence splitter replacement based on regular expressions.
 */
@CreoleResource(name = "RegEx Sentence Splitter", icon = "sentence-splitter", comment = "A sentence splitter based on regular expressions.", helpURL = "http://gate.ac.uk/userguide/sec:annie:regex-splitter")
public class RegexSentenceSplitter extends AbstractLanguageAnalyser {

    /**
     * Parameter name
     */
    public static final String SPLIT_DOCUMENT_PARAMETER_NAME = "document";

    /**
     * Parameter name
     */
    public static final String SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";

    /**
     * Parameter name
     */
    public static final String SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";

    /**
     * Parameter name
     */
    public static final String SPLIT_ENCODING_PARAMETER_NAME = "encoding";

    /**
     * Parameter name
     */
    public static final String SPLIT_SPLIT_LIST_PARAMETER_NAME = "splitListURL";

    /**
     * Parameter name
     */
    public static final String SPLIT_NON_SPLIT_LIST_PARAMETER_NAME = "nonSplitListURL";

    /**
     * serialisation ID
     */
    private static final long serialVersionUID = 1L;

    /**
     * Output annotation set name.
     */
    protected String outputASName;

    /**
     * Encoding used when reading config files
     */
    protected String encoding;

    /**
     * URL pointing to a file with regex patterns for internal sentence splits.
     */
    protected URL internalSplitListURL;

    /**
     * URL pointing to a file with regex patterns for external sentence splits.
     */
    protected URL externalSplitListURL;

    /**
     * URL pointing to a file with regex patterns for non sentence splits.
     */
    protected URL nonSplitListURL;

    protected Pattern internalSplitsPattern;

    protected Pattern externalSplitsPattern;

    protected Pattern nonSplitsPattern;

    protected Pattern compilePattern(URL paternsListUrl, String encoding)
            throws UnsupportedEncodingException, IOException {
        BufferedReader reader = null;
        StringBuffer patternString = new StringBuffer();

        try {
            reader = new BomStrippingInputStreamReader(paternsListUrl.openStream(), encoding);

            String line = reader.readLine();
            while (line != null) {
                line = line.trim();

                if (line.length() == 0 || line.startsWith("//")) {
                    // ignore empty lines and comments
                } else {
                    if (patternString.length() > 0)
                        patternString.append("|");
                    patternString.append("(?:" + line + ")");
                }
                // move to next line
                line = reader.readLine();
            }
        } finally {
            IOUtils.closeQuietly(reader);
        }
        return Pattern.compile(patternString.toString());
    }

    //  protected enum StartEnd {START, END};

    /**
     * A comparator for MatchResult objects. This is used to find the next match
     * result in a text. A null value is used to signify that no more matches are
     * available, hence nulls are the largest value, according to this comparator.
     * @author Valentin Tablan (valyt)
     */
    private class MatchResultComparator implements Comparator<MatchResult> {

        /* (non-Javadoc)
         * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
         */
        @Override
        public int compare(MatchResult o1, MatchResult o2) {
            if (o1 == null && o2 == null)
                return 0;
            if (o1 == null)
                return 1;
            if (o2 == null)
                return -1;
            //at this point both match results are not null
            return o1.start() - o2.start();
        }
    }

    @Override
    public void execute() throws ExecutionException {
        interrupted = false;
        int lastProgress = 0;
        fireProgressChanged(lastProgress);
        //get pointers to the annotation sets
        AnnotationSet outputAS = (outputASName == null || outputASName.trim().length() == 0)
                ? document.getAnnotations()
                : document.getAnnotations(outputASName);

        String docText = document.getContent().toString();

        /* If the document's content is empty or contains only whitespace,
         * we drop out right here, since there's nothing to sentence-split.     */
        if (docText.trim().length() < 1) {
            return;
        }

        Matcher internalSplitMatcher = internalSplitsPattern.matcher(docText);
        Matcher externalSplitMatcher = externalSplitsPattern.matcher(docText);

        Matcher nonSplitMatcher = nonSplitsPattern.matcher(docText);
        //store all non split locations in a list of pairs
        List<int[]> nonSplits = new LinkedList<int[]>();
        while (nonSplitMatcher.find()) {
            nonSplits.add(new int[] { nonSplitMatcher.start(), nonSplitMatcher.end() });
        }
        //this lists holds the next matches at each step
        List<MatchResult> nextSplitMatches = new ArrayList<MatchResult>();
        //initialise matching process
        MatchResult internalMatchResult = null;
        if (internalSplitMatcher.find()) {
            internalMatchResult = internalSplitMatcher.toMatchResult();
            nextSplitMatches.add(internalMatchResult);
        }
        MatchResult externalMatchResult = null;
        if (externalSplitMatcher.find()) {
            externalMatchResult = externalSplitMatcher.toMatchResult();
            nextSplitMatches.add(externalMatchResult);
        }
        MatchResultComparator comparator = new MatchResultComparator();
        int lastSentenceEnd = 0;

        while (!nextSplitMatches.isEmpty()) {
            //see which one matches first
            Collections.sort(nextSplitMatches, comparator);
            MatchResult nextMatch = nextSplitMatches.remove(0);
            if (nextMatch == internalMatchResult) {
                //we have a new internal split; see if it's vetoed or not
                if (!veto(nextMatch, nonSplits)) {
                    //split is not vetoed
                    try {
                        //add the split annotation
                        FeatureMap features = Factory.newFeatureMap();
                        features.put("kind", "internal");
                        outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()), "Split", features);
                        //generate the sentence annotation
                        int endOffset = nextMatch.end();
                        //find the first non whitespace character starting from where the
                        //last sentence ended
                        while (lastSentenceEnd < endOffset
                                && Character.isWhitespace(Character.codePointAt(docText, lastSentenceEnd))) {
                            lastSentenceEnd++;
                        }
                        //if there is any useful text between the two offsets, generate
                        //a new sentence
                        if (lastSentenceEnd < nextMatch.start()) {
                            outputAS.add(new Long(lastSentenceEnd), new Long(endOffset),
                                    ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
                        }
                        //store the new sentence end
                        lastSentenceEnd = endOffset;
                    } catch (InvalidOffsetException e) {
                        // this should never happen
                        throw new ExecutionException(e);
                    }
                }
                //prepare for next step
                if (internalSplitMatcher.find()) {
                    internalMatchResult = internalSplitMatcher.toMatchResult();
                    nextSplitMatches.add(internalMatchResult);
                } else {
                    internalMatchResult = null;
                }
            } else if (nextMatch == externalMatchResult) {
                //we have a new external split; see if it's vetoed or not
                if (!veto(nextMatch, nonSplits)) {
                    //split is not vetoed
                    try {
                        //generate the split
                        FeatureMap features = Factory.newFeatureMap();
                        features.put("kind", "external");
                        outputAS.add(new Long(nextMatch.start()), new Long(nextMatch.end()), "Split", features);
                        //generate the sentence annotation
                        //find the last non whitespace character, going backward from
                        //where the external skip starts
                        int endOffset = nextMatch.start();
                        while (endOffset > lastSentenceEnd
                                && Character.isSpaceChar(Character.codePointAt(docText, endOffset - 1))) {
                            endOffset--;
                        }
                        //find the first non whitespace character starting from where the
                        //last sentence ended
                        while (lastSentenceEnd < endOffset
                                && Character.isSpaceChar(Character.codePointAt(docText, lastSentenceEnd))) {
                            lastSentenceEnd++;
                        }
                        //if there is any useful text between the two offsets, generate
                        //a new sentence
                        if (lastSentenceEnd < endOffset) {
                            outputAS.add(new Long(lastSentenceEnd), new Long(endOffset),
                                    ANNIEConstants.SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
                        }
                        //store the new sentence end
                        lastSentenceEnd = nextMatch.end();
                    } catch (InvalidOffsetException e) {
                        // this should never happen
                        throw new ExecutionException(e);
                    }
                }
                //prepare for next step
                if (externalSplitMatcher.find()) {
                    externalMatchResult = externalSplitMatcher.toMatchResult();
                    nextSplitMatches.add(externalMatchResult);
                } else {
                    externalMatchResult = null;
                }
            } else {
                //malfunction
                throw new ExecutionException("Invalid state - cannot identify match!");
            }
            //report progress
            int newProgress = 100 * lastSentenceEnd / docText.length();
            if (newProgress - lastProgress > 20) {
                lastProgress = newProgress;
                fireProgressChanged(lastProgress);
            }
        } //while(!nextMatches.isEmpty()){
        fireProcessFinished();
    }

    /**
     * Checks whether a possible match is being vetoed by a non split match. A
     * possible match is vetoed if it any nay overlap with a veto region.
     *
     * @param split the match result representing the split to be tested
     * @param vetoRegions regions where matches are not allowed. For efficiency
     * reasons, this method assumes these regions to be non overlapping and sorted
     * in ascending order.
     * All veto regions that end before the proposed match are also discarded
     * (again for efficiency reasons). This requires the proposed matches to be
     * sent to this method in ascending order, so as to avoid malfunctions.
     * @return <tt>true</tt> iff the proposed split should be ignored
     */
    private boolean veto(MatchResult split, List<int[]> vetoRegions) {
        //if no more non splits available, accept everything
        for (Iterator<int[]> vetoRegIter = vetoRegions.iterator(); vetoRegIter.hasNext();) {
            int[] aVetoRegion = vetoRegIter.next();
            if (aVetoRegion[1] - 1 < split.start()) {
                //current veto region ends before the proposed split starts
                //--> discard the veto region
                vetoRegIter.remove();
            } else if (split.end() - 1 < aVetoRegion[0]) {
                //veto region starts after the split ends
                //-> we can return false
                return false;
            } else {
                //we have overlap
                return true;
            }
        }
        //if we got this far, all veto regions are before the split
        return false;
    }

    @Override
    public Resource init() throws ResourceInstantiationException {
        super.init();
        try {
            //sanity checks
            if (internalSplitListURL == null)
                throw new ResourceInstantiationException("No list of internal splits provided!");
            if (externalSplitListURL == null)
                throw new ResourceInstantiationException("No list of external splits provided!");
            if (nonSplitListURL == null)
                throw new ResourceInstantiationException("No list of non splits provided!");
            if (encoding == null)
                throw new ResourceInstantiationException("No encoding provided!");

            //load the known abbreviations list
            internalSplitsPattern = compilePattern(internalSplitListURL, encoding);
            externalSplitsPattern = compilePattern(externalSplitListURL, encoding);
            nonSplitsPattern = compilePattern(nonSplitListURL, encoding);
        } catch (UnsupportedEncodingException e) {
            throw new ResourceInstantiationException(e);
        } catch (IOException e) {
            throw new ResourceInstantiationException(e);
        }

        return this;
    }

    /**
     * @return the outputASName
     */
    public String getOutputASName() {
        return outputASName;
    }

    /**
     * @param outputASName the outputASName to set
     */
    @RunTime
    @Optional
    @CreoleParameter(comment = "The annotation set to be used as output for 'Sentence' and 'Split' annotations")
    public void setOutputASName(String outputASName) {
        this.outputASName = outputASName;
    }

    /**
     * @return the encoding
     */
    public String getEncoding() {
        return encoding;
    }

    /**
     * @param encoding the encoding to set
     */
    @CreoleParameter(comment = "The encoding used for reading the definition files", defaultValue = "UTF-8")
    public void setEncoding(String encoding) {
        this.encoding = encoding;
    }

    /**
     * @return the internalSplitListURL
     */
    public URL getInternalSplitListURL() {
        return internalSplitListURL;
    }

    /**
     * @param internalSplitListURL the internalSplitListURL to set
     */
    @CreoleParameter(defaultValue = "resources/regex-splitter/internal-split-patterns.txt", suffixes = "txt", comment = "The URL to the internal splits pattern list")
    public void setInternalSplitListURL(URL internalSplitListURL) {
        this.internalSplitListURL = internalSplitListURL;
    }

    /**
     * @return the externalSplitListURL
     */
    public URL getExternalSplitListURL() {
        return externalSplitListURL;
    }

    /**
     * @param externalSplitListURL the externalSplitListURL to set
     */
    @CreoleParameter(defaultValue = "resources/regex-splitter/external-split-patterns.txt", comment = "The URL to the external splits pattern list", suffixes = "txt")
    public void setExternalSplitListURL(URL externalSplitListURL) {
        this.externalSplitListURL = externalSplitListURL;
    }

    /**
     * @return the nonSplitListURL
     */
    public URL getNonSplitListURL() {
        return nonSplitListURL;
    }

    /**
     * @param nonSplitListURL the nonSplitListURL to set
     */
    @CreoleParameter(defaultValue = "resources/regex-splitter/non-split-patterns.txt", comment = "The URL to the non splits pattern list", suffixes = "txt")
    public void setNonSplitListURL(URL nonSplitListURL) {
        this.nonSplitListURL = nonSplitListURL;
    }

    /**
     * @return the internalSplitsPattern
     */
    public Pattern getInternalSplitsPattern() {
        return internalSplitsPattern;
    }

    /**
     * @param internalSplitsPattern the internalSplitsPattern to set
     */
    public void setInternalSplitsPattern(Pattern internalSplitsPattern) {
        this.internalSplitsPattern = internalSplitsPattern;
    }
}