com.joliciel.talismane.filters.SentenceHolderImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.joliciel.talismane.filters.SentenceHolderImpl.java

Source

///////////////////////////////////////////////////////////////////////////////
//Copyright (C) 2012 Assaf Urieli
//
//This file is part of Talismane.
//
//Talismane is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Talismane is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Talismane.  If not, see <http://www.gnu.org/licenses/>.
//////////////////////////////////////////////////////////////////////////////
package com.joliciel.talismane.filters;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

class SentenceHolderImpl extends SentenceImpl implements SentenceHolder {
    private static final Log LOG = LogFactory.getLog(SentenceHolderImpl.class);
    private TreeSet<Integer> sentenceBoundaries = new TreeSet<Integer>();
    private static final Pattern duplicateWhiteSpacePattern = Pattern
            .compile("[" + Sentence.WHITE_SPACE + "\n\r]{2,}");
    private static final Pattern openingWhiteSpacePattern = Pattern
            .compile("\\A([" + Sentence.WHITE_SPACE + "\n\r]+)");
    private static final Pattern closingWhiteSpacePattern = Pattern
            .compile("([" + Sentence.WHITE_SPACE + "\n\r]+)\\z");

    private FilterService filterService;

    public Set<Integer> getSentenceBoundaries() {
        return sentenceBoundaries;
    }

    public void addSentenceBoundary(int boundary) {
        this.sentenceBoundaries.add(boundary);
    }

    @Override
    public List<Sentence> getDetectedSentences(Sentence leftover) {
        if (LOG.isTraceEnabled()) {
            LOG.trace("getDetectedSentences. leftover=" + leftover);
        }

        List<Sentence> sentences = new ArrayList<Sentence>();

        int currentIndex = 0;
        boolean haveLeftOvers = this.getText().length() > 0;
        if (this.sentenceBoundaries.size() > 0) {
            haveLeftOvers = false;
            int lastSentenceBoundary = this.sentenceBoundaries.descendingIterator().next();
            if (lastSentenceBoundary < this.getText().length() - 1) {
                haveLeftOvers = true;
            }
            if (LOG.isTraceEnabled()) {
                LOG.trace("haveLeftOvers? " + lastSentenceBoundary + " < " + (this.getText().length() - 1) + " = "
                        + haveLeftOvers);
            }
        }

        List<Integer> allBoundaries = new ArrayList<Integer>(this.sentenceBoundaries);
        if (haveLeftOvers)
            allBoundaries.add(this.getText().length() - 1);

        for (int sentenceBoundary : allBoundaries) {
            boolean isLeftover = haveLeftOvers && sentenceBoundary == this.getText().length() - 1;

            Sentence sentence = filterService.getSentence();
            int leftOverTextLength = 0;
            String text = "";
            if (leftover != null) {
                sentence = leftover;
                leftOverTextLength = leftover.getText().length();
                text = leftover.getText() + this.getText().substring(currentIndex, sentenceBoundary + 1);
                leftover = null;
            } else {
                text = this.getText().substring(currentIndex, sentenceBoundary + 1);
            }

            // handle trim & duplicate white space here
            Matcher matcherOpeningWhiteSpace = openingWhiteSpacePattern.matcher(text);
            int openingWhiteSpaceEnd = 0;
            if (matcherOpeningWhiteSpace.find()) {
                openingWhiteSpaceEnd = matcherOpeningWhiteSpace.end(1);
            }

            int closingWhiteSpaceStart = text.length();
            if (!isLeftover) {
                Matcher matcherClosingWhiteSpace = closingWhiteSpacePattern.matcher(text);
                if (matcherClosingWhiteSpace.find()) {
                    closingWhiteSpaceStart = matcherClosingWhiteSpace.start(1);
                }
            }

            Matcher matcherDuplicateWhiteSpace = duplicateWhiteSpacePattern.matcher(text);
            Set<Integer> duplicateWhiteSpace = new HashSet<Integer>();
            while (matcherDuplicateWhiteSpace.find()) {
                // remove all white space barring the first
                for (int i = matcherDuplicateWhiteSpace.start() + 1; i < matcherDuplicateWhiteSpace.end(); i++) {
                    duplicateWhiteSpace.add(i);
                }
            }

            StringBuilder sb = new StringBuilder();
            int i = currentIndex;
            for (int j = 0; j < text.length(); j++) {
                boolean appendLetter = false;
                if (j < openingWhiteSpaceEnd) {
                    // do nothing
                } else if (j >= closingWhiteSpaceStart) {
                    // do nothing
                } else if (duplicateWhiteSpace.contains(j)) {
                    // do nothing
                } else {
                    appendLetter = true;
                }

                if (j >= leftOverTextLength) {
                    // if we're past the leftovers and onto the new stuff
                    if (appendLetter)
                        sentence.addOriginalIndex(this.getOriginalIndexes().get(i));

                    if (this.getOriginalTextSegments().containsKey(i))
                        sentence.getOriginalTextSegments().put(sb.length(), this.getOriginalTextSegments().get(i));

                    i++;
                }

                if (appendLetter)
                    sb.append(text.charAt(j));
            }

            sentence.setText(sb.toString());
            if (LOG.isTraceEnabled()) {
                LOG.trace("sentence.setText |" + sentence.getText() + "|");
            }

            sentence.setComplete(!isLeftover);

            for (Entry<Integer, Integer> newlineLocation : this.newlines.entrySet()) {
                sentence.addNewline(newlineLocation.getKey(), newlineLocation.getValue());
            }

            sentence.setFileName(this.getFileName());

            sentences.add(sentence);
            currentIndex = sentenceBoundary + 1;
        }

        return sentences;
    }

    public FilterService getFilterService() {
        return filterService;
    }

    public void setFilterService(FilterService filterService) {
        this.filterService = filterService;
    }

}