importer.filters.PlayFilter.java Source code

Introduction

Here is the source code for importer.filters.PlayFilter.java
Source

/* This file is part of calliope.
 *
 *  calliope is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  calliope is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with calliope.  If not, see <http://www.gnu.org/licenses/>.
 */

package importer.filters;

import importer.Archive;
import java.util.StringTokenizer;
import java.util.HashSet;
import importer.exception.ImporterException;
import calliope.json.corcode.Range;
import org.json.simple.JSONObject;
import java.util.ArrayList;

/**
 * Import a plain text play
 * 1. Skim through the file looking at all words other than those at the 
 * start of lines or after "\.\s+". Put those words into a dictionary, 
 * but don't add any words that start with a capital.
 * 2. Look at line starts. Recover all words starting with a capital and 
 * ending in . or : followed by \s. Lowercase the word. If it does not 
 * appear in the word list calculate in step 1 then it is a speaker.
 * @author desmond
 */
public class PlayFilter extends Filter {
    HashSet<String> words;
    HashSet<String> speakers;
    HashSet<String> stageKeys;
    int maxLineSyllables;
    static String vowels;
    static String trailing;
    char speakerEnd;
    String sentenceEnd;
    String[] defaultStageKeys = { "Enter", "Exit", "Sennet" };

    public PlayFilter() {
        super();
        words = new HashSet<String>();
        speakers = new HashSet<String>();
        stageKeys = new HashSet<String>();
    }

    /**
     * This MUST be called before calling the filter
     * @param config parameters appropriate for this type of play
     */
    public void configure(JSONObject config) {
        maxLineSyllables = (Integer) config.get(PlayConfig.MAXLINE_SYLLABLES);
        vowels = (String) config.get(PlayConfig.VOWELS);
        trailing = (String) config.get(PlayConfig.TRAILING);
        String se = (String) config.get(PlayConfig.SPEAKER_END);
        speakerEnd = se.charAt(0);
        sentenceEnd = (String) config.get(PlayConfig.SENTENCE_END);
        ArrayList array = (ArrayList) config.get(PlayConfig.STAGE_KEYS);
        for (int i = 0; i < array.size(); i++)
            stageKeys.add((String) array.get(i));
    }

    @Override
    public String getDescription() {
        return "General play filter";
    }

    /**
     * Remove non-letter and hyphen chars from the string
     * @param input the raw punctuated word
     * @return a punctuation-free word (but with hyphens)
     */
    String strip(String input) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < input.length(); i++) {
            char token = input.charAt(i);
            if (Character.isLetter(token) || token == '-')
                sb.append(token);
        }
        return sb.toString();
    }

    /**
     * Is the toke at what looks like sentence-end?
     * @param token the token with perhaps embedded punctuation
     * @return true if it is
     */
    boolean endOfSentence(String token) {
        char last = token.charAt(token.length() - 1);
        return last == '.' || last == '!' || last == '?';
    }

    /**
     * Is the token an ordinary word, i.e. not a name?
     * @param token the token
     * @return true if it's a word
     */
    boolean isWord(String token) {
        return Character.isLowerCase(token.charAt(0));
    }

    /**
     * Construct a list of what you know are definitely words
     * @param input the input text
     * @return error log
     */
    private String buildWordList(String input) {
        StringTokenizer st = new StringTokenizer(input, "\n\t ", true);
        int state = 0;
        while (st.hasMoreElements()) {
            String token = st.nextToken();
            if (token.length() == 0)
                break;
            switch (state) {
            // at line-start
            case 0:
                if (!Character.isWhitespace(token.charAt(0))) {
                    if (endOfSentence(token))
                        state = 2;
                    else
                        state = 1;
                    token = strip(token);
                    if (token.length() == 0)
                        break;
                    if (isWord(token) && !words.contains(token))
                        words.add(token);
                }
                break;
            //not at line-start, not after full stop
            case 1:
                if (!Character.isWhitespace(token.charAt(0))) {
                    if (endOfSentence(token))
                        state = 2;
                    token = strip(token);
                    if (token.length() == 0)
                        break;
                    if (isWord(token)) {
                        if (!words.contains(token))
                            words.add(token);
                    }
                } else if (token.equals("\n"))
                    state = 0;
                break;
            // after sentence end
            case 2:
                if (!Character.isWhitespace(token.charAt(0))) {
                    if (!endOfSentence(token))
                        state = 1;
                    token = strip(token);
                    if (token.length() == 0)
                        break;
                    if (isWord(token)) {
                        if (!words.contains(token))
                            words.add(token);
                    }
                } else if (token.equals("\n"))
                    state = 0;
                break;
            }
        }
        // this will do for now
        return "";
    }

    /** 
     * Is a sentence starting with a speaker?
     * @param sentence the sentence in question
     * @return true if it is a speech
     */
    boolean isSpeech(String sentence) {
        String[] words = sentence.split("[\n\t ]");
        for (int i = 0; i < words.length; i++) {
            String word = words[i] + speakerEnd;
            if (speakers.contains(words[i]) || speakers.contains(word))
                return true;
            else if (words[i].length() > 0)
                return false;
        }
        return false;
    }

    /**
     * Look at line starts. Recover all words starting with a capital and 
     * ending in . or : followed by \s. Lowercase the word. If it occurs in 
     * the configured stagekeys then it is a stage direction.
     * @param input the input text
     * @return the log
     */
    private String identifyStageDirections(String input) throws ImporterException {
        try {
            String[] sentences = input.split(sentenceEnd);
            int offset = 0;
            int stageStart = -1;
            int state = 0;
            int len = 0;
            for (int i = 0; i < sentences.length; i++) {
                char[] current = sentences[i].toCharArray();
                String word1 = firstWord(sentences[i], true);
                int extra = (i < sentences.length - 1) ? 1 : 0;
                switch (state) {
                case 0:
                    if (stageKeys.contains(word1)) {
                        stageStart = offset;
                        state = 1;
                        len = current.length + extra;
                    }
                    break;
                case 1:
                    if (!isSpeech(sentences[i]))
                        len += current.length + extra;
                    else {
                        if (current.length > 0)
                            markup.add("stage", stageStart, len);
                        /*System.out.println("identified stage direction "
                            +input.substring(stageStart,stageStart+len));*/
                        state = 0;
                        len = -1;
                    }
                    break;
                }
                offset += current.length + 1;
            }
            if (state == 1 && len > 0)
                markup.add("stage", stageStart, len);
            return "";
        } catch (Exception e) {
            throw new ImporterException(e);
        }
    }

    /**
     * Can the first word of a sentence be a speaker name?
     * @param sp the first word of the sentence
     * @return true if it's a speaker
     */
    boolean canBeSpeaker(String sp) {
        char last = sp.charAt(sp.length() - 1);
        return last == speakerEnd;
    }

    /**
     * Look at line starts. Recover all words starting with a capital and 
     * ending in speakerEnd. Lowercase the word. If it does not 
     * appear in the word list calculated in step 1 then it is a speaker.
     */
    private String identifySpeakers(String input) throws ImporterException {
        try {
            StringBuilder log = new StringBuilder();
            String[] lines = input.split("\n");
            int offset = 0;
            for (int i = 0; i < lines.length; i++) {
                int index = 0;
                byte[] current = lines[i].getBytes(ENC);
                for (int j = 0; j < lines[i].length(); j++) {
                    char token = lines[i].charAt(j);
                    if (token == speakerEnd || j == lines[i].length() - 1) {
                        index = j + 1;
                        break;
                    } else if (token == ' ' || token == '\t') {
                        index = j;
                        break;
                    }
                }
                if (index > 0) {
                    String speaker = lines[i].substring(0, index);
                    if (canBeSpeaker(speaker)) {
                        String stripped = strip(speaker);
                        String token = stripped.toLowerCase();
                        if (!words.contains(token) && !stageKeys.contains(stripped)) {
                            speakers.add(speaker);
                            // record speaker in markup set
                            markup.add("speaker", lines[i].indexOf(stripped) + offset,
                                    stripped.getBytes(ENC).length);
                        }
                    }
                }
                offset += current.length + 1;
            }
            return log.toString();
        } catch (Exception e) {
            throw new ImporterException(e);
        }
    }

    /**
     * Headings go at the top
     * @param input the input containing just the headings
     * @return the log output if any
     */
    String identifyHeadings(String input) throws ImporterException {
        try {
            int pos = 0;
            String[] lines = input.split("\n");
            for (int i = 0; i < lines.length; i++) {
                byte[] current = lines[i].getBytes(ENC);
                String trimmed = lines[i].trim();
                int local = lines[i].indexOf(trimmed);
                String leading = lines[i].substring(0, local);
                byte[] leadBytes = leading.getBytes(ENC);
                byte[] trimBytes = trimmed.getBytes(ENC);
                markup.add("head", pos + leadBytes.length, trimBytes.length);
                pos += current.length + 1;
            }
            return "";
        } catch (Exception e) {
            throw new ImporterException(e);
        }
    }

    /**
     * Work out if we have a poetic line by counting syllables
     * @param line the line or paragraph to test
     * @return true if it is, else false
     */
    boolean itsALine(String line) {
        int state = 0;
        int numSyllables = 0;
        for (int i = 0; i < line.length(); i++) {
            char token = Character.toLowerCase(line.charAt(i));
            switch (state) {
            case 0: // word start
                if (vowels.indexOf(token) != -1)
                    state = 1;
                else if (Character.isLetter(token))
                    state = 2;
                break;
            case 1: // general vowel state
                if (!Character.isLetter(token)) {
                    numSyllables++;
                    state = 0;
                } else if (vowels.indexOf(token) == -1) {
                    state = 2;
                    numSyllables++;
                }
                break;
            case 2:// consonants
                if (trailing.indexOf(token) != -1)
                    state = 3;
                else if (vowels.indexOf(token) != -1)
                    state = 1;
                else if (!Character.isLetter(token))
                    state = 0;
                break;
            case 3:// trailing vowel after consonant
                if (!Character.isLetter(token))
                    state = 0;
                else if (vowels.indexOf(token) != -1)
                    state = 1;
                else {
                    numSyllables++;
                    state = 2;
                }
                break;
            }
        }
        if (state == 1)
            numSyllables++;
        //System.out.println("detected "+numSyllables+" syllables");
        return numSyllables < maxLineSyllables;
    }

    /**
     * Get the first word of a string
     * @param text the text to get the first word of
     * @param stripPunctuation strip punctuation
     * @return the first word
     */
    String firstWord(String text, boolean stripPunctuation) {
        String word1 = text.trim();
        int pos = 0;
        for (int i = 0; i < word1.length(); i++) {
            char token = word1.charAt(i);
            if (token == speakerEnd) {
                pos = i + 1;
                break;
            } else if (Character.isWhitespace(token)) {
                pos = i;
                break;
            }
        }
        if (pos > 0)
            word1 = word1.substring(0, pos);
        if (stripPunctuation)
            return strip(word1);
        else
            return word1;
    }

    /**
     * Is an entire line just whitespace?
     * @param line the line in question
     * @return true if it is
     */
    boolean isWhitespace(String line) {
        for (int i = 0; i < line.length(); i++) {
            if (!Character.isWhitespace(line.charAt(i)))
                return false;
        }
        return true;
    }

    /**
     * Identify the speaker and break up the body into lines or paragraphs
     * @param input the text of the speech
     * @param offset the offset of the speech
     * @return the log
     */
    String identifySpeech(String input, int offset) throws ImporterException {
        try {
            int speakerOffset = 0;
            byte[] speechBytes = input.getBytes(ENC);
            markup.add("sp", offset, speechBytes.length);
            String word1 = firstWord(input, false);
            if (speakers.contains(word1)) {
                byte[] spBytes = word1.getBytes(ENC);
                int leadIndex = input.indexOf(word1);
                if (leadIndex > 0) {
                    String leading = input.substring(0, leadIndex);
                    leadIndex += leading.getBytes(ENC).length;
                }
                speakerOffset = +spBytes.length + leadIndex;
                byte token = speechBytes[speakerOffset];
                while (speakerOffset < speechBytes.length - 1
                        && (token == ' ' || token == '\t' || token == '\r' || token == '\n')) {
                    speakerOffset++;
                    token = speechBytes[speakerOffset];
                }
            }
            // speakerOffset points to the first non-speaker UTF-8 byte
            String[] lines = input.split("\n");
            for (int i = 0; i < lines.length; i++) {
                byte[] current = lines[i].getBytes(ENC);
                // on other than the 1st line read from start of line
                if (i > 0)
                    speakerOffset = 0;
                if (current.length > speakerOffset) {
                    if (itsALine(lines[i])) {
                        markup.add("l", offset + speakerOffset, current.length - speakerOffset);
                    } else if (!isWhitespace(lines[i]))
                        markup.add("p", offset + speakerOffset, current.length);
                }
                // add 1 for removed CR
                offset += current.length + 1;
            }
            return "";
        } catch (Exception e) {
            throw new ImporterException(e);
        }
    }

    /**
     * Having identified the speakers and stage directions now identify the 
     * intervening speeches
     * @param input the text of the play
     * @return the log
     * @throws ImportException 
     */
    String identifySpeeches(String input) throws ImporterException {
        StringBuilder log = new StringBuilder();
        Range prev = null;
        int rSize = markup.size();
        Range[] ranges = new Range[rSize];
        markup.toArray(ranges);
        for (int i = 0; i < rSize; i++) {
            Range r = ranges[i];
            if (r.name.equals("speaker") || r.name.equals("stage")) {
                if (prev != null)
                    log.append(identifySpeech(input.substring(prev.offset, r.offset), prev.offset));
                if (r.name.equals("stage"))
                    prev = null;
                else
                    prev = r;
            }
        }
        return log.toString();
    }

    /**
     * Convert to standoff properties
     * @param input the raw text input string
     * @param cortext a cortext mvd archive
     * @param corcode a corcode mvd archive
     * @return log output
     */
    @Override
    public String convert(String input, String name, Archive cortex, Archive corcode) throws ImporterException {
        init();
        if (vowels == null)
            throw new ImporterException("configure required before convert");
        StringBuilder log = new StringBuilder();
        log.append(buildWordList(input));
        log.append(identifySpeakers(input));
        log.append(identifyStageDirections(input));
        markup.sort();
        int offset = markup.getFirstOffset();
        if (offset > 0) {
            String headingZone = input.substring(0, offset);
            log.append(identifyHeadings(headingZone));
            markup.sort();
        }
        log.append(identifySpeeches(input));
        markup.sort();
        cortex.put(name, input.toCharArray());
        corcode.put(name, markup.toSTILDocument().toString().toCharArray());
        return log.toString();
    }
}