importer.filters.CceFilter.java Source code

Java tutorial

Introduction

Here is the source code for importer.filters.CceFilter.java

Source

/*
 * This file is part of Importer.
 *
 *  Importer is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  Importer is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Importer.  If not, see <http://www.gnu.org/licenses/>.
 *  (c) copyright Desmond Schmidt 2015
 */

package importer.filters;

import importer.constants.CSSStyles;
import importer.exception.ImporterException;
import importer.Archive;
import calliope.core.json.corcode.Range;
import java.io.IOException;
import java.io.CharArrayWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Stack;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.json.simple.JSONObject;

/**
 * Convert CCT documents to XML for import to HRIT
 * @author desmond
 */
public class CceFilter extends Filter {
    static boolean preservingLineBreaks = true;
    int paraStart;
    boolean paraSeen;
    HashMap<String, String> headings;
    HashMap<String, String> formats;
    HashMap<String, String> characters;
    HashMap<String, Map<Character, Character>> accents;
    HashSet<String> endcodes;
    ArrayList<String> lineCommands;
    Stack<Location> nestedCommands;

public CceFilter()
{
    super();
    // chaaracter substitutions (aka entities)
    characters = new HashMap<String,String>();
    characters.put(" -- ","  ");
    characters.put("op\"","");
    characters.put("op'","");
    characters.put("---","");
    characters.put("oe","");
    characters.put("ae","");
    characters.put("mi","");
    characters.put("bp","");
    characters.put("di","");
    characters.put("l/","");
    characters.put("ecd","");
    characters.put("acd","");
    characters.put("s6","");
    characters.put("pi","");
    characters.put("s1","");
    characters.put("s2","");
    characters.put("s3","");
    characters.put("s4","");
    characters.put("s5","");
    characters.put("s7","");
    characters.put("|","|");
    characters.put("prg","\n   ");
    characters.put("n","");
    // accents
    accents = new HashMap<String,Map<Character,Character>>();
    HashMap<Character,Character> acutes = new HashMap<Character,Character>();
    acutes.put('e','');
    acutes.put('a','');
    acutes.put('i','');
    accents.put("a",acutes);
    HashMap<Character,Character> graves = new HashMap<Character,Character>();
    graves.put('a','');
    graves.put('e','');
    accents.put("gr",graves);
    accents.put("g",graves);
    HashMap<Character,Character> umlauts = new HashMap<Character,Character>();
    umlauts.put('u','');
    umlauts.put('o','');
    umlauts.put('a','');
    umlauts.put('i','');
    accents.put("um",umlauts);
    HashMap<Character,Character> tildes = new HashMap<Character,Character>();
    tildes.put('n','');
    tildes.put('a','');
    tildes.put('o','');
    accents.put("tld",tildes);
    HashMap<Character,Character> macrons = new HashMap<Character,Character>();
    macrons.put('a','?');
    macrons.put('e','');
    macrons.put('i','');
    macrons.put('o','?');
    macrons.put('u','');
    accents.put("mac",macrons);
    HashMap<Character,Character> breves = new HashMap<Character,Character>();
    breves.put('a','');
    breves.put('e','');
    breves.put('i','');
    breves.put('o','?');
    breves.put('u','');
    accents.put("mac",breves);
    HashMap<Character,Character> icarets = new HashMap<Character,Character>();
    icarets.put('e','');
    icarets.put('u','');
    accents.put("mac",icarets);
    HashMap<Character,Character> itildes = new HashMap<Character,Character>();
    itildes.put('e','');
    itildes.put('i','');
    itildes.put('u','');
    accents.put("mac",itildes);
    // headings cancelled at line-end
    headings = new HashMap<String,String>();
    headings.put("ht","head");
    headings.put("ct","head");
    headings.put("pt","titlePart");
    headings.put("cn","head");
    headings.put("pn","head");
    headings.put("sxt","head");
    headings.put("au","head-italic");
    headings.put("cst","head-italic");
    headings.put("sti","head-italic");
    headings.put("sht","head-italic");
    headings.put("ha","head");
    headings.put("hb","head");
    headings.put("hn","head");
    headings.put("ep","epigraph");
    headings.put("sep","epigraph");
    headings.put("epa","signed");
    headings.put("eph","head");
    headings.put("seph","head-italic");
    headings.put("epha","signed");
    headings.put("de","dedication");
    headings.put("sde","dedication");
    headings.put("cpt","desc");
    headings.put("sx","head");
    headings.put("sx2","head");
        
    formats = new HashMap<String,String>();
    formats.put("cop","dedication");
    formats.put("it","emph");
    formats.put("oi","emph");
    formats.put("bo","bold");
    formats.put("sc","smallcaps");
    formats.put("i","subscript");
    formats.put("ii","subsubscript");
    formats.put("s","superscript");
    formats.put("ss","supersuperscript");
    formats.put("sp","speaker");
    formats.put("sd","stage-italic");
        
    endcodes = new HashSet<String>();
    endcodes.add("stx");
    endcodes.add("ro");
    endcodes.add("ei");
    endcodes.add("es");
    endcodes.add("/as");
    endcodes.add("/sp");
    endcodes.add("/sd");
    lineCommands = new ArrayList<String>();
    nestedCommands = new Stack<Location>();
}

    public String getDescription() {
        return "Cambridge Conrad Edition";
    }

    public void configure(JSONObject jdoc) {
    }

    /**
     * Read a single dot command and write it to the output
     * @param line the line containing the command
     * @param txt the byte array output stream
     * @return true if it worked
     * @throws Exception 
     */
    void convertDotCommand(String line, CharArrayWriter txt) throws Exception {
        if (line.length() > 1 && line.charAt(1) == 'p') {
            String pn = line.substring(2);
            Range r = new Range("pb", written, 0);
            r.addAnnotation("n", pn);
            markup.add(r);
        } else {
            System.out.println("unknown dot command " + line + " ignored");
        }
    }

    protected void init() {
        super.init();
        paraSeen = false;
        paraStart = 0;
        lineCommands.clear();
        nestedCommands.clear();
    }

    String findLineEnding(String text) {
        if (text.indexOf("\r\n") != -1)
            return "\r\n";
        else if (text.indexOf("\n") != -1)
            return "\n";
        else
            return "\r";
    }

    /**
     * Convert all the files in a directory 
     * @param input the raw text input string
     * @param name the name of the new version
     * @param cortext a cortext mvd archive
     * @param corcode a corcode mvd archive
     * @return the log
     */
    public String convert(String input, String name, Archive cortex, Archive corcode) throws ImporterException {
        try {
            init();
            CharArrayWriter txt = new CharArrayWriter();
            String lastWord = "";
            String firstWord = "";
            String lineEnd = findLineEnding(input);
            String[] lines = input.split(lineEnd);
            paraSeen = true;
            for (int i = 0; i < lines.length; i++) {
                String str = lines[i].trim();
                firstWord = getFirstWord(str);
                if (str.startsWith(".") && str.length() > 1 && Character.isLetter(str.charAt(1))) {
                    convertDotCommand(str, txt);
                    if (!lastEndsInHyphen && written > 0)
                        writeCurrent(txt, SPACE);
                    // don't reset lastWord
                    continue;
                } else if (lines[i].startsWith("   ")) {
                    Range r;
                    if (!paraSeen)
                        paraSeen = true;
                    else if (written > paraStart) {
                        // write previous para range
                        r = new Range("p", paraStart, written - paraStart);
                        markup.add(r);
                    }
                    if (written > 0)
                        writeCurrent(txt, CR);
                    paraStart = written;
                    // markup new paragraphs with 4 spaces for readability
                    r = new Range(CSSStyles.PARA_START, written, 4);
                    markup.add(r);
                    writeLineContents("    " + str, txt);
                } else {
                    if (lastEndsInHyphen) {
                        Range r;
                        if (isHardHyphen(lastWord, firstWord)) {
                            r = new Range(CSSStyles.STRONG, written - 1, 1);
                            markup.add(r);
                        } else {
                            r = new Range(CSSStyles.WEAK, written - 1, 1);
                            markup.add(r);
                        }
                        writeCurrent(txt, CR);
                        r = new Range(CSSStyles.HYPHEN_CR, written - 1, 1);
                        markup.add(r);
                    } else if (written > 0) {
                        writeCurrent(txt, CR);
                        if (written == paraStart + 1)
                            paraStart = written;
                    }
                    writeLineContents(str, txt);
                }
                if (!lineCommands.isEmpty()) {
                    for (int j = lineCommands.size() - 1; j >= 0; j--) {
                        Range r = new Range(lineCommands.get(j), paraStart, written - paraStart);
                        markup.add(r);
                    }
                    lineCommands.clear();
                    paraStart = written;
                }
                lastWord = getLastWord(str);
            }
            // write closing para range
            if (written > paraStart) {
                Range r = new Range("p", paraStart, written - paraStart);
                markup.add(r);
            }
            markup.sort();
            char[] chars = txt.toCharArray();
            cortex.put(name, chars);
            String json = markup.toSTILDocument().toString();
            corcode.put(name, json.toCharArray());
        } catch (Exception e) {
            e.printStackTrace(System.out);
        }
        return "";
    }

/**
 * Escape quotes when writing text
 * @param input the input text
 * @return the escaped version of input
 */
String escape( String input )
{
    StringBuilder sb = new StringBuilder( input );
    for ( int i=0;i<sb.length();i++ )
    {
        char token = sb.charAt(i);
        if ( token == 34 )
            sb.setCharAt(i,'?');
        else if ( token == 39 )
            sb.setCharAt(i,'');
    }
    return sb.toString();
}

    /**
     * An accent has been detected. Process it
     * @param txt the byte output stream
     * @param accent the name of the accent in CCE
     * @param pending the preceding text in the line
     * @throws Exception 
     */
    void processAccent(CharArrayWriter txt, String pending, String accent) throws Exception {
        Map<Character, Character> map = accents.get(accent);
        if (pending != null && pending.length() > 0) {
            char last = pending.charAt(pending.length() - 1);
            if (map.containsKey(last)) {
                String adjusted = pending.substring(0, pending.length() - 1) + map.get(last);
                writeCurrent(txt, adjusted.toCharArray());
                return;
            }
        }
        writeCurrent(txt, accent.toCharArray());
    }

    /**
     * Write out the pending text
     * @param txt the destination byte stream
     * @param pending the pending text
     * @return null to clear pending
     * @throws IOException 
     */
    String writePending(CharArrayWriter txt, String pending) throws IOException {
        if (pending != null) {
            pending = pending.replace("--", "");
            writeCurrent(txt, pending.toCharArray());
            pending = null;
        }
        return null;
    }

    /**
     * Write the contents of a line
     * @param line the current line
     * @param fos the output stream
     * @return true if it worked
     */
    boolean writeLineContents(String line, CharArrayWriter txt) throws Exception {
        boolean result = true;
        StringTokenizer st = new StringTokenizer(line, "{}", true);
        int state = 0;
        String pending = null;
        while (st.hasMoreTokens()) {
            String token = st.nextToken();
            switch (state) {
            case 0: // looking for '{'
                if (token.equals("{"))
                    state = 1;
                else
                    pending = token;
                break;
            case 1: // reading command
                if (accents.containsKey(token)) {
                    pending = writePending(txt, pending);
                    processAccent(txt, pending, token);
                } else if (characters.containsKey(token)) {
                    String rep = characters.get(token);
                    pending = writePending(txt, pending);
                    writeCurrent(txt, rep.toCharArray());
                } else if (headings.containsKey(token)) {
                    lineCommands.add(headings.get(token));
                } else if (formats.containsKey(token)) {
                    pending = writePending(txt, pending);
                    Location loc = new Location(formats.get(token), written);
                    nestedCommands.push(loc);
                } else if (endcodes.contains(token) && nestedCommands.size() > 0) {
                    Location loc = nestedCommands.remove(nestedCommands.size() - 1);
                    pending = writePending(txt, pending);
                    Range r = new Range(loc.name, loc.loc, written - loc.loc);
                    markup.add(r);
                } else
                    System.out.println("ignoring unknown command " + token);
                state = 2;
                break;
            case 2: // reading closing brace
                if (!token.equals("}")) {
                    System.out.println("missing closing brace");
                }
                state = 0;
                break;
            }
        }
        writePending(txt, pending);
        return result;
    }

    class Location {
        String name;
        int loc;

        Location(String name, int loc) {
            this.loc = loc;
            this.name = name;
        }
    }

    static String readFile(String name) {
        try {
            File f = new File(name);
            FileInputStream fis = new FileInputStream(f);
            byte[] data = new byte[(int) f.length()];
            fis.read(data);
            return new String(data, "UTF-8");
        } catch (Exception e) {
            return "";
        }
    }

    static void writeFile(String data, String name) {
        try {
            File f = new File(name);
            FileOutputStream fos = new FileOutputStream(f);
            fos.write(data.getBytes("UTF-8"));
            fos.close();
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }

    public static void main(String[] args) {
        String input = readFile(args[0]);
        Archive cortex = new Archive("Nostromo", "Joseph Conrad", "TEI/default", "UTF-8");
        Archive corcode = new Archive("Nostromo", "Joseph Conrad", "TEI/default", "UTF-8");
        try {
            String log = new CceFilter().convert(input, "A1", cortex, corcode);
            System.out.println(log);
            writeFile(new String(cortex.get("A1")), "cortex.txt");
            writeFile(new String(corcode.get("A1")), "corcode.txt");
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
    }
}