com.globalsight.everest.tm.util.TmxLevelSplitter.java Source code

Introduction

Here is the source code for com.globalsight.everest.tm.util.TmxLevelSplitter.java
Source

/**
 *  Copyright 2009 Welocalize, Inc. 
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  
 *  You may obtain a copy of the License at 
 *  http://www.apache.org/licenses/LICENSE-2.0
 *  
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *  
 */

package com.globalsight.everest.tm.util;

import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.List;

import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.ElementHandler;
import org.dom4j.ElementPath;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

import com.sun.org.apache.regexp.internal.RECompiler;
import com.sun.org.apache.regexp.internal.REProgram;
import com.sun.org.apache.regexp.internal.RESyntaxException;

/**
 * Reads a TMX file and splits it into two files: one that contains level 1
 * segments (text only), and level 2 segments (with tags).
 */
public class TmxLevelSplitter {
    private int m_entryCount = 0;
    private int m_textCount = 0;
    private int m_tagsCount = 0;

    private PrintWriter m_writer1;
    private PrintWriter m_writer2;

    private String m_version = "";
    private Element m_header = null;

    private static final REProgram DTD_SEARCH_PATTERN = createSearchPattern("tmx\\d+\\.dtd");

    private static REProgram createSearchPattern(String p_pattern) {
        REProgram pattern = null;

        try {
            RECompiler compiler = new RECompiler();
            pattern = compiler.compile(p_pattern);
        } catch (RESyntaxException e) {
            // Pattern syntax error. Stop the application.
            throw new RuntimeException(e.getMessage());
        }

        return pattern;
    }

    //
    // Constructors
    //

    public TmxLevelSplitter() {
    }

    public String getBaseName(String p_name) {
        return p_name.substring(0, p_name.lastIndexOf("."));
    }

    public String getExtension(String p_name) {
        return p_name.substring(p_name.lastIndexOf(".") + 1);
    }

    public void log(String p_message) {
        System.err.println(p_message);
    }

    public void startFiles(String p_base, String p_extension) throws Exception {
        String textfile = p_base + "-text." + p_extension;
        String tagsfile = p_base + "-tags." + p_extension;

        m_writer1 = startFile(textfile);
        m_writer2 = startFile(tagsfile);
    }

    public PrintWriter startFile(String p_filename) throws Exception {
        log("Starting file " + p_filename);

        PrintWriter writer = new PrintWriter(
                new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(p_filename)), "UTF8"));

        writer.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");

        // Parser should read and build header.
        Tmx temp = new Tmx();
        temp.setTmxVersion(m_version);

        writer.println(temp.getTmxDeclaration());

        writer.println("<tmx version=\"" + m_version + "\" >");
        writer.println(m_header.asXML());
        writer.println("<body>");

        return writer;
    }

    public void closeFiles() throws Exception {
        m_writer1.println("</body>");
        m_writer1.println("</tmx>");
        m_writer1.close();
        m_writer1 = null;

        m_writer2.println("</body>");
        m_writer2.println("</tmx>");
        m_writer2.close();
        m_writer2 = null;
    }

    public void writeTextEntry(String p_message) {
        m_writer1.println(p_message);
    }

    public void writeTagsEntry(String p_message) {
        m_writer2.println(p_message);
    }

    public boolean containsTags(Element p_tu) {
        List segs = p_tu.selectNodes("tuv/seg");

        for (int i = 0, max = segs.size(); i < max; i++) {
            Element seg = (Element) segs.get(i);

            if (containsTags2(seg)) {
                return true;
            }
        }

        return false;
    }

    public boolean containsTags2(Element p_seg) {
        for (int i = 0, max = p_seg.nodeCount(); i < max; i++) {
            Node child = (Node) p_seg.node(i);

            if (child instanceof Element) {
                return true;
            }
        }

        return false;
    }

    public void split(String p_url) throws Exception {
        final String baseName = getBaseName(p_url);
        final String extension = getExtension(p_url);

        m_entryCount = 0;

        SAXReader reader = new SAXReader();
        reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

        reader.setEntityResolver(DtdResolver.getInstance());
        reader.setValidation(true);

        log("Splitting document `" + p_url + "'");

        // enable element complete notifications to conserve memory
        reader.addHandler("/tmx", new ElementHandler() {
            public void onStart(ElementPath path) {
                Element element = path.getCurrent();

                m_version = element.attributeValue("version");
            }

            public void onEnd(ElementPath path) {
            }
        });

        // enable element complete notifications to conserve memory
        reader.addHandler("/tmx/header", new ElementHandler() {
            public void onStart(ElementPath path) {
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();

                m_header = element;

                try {
                    startFiles(baseName, extension);
                } catch (Exception ex) {
                    log(ex.toString());
                    System.exit(1);
                }

                // prune the current element to reduce memory
                element.detach();

                element = null;
            }
        });

        // enable element complete notifications to conserve memory
        reader.addHandler("/tmx/body/tu", new ElementHandler() {
            public void onStart(ElementPath path) {
                ++m_entryCount;
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();

                if (containsTags(element)) {
                    writeTagsEntry(element.asXML());

                    m_tagsCount++;
                } else {
                    writeTextEntry(element.asXML());

                    m_textCount++;
                }

                // prune the current element to reduce memory
                element.detach();

                element = null;
            }
        });

        Document document = reader.read(p_url);

        closeFiles();

        log("Processed " + m_entryCount + " TUs, " + m_textCount + " level 1 (text), " + m_tagsCount
                + " level 2 (tags)");

        // all done
    }

    static public void main(String[] argv) throws Exception {
        TmxLevelSplitter a = new TmxLevelSplitter();

        if (argv.length != 1) {
            System.err.println(
                    "Usage: TmxLevelSplitter FILE\n" + "\tSplits a FILE into two files, one that contains\n"
                            + "\tthe level 1 segments (text only) and another that\n"
                            + "\tcontains the level 2 segments (with tags).\n"
                            + "\tOutput files are named FILE-text.EXT and FILE-tags.EXT.");
            System.exit(1);
        }

        a.split(argv[0]);
    }
}