Example usage for org.dom4j ElementHandler ElementHandler

List of usage examples for org.dom4j ElementHandler ElementHandler

Introduction

In this page you can find the example usage for org.dom4j ElementHandler ElementHandler.

Prototype

ElementHandler

Source Link

Usage

From source file:com.globalsight.everest.tm.util.trados.TradosHtmlTmxToGxml.java

License:Apache License

/**
 * Main method to call, returns the new filename of the result.
 *///from w w w.  j a v a  2 s  .c  o  m
public String convertToGxml(String p_url) throws Exception {
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(p_url);

    info("Converting TMX file to GXML: `" + p_url + "'");

    startOutputFile(baseName);

    m_entryCount = 0;

    // Reading from a file, need to use Xerces.
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");
    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue("version");
        }

        public void onEnd(ElementPath path) {
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();
            setOldHeader(element);
            createNewHeader();

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
            m_tuError = false;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            if (m_tuError) {
                m_errorCount++;
            } else {
                writeEntry(element.asXML());
            }

            // prune the current element to reduce memory
            element.detach();

            element = null;

            if (m_entryCount % 1000 == 0) {
                debug("Entry " + m_entryCount);
            }
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            try {
                element = removeUtElements(element);

                String gxml = handleTuv(element.getText());
                Document doc = parse("<root>" + gxml + "</root>");

                // Remove old content of seg
                List content = element.content();
                for (int i = content.size() - 1; i >= 0; --i) {
                    ((Node) content.get(i)).detach();
                }

                // Add new GXML content (backwards)
                content = doc.getRootElement().content();
                Collections.reverse(content);
                for (int i = content.size() - 1; i >= 0; --i) {
                    Node node = (Node) content.get(i);
                    element.add(node.detach());
                }
            } catch (Throwable ex) {
                m_tuError = true;
            }
        }
    });

    Document document = reader.read(p_url);

    closeOutputFile();

    info("Processed " + m_entryCount + " TUs into file `" + m_filename + "', " + m_errorCount + " errors.");

    return m_filename;
}

From source file:com.globalsight.everest.tm.util.trados.TradosTmxToRtf.java

License:Apache License

/**
 * Main method to call, returns the new filename of the result.
 *//*w w  w .  j a v a  2  s. c o m*/
public String convertToRtf(String p_url) throws Exception {
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(p_url);

    info("Converting TMX file to RTF: `" + p_url + "'");

    startOutputFile(baseName);

    m_entryCount = 0;

    // Reading from a file, need to use Xerces.
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");
    reader.setEntityResolver(DtdResolver.getInstance());
    reader.setValidation(true);

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue("version");
        }

        public void onEnd(ElementPath path) {
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/header", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();
            setOldHeader(element);

            Element prop = (Element) element.selectSingleNode("/prop[@type='RTFFontTable']");

            if (prop != null)
                writeEntry(prop.getText());

            prop = (Element) element.selectSingleNode("/prop[@type='RTFStyleSheet']");

            if (prop != null)
                writeEntry(prop.getText());

            writeOtherRtfHeader();

            writeDummyParagraph();

            // prune the current element to reduce memory
            element.detach();

            element = null;
        }
    });

    // enable element complete notifications to conserve memory
    reader.addHandler("/tmx/body/tu", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            element = removeUtElements(element);

            writeEntry(replaceUnicodeChars(removeRtfParagraphs(element.asXML())));
            writeEntry("\\par");

            // prune the current element to reduce memory
            element.detach();

            element = null;

            if (m_entryCount % 1000 == 0) {
                debug("Entry " + m_entryCount);
            }
        }
    });

    Document document = reader.read(p_url);

    closeOutputFile();

    info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'");

    return m_filename;
}

From source file:com.globalsight.everest.tm.util.ttx.TtxClean.java

License:Apache License

/**
 * Main method to call, returns the new filename of the result.
 *///from ww  w. j av  a  2 s .c  o  m
public String cleanTtx(String p_url, boolean p_cleanTarget, String p_encoding) throws Exception {
    m_cleanTarget = p_cleanTarget;

    // File is called <file>.<ext>.<ttx>
    final String origName = getBaseName(p_url);
    final String baseName = getBaseName(origName);
    final String extension = getExtension(origName);

    info("Cleaning TTX file to " + (m_cleanTarget ? "target" : "source") + ": `" + p_url + "'");

    m_entryCount = 0;

    // Reading from a file, need to use Xerces.
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");
    //reader.setEntityResolver(DtdResolver.getInstance());
    //reader.setValidation(true);

    // Fetch the version info early.
    reader.addHandler("/TRADOStag", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue(Ttx.VERSION);
        }

        public void onEnd(ElementPath path) {
        }
    });

    // Fetch the header info early.
    reader.addHandler("/TRADOStag/FrontMatter", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            setOldHeader(element);
        }
    });

    // Read in the entire file (it's not too big normally).
    Document document = reader.read(p_url);

    Element body = (Element) document.getRootElement().selectSingleNode("//Body/Raw");

    // Remove <ut>, <df> and pull out one TUV.
    processBody(body);

    String content = getInnerText(body);
    String encoding;

    if (m_cleanTarget) {
        if (p_encoding != null) {
            encoding = p_encoding;
        } else {
            encoding = "UTF-8";
        }
    } else {
        // reuse original encoding
        encoding = m_header.getOriginalEncoding();
    }

    String locale;

    if (m_cleanTarget) {
        locale = m_header.getTargetLanguage();
    } else {
        locale = m_header.getSourceLanguage();
    }

    startOutputFile(baseName, locale, extension, encoding);
    writeEntry(content);
    closeOutputFile();

    info("Result written to file `" + m_filename + "'.");

    return m_filename;
}

From source file:com.globalsight.everest.tm.util.ttx.TtxToTmx.java

License:Apache License

/**
 * Main method to call, returns the new filename of the result.
 *///from   ww  w. j a v  a 2s. c om
public String convertTtxToTmx(String p_url) throws Exception {
    final String baseName = getBaseName(p_url);
    final String extension = getExtension(baseName);

    info("Converting TTX file to TMX: `" + p_url + "'");

    m_entryCount = 0;

    // Reading from a file, need to use Xerces.
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");
    //reader.setEntityResolver(DtdResolver.getInstance());
    //reader.setValidation(true);

    // Fetch the version info early.
    reader.addHandler("/TRADOStag", new ElementHandler() {
        public void onStart(ElementPath path) {
            Element element = path.getCurrent();

            m_version = element.attributeValue(Ttx.VERSION);
        }

        public void onEnd(ElementPath path) {
        }
    });

    // Fetch the header info early.
    reader.addHandler("/TRADOStag/FrontMatter", new ElementHandler() {
        public void onStart(ElementPath path) {
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            setTtxHeader(element);

            try {
                startOutputFile(baseName);
            } catch (Exception ex) {
                error(ex.toString());
                System.exit(1);
            }

            // prune the current element to reduce memory
            element.detach();
            element = null;
        }
    });

    ElementHandler tuHandler = new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            element = cleanupTu(element);

            writeEntry(element.asXML());

            // prune the current element to reduce memory
            element.detach();
            element = null;

            if (m_entryCount % 50 == 0) {
                debug("Entry " + m_entryCount);
            }
        }
    };

    // Path handlers cannot use "//", sooo specify all known paths.
    reader.addHandler("/TRADOStag/Body/Raw/Tu", tuHandler);
    reader.addHandler("/TRADOStag/Body/Raw/df/Tu", tuHandler);
    reader.addHandler("/TRADOStag/Body/Raw/ut/Tu", tuHandler);
    reader.addHandler("/TRADOStag/Body/Raw/df/ut/Tu", tuHandler);

    // Read in the entire file (it's not too big normally).
    Document document = reader.read(p_url);

    closeOutputFile();

    info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'");

    return m_filename;
}

From source file:com.globalsight.terminology.importer.GTXmlReader.java

License:Apache License

/**
 * Reads an XML file and checks for correctness. If there's any
 * error in the file, an exception is thrown.
 *///from ww w  .j a v  a2 s . co m
private void analyzeXml(String p_url) throws Exception {
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    CATEGORY.debug("Analyzing document: " + p_url);

    // enable element complete notifications to conserve memory
    reader.addHandler("/entries/conceptGrp", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            // prune the current element to reduce memory
            element.detach();
        }
    });

    ImportUtil.filterateXmlIllegal(p_url, m_options.getEncoding());
    Document document = reader.read(p_url);

    // all done
}

From source file:com.globalsight.terminology.importer.GTXmlReaderThread.java

License:Apache License

public void run() {
    try {/*from  www. j  a  v  a  2  s . c  o m*/
        SAXReader reader = new SAXReader();
        reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

        // enable pruning to call me back as each Element is complete
        reader.addHandler("/entries/conceptGrp", new ElementHandler() {
            public void onStart(ElementPath path) {
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();

                // prune the current element to reduce memory
                element.detach();

                Document doc = m_factory.createDocument(element);
                Entry entry = new Entry(doc);

                m_result = m_results.hireResult();
                m_result.setResultObject(entry);

                boolean done = m_results.put(m_result);
                m_result = null;

                // Stop reading the TMX file.
                if (done) {
                    throw new ThreadDeath();
                }
            }
        });

        String url = m_options.getFileName();

        reader.read(url);
    } catch (ThreadDeath ignore) {
        CATEGORY.info("ReaderThread: interrupted.");
    } catch (Throwable ignore) {
        // Should never happen, and I don't know how to handle
        // this case other than passing the exception in
        // m_results, which I won't do for now.
    } finally {
        if (m_result != null) {
            m_results.fireResult(m_result);
            m_result = null;
        }

        m_results.producerDone();
        m_results = null;

        CATEGORY.debug("ReaderThread: done.");
    }
}

From source file:com.globalsight.terminology.importer.MtfReader.java

License:Apache License

/**
 * Reads an XML file and checks for correctness. If there's any
 * error in the file, an exception is thrown.
 *//*from   www.  j av a2 s  . c  o  m*/
private void analyzeXml(String p_url) throws Exception {
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    CATEGORY.debug("Analyzing document: " + p_url);

    // enable element complete notifications to conserve memory
    reader.addHandler("/mtf/conceptGrp", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            // prune the current element to reduce memory
            element.detach();

            // TODO: validate entry and report errors.
        }
    });

    Document document = reader.read(p_url);

    // all done
}

From source file:com.globalsight.terminology.importer.MtfReaderThread.java

License:Apache License

public void run() {
    try {//from  www  . jav  a 2s  . c  o m
        SAXReader reader = new SAXReader();
        reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

        // enable pruning to call me back as each Element is complete
        reader.addHandler("/mtf/conceptGrp", new ElementHandler() {
            public void onStart(ElementPath path) {
                m_count++;
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();

                // prune the current element to reduce memory
                element.detach();

                m_result = m_results.hireResult();

                try {
                    // Convert MultiTerm to GlobalSight.
                    element = convertMtf(element);

                    Document doc = m_factory.createDocument(element);
                    Entry entry = new Entry(doc);

                    if (CATEGORY.isDebugEnabled()) {
                        CATEGORY.debug(entry.getXml());
                    }

                    m_result.setResultObject(entry);
                } catch (Throwable ex) {
                    String msg = "Entry " + m_count + ": " + ex.getMessage();

                    m_result.setError(msg);

                    if (CATEGORY.isDebugEnabled()) {
                        CATEGORY.debug(msg, ex);
                    } else {
                        CATEGORY.warn(msg, ex);
                    }
                }

                boolean done = m_results.put(m_result);
                m_result = null;

                // Stop reading the XML file.
                if (done) {
                    throw new ThreadDeath();
                }
            }
        });

        String url = m_options.getFileName();

        Document document = reader.read(url);
    } catch (ThreadDeath ignore) {
        CATEGORY.info("ReaderThread: interrupted.");
    } catch (Throwable ignore) {
        // Should never happen, and I don't know how to handle
        // this case other than passing the exception in
        // m_results, which I won't do for now.
        CATEGORY.error("unexpected error", ignore);
    } finally {
        if (m_result != null) {
            m_results.fireResult(m_result);
            m_result = null;
        }

        m_results.producerDone();
        m_results = null;

        CATEGORY.debug("ReaderThread: done.");
    }
}

From source file:com.globalsight.terminology.importer.TbxReader.java

License:Apache License

private void analyzeTbx(String p_url) throws Exception {
    SAXReader reader = new SAXReader();
    reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

    CATEGORY.debug("Analyzing document: " + p_url);
    // enable element complete notifications to conserve memory

    reader.addHandler("/martif/text/body/termEntry", new ElementHandler() {
        public void onStart(ElementPath path) {
            ++m_entryCount;//w w w . j av a  2s .c o m
        }

        public void onEnd(ElementPath path) {
            Element element = path.getCurrent();

            // prune the current element to reduce memory
            element.detach();
        }
    });

    Document document = reader.read(p_url);
}

From source file:com.globalsight.terminology.importer.TbxReaderThread.java

License:Apache License

public void run() {
    try {// w  ww.  jav a2  s .  c o m
        SAXReader reader = new SAXReader();
        reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser");

        // enable pruning to call me back as each Element is complete
        reader.addHandler("/martif/text/body/termEntry", new ElementHandler() {
            public void onStart(ElementPath path) {
            }

            public void onEnd(ElementPath path) {
                Element element = path.getCurrent();

                // prune the current element to reduce memory
                element.detach();

                Document doc = m_factory.createDocument(element);
                Entry entry = new Entry(doc);

                m_result = m_results.hireResult();
                m_result.setResultObject(entry);

                boolean done = m_results.put(m_result);
                m_result = null;

                // Stop reading the TMX file.
                if (done) {
                    throw new ThreadDeath();
                }
            }
        });

        String url = m_options.getFileName();

        Document document = reader.read(url);
    } catch (ThreadDeath ignore) {
        CATEGORY.info("ReaderThread: interrupted.");
    } catch (Throwable ignore) {
        // Should never happen, and I don't know how to handle
        // this case other than passing the exception in
        // m_results, which I won't do for now.
    } finally {
        if (m_result != null) {
            m_results.fireResult(m_result);
            m_result = null;
        }

        m_results.producerDone();
        m_results = null;

        CATEGORY.debug("ReaderThread: done.");
    }
}