List of usage examples for org.dom4j ElementHandler ElementHandler
ElementHandler
From source file:com.globalsight.everest.tm.util.trados.TradosHtmlTmxToGxml.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from w w w. j a v a 2 s .c o m public String convertToGxml(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to GXML: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); createNewHeader(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; m_tuError = false; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); if (m_tuError) { m_errorCount++; } else { writeEntry(element.asXML()); } // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu/tuv/seg", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); try { element = removeUtElements(element); String gxml = handleTuv(element.getText()); Document doc = parse("<root>" + gxml + "</root>"); // Remove old content of seg List content = element.content(); for (int i = content.size() - 1; i >= 0; --i) { ((Node) content.get(i)).detach(); } // Add new GXML content (backwards) content = doc.getRootElement().content(); Collections.reverse(content); for (int i = content.size() - 1; i >= 0; --i) { Node node = (Node) content.get(i); element.add(node.detach()); } } catch (Throwable ex) { m_tuError = true; } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "', " + m_errorCount + " errors."); return m_filename; }
From source file:com.globalsight.everest.tm.util.trados.TradosTmxToRtf.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *//*w w w . j a v a 2 s. c o m*/ public String convertToRtf(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(p_url); info("Converting TMX file to RTF: `" + p_url + "'"); startOutputFile(baseName); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); reader.setEntityResolver(DtdResolver.getInstance()); reader.setValidation(true); // enable element complete notifications to conserve memory reader.addHandler("/tmx", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue("version"); } public void onEnd(ElementPath path) { } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/header", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); Element prop = (Element) element.selectSingleNode("/prop[@type='RTFFontTable']"); if (prop != null) writeEntry(prop.getText()); prop = (Element) element.selectSingleNode("/prop[@type='RTFStyleSheet']"); if (prop != null) writeEntry(prop.getText()); writeOtherRtfHeader(); writeDummyParagraph(); // prune the current element to reduce memory element.detach(); element = null; } }); // enable element complete notifications to conserve memory reader.addHandler("/tmx/body/tu", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element = removeUtElements(element); writeEntry(replaceUnicodeChars(removeRtfParagraphs(element.asXML()))); writeEntry("\\par"); // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 1000 == 0) { debug("Entry " + m_entryCount); } } }); Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'"); return m_filename; }
From source file:com.globalsight.everest.tm.util.ttx.TtxClean.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from ww w. j av a 2 s .c o m public String cleanTtx(String p_url, boolean p_cleanTarget, String p_encoding) throws Exception { m_cleanTarget = p_cleanTarget; // File is called <file>.<ext>.<ttx> final String origName = getBaseName(p_url); final String baseName = getBaseName(origName); final String extension = getExtension(origName); info("Cleaning TTX file to " + (m_cleanTarget ? "target" : "source") + ": `" + p_url + "'"); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); //reader.setEntityResolver(DtdResolver.getInstance()); //reader.setValidation(true); // Fetch the version info early. reader.addHandler("/TRADOStag", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue(Ttx.VERSION); } public void onEnd(ElementPath path) { } }); // Fetch the header info early. reader.addHandler("/TRADOStag/FrontMatter", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setOldHeader(element); } }); // Read in the entire file (it's not too big normally). Document document = reader.read(p_url); Element body = (Element) document.getRootElement().selectSingleNode("//Body/Raw"); // Remove <ut>, <df> and pull out one TUV. processBody(body); String content = getInnerText(body); String encoding; if (m_cleanTarget) { if (p_encoding != null) { encoding = p_encoding; } else { encoding = "UTF-8"; } } else { // reuse original encoding encoding = m_header.getOriginalEncoding(); } String locale; if (m_cleanTarget) { locale = m_header.getTargetLanguage(); } else { locale = m_header.getSourceLanguage(); } startOutputFile(baseName, locale, extension, encoding); writeEntry(content); closeOutputFile(); info("Result written to file `" + m_filename + "'."); return m_filename; }
From source file:com.globalsight.everest.tm.util.ttx.TtxToTmx.java
License:Apache License
/** * Main method to call, returns the new filename of the result. *///from ww w. j a v a 2s. c om public String convertTtxToTmx(String p_url) throws Exception { final String baseName = getBaseName(p_url); final String extension = getExtension(baseName); info("Converting TTX file to TMX: `" + p_url + "'"); m_entryCount = 0; // Reading from a file, need to use Xerces. SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); //reader.setEntityResolver(DtdResolver.getInstance()); //reader.setValidation(true); // Fetch the version info early. reader.addHandler("/TRADOStag", new ElementHandler() { public void onStart(ElementPath path) { Element element = path.getCurrent(); m_version = element.attributeValue(Ttx.VERSION); } public void onEnd(ElementPath path) { } }); // Fetch the header info early. reader.addHandler("/TRADOStag/FrontMatter", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); setTtxHeader(element); try { startOutputFile(baseName); } catch (Exception ex) { error(ex.toString()); System.exit(1); } // prune the current element to reduce memory element.detach(); element = null; } }); ElementHandler tuHandler = new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); element = cleanupTu(element); writeEntry(element.asXML()); // prune the current element to reduce memory element.detach(); element = null; if (m_entryCount % 50 == 0) { debug("Entry " + m_entryCount); } } }; // Path handlers cannot use "//", sooo specify all known paths. reader.addHandler("/TRADOStag/Body/Raw/Tu", tuHandler); reader.addHandler("/TRADOStag/Body/Raw/df/Tu", tuHandler); reader.addHandler("/TRADOStag/Body/Raw/ut/Tu", tuHandler); reader.addHandler("/TRADOStag/Body/Raw/df/ut/Tu", tuHandler); // Read in the entire file (it's not too big normally). Document document = reader.read(p_url); closeOutputFile(); info("Processed " + m_entryCount + " TUs into file `" + m_filename + "'"); return m_filename; }
From source file:com.globalsight.terminology.importer.GTXmlReader.java
License:Apache License
/** * Reads an XML file and checks for correctness. If there's any * error in the file, an exception is thrown. *///from ww w .j a v a2 s . co m private void analyzeXml(String p_url) throws Exception { SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); CATEGORY.debug("Analyzing document: " + p_url); // enable element complete notifications to conserve memory reader.addHandler("/entries/conceptGrp", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); } }); ImportUtil.filterateXmlIllegal(p_url, m_options.getEncoding()); Document document = reader.read(p_url); // all done }
From source file:com.globalsight.terminology.importer.GTXmlReaderThread.java
License:Apache License
public void run() { try {/*from www. j a v a 2 s . c o m*/ SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // enable pruning to call me back as each Element is complete reader.addHandler("/entries/conceptGrp", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); Document doc = m_factory.createDocument(element); Entry entry = new Entry(doc); m_result = m_results.hireResult(); m_result.setResultObject(entry); boolean done = m_results.put(m_result); m_result = null; // Stop reading the TMX file. if (done) { throw new ThreadDeath(); } } }); String url = m_options.getFileName(); reader.read(url); } catch (ThreadDeath ignore) { CATEGORY.info("ReaderThread: interrupted."); } catch (Throwable ignore) { // Should never happen, and I don't know how to handle // this case other than passing the exception in // m_results, which I won't do for now. } finally { if (m_result != null) { m_results.fireResult(m_result); m_result = null; } m_results.producerDone(); m_results = null; CATEGORY.debug("ReaderThread: done."); } }
From source file:com.globalsight.terminology.importer.MtfReader.java
License:Apache License
/** * Reads an XML file and checks for correctness. If there's any * error in the file, an exception is thrown. *//*from www. j av a2 s . c o m*/ private void analyzeXml(String p_url) throws Exception { SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); CATEGORY.debug("Analyzing document: " + p_url); // enable element complete notifications to conserve memory reader.addHandler("/mtf/conceptGrp", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); // TODO: validate entry and report errors. } }); Document document = reader.read(p_url); // all done }
From source file:com.globalsight.terminology.importer.MtfReaderThread.java
License:Apache License
public void run() { try {//from www . jav a 2s . c o m SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // enable pruning to call me back as each Element is complete reader.addHandler("/mtf/conceptGrp", new ElementHandler() { public void onStart(ElementPath path) { m_count++; } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); m_result = m_results.hireResult(); try { // Convert MultiTerm to GlobalSight. element = convertMtf(element); Document doc = m_factory.createDocument(element); Entry entry = new Entry(doc); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(entry.getXml()); } m_result.setResultObject(entry); } catch (Throwable ex) { String msg = "Entry " + m_count + ": " + ex.getMessage(); m_result.setError(msg); if (CATEGORY.isDebugEnabled()) { CATEGORY.debug(msg, ex); } else { CATEGORY.warn(msg, ex); } } boolean done = m_results.put(m_result); m_result = null; // Stop reading the XML file. if (done) { throw new ThreadDeath(); } } }); String url = m_options.getFileName(); Document document = reader.read(url); } catch (ThreadDeath ignore) { CATEGORY.info("ReaderThread: interrupted."); } catch (Throwable ignore) { // Should never happen, and I don't know how to handle // this case other than passing the exception in // m_results, which I won't do for now. CATEGORY.error("unexpected error", ignore); } finally { if (m_result != null) { m_results.fireResult(m_result); m_result = null; } m_results.producerDone(); m_results = null; CATEGORY.debug("ReaderThread: done."); } }
From source file:com.globalsight.terminology.importer.TbxReader.java
License:Apache License
private void analyzeTbx(String p_url) throws Exception { SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); CATEGORY.debug("Analyzing document: " + p_url); // enable element complete notifications to conserve memory reader.addHandler("/martif/text/body/termEntry", new ElementHandler() { public void onStart(ElementPath path) { ++m_entryCount;//w w w . j av a 2s .c o m } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); } }); Document document = reader.read(p_url); }
From source file:com.globalsight.terminology.importer.TbxReaderThread.java
License:Apache License
public void run() { try {// w ww. jav a2 s . c o m SAXReader reader = new SAXReader(); reader.setXMLReaderClassName("org.apache.xerces.parsers.SAXParser"); // enable pruning to call me back as each Element is complete reader.addHandler("/martif/text/body/termEntry", new ElementHandler() { public void onStart(ElementPath path) { } public void onEnd(ElementPath path) { Element element = path.getCurrent(); // prune the current element to reduce memory element.detach(); Document doc = m_factory.createDocument(element); Entry entry = new Entry(doc); m_result = m_results.hireResult(); m_result.setResultObject(entry); boolean done = m_results.put(m_result); m_result = null; // Stop reading the TMX file. if (done) { throw new ThreadDeath(); } } }); String url = m_options.getFileName(); Document document = reader.read(url); } catch (ThreadDeath ignore) { CATEGORY.info("ReaderThread: interrupted."); } catch (Throwable ignore) { // Should never happen, and I don't know how to handle // this case other than passing the exception in // m_results, which I won't do for now. } finally { if (m_result != null) { m_results.fireResult(m_result); m_result = null; } m_results.producerDone(); m_results = null; CATEGORY.debug("ReaderThread: done."); } }