Java tutorial
package revaligner.service; import com.aspose.cells.Cell; import com.aspose.cells.Cells; import com.aspose.cells.Workbook; import com.aspose.cells.Worksheet; import com.aspose.cells.WorksheetCollection; import com.aspose.words.Comment; import com.aspose.words.CommentRangeEnd; import com.aspose.words.CommentRangeStart; import com.aspose.words.CompositeNode; import com.aspose.words.ControlChar; import com.aspose.words.FieldStart; import com.aspose.words.FindReplaceOptions; import com.aspose.words.Font; import com.aspose.words.Footnote; import com.aspose.words.NodeCollection; import com.aspose.words.NodeType; import com.aspose.words.Paragraph; import com.aspose.words.Range; import com.aspose.words.Revision; import com.aspose.words.RevisionCollection; import com.aspose.words.Row; import com.aspose.words.Run; import com.aspose.words.Section; import com.aspose.words.SectionCollection; import com.aspose.words.Shape; import com.aspose.words.Table; import com.aspose.words.Underline; import java.awt.Color; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileReader; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintStream; import java.io.Serializable; import java.io.Writer; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.commons.configuration.BaseConfiguration; import org.apache.commons.configuration.Configuration; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.ArrayUtils; import org.dom4j.Attribute; import org.dom4j.DocumentHelper; import org.dom4j.Element; import org.gs4tr.filters3.extraction.ExtractionSupportImpl; import org.gs4tr.filters3.msoffice.ConvertDOC; import org.gs4tr.filters3.msoffice.MergeDOC; import org.gs4tr.filters3.msoffice.common.AsposeFactory; import org.gs4tr.filters3.msoffice.common.OfficeConfigurationConverterImpl; import org.gs4tr.filters3.msoffice.word.WordDocumentAligner; import org.gs4tr.foundation.locale.Locale; import org.gs4tr.foundation3.xliff.model.XliffDocument; import org.gs4tr.foundation3.xml.XmlParser; import org.gs4tr.tm3.commandline.utils.XliffSegmenter; import org.gs4tr.tm3.segmenter.Segmenter; import org.gs4tr.tm3.segmenter.SegmenterFactory; import org.gs4tr.tm3.wordcounter.strategy.TradosWordCounter; import org.json.JSONArray; import org.jsoup.Jsoup; import org.jsoup.select.Elements; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.NodeList; import org.xmlunit.builder.DiffBuilder; import org.xmlunit.diff.Diff; public class FileAligner implements Serializable { private String sourcefile; private String targetfile; private String alignedfile; private String auto_saved_alignedfile; private String backupsourcefile; private String comparedoc; private String reformattedsourcefile; private String reformattedtargetfile; private String sourcetxlf_nonSeg; private String sourcetxlf_seg; private String reformattedsourcetxlf_nonSeg; private String reformattedsourcetxlf_seg; private String populatedsourcetxlf; private String reformattedtargettxlf_nonSeg; private String reformattedtargettxlf_seg; private String reformattedtargetmapfile; private String targettxlf_nonSeg; private String targettxlf_seg; private String translatedtxlf; private String aligneddoc; private String htmlreportfortranslation; private String htmlreportfortranslation_temp; private String htmlreportforfinalreview; private String prjfolder; private String sourcelanguage; private String targetlanguage; private String creatorid; private String exportfolder; private String prjinfofile; private String tempfolder; private String nbalignerfolder; private boolean isSectionBreakDeletedORInserted = false; private boolean stopexcution = false; private LinkedHashMap<String, List<String>> src_paras; private LinkedHashMap<String, List<String>> trg_paras; private LinkedHashMap<String, List<String>> missing_trg_paras; private LinkedHashMap<String, List<String>> src_segs; private LinkedHashMap<String, List<String>> trg_segs; private LinkedHashMap<String, String> locked_para_seqs; private LinkedHashMap<String, String> aligned_para_seqs; private int nullcnt = 0; private String estimateNBAlignerCompTime = ""; private static HashMap<String, Integer> alignProgress = new HashMap(); private LinkedHashMap<Integer, List> txlftrgsegmap; private LinkedHashMap<Integer, boolean[]> txlftrgsewsmap; private LinkedHashMap<String, List<String>> missing_trg_segs; private LinkedHashMap<String, String> locked_seg_seqs; private LinkedHashMap<String, String> review_seg_seqs; private LinkedHashMap<String, String> ignore_seg_seqs; private int[] replacestyles = new int[3]; private static int doublestrikethrough = 99; private static int needreviewthreshhold = 75; private static double textexpansionthreshold = 2.0D; private String errorreason = ""; private char[] esps = { '.', '?', '!' }; public FileAligner() { applyAsposeLicense(); } private static void applyAsposeLicense() { System.err.println("applying Aspose license.."); com.aspose.words.License license_wordlicense = new com.aspose.words.License(); InputStream license_word = FileAligner.class.getClassLoader().getResourceAsStream("aspose.license"); try { license_wordlicense.setLicense(license_word); } catch (Exception e) { e.printStackTrace(); } com.aspose.cells.License license_excellicense = new com.aspose.cells.License(); InputStream license_excel = FileAligner.class.getClassLoader().getResourceAsStream("aspose.license"); try { license_excellicense.setLicense(license_excel); } catch (Exception e) { e.printStackTrace(); } } public HashMap<String, Integer> getAlignProgress() { return alignProgress; } public void removeAlignProgress(String prjid) { if (alignProgress.containsKey(prjid)) { alignProgress.remove(prjid); } } public void setAlignProgress(String prjid, int alignProgress) { this.alignProgress.put(prjid, Integer.valueOf(alignProgress)); } public void setnullcnt(int cnt) { this.nullcnt = cnt; } public int getnullcnt() { return this.nullcnt; } public void setStopExcution(boolean stop) { this.stopexcution = stop; } public boolean getStopExcution() { return this.stopexcution; } public void setSourceFile(String sourcefile) { this.sourcefile = sourcefile; } public void setBackUpSourceFile(String backupsourcefile) { this.backupsourcefile = backupsourcefile; } public void setTargetFile(String targetfile) { this.targetfile = targetfile; } public void setAlignedFile(String alignedfile) { this.alignedfile = alignedfile; } public void setAutoSavedAlignedFile(String autosavedalignedfile) { this.auto_saved_alignedfile = autosavedalignedfile; } public void replaceAlignedFile(String newalignedfile) throws Exception { new File(this.alignedfile).delete(); FileUtils.copyFileToDirectory(new File(newalignedfile), new File(this.prjfolder)); } public void setReformattedSourceFile(String reformattedsourcefile) { this.reformattedsourcefile = reformattedsourcefile; } public void setReformattedTargetFile(String reformattedtargetfile) { this.reformattedtargetfile = reformattedtargetfile; } public void setReformattedTargetMapFile(String reformattedtargetmapfile) { this.reformattedtargetmapfile = reformattedtargetmapfile; } public void setSourceTxlfSeg(String sourcetxlfseg) { this.sourcetxlf_seg = sourcetxlfseg; } public void setSourceTxlfNonSeg(String sourcetxlfnonseg) { this.sourcetxlf_nonSeg = sourcetxlfnonseg; } public void setReformattedSourceTxlf_NonSeg(String reformattedsourcetxlf_nonSeg) { this.reformattedsourcetxlf_nonSeg = reformattedsourcetxlf_nonSeg; } public void setReformattedSourceTxlf_Seg(String reformattedsourcetxlf_seg) { this.reformattedsourcetxlf_seg = reformattedsourcetxlf_seg; } public void setReformattedTargetTxlf_NonSeg(String reformattedtargetetxlf_nonseg) { this.reformattedtargettxlf_nonSeg = reformattedtargetetxlf_nonseg; } public void setReformattedTargetTxlf_Seg(String reformattedtargetetxlf_seg) { this.reformattedtargettxlf_seg = reformattedtargetetxlf_seg; } public void setPopulatedSourceTxlf(String populatedsourcetxlf) { this.populatedsourcetxlf = populatedsourcetxlf; } public void setHtmlReportForTranslation(String htmlreportfortranslation) { this.htmlreportfortranslation = htmlreportfortranslation; } public void setTempHtmlReportForTranslation(String temphtmlreportfortranslation) { this.htmlreportfortranslation_temp = temphtmlreportfortranslation; } public void setTranslatedTxlf(String translatedtxlf) { this.translatedtxlf = translatedtxlf; } public void setPrjFolder(String prjfolder) { this.prjfolder = prjfolder; } public void setPrjInfoFile(String prjinfofile) { this.prjinfofile = prjinfofile; } public String getPrjInfoFile() { return this.prjinfofile; } public void setSourceLanguage(String sourcelanguage) { this.sourcelanguage = sourcelanguage; } public void setTargetLanguage(String targetlanguage) { this.targetlanguage = targetlanguage; } public void setCreatorID(String creatorid) { this.creatorid = creatorid; } public void setErrorReason(String errorreason) { this.errorreason = errorreason; } public LinkedHashMap<String, List<String>> getAlignedSrcParas() { return this.src_paras; } public LinkedHashMap<String, List<String>> getAlignedTrgParas() { return this.trg_paras; } public LinkedHashMap<String, List<String>> getMissingTrgParas() { return this.missing_trg_paras; } public LinkedHashMap<String, List<String>> getMissingTrgSegs() { return this.missing_trg_segs; } public LinkedHashMap<String, List<String>> getSrcSegs() { return this.src_segs; } public LinkedHashMap<String, List<String>> getTrgSegs() { return this.trg_segs; } public LinkedHashMap<String, String> getLockedParaSeqs() { return this.locked_para_seqs; } public LinkedHashMap<String, String> getLockedSegSeqs() { return this.locked_seg_seqs; } public LinkedHashMap<String, String> getReviewSegSeqs() { return this.review_seg_seqs; } public LinkedHashMap<String, String> getIgnoreSegSeqs() { return this.ignore_seg_seqs; } public LinkedHashMap<String, String> getAligneddParaSeqs() { return this.aligned_para_seqs; } public String getErrorReason() { return this.errorreason; } public void createReformattedDocument(String aligntype) throws Exception { this.errorreason = "Unknown error"; com.aspose.words.Document doc_src = new com.aspose.words.Document(this.sourcefile); File backupsourcefolder = new File(this.prjfolder + "/source_backup"); if (!backupsourcefolder.exists()) { backupsourcefolder.mkdirs(); } this.backupsourcefile = (backupsourcefolder.getAbsolutePath() + File.separator + new File(this.sourcefile).getName()); doc_src.save(this.backupsourcefile); if (!isReplaceStyleAvailable(doc_src)) { this.errorreason = "Cannot reformat source file, replace styles not available"; throw new Exception("cannot reformat source file, replace styles not available"); } System.out.println( "\"underline\" replaced by \"" + Underline.getName(this.replacestyles[0]) + "\" underline"); System.out.println("\"strikethrough\" replaced by \"" + (this.replacestyles[1] == 99 ? "double strikethrough" : Underline.getName(this.replacestyles[1])) + "\""); System.out.println("\"underline + strikethrough\" replaced by \"" + (this.replacestyles[1] == 99 ? Underline.getName(this.replacestyles[0]) + " + double strikethrough" : Underline.getName(this.replacestyles[2])) + "\""); for (int i = 0; i < doc_src.getRevisions().getCount(); i++) { Revision rev = doc_src.getRevisions().get(i); if (((rev.getRevisionType() == 1) || (rev.getRevisionType() == 0) || (rev.getRevisionType() == 4)) && (isTrackedHidden(rev))) { this.errorreason = "There are hidden tracked text in the document"; throw new Exception("there are hidden tracked text in the document"); } if ((rev.getRevisionType() == 2) && (rev.getParentNode().getNodeType() == 21)) { Run run = (Run) rev.getParentNode(); if ((run.isDeleteRevision()) || (run.isInsertRevision())) { Run clone = (Run) run.deepClone(true); rev.accept(); i--; run.setInsertRevision(clone.getInsertRevision()); run.setDeleteRevision(clone.getDeleteRevision()); } } } if (this.replacestyles[1] == doublestrikethrough) { for (int t = 0; t < doc_src.getChildNodes(21, true).getCount(); t++) { Run run = (Run) doc_src.getChildNodes(21, true).get(t); if ((run.getFont().getUnderline() == 1) && (run.getFont().getStrikeThrough() == true)) { run.getFont().setStrikeThrough(false); run.getFont().setUnderline(0); run.getFont().setDoubleStrikeThrough(true); run.getFont().setUnderline(this.replacestyles[0]); } else if (run.getFont().getUnderline() == 1) { run.getFont().setUnderline(0); run.getFont().setUnderline(this.replacestyles[0]); } else if (run.getFont().getStrikeThrough() == true) { run.getFont().setStrikeThrough(false); run.getFont().setDoubleStrikeThrough(true); } } } else { for (int t = 0; t < doc_src.getChildNodes(21, true).getCount(); t++) { Run run = (Run) doc_src.getChildNodes(21, true).get(t); if ((run.getFont().getUnderline() == 1) && (run.getFont().getStrikeThrough() == true)) { run.getFont().setStrikeThrough(false); run.getFont().setUnderline(0); run.getFont().setUnderline(this.replacestyles[2]); } else if (run.getFont().getUnderline() == 1) { run.getFont().setUnderline(0); run.getFont().setUnderline(this.replacestyles[0]); } else if (run.getFont().getStrikeThrough() == true) { run.getFont().setStrikeThrough(false); run.getFont().setUnderline(this.replacestyles[1]); } } } System.out.println("normalizing whitespace after ESMs..."); normalizeSpaceAfterESM(doc_src, Locale.makeLocale(this.sourcelanguage)); if (getStopExcution()) { throw new Exception("job cancelled."); } System.out.println("deleting comments and shape alt text in source..."); DeleteCommentsAndShapeAltText(doc_src); if (getStopExcution()) { throw new Exception("job cancelled."); } System.out.println("rmeove hyperlinks in source..."); if (getStopExcution()) { throw new Exception("job cancelled."); } System.out.println("merging sections in source..."); mergeSectionsInDocuemnt(doc_src); if (getStopExcution()) { throw new Exception("job cancelled."); } System.out.println("converting move tracks to insertion/deletion..."); convertMoveToIDTracks(doc_src); if (getStopExcution()) { throw new Exception("job cancelled."); } doc_src.save(this.sourcefile); System.out.println("rmeove hyperlinks in source..."); System.out.println("merging paragraphs in source..."); mergeRevision(doc_src); mergeParagraphsInDocuemnt(doc_src); System.out.println("creating fake tracks..."); createFakeTrackChanges(doc_src); File reformattedsourcefolder = new File(this.prjfolder + "/source_reformatted"); if (!reformattedsourcefolder.exists()) { reformattedsourcefolder.mkdirs(); } this.reformattedsourcefile = (reformattedsourcefolder.getAbsolutePath() + File.separator + new File(this.sourcefile).getName()); if (new File(this.reformattedsourcefile).exists()) { new File(this.reformattedsourcefile).delete(); } System.out.println("saving source..."); doc_src.save(this.reformattedsourcefile); File reformattedtargetfolder = new File(this.prjfolder + "/target_reformatted"); if (!reformattedtargetfolder.exists()) { reformattedtargetfolder.mkdirs(); } this.reformattedtargetfile = (reformattedtargetfolder.getAbsolutePath() + File.separator + new File(this.targetfile).getName()); if (new File(this.reformattedtargetfile).exists()) { new File(this.reformattedtargetfile).delete(); } com.aspose.words.Document doc_trg = new com.aspose.words.Document(this.targetfile); System.out.println("reformatting target file..."); if (aligntype.equals("sequential")) { reformatTargetFile(doc_trg); } else if (aligntype.equals("auto")) { reformatTargetFile_auto(doc_trg); } doc_trg.save(this.reformattedtargetfile); } private void reformatTargetFile(com.aspose.words.Document doc_trg) throws Exception { ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); normalizeSpaceAfterESM(doc_trg, Locale.makeLocale(this.targetlanguage)); DeleteCommentsAndShapeAltText(doc_trg); convertMoveToIDTracks(doc_trg); while (doc_trg.getRevisions().getCount() != 0) { Revision rev = doc_trg.getRevisions().get(0); rev.reject(); } } private void reformatTargetFile_auto(com.aspose.words.Document doc_trg) throws Exception { normalizeSpaceAfterESM(doc_trg, Locale.makeLocale(this.targetlanguage)); DeleteCommentsAndShapeAltText(doc_trg); convertMoveToIDTracks(doc_trg); while (doc_trg.getRevisions().getCount() != 0) { Revision rev = doc_trg.getRevisions().get(0); rev.reject(); } } private void convertMoveToIDTracks(com.aspose.words.Document doc) throws Exception { com.aspose.words.Document temp = new com.aspose.words.Document(); com.aspose.words.DocumentBuilder db = new com.aspose.words.DocumentBuilder(temp); Paragraph temp_para_del = db.insertParagraph(); temp.startTrackRevisions(""); temp_para_del.remove(); db.writeln(); Paragraph temp_para_ins = (Paragraph) db.getCurrentParagraph().getPreviousSibling(); NodeCollection moveFromStarts = doc.getChildNodes(13, true); NodeCollection moveFromEnds = doc.getChildNodes(14, true); NodeCollection moveToStarts = doc.getChildNodes(15, true); NodeCollection moveToEnds = doc.getChildNodes(16, true); for (int i = 0; i < moveFromStarts.getCount(); i++) { com.aspose.words.Node curNode = moveFromStarts.get(i); while (curNode.getNodeType() != 14) { curNode = curNode.nextPreOrder(doc); if (curNode == null) { break; } if ((curNode != null) && (curNode.getNodeType() == 21)) { Run run = (Run) curNode; Run runClone = new Run(doc, run.getText()); runClone.getFont().setName(run.getFont().getName()); run.getParentNode().insertAfter(runClone, run); run.remove(); doc.startTrackRevisions(""); runClone.remove(); doc.stopTrackRevisions(); curNode = runClone; } else if ((curNode != null) && (curNode.getNodeType() == 8)) { Paragraph para = (Paragraph) curNode; if ((curNode.getPreviousSibling() != null) && (curNode.getPreviousSibling().getNodeType() == 8)) { Paragraph prev_para = (Paragraph) curNode.getPreviousSibling(); prev_para.setDeleteRevision(temp_para_del.getDeleteRevision()); } } } } for (int i = 0; i < moveToStarts.getCount(); i++) { com.aspose.words.Node curNode = moveToStarts.get(i); while (curNode.getNodeType() != 16) { curNode = curNode.nextPreOrder(doc); if (curNode == null) { break; } if ((curNode != null) && (curNode.getNodeType() == 21)) { Run run = (Run) curNode; Run runClone = new Run(doc, run.getText()); runClone.getFont().setName(run.getFont().getName()); doc.startTrackRevisions(""); run.getParentNode().insertAfter(runClone, run); doc.stopTrackRevisions(); run.remove(); curNode = runClone; } else if ((curNode != null) && (curNode.getNodeType() == 8)) { Paragraph para = (Paragraph) curNode; if ((para.getPreviousSibling() != null) && (para.getPreviousSibling().getNodeType() == 8)) { Paragraph prev_para = (Paragraph) para.getPreviousSibling(); prev_para.setInsertRevision(temp_para_ins.getInsertRevision()); } } } } } private void createFakeTrackChanges(com.aspose.words.Document doc) throws Exception { boolean ismovefrom = false; boolean ismoveto = false; HashSet<Footnote> set = new HashSet(); for (int i = 0; i < doc.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node node = doc.getChildNodes(0, true).get(i); if (node.getNodeType() == 13) { ismovefrom = true; } else if (node.getNodeType() == 14) { ismovefrom = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } else if (node.getNodeType() == 20) { Footnote fn = (Footnote) node; if (ismovefrom) { System.out.println(fn.getText()); set.add(fn); } } } doc.getChildNodes(13, true).clear(); doc.getChildNodes(14, true).clear(); doc.getChildNodes(15, true).clear(); doc.getChildNodes(16, true).clear(); for (int i = 0; i < doc.getChildNodes(21, true).getCount(); i++) { Run run = (Run) doc.getChildNodes(21, true).get(i); Font font = run.getFont(); font.setStrikeThrough(false); if ((run.isInsertRevision()) && (!run.isDeleteRevision())) { Run runClone = new Run(doc, run.getText()); runClone.getFont().setName(run.getFont().getName()); runClone.getFont().setUnderline(1); runClone.getFont().setStrikeThrough(false); run.getParentNode().insertAfter(runClone, run); run.remove(); } else if ((run.isInsertRevision()) && (run.isDeleteRevision())) { run.remove(); i--; } else if (run.isDeleteRevision()) { Run runClone = new Run(doc, run.getText()); runClone.getFont().setName(run.getFont().getName()); runClone.getFont().setStrikeThrough(true); run.getParentNode().insertAfter(runClone, run); run.remove(); } } for (int i = 0; i < doc.getRevisions().getCount(); i++) { Revision rev = doc.getRevisions().get(i); if ((rev.getRevisionType() == 1) && (rev.getParentNode().getNodeType() == 6)) { Row row = (Row) rev.getParentNode(); for (int j = 0; j < row.getChildNodes(8, true).getCount(); j++) { Paragraph p = (Paragraph) row.getChildNodes(8, true).get(j); if (p.isDeleteRevision()) { p.setDeleteRevision(new Paragraph(doc).getDeleteRevision()); } for (int z = 0; z < p.getChildNodes(21, true).getCount(); z++) { Run run = (Run) p.getChildNodes(21, true).get(z); Font font = run.getFont(); if (!font.getStrikeThrough()) { font.setStrikeThrough(true); } } } rev.reject(); i--; } else if ((rev.getRevisionType() == 1) && ((rev.getParentNode().getNodeType() == 22) || (rev.getParentNode().getNodeType() == 23) || (rev.getParentNode().getNodeType() == 24))) { rev.reject(); i--; } else if ((rev.getRevisionType() == 1) && (rev.getParentNode().getNodeType() == 8)) { rev.reject(); i--; } else if ((rev.getRevisionType() == 0) && (rev.getParentNode().getNodeType() == 8)) { rev.accept(); i--; } else if ((rev.getRevisionType() == 1) && (rev.getParentNode().getNodeType() == 18)) { rev.reject(); i--; } else if ((rev.getRevisionType() == 1) && (rev.getParentNode().getNodeType() == 20)) { rev.reject(); i--; } else if ((rev.getRevisionType() == 4) && (rev.getParentNode().getNodeType() == 20)) { Footnote ft = (Footnote) rev.getParentNode(); if (set.contains(ft)) { rev.reject(); i--; } } else if (rev.getRevisionType() == 0) { rev.accept(); } else if (rev.getRevisionType() == 1) { rev.reject(); } } while (doc.getRevisions().getCount() != 0) { if (doc.getRevisions().get(0).getRevisionType() == 0) { doc.getRevisions().get(0).accept(); } else if (doc.getRevisions().get(0).getRevisionType() == 1) { doc.getRevisions().get(0).reject(); } else { doc.getRevisions().get(0).accept(); } } } private void mergeSectionsInDocuemnt(com.aspose.words.Document doc) throws Exception { boolean[] ismovefromsection = new boolean[doc.getSections().getCount()]; int index = -1; boolean ismovefrom = false; for (int i = 0; i < doc.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node node = doc.getChildNodes(0, true).get(i); if (node.getNodeType() == 13) { ismovefrom = true; } if (node.getNodeType() == 14) { ismovefrom = false; } if (node.getNodeType() == 2) { index++; } if ((node.getNodeType() == 8) && (((Paragraph) node).isEndOfSection())) { if (ismovefrom) { ismovefromsection[index] = true; this.isSectionBreakDeletedORInserted = true; } else { ismovefromsection[index] = false; } } } for (int i = doc.getSections().getCount() - 2; i >= 0; i--) { Section prev_section = doc.getSections().get(i + 1); Section curr_Section = doc.getSections().get(i); int para_count = curr_Section.getChildNodes(8, true).getCount(); if (para_count > 0) { Paragraph lastpara = (Paragraph) curr_Section.getChildNodes(8, true).get(para_count - 1); if ((lastpara.isDeleteRevision()) || (ismovefromsection[i])) { prev_section.prependContent(curr_Section); curr_Section.remove(); } } } } private void mergeRevision(com.aspose.words.Document doc) throws Exception { Paragraph prev = null; for (int i = 0; i < doc.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node node = doc.getChildNodes(0, true).get(i); if (node.getNodeType() == 21) { Run run = (Run) node; if ((run.isDeleteRevision()) && (run.isInsertRevision())) { run.remove(); i--; } } else if (node.getNodeType() == 8) { Paragraph para = (Paragraph) node; if ((prev != null) && (prev.isInsertRevision()) && (prev.isDeleteRevision())) { for (int j = 0; j < para.getChildNodes().getCount(); j++) { com.aspose.words.Node nd = para.getChildNodes().get(j); if ((nd.getNodeType() != 21) || (!((Run) nd).isDeleteRevision()) || (!((Run) nd).getText().contains(ControlChar.PAGE_BREAK))) { prev.appendChild(nd.deepClone(true)); i++; } } prev.setDeleteRevision(para.getDeleteRevision()); prev.setInsertRevision(para.getInsertRevision()); para.remove(); i--; } else { prev = para; } } } } private void mergeParagraphsInDocuemnt(com.aspose.words.Document doc) throws Exception { ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); Paragraph prev_para = null; Paragraph sample = new Paragraph(doc); Shape shape = null; boolean isStillSplitting = false; boolean ismoveaway = false; boolean ismoveto = false; int toskip = 0; List<Paragraph> toremove = new ArrayList(); for (int i = 0; i < doc.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node node = doc.getChildNodes(0, true).get(i); if (node.getNodeType() == 18) { shape = (Shape) node; } else if (node.getNodeType() == 13) { ismoveaway = true; } else if (node.getNodeType() == 14) { ismoveaway = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } else if (node.getNodeType() == 8) { if (toskip > 0) { toskip--; } else { Paragraph para = (Paragraph) node; if ((para.isEndOfSection()) && ((para.isInsertRevision()) || (para.isDeleteRevision()))) { this.isSectionBreakDeletedORInserted = true; } if ((para.isInCell()) && (para.getPreviousSibling() == null)) { prev_para = null; isStillSplitting = false; } if ((shape != null) && (shape.getChildNodes(8, true).getCount() > 0) && (shape.getChildNodes(8, true).get(0) == para)) { prev_para = null; isStillSplitting = false; } if (prev_para != null) { if (isStillSplitting) { if ((prev_para.isInsertRevision()) || (ismoveto)) { if (extractionSupportImpl.isExtractable(getParaText(para))) { para.appendChild(new Run(doc, "¶ins;")); } } else { isStillSplitting = false; } } else if (((prev_para.isInsertRevision()) || (ismoveto)) && (!isWholeParaInserted(prev_para))) { if (extractionSupportImpl.isExtractable(getParaText(para))) { para.appendChild(new Run(doc, "¶ins;")); } isStillSplitting = true; } if ((para.isEndOfCell()) || (para.isEndOfHeaderFooter()) || (para.isEndOfSection())) { isStillSplitting = false; } if ((shape != null) && (shape.getChildNodes(8, true).getCount() > 0) && (shape .getChildNodes(8, true).get(shape.getChildNodes(8, true).getCount() - 1) == para)) { isStillSplitting = false; } if (((prev_para.isDeleteRevision()) || (ismoveaway)) && (!isWholeParaDeleted(prev_para))) { if ((para.getChildNodes(21, true).getCount() != 0) || (para.isDeleteRevision())) { prev_para.appendChild(new Run(doc, "¶del;")); i++; for (int j = 0; j < para.getChildNodes().getCount(); j++) { com.aspose.words.Node nd = para.getChildNodes().get(j); if ((nd.getNodeType() != 21) || (!((Run) nd).isDeleteRevision()) || (!((Run) nd).getText().contains(ControlChar.PAGE_BREAK))) { prev_para.appendChild(nd.deepClone(true)); i++; } } if (para.getChildNodes(14, true).getCount() != 0) { ismoveaway = false; } } prev_para.setDeleteRevision(para.getDeleteRevision()); prev_para.setInsertRevision(para.getInsertRevision()); if (!para.isInsertRevision()) { for (int z = 0; z < para.getChildNodes().getCount(); z++) { com.aspose.words.Node temp_nd = para.getChildNodes().get(z); if (temp_nd.getNodeType() == 15) { ismoveto = true; } else if (temp_nd.getNodeType() == 16) { ismoveto = false; } } para.remove(); i--; } else { toremove.add(para); prev_para = para; } } else if ((prev_para.isDeleteRevision()) && (isWholeParaDeleted(prev_para))) { prev_para.setDeleteRevision(sample.getDeleteRevision()); prev_para = para; } else if ((para.isEndOfCell()) || (para.isEndOfHeaderFooter())) { para.setDeleteRevision(sample.getDeleteRevision()); prev_para = null; } else if ((shape != null) && (shape.getChildNodes(8, true).getCount() > 0) && (shape .getChildNodes(8, true).get(shape.getChildNodes(8, true).getCount() - 1) == para)) { para.setDeleteRevision(sample.getDeleteRevision()); prev_para = null; } else { prev_para = para; } } else { if ((isStillSplitting) && (extractionSupportImpl.isExtractable(getParaText(para)))) { para.appendChild(new Run(doc, "¶ins;")); i++; } if (para.isInsertRevision()) { if ((extractionSupportImpl.isExtractable(getParaText(para))) && (!isWholeParaInserted(para))) { isStillSplitting = true; } } else { isStillSplitting = false; } if ((para.isEndOfCell()) || (para.isEndOfHeaderFooter())) { if ((para.getNextSibling() == null) && (para.getPreviousSibling() == null) && (para.isDeleteRevision()) && (isWholeParaDeleted(para))) { para.setDeleteRevision(sample.getDeleteRevision()); } prev_para = null; } else if ((shape != null) && (shape.getChildNodes(8, true).getCount() > 0) && (shape .getChildNodes(8, true).get(shape.getChildNodes(8, true).getCount() - 1) == para)) { if ((para.getNextSibling() == null) && (para.getPreviousSibling() == null) && (para.isDeleteRevision()) && (isWholeParaDeleted(para))) { para.setDeleteRevision(sample.getDeleteRevision()); } prev_para = null; } else { prev_para = para; } } } } else if (node.getNodeType() == 5) { Table tb = (Table) node; if (isWholeTableDeleted(tb, doc)) { toskip = tb.getChildNodes(8, true).getCount(); } } } for (Paragraph p : toremove) { if (p != null) { p.removeAllChildren(); } } doc.save("C:\\Program Files (x86)\\pa\\paprjs\\testprj23\\test.docx"); } private boolean isWholeTableDeleted(Table tb, com.aspose.words.Document doc) { Row sampler = new Row(doc); for (int i = 0; i < tb.getChildNodes(6, true).getCount(); i++) { if (((Row) tb.getChildNodes(6, true).get(i)).getDeleteRevision() == sampler.getDeleteRevision()) { return false; } } return true; } private boolean isWholeParaDeleted(Paragraph para) { for (int i = 0; i < para.getChildNodes(21, true).getCount(); i++) { if (!((Run) para.getChildNodes(21, true).get(i)).isDeleteRevision()) { return false; } } return true; } private boolean isWholeParaInserted(Paragraph para) { for (int i = 0; i < para.getChildNodes(21, true).getCount(); i++) { Run run = (Run) para.getChildNodes(21, true).get(i); if (!run.isInsertRevision()) { return false; } } return true; } public String[] createAlignedXML() throws Exception { System.out.println("creating aligned xml...."); String[] res = new String[1]; this.txlftrgsegmap = new LinkedHashMap(); this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml"); this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator + ".mp"); StringBuffer sbmp = new StringBuffer(); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); org.dom4j.Element aligned = root.addElement("aligned"); org.dom4j.Element orphans = root.addElement("orphans"); org.dom4j.Document document_source_formatted_nonSeg = XmlParser .parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement(); List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg); org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement(); List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); int trg_para_count = 0; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); Workbook wb = new Workbook(); Cells cells = wb.getWorksheets().get(0).getCells(); int cnt = 0; boolean issrcfirsthf = true; boolean istrgfirsthf = true; int gcount = -1; for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) { org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i)) .element("source"); String merged_text = getTxlfElementText_withFakeTC(src_txlf); if (extractionSupportImpl .isExtractable(merged_text.replace("&paradel;", "").replace("&parains;", ""))) { gcount++; org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(gcount)); merged_text = trimText(merged_text, true)[0]; org.dom4j.Element merged_src_text = group.addElement("text"); merged_src_text.setText(merged_text.replace("&paradel;", "").replace("&parains;", "")); String[] split_merged_text = merged_text.replaceAll("(&paradel;)+", "&paradel;") .replaceAll("^&paradel;", "").replaceAll("&paradel;$", "").split("&paradel;"); List<String> segmentsGroup = segmentStringWithRevs( merged_text.replaceAll("(&paradel;)+", "&paradel;").replace("&parains;", ""), this.sourcelanguage); List<List<String>> resegmentedGroup = new ArrayList(); resegmentedGroup.add(new ArrayList()); int idx = 0; String orgs; String[] newsegs; for (int s = 0; s < segmentsGroup.size(); s++) { orgs = (String) segmentsGroup.get(s); if (orgs.contains("&paradel;")) { newsegs = orgs.split("&paradel;"); for (int ss = 0; ss < newsegs.length; ss++) { String sss = newsegs[ss]; if (!sss.trim().equals("")) { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss)); } if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) { resegmentedGroup.add(new ArrayList()); idx++; } } if (orgs.trim().endsWith("&paradel;")) { resegmentedGroup.add(new ArrayList()); idx++; } } else { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs)); } } if (split_merged_text.length > resegmentedGroup.size()) { System.out.println(i); System.out.println("merged_text: " + merged_text); for (String smt : split_merged_text) { System.out.println("split_merged_text: " + smt); } for (List<String> smts : resegmentedGroup) { System.out.println("resegmentedGroup: " + smts); } for (String smtss : segmentsGroup) { System.out.println("segmentedGroup: " + smtss); } } for (int j = 0; j < split_merged_text.length; j++) { if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) { split_merged_text[j] = fixMissingTags(split_merged_text[j]); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", Integer.toString(j)); unit.addAttribute("alignsegs", "false"); org.dom4j.Element src = unit.addElement("src_para"); org.dom4j.Element src_text = src.addElement("text"); boolean ishf = split_merged_text[j].contains("&hf;"); if (!ishf) { issrcfirsthf = false; } boolean isAddedPara = split_merged_text[j].contains("&parains;"); src.addAttribute("added", "" + isAddedPara); String[] trim_result = trimText( split_merged_text[j].replace("&parains;", "").replace("&hf;", ""), false); src.addAttribute("lefttrim", trim_result[1]); src.addAttribute("righttrim", trim_result[2]); split_merged_text[j] = trim_result[0]; int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]); src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para)); String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if ((!extractionSupportImpl.isExtractable(rejected_src)) || (ishf)) { unit.addAttribute("locked", "true"); } else { unit.addAttribute("locked", "false"); } src_text.setText(split_merged_text[j]); cells.get(cnt, 0).setHtmlString("<html>" + split_merged_text[j].replace("ins>", "u>") .replace("del>", "strike>").replace("<br> ", "↵<br>") + "</html>"); org.dom4j.Element src_segs = src.addElement("segments"); List<String> segments = (List) resegmentedGroup.get(j); for (int z = 0; z < segments.size(); z++) { String segment_text = trimText((String) segments.get(z), false)[0]; org.dom4j.Element src_seg = src_segs.addElement("src_seg"); src_seg.addAttribute("id", Integer.toString(z)); src_seg.addAttribute("needreview", "false"); src_seg.addAttribute("ignored", "false"); int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text); src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg)); String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "") .replace("<ins>", "").replace("</ins>", ""); src_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(accepted_t))); String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if ((!extractionSupportImpl.isExtractable(rejected_s)) || (ishf)) { src_seg.addAttribute("locked", "true"); } else { src_seg.addAttribute("locked", "false"); } src_seg.setText(segment_text); } org.dom4j.Element trg = unit.addElement("trg_para"); if ((src_tctype_para != 1) && (!isAddedPara) && (!ishf) && (trg_para_count < list_target_nonSeg.size())) { trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j)); org.dom4j.Element trg_text = trg.addElement("text"); org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg .get(trg_para_count)).element("source"); org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg .get(trg_para_count); while (trg_txlf.getText().contains("&hf;")) { trg_para_count++; trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count)) .element("source"); trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count); } istrgfirsthf = false; String trg_formatted_text = getTxlfElementText_normal(trg_txlf); trg_text.setText(trg_formatted_text.replace("&hf;", "")); cells.get(cnt, 1) .setHtmlString("<html>" + trg_formatted_text.replace("ins>", "u>").replace("del>", "strike>") .replace("&hf;", "").replace("<br> ", "↵<br>") + "</html>"); cnt++; org.dom4j.Element trg_segs = trg.addElement("segments"); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage); List<org.dom4j.Element> srcsegs = src_segs.elements("src_seg"); int trg_tmp_cnt = 0; for (int z = 0; trg_tmp_cnt < trgsegs.size(); z++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(z)); trg_seg.addAttribute("edited", "false"); if ((z < srcsegs.size()) && (((org.dom4j.Element) srcsegs.get(z)) .attributeValue("tctype").equals(TrackChangeType.getName(1)))) { trg_seg.addAttribute("isExtractable", "false"); trg_seg.setText(""); } else { String trgsegtext = ((String) trgsegs.get(trg_tmp_cnt)).replace("&hf;", "") .trim(); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(trgsegtext))); trg_seg.setText(trgsegtext); String mapid = Integer.toString(gcount) + " - " + Integer.toString(j) + " - " + Integer.toString(z); List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit") .get(trg_tmp_cnt)).content(); sbmp.append(mapid + "\t" + trg_para_count + "\t" + trg_tmp_cnt + "\n"); trg_tmp_cnt++; } } trg_para_count++; } else { trg.addAttribute("id", Integer.toString(gcount) + " - " + Integer.toString(j)); org.dom4j.Element trg_text = trg.addElement("text"); trg_text.setText(""); trg.addElement("segments"); cnt++; } int trgcnt = trg.element("segments").elements("trg_seg").size(); int srccnt = src.element("segments").elements("src_seg").size(); if (trgcnt < srccnt) { for (int x = 1; x <= srccnt - trgcnt; x++) { org.dom4j.Element trg_seg = trg.element("segments").addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(trgcnt + x - 1)); trg_seg.addAttribute("edited", "false"); trg_seg.addAttribute("isExtractable", "false"); trg_seg.setText(""); } } } } } } int unitcnt = list_source_formatted_nonSeg.size(); for (int i = trg_para_count; i < list_target_nonSeg.size(); i++) { org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(trg_para_count)) .element("source"); org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) list_target_seg.get(trg_para_count); if (!trg_txlf.getText().contains("&hf;")) { org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(unitcnt)); group.addElement("text").setText(""); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); unit.addAttribute("alignsegs", "false"); unit.addAttribute("locked", "false"); org.dom4j.Element trg = unit.addElement("trg_para"); trg.addAttribute("id", Integer.toString(unitcnt) + " - 0"); org.dom4j.Element trg_text = trg.addElement("text"); String trg_formatted_text = getTxlfElementText_normal(trg_txlf); trg_text.setText(trg_formatted_text.replace("&hf;", "")); org.dom4j.Element trg_segs = trg.addElement("segments"); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text.replace("&hf;", ""), this.targetlanguage); for (int z = 0; z < trgsegs.size(); z++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(z)); trg_seg.addAttribute("edited", "false"); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable((String) trgsegs.get(z)))); trg_seg.setText(((String) trgsegs.get(z)).trim()); String mapid = Integer.toString(unitcnt) + " - 0 - " + Integer.toString(z); List t = ((org.dom4j.Element) trg_txlf_seg.elements("trans-unit").get(z)).content(); sbmp.append(mapid + "\t" + trg_para_count + "\t" + z + "\n"); } trg_para_count++; unitcnt++; } } wb.save(this.prjfolder + File.separator + "verifySegsPop.xlsx"); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); OutputStreamWriter writermp = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.reformattedtargetmapfile)), "UTF8"); writermp.write(sbmp.toString()); writermp.close(); return res; } public void createAlignedXML_auto(String prjid) throws Exception { System.out.println("creating aligned xml with nbAligner...."); this.alignedfile = (this.prjfolder + File.separator + "rev_aligned.xml"); this.reformattedtargetmapfile = (this.prjfolder + File.separator + "target_reformatted" + File.separator + ".mp"); HashMap<String, String> srcidmap = new HashMap(); this.nbalignerfolder = (this.prjfolder + File.separator + "nbaligner"); if (!new File(this.nbalignerfolder).exists()) { new File(this.nbalignerfolder).mkdir(); } FileUtils.cleanDirectory(new File(this.nbalignerfolder)); String nbsourcefolder = this.nbalignerfolder + File.separator + this.sourcelanguage; new File(nbsourcefolder).mkdir(); org.dom4j.Document nbsource = DocumentHelper.createDocument(); org.dom4j.Element root_src = nbsource.addElement("txml"); root_src.addAttribute("locale", this.sourcelanguage); root_src.addAttribute("version", "1.0"); root_src.addAttribute("segtype", "sentence"); org.dom4j.Element translatable_src = root_src.addElement("translatable"); translatable_src.addAttribute("blockId", "1"); String nbtargetfolder = this.nbalignerfolder + File.separator + this.targetlanguage; new File(nbtargetfolder).mkdir(); org.dom4j.Document nbtarget = DocumentHelper.createDocument(); org.dom4j.Element root_trg = nbtarget.addElement("txml"); root_trg.addAttribute("locale", this.targetlanguage); root_trg.addAttribute("version", "1.0"); root_trg.addAttribute("segtype", "sentence"); org.dom4j.Element translatable_trg = root_trg.addElement("translatable"); translatable_trg.addAttribute("blockId", "0"); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); org.dom4j.Element aligned = root.addElement("aligned"); org.dom4j.Element orphans = root.addElement("orphans"); org.dom4j.Document document_source_formatted_nonSeg = XmlParser .parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_source_formatted_nonSeg = document_source_formatted_nonSeg.getRootElement(); List list_source_formatted_nonSeg = root_source_formatted_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_nonSeg = XmlParser.parseXmlFile(this.reformattedtargettxlf_nonSeg); org.dom4j.Element root_target_nonSeg = document_target_nonSeg.getRootElement(); List list_target_nonSeg = root_target_nonSeg.selectNodes("//*[name() = 'trans-unit']"); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_seg = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); ExtractionSupportImpl extractionSupportImpl_src = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.sourcelanguage)); Configuration config_src = new BaseConfiguration(); config_src.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_src.setConfiguration(config_src); ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl( Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage)); Configuration config_trg = new BaseConfiguration(); config_trg.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_trg.setConfiguration(config_trg); boolean issrcfirsthf = true; boolean istrgfirsthf = true; int gcount = -1; int segmentId = 0; for (int i = 0; i < list_source_formatted_nonSeg.size(); i++) { org.dom4j.Element src_txlf = ((org.dom4j.Element) list_source_formatted_nonSeg.get(i)) .element("source"); String merged_text = getTxlfElementText_withFakeTC(src_txlf); if (extractionSupportImpl_src .isExtractable(merged_text.replace("&paradel;", "").replace("&parains;", ""))) { gcount++; org.dom4j.Element group = aligned.addElement("group"); group.addAttribute("id", Integer.toString(gcount)); merged_text = trimText(merged_text, true)[0]; org.dom4j.Element merged_src_text = group.addElement("text"); merged_src_text.setText(merged_text.replace("&paradel;", "").replace("&parains;", "")); String[] split_merged_text = merged_text.replaceAll("(&paradel;)+", "&paradel;") .replaceAll("^&paradel;", "").replaceAll("&paradel;$", "").split("&paradel;"); List<String> segmentsGroup = segmentStringWithRevs( merged_text.replaceAll("(&paradel;)+", "&paradel;").replace("&parains;", ""), this.sourcelanguage); List<List<String>> resegmentedGroup = new ArrayList(); resegmentedGroup.add(new ArrayList()); int idx = 0; String orgs; String[] newsegs; for (int s = 0; s < segmentsGroup.size(); s++) { orgs = (String) segmentsGroup.get(s); if (orgs.contains("&paradel;")) { newsegs = orgs.split("&paradel;"); for (int ss = 0; ss < newsegs.length; ss++) { String sss = newsegs[ss]; if (!sss.trim().equals("")) { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(sss)); } if ((((List) resegmentedGroup.get(idx)).size() != 0) && (ss != newsegs.length - 1)) { resegmentedGroup.add(new ArrayList()); idx++; } } if (orgs.trim().endsWith("&paradel;")) { resegmentedGroup.add(new ArrayList()); idx++; } } else { ((List) resegmentedGroup.get(idx)).add(fixMissingTags(orgs)); } } if (split_merged_text.length > resegmentedGroup.size()) { System.out.println(i); System.out.println("merged_text: " + merged_text); for (String smt : split_merged_text) { System.out.println("split_merged_text: " + smt); } for (List<String> smts : resegmentedGroup) { System.out.println("resegmentedGroup: " + smts); } for (String smtss : segmentsGroup) { System.out.println("segmentedGroup: " + smtss); } } for (int j = 0; j < split_merged_text.length; j++) { if (!split_merged_text[j].replaceAll("<(/)*ins>|<(/)*del>", "").trim().equals("")) { split_merged_text[j] = fixMissingTags(split_merged_text[j]); Element unit = group.addElement("unit"); unit.addAttribute("id", Integer.toString(j)); unit.addAttribute("alignsegs", "false"); Element src = unit.addElement("src_para"); org.dom4j.Element src_text = src.addElement("text"); boolean isAddedPara = split_merged_text[j].contains("&parains;"); src.addAttribute("added", "" + isAddedPara); String[] trim_result = trimText(split_merged_text[j].replace("&parains;", ""), false); src.addAttribute("lefttrim", trim_result[1]); src.addAttribute("righttrim", trim_result[2]); split_merged_text[j] = trim_result[0]; int src_tctype_para = TrackChangeHelper.getTrackChangeType(split_merged_text[j]); src.addAttribute("tctype", TrackChangeType.getName(src_tctype_para)); String rejected_src = split_merged_text[j].replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if (!extractionSupportImpl_src.isExtractable(rejected_src)) { unit.addAttribute("locked", "true"); } else { unit.addAttribute("locked", "false"); } src_text.setText(split_merged_text[j]); org.dom4j.Element src_segs = src.addElement("segments"); List<String> segments = (List) resegmentedGroup.get(j); for (int z = 0; z < segments.size(); z++) { String segment_text = trimText((String) segments.get(z), false)[0]; org.dom4j.Element src_seg = src_segs.addElement("src_seg"); src_seg.addAttribute("id", Integer.toString(z)); src_seg.addAttribute("needreview", "false"); src_seg.addAttribute("ignored", "false"); int tctype_seg = TrackChangeHelper.getTrackChangeType(segment_text); src_seg.addAttribute("tctype", TrackChangeType.getName(tctype_seg)); String accepted_t = segment_text.replaceAll("(?s)<del>.*?</del>", "") .replace("<ins>", "").replace("</ins>", ""); src_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_src.isExtractable(accepted_t))); String rejected_s = segment_text.replaceAll("(?s)<ins>.*?</ins>", "") .replace("<del>", "").replace("</del>", ""); if (!extractionSupportImpl_src.isExtractable(rejected_s)) { src_seg.addAttribute("locked", "true"); } else { src_seg.addAttribute("locked", "false"); org.dom4j.Element segment_src = translatable_src.addElement("segment"); segment_src.addAttribute("segmentId", Integer.toString(segmentId)); srcidmap.put(i + " - " + j + " - " + z, Integer.toString(segmentId)); segmentId++; segment_src.addElement("source").setText(rejected_s); } src_seg.setText(segment_text); } } } } } segmentId = 0; for (int i = 0; i < list_target_nonSeg.size(); i++) { org.dom4j.Element trg_txlf = ((org.dom4j.Element) list_target_nonSeg.get(i)).element("source"); String trg_formatted_text = getTxlfElementText_normal(trg_txlf); List<String> trgsegs = segmentStringWithRevs(trg_formatted_text, this.targetlanguage); for (int j = 0; j < trgsegs.size(); j++) { String trgseg = ((String) trgsegs.get(j)).trim().replaceAll("(\\s)+", " "); if (extractionSupportImpl_trg.isExtractable(trgseg)) { org.dom4j.Element segment_trg = translatable_trg.addElement("segment"); segment_trg.addAttribute("segmentId", Integer.toString(segmentId)); segmentId++; segment_trg.addElement("source").setText(trgseg); } } } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream( new FileOutputStream(nbsourcefolder + File.separator + this.sourcelanguage + ".txml")), "UTF8"); nbsource.write(writer); writer.close(); writer = new OutputStreamWriter( new BufferedOutputStream( new FileOutputStream(nbtargetfolder + File.separator + this.targetlanguage + ".txml")), "UTF8"); nbtarget.write(writer); writer.close(); String pahtexe = "\\\\10.2.50.190\\AutoAlignerCLI\\AutoAlignerCLI.exe"; ProcessBuilder pb = new ProcessBuilder( new String[] { pahtexe, "-i", this.nbalignerfolder, "-o", this.nbalignerfolder, "-lang_pairs", this.sourcelanguage + "_" + this.targetlanguage, "-lang_detect", "normal", "-identicals", "-match_filenames", "-txml_or_xmx_output", "-docnames_output", "-disallow_src_merging" }); pb.redirectErrorStream(true); Process p = pb.start(); InputStreamReader isr = new InputStreamReader(p.getInputStream()); BufferedReader br = new BufferedReader(isr); boolean sentESTTime = false; boolean alignstart = false; String lineRead; while ((lineRead = br.readLine()) != null) { System.out.println(lineRead); if (lineRead.contains("Aligning...")) { alignstart = true; } else { if ((lineRead.contains("Estimated Time to Completion:")) && (alignstart)) { this.estimateNBAlignerCompTime = lineRead.replace("Estimated Time to Completion: ", "") .replace(" Minute(s)", ""); } if ((!this.estimateNBAlignerCompTime.equals("")) && (!sentESTTime)) { sentESTTime = true; try { int minutes = 200 + Integer.parseInt(this.estimateNBAlignerCompTime); setAlignProgress(prjid, minutes); this.estimateNBAlignerCompTime = ""; } catch (Exception ex) { ex.printStackTrace(); } } } } p.waitFor(); for (File file : new File(this.nbalignerfolder).listFiles()) { if (file.getName().endsWith(".zip")) { UnzipFile.UnZipIt(file.getAbsolutePath(), this.nbalignerfolder); } } String alignedtxml = ""; for (File file : new File(this.nbalignerfolder).listFiles()) { if (file.getName().endsWith(".txml")) { alignedtxml = file.getAbsolutePath(); } } if (alignedtxml.equals("")) { throw new Exception("file didn't aligned by nbaligner"); } HashMap<String, String[]> alignedtrgs = new HashMap(); List<String[]> missingtrgs = new ArrayList(); int src_idx = -1; org.dom4j.Document alignedtxmldoc = XmlParser.parseXmlFile(alignedtxml); org.dom4j.Element root_alignedtxmldoc = alignedtxmldoc.getRootElement(); for (int i = 0; i < root_alignedtxmldoc.elements("translatable").size(); i++) { org.dom4j.Element translatable = (org.dom4j.Element) root_alignedtxmldoc.elements("translatable") .get(i); for (int j = 0; j < translatable.elements("segment").size(); j++) { org.dom4j.Element segment = (org.dom4j.Element) translatable.elements("segment").get(j); org.dom4j.Element source = segment.element("source"); org.dom4j.Element target = segment.element("target"); if ((source != null) && (!source.getTextTrim().equals(""))) { src_idx++; if ((target != null) && (!target.getTextTrim().equals(""))) { String matchscore = target.attributeValue("score"); int trg_idx = Integer.parseInt(target.attributeValue("sent_no")); if (matchscore.equals("0")) { alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), "1", Integer.toString(trg_idx) }); } else if (target.attribute("original_segments_count") != null) { int merged_cnt = Integer.parseInt(target.attributeValue("original_segments_count")); String trg_idx_str = Integer.toString(trg_idx) + " - " + Integer.toString(trg_idx + merged_cnt - 1); alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), matchscore, trg_idx_str }); } else { alignedtrgs.put(Integer.toString(src_idx), new String[] { target.getTextTrim(), matchscore, Integer.toString(trg_idx) }); } } } else if ((target != null) && (!target.getTextTrim().equals(""))) { String matchscore = target.attributeValue("score"); int trg_idx = Integer.parseInt(target.attributeValue("sent_no")); missingtrgs.add(new String[] { target.getTextTrim(), Integer.toString(trg_idx) }); } } } int null_idx = 0; List<org.dom4j.Element> groups = aligned.elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); org.dom4j.Element src_para_segs = src_para.element("segments"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trg_para_segs = trg_para.addElement("segments"); List<org.dom4j.Element> src_segs = src_para_segs.elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); org.dom4j.Element trg_seg = trg_para_segs.addElement("trg_seg"); String mapid = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(z); trg_seg.addAttribute("edited", "false"); String trgsegtext = ""; if (srcidmap.containsKey(mapid)) { String sourceidintxml = (String) srcidmap.get(mapid); if (alignedtrgs.containsKey(sourceidintxml)) { src_seg.addAttribute("locked", "true"); trgsegtext = ((String[]) alignedtrgs.get(sourceidintxml))[0]; String score = ((String[]) alignedtrgs.get(sourceidintxml))[1]; String targetidintxml = ((String[]) alignedtrgs.get(sourceidintxml))[2]; if (Integer.parseInt(score) < needreviewthreshhold) { src_seg.addAttribute("needreview", "true"); } trg_seg.addAttribute("id", targetidintxml); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_trg.isExtractable(trgsegtext))); } else { trg_seg.addAttribute("id", "n - " + null_idx); null_idx++; trg_seg.addAttribute("isExtractable", "false"); } } else { trg_seg.addAttribute("id", "n - " + null_idx); null_idx++; trg_seg.addAttribute("isExtractable", "false"); } trg_seg.setText(trgsegtext); } } } org.dom4j.Element orp_unit = orphans.addElement("unit"); orp_unit.addAttribute("id", "0"); org.dom4j.Element orp_trg_para = orp_unit.addElement("trg_para"); org.dom4j.Element orp_segments = orp_trg_para.addElement("segments"); for (int i = 0; i < missingtrgs.size(); i++) { String orptrgtext = ((String[]) missingtrgs.get(i))[0]; String orptrgid = ((String[]) missingtrgs.get(i))[1]; org.dom4j.Element orp_trg_seg = orp_segments.addElement("trg_seg"); orp_trg_seg.addAttribute("id", orptrgid); orp_trg_seg.addAttribute("edited", "false"); orp_trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl_trg.isExtractable(orptrgtext))); orp_trg_seg.setText(orptrgtext); } OutputStreamWriter oswriter = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(oswriter); oswriter.close(); } public String[] readnbalignmentreport() throws Exception { System.out.println("reading auto-Aligner report...."); String[] res = new String[1]; String alignmentreport = ""; for (File file : new File(this.nbalignerfolder).listFiles()) { if (file.getName().contains("Alignment_Report")) { alignmentreport = file.getAbsolutePath(); } } if (alignmentreport.equals("")) { throw new Exception("file didn't aligned properly"); } BufferedReader input = new BufferedReader(new FileReader(alignmentreport)); String line = null; while ((line = input.readLine()) != null) { if (line.contains("Alignment Score:")) { res[0] = line.replace("Alignment Score:", "").trim(); } } input.close(); return res; } public void buildTargetContentMap() throws Exception { System.out.println("rebuilding target content map file...."); ExtractionSupportImpl extractionSupportImpl_trg = new ExtractionSupportImpl( Locale.makeLocale(this.targetlanguage), Locale.makeLocale(this.targetlanguage)); Configuration config_trg = new BaseConfiguration(); config_trg.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl_trg.setConfiguration(config_trg); this.txlftrgsegmap = new LinkedHashMap(); this.txlftrgsewsmap = new LinkedHashMap(); org.dom4j.Document document_target_seg = XmlParser.parseXmlFile(this.reformattedtargettxlf_seg); org.dom4j.Element root_target_seg = document_target_seg.getRootElement(); List list_target_para = root_target_seg.selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); int segmentId = 1; for (int i = 0; i < list_target_para.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) list_target_para.get(i); for (int j = 0; j < group.elements("trans-unit").size(); j++) { org.dom4j.Element trg_txlf_seg = (org.dom4j.Element) group.elements("trans-unit").get(j); String trgseg = trg_txlf_seg.element("source").getText().trim().replaceAll("(\\s)+", " "); if (extractionSupportImpl_trg.isExtractable(trgseg)) { List tmp_content = new ArrayList(); for (int z = 0; z < trg_txlf_seg.content().size(); z++) { if ((trg_txlf_seg.content().get(z) instanceof org.dom4j.Element)) { tmp_content.add(trg_txlf_seg.content().get(z)); } } this.txlftrgsegmap.put(Integer.valueOf(segmentId), tmp_content); boolean[] seg_attr = { false, false }; if (j == 0) { seg_attr[0] = true; } if (j == group.elements("trans-unit").size() - 1) { seg_attr[1] = true; } this.txlftrgsewsmap.put(Integer.valueOf(segmentId), seg_attr); segmentId++; } } } } public boolean compareAlignedXmls(String trgxml) throws Exception { System.out.println("comparing xmls...."); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); javax.xml.parsers.DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document_trg = builder.parse(new File(trgxml)); org.w3c.dom.Element root_trg = document_trg.getDocumentElement(); NodeList list_target_paras_trg = root_trg.getElementsByTagName("trg_para"); while (list_target_paras_trg.getLength() != 0) { list_target_paras_trg.item(0).getParentNode().removeChild(list_target_paras_trg.item(0)); } org.w3c.dom.Document document_src = builder.parse(new File(this.alignedfile)); org.w3c.dom.Element root_src = document_src.getDocumentElement(); NodeList list_target_paras_src = root_src.getElementsByTagName("trg_para"); while (list_target_paras_src.getLength() != 0) { list_target_paras_src.item(0).getParentNode().removeChild(list_target_paras_src.item(0)); } Diff diff = DiffBuilder.compare(document_src).withTest(document_trg).checkForIdentical().build(); if (diff.hasDifferences()) { return false; } return true; } private String fixMissingTags(String s) { String result = s.replaceAll("(?s)<ins>.*?</ins>", "").replaceAll("(?s)<del>.*?</del>", ""); if (result.contains("<ins>")) { result = s + "</ins>"; } else if (result.contains("</ins>")) { result = "<ins>" + s; } else if (result.contains("<del>")) { result = s + "</del>"; } else if (result.contains("</del>")) { result = "<del>" + s; } return s; } private String getTxlfElementText_withFakeTC(org.dom4j.Element src) { String text = ""; ArrayList<String> node_ids = new ArrayList(); int start_idx = -1; int end_idx = 0; for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) src.content().get(j)).getText().trim().equals("")) || (start_idx != -1)) { end_idx = j; } else { start_idx = j; } } } for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { text = text + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("bx")) && (e.attribute("ctype").getValue().equals("x-strike-through"))) { text = text + "<del>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ex")) { if (node_ids.contains(e.attribute("rid").getValue())) { text = text + "</del>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("bpt")) && (e.attribute("ctype").getValue().equals("x-underlined")) && (e.getText().contains("type=\"1\""))) { text = text + "<ins>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ept")) { if (node_ids.contains(e.attribute("rid").getValue())) { text = text + "</ins>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-paragraphBreak")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb")) && (j > start_idx) && (j < end_idx)) { text = text + "<br> "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("pb")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("equiv-text") != null) && (j > start_idx) && (j < end_idx)) { text = text + e.attributeValue("equiv-text"); } } } if (!text.contains("<ins>")) { if (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']").size() != 0) { org.dom4j.Node node = (org.dom4j.Node) src .selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']").get(0); if (node.getText().contains("type=\"1\"")) { text = "<ins>" + text + "</ins>"; } } } else if ((!text.contains("<del>")) && (src .selectNodes("..//*[name() = 'it'][@ctype = 'x-strike-through'][@pos = 'open']").size() != 0)) { text = "<del>" + text + "</del>"; } return text; } private String getTxlfElementText_normal(org.dom4j.Element src) { String text = ""; int start_idx = -1; int end_idx = 0; for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) src.content().get(j)).getText().trim().equals("")) || (start_idx != -1)) { end_idx = j; } else { start_idx = j; } } } for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { text = text + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-paragraphBreak")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb")) && (j > start_idx) && (j < end_idx)) { text = text + "<br> "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("pb")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("equiv-text") != null) && (j > start_idx) && (j < end_idx)) { text = text + e.attributeValue("equiv-text"); } } } return text; } private List replacetextinDomObj(List contents) { List clone = new ArrayList(); for (int j = 0; j < contents.size(); j++) { if ((!(contents.get(j) instanceof org.dom4j.Text)) || (!((org.dom4j.Text) contents.get(j)).getText().equals("[skipseg]"))) { if ((contents.get(j) instanceof org.dom4j.Element)) { clone.add(contents.get(j)); } } } return clone; } private String assembleText(List contents) { String text = ""; if (contents == null) { return ""; } int start_idx = -1; int end_idx = 0; for (int j = 0; j < contents.size(); j++) { if ((contents.get(j) instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) contents.get(j)).getText().trim().equals("")) || (start_idx != -1)) { end_idx = j; } else { start_idx = j; } } } for (int j = 0; j < contents.size(); j++) { if ((contents.get(j) instanceof org.dom4j.Text)) { text = text + ((org.dom4j.Text) contents.get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((contents.get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) contents.get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb")) && (j > start_idx) && (j < end_idx)) { text = text + "<br> "; } } } return text; } private String getTxlfElementText_noescape(org.dom4j.Element src) { String text = ""; int start_idx = -1; int end_idx = 0; for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) src.content().get(j)).getText().trim().equals("")) || (start_idx != -1)) { end_idx = j; } else { start_idx = j; } } } for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { text = text + ((org.dom4j.Text) src.content().get(j)).getText(); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-paragraphBreak")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb")) && (j > start_idx) && (j < end_idx)) { text = text + "<br> "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("pb")) && (j > start_idx) && (j < end_idx)) { text = text + " "; } else if ((e.getName().equals("x")) && (e.attribute("equiv-text") != null) && (j > start_idx) && (j < end_idx)) { text = text + e.attributeValue("equiv-text"); } } } return text; } public boolean verifysegments() throws Exception { System.out.println("verifying segments mapping...."); boolean isValid = false; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_seg); org.dom4j.Element root_source = document_source.getRootElement(); List list_source = root_source.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_source = list_source.size(); List<org.dom4j.Element> text_source = new ArrayList(); Iterator iter_source = list_source.iterator(); while (iter_source.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source"); text_source.add(source); } List<String> mergedsegtext = new ArrayList(); org.dom4j.Document alignedxml = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = alignedxml.getRootElement(); List groups = root.selectNodes("//*[name() = 'group']"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List units = group.elements("unit"); ArrayList<String> keys = new ArrayList(); ArrayList<String> key_left = new ArrayList(); ArrayList<String> key_right = new ArrayList(); ArrayList<String> org_keys = new ArrayList(); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < segs.size(); z++) { org.dom4j.Element seg = (org.dom4j.Element) segs.get(z); keys.add(seg.getText().replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "") .replace("<br>", "").replace("<", "<").replace(">", ">").replace("&", "&") .trim()); org_keys.add(seg.getText()); if ((z == 0) && (z == segs.size() - 1)) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add(src_para.attributeValue("righttrim")); } else if (z == 0) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add("true"); } else if (z == segs.size() - 1) { key_left.add("true"); key_right.add(src_para.attributeValue("righttrim")); } else { key_left.add("true"); key_right.add("true"); } } } } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); List<String> finsegs = segmenter.segment( group.elementText("text").replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", "") .replace("<br>", "").replace("<", "<").replace(">", ">").replace("&", "&")); ArrayList<ArrayList<Integer>> indices = new ArrayList(); int key_start_index = 0; ArrayList<Integer> indice; for (int k = 0; k < finsegs.size(); k++) { String finsegtext = (String) finsegs.get(k); String combined_key = ""; indice = new ArrayList(); for (int x = key_start_index; x < keys.size(); x++) { combined_key = combined_key + (String) keys.get(x); indice.add(Integer.valueOf(x)); if (combined_key.replace("", " ").trim().replaceAll("(\\s)+", "") .equals(finsegtext.replace("", " ").trim().replaceAll("(\\s)+", ""))) { indices.add(indice); key_start_index = x + 1; break; } } } for (int m = 0; m < indices.size(); m++) { ArrayList<Integer> temp_indice = (ArrayList) indices.get(m); String temp = ""; for (int it : temp_indice) { temp = temp + (String) org_keys.get(it); } mergedsegtext.add(temp); } } List<String> rejectedtexts = new ArrayList(); Workbook wb = new Workbook(); Worksheet ws = wb.getWorksheets().get(0); Cells cells = ws.getCells(); int count = Math.max(text_source.size(), mergedsegtext.size()); int t_count = 0; for (int i = 0; i < count; i++) { String t_src = ""; String t_fom = ""; if (i < text_source.size()) { org.dom4j.Element src = (org.dom4j.Element) text_source.get(i); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_src = t_src + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_src = t_src + "<br> "; } } } } cells.get(i, 0).setHtmlString(t_src); if (i < mergedsegtext.size()) { t_fom = ((String) mergedsegtext.get(i)).replace("<ins>", "<u>").replace("</ins>", "</u>") .replace("<del>", "<strike>").replace("</del>", "</strike>"); } String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "") .replace("</u>", "").replace("&paradel;", "").replace("<br>", ""); String rejected_t_fom = t_fom.replaceAll("(?s)<u>.*?</u>", "").replace("<strike>", "") .replace("</strike>", "").replace("&paradel;", "").replace("<br>", ""); if (extractionSupportImpl.isExtractable(accepted_t_fom)) { String input = "<html>" + t_fom.replace("<br> ", "↵<br>").replace("&paradel;", "<strike>¶</strike><br>") + "</html>"; cells.get(t_count, 1).setHtmlString(input); t_count++; if ((i < mergedsegtext.size()) && (TrackChangeHelper.getTrackChangeType((String) mergedsegtext.get(i)) == 3)) { rejectedtexts.add(rejected_t_fom); } else { rejectedtexts.add(""); } } } wb.save(this.prjfolder + File.separator + "verifySegs.xlsx"); if (numberOfPara_source == t_count) { System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count); isValid = true; String timestamp = new SimpleDateFormat("yyyyMMdd'T'HHmmss'Z'").format(new Date()); for (int r = 0; r < list_source.size(); r++) { if (!((String) rejectedtexts.get(r)).equals("")) { org.dom4j.Element transunit = (org.dom4j.Element) list_source.get(r); org.dom4j.Element originalbase = transunit.addElement("alt-trans"); org.dom4j.Element source = transunit.element("source"); org.dom4j.Element target = transunit.element("target"); source.addAttribute("gs4tr:seginfo", "<root username=\"TC Aligner\" timestamp=\"" + timestamp + "\"/>"); if (target != null) { transunit.elements().add(transunit.elements().indexOf(target) + 1, originalbase.clone()); } else { transunit.elements().add(transunit.elements().indexOf(source) + 1, originalbase.clone()); } transunit.remove(originalbase); org.dom4j.Element original = transunit.element("alt-trans"); original.addAttribute("alttranstype", "x-previous-source-version"); original.addAttribute("gs4tr:seginfo", "<root username=\"Original\" timestamp=\"" + timestamp + "\"/>"); org.dom4j.Element original_source = original.addElement("source"); original_source.addText((String) rejectedtexts.get(r)); original.addElement("target"); } } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.sourcetxlf_seg)), "UTF8"); document_source.write(writer); writer.close(); } else { System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count); } return isValid; } public void align() throws Exception { org.dom4j.Document document = DocumentHelper.createDocument(); org.dom4j.Element root = document.addElement("alinger"); org.dom4j.Element head = root.addElement("head"); head.addAttribute("src_lang", this.sourcelanguage); head.addAttribute("trg_lang", this.targetlanguage); head.addAttribute("creator", this.creatorid); ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Element content = root.addElement("content"); org.dom4j.Element orphans = root.addElement("orphans"); com.aspose.words.Document doc_src = new com.aspose.words.Document(this.sourcefile); com.aspose.words.Document doc_trg = new com.aspose.words.Document(this.targetfile); doc_src.joinRunsWithSameFormatting(); trimParaLeadingTrailingSpace(doc_src); UnlinkFields(doc_src); doc_src.save(this.sourcefile + ".docx"); doc_trg.joinRunsWithSameFormatting(); trimParaLeadingTrailingSpace(doc_trg); UnlinkFields(doc_trg); int seqnum = 0; int srcparaindex = 0; int srcparaindex_accept = 0; int trgparaindex = 0; int srcparacnt = doc_src.getChildNodes(8, true).getCount(); int trgparacnt = doc_trg.getChildNodes(8, true).getCount(); boolean ismovefrom = false; boolean ismoveto = false; boolean isprvdelpara = false; int prv = 999999; int unitid = 0; for (int i = 0; i < srcparacnt; i++) { Paragraph para_src = (Paragraph) doc_src.getChildNodes(8, true).get(i); String para_text = getParaText(para_src); boolean isExtractable = extractionSupportImpl.isExtractable(para_text); boolean isNumeric = org.gs4tr.foundation3.core.utils.Text.isNumeric(para_text); if ((!para_text.equals("")) && (isExtractable) && (!isNumeric)) { String src_para_text = ""; boolean hasadds = false; boolean hasdels = false; boolean hasnorm = false; for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) { com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j); if (node.getNodeType() == 13) { ismovefrom = true; } else if (node.getNodeType() == 14) { ismovefrom = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } else if (node.getNodeType() == 21) { Run run = (Run) para_src.getChildNodes(0, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { if ((run.isInsertRevision()) && (!run.isDeleteRevision())) { hasadds = true; src_para_text = src_para_text + "<ins>" + run.getText().replace("<", "<").replace(">", ">") + "</ins>"; } else if (run.isDeleteRevision()) { hasdels = true; src_para_text = src_para_text + "<del>" + run.getText().replace("<", "<").replace(">", ">") + "</del>"; } else if (ismoveto) { hasadds = true; src_para_text = src_para_text + "<ins>" + run.getText().replace("<", "<").replace(">", ">") + "</ins>"; } else if (ismovefrom) { hasdels = true; src_para_text = src_para_text + "<del>" + run.getText().replace("<", "<").replace(">", ">") + "</del>"; } else { hasnorm = true; src_para_text = src_para_text + run.getText().replace("<", "<").replace(">", ">"); } } } } org.dom4j.Element unit; org.dom4j.Element src_para; if ((hasadds) && (!hasdels) && (!hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "insertion"); } else if ((!hasadds) && (!hasdels) && (hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "regular"); } else if ((!hasadds) && (hasdels) && (!hasnorm)) { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "deletion"); } else { unit = content.addElement("unit"); src_para = unit.addElement("src_para"); unit.addAttribute("id", Integer.toString(unitid++)); src_para.addAttribute("para_type", "mix"); } src_para.addAttribute("para_seq", Integer.toString(srcparaindex)); src_para.addAttribute("para_seq_acpt", Integer.toString(srcparaindex_accept)); if (prv != srcparaindex_accept) { } prv = srcparaindex_accept; src_para.addText(wordToHtml(src_para_text)); if (((hasdels) || (hasnorm)) && (!isNumeric)) { if (trgparaindex < trgparacnt) { Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex); String para_trg_text = getParaText(para_trg); boolean isExtractable_trg = extractionSupportImpl.isExtractable(para_trg_text); if (trgparaindex == trgparacnt - 1) { if ((!para_trg_text.equals("")) && (isExtractable_trg)) { org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); } } else { while ((doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim().equals("")) || (!extractionSupportImpl.isExtractable( doc_trg.getChildNodes(8, true).get(trgparaindex).getText())) || (org.gs4tr.foundation3.core.utils.Text.isNumeric( doc_trg.getChildNodes(8, true).get(trgparaindex).getText().trim()))) { trgparaindex++; if (trgparaindex == trgparacnt - 1) { break; } } para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(trgparaindex); para_trg_text = getParaText(para_trg); if ((!para_trg_text.equals("")) && (extractionSupportImpl.isExtractable(para_trg_text)) && (!org.gs4tr.foundation3.core.utils.Text.isNumeric(para_trg_text))) { org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); } } } trgparaindex++; } if (para_src.isDeleteRevision()) { isprvdelpara = true; } else { seqnum++; isprvdelpara = false; } if ((para_src.isEndOfCell()) || (para_src.isEndOfHeaderFooter()) || (para_src.isEndOfSection())) { seqnum++; isprvdelpara = false; } srcparaindex++; if (!isprvdelpara) { srcparaindex_accept++; } } else { for (int j = 0; j < para_src.getChildNodes(0, true).getCount(); j++) { com.aspose.words.Node node = para_src.getChildNodes(0, true).get(j); if (node.getNodeType() == 13) { ismovefrom = true; } else if (node.getNodeType() == 14) { ismovefrom = false; } else if (node.getNodeType() == 15) { ismoveto = true; } else if (node.getNodeType() == 16) { ismoveto = false; } } if (!para_src.isDeleteRevision()) { seqnum++; isprvdelpara = false; srcparaindex_accept++; } srcparaindex++; } } if (trgparaindex < trgparacnt) { for (int i = trgparaindex; i < trgparacnt; i++) { Paragraph para_trg = (Paragraph) doc_trg.getChildNodes(8, true).get(i); if ((!para_trg.getText().trim().equals("")) && (extractionSupportImpl.isExtractable(para_trg.getText()))) { org.dom4j.Element unit = content.addElement("unit"); unit.addAttribute("id", Integer.toString(seqnum)); org.dom4j.Element trg_para = unit.addElement("trg_para"); String trg_para_text = ""; for (int j = 0; j < para_trg.getChildNodes(21, true).getCount(); j++) { Run run = (Run) para_trg.getChildNodes(21, true).get(j); if (!run.getFont().getName().equals("Wingdings")) { trg_para_text = trg_para_text + run.getText().replace("<", "<").replace(">", ">"); } } trg_para.addAttribute("para_seq", Integer.toString(trgparaindex)); trg_para.addText(wordToHtml(trg_para_text)); seqnum++; trgparaindex++; } } } this.alignedfile = (new File(new File(this.sourcefile).getParent()).getParent() + "/rev_aligned.xml"); if (new File(this.alignedfile).exists()) { new File(this.alignedfile).delete(); } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); } public void readAlignedFile() throws Exception { this.src_paras = new LinkedHashMap(); this.trg_paras = new LinkedHashMap(); this.missing_trg_paras = new LinkedHashMap(); this.locked_para_seqs = new LinkedHashMap(); this.aligned_para_seqs = new LinkedHashMap(); this.src_segs = new LinkedHashMap(); this.trg_segs = new LinkedHashMap(); org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); String seq = Integer.toString(i) + " - " + Integer.toString(j); if (unit.attributeValue("locked").equals("true")) { this.locked_para_seqs.put(Integer.toString(this.locked_para_seqs.size()), seq); } if (unit.attributeValue("alignsegs").equals("true")) { this.aligned_para_seqs.put(Integer.toString(this.aligned_para_seqs.size()), seq); } String src = ""; String src_tctype = ""; if (unit.element("src_para") != null) { src = unit.element("src_para").element("text").getText(); src_tctype = unit.element("src_para").attributeValue("tctype"); List<String> st = new ArrayList(); st.add(encodeHtml(src.replaceAll("<ins>(\\s)+</ins>", "<ins> </ins>") .replaceAll("<del>(\\s)+</del>", "<del> </del>"))); st.add(seq); st.add(src_tctype); this.src_paras.put(Integer.toString(this.src_paras.size()), st); List<org.dom4j.Element> src_segs = unit.element("src_para").element("segments") .elements("src_seg"); String segstring = ""; String segstctypestring = ""; for (int t = 0; t < src_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) src_segs.get(t); if (t == src_segs.size() - 1) { segstring = segstring + seg.getText(); segstctypestring = segstctypestring + seg.attributeValue("tctype"); } else { segstring = segstring + seg.getText() + "|||"; segstctypestring = segstctypestring + seg.attributeValue("tctype") + "|||"; } } List<String> sts = new ArrayList(); sts.add(encodeHtml(segstring)); sts.add(seq); sts.add(segstctypestring); this.src_segs.put(Integer.toString(this.src_segs.size()), sts); } org.dom4j.Element trg_para = unit.element("trg_para"); String trg = trg_para.element("text").getText(); String trg_seq = trg_para.attributeValue("id"); List<String> st = new ArrayList(); st.add(encodeHtml(trg)); st.add(trg_seq); this.trg_paras.put(Integer.toString(this.trg_paras.size()), st); List<org.dom4j.Element> trg_segs = unit.element("trg_para").element("segments").elements("trg_seg"); String segstring = ""; for (int t = 0; t < trg_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trg_segs.get(t); if (t == trg_segs.size() - 1) { segstring = segstring + seg.getText(); } else { segstring = segstring + seg.getText() + "|||"; } } List<String> sts = new ArrayList(); sts.add(encodeHtml(segstring)); sts.add(trg_seq); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); } } org.dom4j.Element orphans = document.getRootElement().element("orphans"); List<org.dom4j.Element> units = orphans.elements("unit"); for (int i = 0; i < units.size(); i++) { org.dom4j.Element trg_para = ((org.dom4j.Element) units.get(i)).element("trg_para"); String trgtext = trg_para.element("text").getText(); String trg_seq = trg_para.attributeValue("id"); List<String> st = new ArrayList(); st.add(encodeHtml(trgtext)); st.add(trg_seq); this.missing_trg_paras.put(Integer.toString(this.missing_trg_paras.size()), st); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); String segstring = ""; for (int t = 0; t < trg_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trg_segs.get(t); if (t == trg_segs.size() - 1) { segstring = segstring + seg.getText(); } else { segstring = segstring + seg.getText() + "|||"; } } List<String> sts = new ArrayList(); sts.add(encodeHtml(segstring)); sts.add(trg_seq); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); } } public String createTM(String prjid, boolean reviewed) throws Exception { String tm_add = this.prjfolder + File.separator + (reviewed ? "reviewed_tm_" : "aligned_tm_") + prjid + "_" + this.sourcelanguage + "_" + this.targetlanguage; StringBuffer sb = new StringBuffer(); DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd~HHmmss"); Date date = new Date(); String TNow = dateFormat.format(date); String dummyTMHeader = "%" + TNow + "\t%User System\t%TU=00000000\t%" + this.sourcelanguage + "\t%Wordfast TM v.546/00\t%" + this.targetlanguage + "\t%-----------\t\t\t\t\t .\r\n"; sb.append(dummyTMHeader); org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); if (unit.element("src_para") != null) { List<org.dom4j.Element> src_segs = unit.element("src_para").element("segments") .elements("src_seg"); List<org.dom4j.Element> trg_segs = unit.element("trg_para").element("segments") .elements("trg_seg"); for (int t = 0; t < src_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) src_segs.get(t); if (seg.attributeValue("locked").equals("true")) { String srctext = seg.getText(); String trgtext = ((org.dom4j.Element) trg_segs.get(t)).getText(); String TUline = TNow + "\t" + (reviewed ? "System" : "ALIGN!") + "\t0\t" + this.sourcelanguage + "\t" + srctext + "\t" + this.targetlanguage + "\t" + trgtext; sb.append(TUline); sb.append("\r\n"); } } } } } Writer out = new OutputStreamWriter(new FileOutputStream(tm_add), "UTF-8"); out.write(sb.toString()); out.close(); return tm_add; } public void readAlignedFile_seg(boolean useautosaveddata) throws Exception { this.src_segs = new LinkedHashMap(); this.trg_segs = new LinkedHashMap(); this.missing_trg_segs = new LinkedHashMap(); this.locked_seg_seqs = new LinkedHashMap(); this.review_seg_seqs = new LinkedHashMap(); this.ignore_seg_seqs = new LinkedHashMap(); HashSet<String> lock_tmp = new HashSet(); org.dom4j.Document document = null; if (useautosaveddata) { document = XmlParser.parseXmlFile(this.auto_saved_alignedfile); } else { document = XmlParser.parseXmlFile(this.alignedfile); } List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); int maxnullcnt = 0; for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); if (unit.element("src_para") != null) { List<org.dom4j.Element> src_segs = unit.element("src_para").element("segments") .elements("src_seg"); for (int t = 0; t < src_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) src_segs.get(t); String seq = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(t); List<String> sts = new ArrayList(); sts.add(seq); sts.add(seg.attributeValue("tctype")); sts.add(encodeHtml(seg.getText())); this.src_segs.put(Integer.toString(this.src_segs.size()), sts); if (seg.attributeValue("locked").equals("true")) { this.locked_seg_seqs.put(Integer.toString(this.locked_seg_seqs.size()), seq); lock_tmp.add(seq); } if (seg.attributeValue("needreview").equals("true")) { this.review_seg_seqs.put(Integer.toString(this.review_seg_seqs.size()), seq); } if (seg.attributeValue("ignored").equals("true")) { this.ignore_seg_seqs.put(Integer.toString(this.ignore_seg_seqs.size()), seq); } } } if (unit.element("trg_para") != null) { List<org.dom4j.Element> trg_segs = unit.element("trg_para").element("segments") .elements("trg_seg"); for (int t = 0; t < trg_segs.size(); t++) { if (((org.dom4j.Element) trg_segs.get(t)).attributeValue("id").startsWith("n - ")) { int tmpcnt = Integer.parseInt( ((org.dom4j.Element) trg_segs.get(t)).attributeValue("id").replace("n - ", "")); maxnullcnt = Math.max(maxnullcnt, tmpcnt); } } } } } this.nullcnt = (maxnullcnt + 1); boolean isstart = true; for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); List<org.dom4j.Element> trgsegs = unit.element("trg_para").element("segments").elements("trg_seg"); for (int t = 0; t < trgsegs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trgsegs.get(t); List<String> sts = new ArrayList(); if ((seg.getText().equals("")) && (!seg.attributeValue("id").contains("-"))) { if ((this.trg_segs.size() < this.src_segs.size()) && ((lock_tmp.contains( ((List) this.src_segs.get(Integer.toString(this.trg_segs.size()))).get(0))) || (isstart))) { String seq = "n - " + Integer.toString(this.nullcnt); sts.add(seq); sts.add(""); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); this.nullcnt += 1; sts = new ArrayList(); } } else { isstart = false; if (seg.attributeValue("id").contains("-")) { sts.add(seg.attributeValue("id")); sts.add(encodeHtml(seg.getText())); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); } else { while ((this.trg_segs.size() < this.src_segs.size()) && (lock_tmp.contains( ((List) this.src_segs.get(Integer.toString(this.trg_segs.size()))).get(0)))) { String seq = "n - " + Integer.toString(this.nullcnt); sts.add(seq); sts.add(""); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); this.nullcnt += 1; sts = new ArrayList(); } String seq = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(t); sts.add(seq); sts.add(encodeHtml(seg.getText())); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); } } } } } if (this.trg_segs.size() < this.src_segs.size()) { List<String> sts = new ArrayList(); for (int x = this.trg_segs.size(); x < this.src_segs.size(); x++) { String seq = "n - " + Integer.toString(this.nullcnt); sts.add(seq); sts.add(""); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); this.nullcnt += 1; sts = new ArrayList(); } } org.dom4j.Element orphans = document.getRootElement().element("orphans"); List<org.dom4j.Element> units = orphans.elements("unit"); for (int i = 0; i < units.size(); i++) { org.dom4j.Element trg_para = ((org.dom4j.Element) units.get(i)).element("trg_para"); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int t = 0; t < trg_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trg_segs.get(t); List<String> sts = new ArrayList(); sts.add(seg.attributeValue("id")); sts.add(seg.getText()); this.missing_trg_segs.put(Integer.toString(this.missing_trg_segs.size()), sts); } } } public String readAlignedFileForRatio(String alignedfile) throws Exception { String ratio = "0"; org.dom4j.Document document = XmlParser.parseXmlFile(alignedfile); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); int lockedtargets = 0; int removedtargets = 0; int totaltargets = 0; for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); totaltargets++; if ((src_seg.attribute("locked") != null) && (src_seg.attributeValue("locked").equals("true"))) { lockedtargets++; } } } } } List<org.dom4j.Element> missings = document.getRootElement().element("orphans").elements("unit"); for (int i = 0; i < missings.size(); i++) { List<org.dom4j.Element> trgs = ((org.dom4j.Element) missings.get(i)).element("trg_para") .element("segments").elements("trg_seg"); removedtargets += trgs.size(); } ratio = Double.toString(lockedtargets / (removedtargets + totaltargets)); while (ratio.length() < 4) { ratio = ratio + "0"; } ratio = ratio.substring(0, 4); return ratio; } public void readAlignedFile_seg_auto(boolean useautosaveddata) throws Exception { this.src_segs = new LinkedHashMap(); this.trg_segs = new LinkedHashMap(); this.missing_trg_segs = new LinkedHashMap(); this.locked_seg_seqs = new LinkedHashMap(); this.review_seg_seqs = new LinkedHashMap(); this.ignore_seg_seqs = new LinkedHashMap(); HashSet<String> lock_tmp = new HashSet(); this.auto_saved_alignedfile = (this.alignedfile + ".temp"); org.dom4j.Document document = null; if (useautosaveddata) { document = XmlParser.parseXmlFile(this.auto_saved_alignedfile); } else { document = XmlParser.parseXmlFile(this.alignedfile); } List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); int maxnullcnt = 0; for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); if (unit.element("src_para") != null) { List<org.dom4j.Element> src_segs = unit.element("src_para").element("segments") .elements("src_seg"); for (int t = 0; t < src_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) src_segs.get(t); String seq = Integer.toString(i) + " - " + Integer.toString(j) + " - " + Integer.toString(t); List<String> sts = new ArrayList(); sts.add(seq); sts.add(seg.attributeValue("tctype")); sts.add(encodeHtml(seg.getText())); this.src_segs.put(Integer.toString(this.src_segs.size()), sts); if (seg.attributeValue("locked").equals("true")) { this.locked_seg_seqs.put(Integer.toString(this.locked_seg_seqs.size()), seq); lock_tmp.add(seq); } if (seg.attributeValue("needreview").equals("true")) { this.review_seg_seqs.put(Integer.toString(this.review_seg_seqs.size()), seq); } if (seg.attributeValue("ignored").equals("true")) { this.ignore_seg_seqs.put(Integer.toString(this.ignore_seg_seqs.size()), seq); } } } if (unit.element("trg_para") != null) { List<org.dom4j.Element> trg_segs = unit.element("trg_para").element("segments") .elements("trg_seg"); for (int t = 0; t < trg_segs.size(); t++) { if (((org.dom4j.Element) trg_segs.get(t)).attributeValue("id").startsWith("n - ")) { int tmpcnt = Integer.parseInt( ((org.dom4j.Element) trg_segs.get(t)).attributeValue("id").replace("n - ", "")); maxnullcnt = Math.max(maxnullcnt, tmpcnt); } } } } } this.nullcnt = (maxnullcnt + 1); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); List<org.dom4j.Element> trgsegs = unit.element("trg_para").element("segments").elements("trg_seg"); for (int t = 0; t < trgsegs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trgsegs.get(t); List<String> sts = new ArrayList(); String seq = seg.attributeValue("id"); sts.add(seq); sts.add(encodeHtml(seg.getText())); this.trg_segs.put(Integer.toString(this.trg_segs.size()), sts); } } } org.dom4j.Element orphans = document.getRootElement().element("orphans"); List<org.dom4j.Element> units = orphans.elements("unit"); for (int i = 0; i < units.size(); i++) { org.dom4j.Element trg_para = ((org.dom4j.Element) units.get(i)).element("trg_para"); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int t = 0; t < trg_segs.size(); t++) { org.dom4j.Element seg = (org.dom4j.Element) trg_segs.get(t); List<String> sts = new ArrayList(); sts.add(seg.attributeValue("id")); sts.add(encodeHtml(seg.getText())); this.missing_trg_segs.put(Integer.toString(this.missing_trg_segs.size()), sts); } } } public void update(JSONArray arr, JSONArray missings, JSONArray locks, JSONArray segaligned, JSONArray targets, JSONArray missing_targets, int cnt) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); } ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<JSONArray> trg_list = new ArrayList(); for (int i = 0; i < arr.length(); i++) { trg_list.add(arr.getJSONArray(i)); } List<JSONArray> missings_list = new ArrayList(); for (int i = 0; i < missings.length(); i++) { missings_list.add(missings.getJSONArray(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } List<String> segaligned_list = new ArrayList(); for (int i = 0; i < segaligned.length(); i++) { segaligned_list.add(segaligned.getString(i)); } this.nullcnt = cnt; int unitcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { unitcnt++; org.dom4j.Element unit = (org.dom4j.Element) units.get(j); String seq = i + " - " + j; if (locks_list.contains(seq)) { unit.attribute("locked").setValue("true"); } else { unit.attribute("locked").setValue("false"); } if (segaligned_list.contains(seq)) { unit.attribute("alignsegs").setValue("true"); } else { unit.attribute("alignsegs").setValue("false"); } if (unitcnt <= trg_list.size()) { JSONArray segs = (JSONArray) trg_list.get(unitcnt - 1); String trg_para_text = targets.getString(unitcnt - 1); org.dom4j.Element trg_para = unit.element("trg_para"); if (trg_para != null) { trg_para.clearContent(); } else { trg_para = unit.addElement("trg_para"); } trg_para.addAttribute("id", segs.getString(0)); org.dom4j.Element trg_text = trg_para.addElement("text"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int s = 1; s < segs.length(); s++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(s - 1)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s)))); trg_seg.setText(decodehtmlstring(segs.getString(s))); } trg_text.setText(decodehtmlstring(trg_para_text)); } else { units.remove(j); j--; } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); for (int i = 0; i < missings_list.size(); i++) { JSONArray segs = (JSONArray) missings_list.get(i); String trg_para_text = missing_targets.getString(i); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", Integer.toString(i)); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", segs.getString(0)); org.dom4j.Element trg_text = trg_para.addElement("text"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int s = 1; s < segs.length(); s++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); trg_seg.addAttribute("id", Integer.toString(s)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(segs.getString(s)))); trg_seg.setText(decodehtmlstring(segs.getString(s))); } trg_text.setText(decodehtmlstring(trg_para_text)); } new File(this.alignedfile).delete(); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); } public void update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets, JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review, JSONArray ignore) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); } ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<String> trg_list = new ArrayList(); for (int i = 0; i < targets.length(); i++) { trg_list.add(targets.getString(i)); } List<String> trg_seq_list = new ArrayList(); for (int i = 0; i < trg_seqs.length(); i++) { trg_seq_list.add(trg_seqs.getString(i)); } List<String> missing_list = new ArrayList(); for (int i = 0; i < missing_targets.length(); i++) { missing_list.add(missing_targets.getString(i)); } List<String> missing_seq_list = new ArrayList(); for (int i = 0; i < missing_trg_seqs.length(); i++) { missing_seq_list.add(missing_trg_seqs.getString(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } this.nullcnt = cnt; List<String> edited_list = new ArrayList(); for (int i = 0; i < edited.length(); i++) { edited_list.add(edited.getString(i)); } List<String> review_list = new ArrayList(); for (int i = 0; i < review.length(); i++) { review_list.add(review.getString(i)); } List<String> ignore_list = new ArrayList(); for (int i = 0; i < ignore.length(); i++) { ignore_list.add(ignore.getString(i)); } int segcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); unit.addAttribute("alignsegs", "true"); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); String seq = i + " - " + j + " - " + z; if (locks_list.contains(seq)) { src_seg.attribute("locked").setValue("true"); } else { src_seg.attribute("locked").setValue("false"); } if (review_list.contains(seq)) { src_seg.attribute("needreview").setValue("true"); } else { src_seg.attribute("needreview").setValue("false"); } if (ignore_list.contains(seq)) { src_seg.attribute("ignored").setValue("true"); } else { src_seg.attribute("ignored").setValue("false"); } } org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (z >= src_segs.size()) { trg_para.element("segments").remove(trg_seg); } else if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } else { org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } if (segcnt < trg_list.size()) { org.dom4j.Element group = root.element("aligned").addElement("group"); group.addAttribute("id", Integer.toString(groups.size())); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", Integer.toString(groups.size()) + " - 0"); org.dom4j.Element trgsegs = trg_para.addElement("segments"); for (int x = segcnt; x < trg_list.size(); x++) { String text = decodehtmlstring((String) trg_list.get(x)); org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg"); trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x)))) || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); trg_para.addAttribute("id", "0 - 0"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int i = 0; i < missing_list.size(); i++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); String text = decodehtmlstring((String) missing_list.get(i)); trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i))); if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i)))) || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } new File(this.alignedfile).delete(); OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.alignedfile)), "UTF8"); document.write(writer); writer.close(); if (new File(this.auto_saved_alignedfile).exists()) { new File(this.auto_saved_alignedfile).delete(); } } public void auto_update_seg(JSONArray targets, JSONArray trg_seqs, JSONArray missing_targets, JSONArray missing_trg_seqs, JSONArray locks, int cnt, JSONArray edited, JSONArray review, JSONArray ignore) throws Exception { File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); } ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); List<String> trg_list = new ArrayList(); for (int i = 0; i < targets.length(); i++) { trg_list.add(targets.getString(i)); } List<String> trg_seq_list = new ArrayList(); for (int i = 0; i < trg_seqs.length(); i++) { trg_seq_list.add(trg_seqs.getString(i)); } List<String> missing_list = new ArrayList(); for (int i = 0; i < missing_targets.length(); i++) { missing_list.add(missing_targets.getString(i)); } List<String> missing_seq_list = new ArrayList(); for (int i = 0; i < missing_trg_seqs.length(); i++) { missing_seq_list.add(missing_trg_seqs.getString(i)); } List<String> locks_list = new ArrayList(); for (int i = 0; i < locks.length(); i++) { locks_list.add(locks.getString(i)); } this.nullcnt = cnt; List<String> edited_list = new ArrayList(); for (int i = 0; i < edited.length(); i++) { edited_list.add(edited.getString(i)); } List<String> review_list = new ArrayList(); for (int i = 0; i < review.length(); i++) { review_list.add(review.getString(i)); } List<String> ignore_list = new ArrayList(); for (int i = 0; i < ignore.length(); i++) { ignore_list.add(ignore.getString(i)); } int segcnt = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); unit.addAttribute("alignsegs", "true"); org.dom4j.Element src_para = unit.element("src_para"); if (src_para != null) { List<org.dom4j.Element> src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); String seq = i + " - " + j + " - " + z; if (locks_list.contains(seq)) { src_seg.attribute("locked").setValue("true"); } else { src_seg.attribute("locked").setValue("false"); } if (review_list.contains(seq)) { src_seg.attribute("needreview").setValue("true"); } else { src_seg.attribute("needreview").setValue("false"); } if (ignore_list.contains(seq)) { src_seg.attribute("ignored").setValue("true"); } else { src_seg.attribute("ignored").setValue("false"); } } org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (z >= src_segs.size()) { trg_para.element("segments").remove(trg_seg); } else if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } else { org.dom4j.Element trg_para = unit.element("trg_para"); trg_para.remove(trg_para.element("text")); List<org.dom4j.Element> trg_segs = trg_para.element("segments").elements("trg_seg"); for (int z = 0; z < trg_segs.size(); z++) { org.dom4j.Element trg_seg = (org.dom4j.Element) trg_segs.get(z); if (segcnt < trg_list.size()) { trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(segcnt))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(segcnt)))) || (decodehtmlstring((String) trg_seq_list.get(segcnt)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } String text = decodehtmlstring((String) trg_list.get(segcnt)); trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); segcnt++; } else { trg_para.element("segments").remove(trg_seg); } } if (trg_para.element("segments").elements("trg_seg").size() == 0) { group.remove(unit); } } } if (group.elements("unit").size() == 0) { groups.remove(i); i--; } } if (segcnt < trg_list.size()) { org.dom4j.Element group = root.element("aligned").addElement("group"); group.addAttribute("id", Integer.toString(groups.size())); org.dom4j.Element unit = group.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trgsegs = trg_para.addElement("segments"); for (int x = segcnt; x < trg_list.size(); x++) { String text = decodehtmlstring((String) trg_list.get(x)); org.dom4j.Element trg_seg = trgsegs.addElement("trg_seg"); trg_seg.addAttribute("id", decodehtmlstring((String) trg_seq_list.get(x))); if ((edited_list.contains(decodehtmlstring((String) trg_seq_list.get(x)))) || (decodehtmlstring((String) trg_seq_list.get(x)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } } org.dom4j.Element orphans = root.element("orphans"); orphans.clearContent(); org.dom4j.Element unit = orphans.addElement("unit"); unit.addAttribute("id", "0"); org.dom4j.Element trg_para = unit.addElement("trg_para"); org.dom4j.Element trg_segs = trg_para.addElement("segments"); for (int i = 0; i < missing_list.size(); i++) { org.dom4j.Element trg_seg = trg_segs.addElement("trg_seg"); String text = decodehtmlstring((String) missing_list.get(i)); trg_seg.addAttribute("id", decodehtmlstring((String) missing_seq_list.get(i))); if ((edited_list.contains(decodehtmlstring((String) missing_seq_list.get(i)))) || (decodehtmlstring((String) trg_seq_list.get(i)).startsWith("n - "))) { trg_seg.addAttribute("edited", "true"); } trg_seg.addAttribute("isExtractable", Boolean.toString(extractionSupportImpl.isExtractable(text))); trg_seg.setText(text); } this.auto_saved_alignedfile = (this.alignedfile + ".temp"); if (new File(this.auto_saved_alignedfile).exists()) { new File(this.auto_saved_alignedfile).delete(); } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.auto_saved_alignedfile)), "UTF8"); document.write(writer); writer.close(); } public String decodehtmlstring(String encoded_s) { return Jsoup .parse(encoded_s.replaceAll("<br[^>]*?>", "<br>").replace("&", "&").replace("<", "<") .replace(">", ">").replaceAll("\\s", " ")) .text().replace("", " ").replace("
", ""); } private String encodeHtml(String s) { return s.replace("", "
"); } public HashMap<String, List<String>> updateSingleParagraph(String text) throws Exception { HashMap<String, List<String>> semap = new HashMap(); List<String> trgsegs = segmentStringWithRevs(decodehtmlstring(text), this.targetlanguage); for (int i = 0; i < trgsegs.size(); i++) { System.out.println((String) trgsegs.get(i)); trgsegs.set(i, ((String) trgsegs.get(i)).trim()); } semap.put("trgsegs", trgsegs); return semap; } public HashMap<String, List<String>> findandsegmentpara(String src_segid, String trg_segid) throws Exception { HashMap<String, List<String>> stsegs = new HashMap(); List<String> trgs = new ArrayList(); List<String> srcs = new ArrayList(); File alignedFile = new File(this.alignedfile); if (!alignedFile.exists()) { throw new FileNotFoundException("Could not find aligned xml file"); } org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); org.dom4j.Element root = document.getRootElement(); List<org.dom4j.Element> units = root.element("content").elements("unit"); for (int i = 0; i < units.size(); i++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(i); String trg_seq = ""; if (unit.element("trg_para") != null) { org.dom4j.Element trg = (org.dom4j.Element) unit.element("trg_para").clone(); trg_seq = unit.element("trg_para").attributeValue("para_seq"); if (trg_seq.equals(trg_segid)) { SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.targetlanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.targetlanguage), segconfig); trgs = segmenter.segment(trg.getText()); } } String src_seq = ""; if (unit.element("src_para") != null) { org.dom4j.Element src = (org.dom4j.Element) unit.element("src_para").clone(); src_seq = unit.element("src_para").attributeValue("para_seq"); if (src_seq.equals(src_segid)) { srcs = segmentStringWithRevs(src.getText(), this.sourcelanguage); } } } stsegs.put("srcsegs", srcs); stsegs.put("trgsegs", trgs); return stsegs; } public List<String> segmentStringWithRevs(String srctext, String language) throws Exception { List<String> srcs = new ArrayList(); Map<Integer, String> tagmap = new TreeMap(); SegmenterFactory factory = new SegmenterFactory(); Configuration config = createConfigForSegmenter(false, language); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(language), config); srctext = srctext.replace("<", "<").replace(">", ">").replace("&", "&"); Pattern p = Pattern.compile("<(/)*ins>|<del>.*?</del>|<br>|¶del;"); Matcher m = p.matcher(srctext); while (m.find()) { tagmap.put(Integer.valueOf(m.start()), m.group()); } String srctext_notag = srctext.replaceAll("<(/)*ins>|<del>.*?</del>|<br>|¶del;", ""); srcs = segmenter.segment(srctext_notag); List<String> srcs_split = new ArrayList(); int i; int entrycount; int strlen; ArrayList<String> tags; ArrayList<String> faketags; int offset; if (srcs.size() == 0) { srcs.add(srctext); } else { i = 0; entrycount = 0; strlen = ((String) srcs.get(0)).length(); tags = new ArrayList(); faketags = new ArrayList(); offset = 0; for (Map.Entry<Integer, String> entry : tagmap.entrySet()) { entrycount++; int position = ((Integer) entry.getKey()).intValue(); String tag = (String) entry.getValue(); while (((i < srcs.size() - 1) && (strlen - 1 < position + offset)) || ((i == srcs.size() - 1) && (strlen < position + offset))) { if (tags.size() != 0) { for (int t = 0; t < tags.size(); t++) { String remt = (String) tags.get(t); if (remt.equals("<ins>")) { srcs.set(i, (String) srcs.get(i) + "</ins>"); if (i < srcs.size() - 1) { srcs.set(i + 1, "<ins>" + (String) srcs.get(i + 1)); offset += 5; faketags.add("<ins>"); } } else if (remt.equals("</ins>")) { srcs.set(i, "<ins>" + (String) srcs.get(i)); } } } tags = new ArrayList(); tags.addAll(faketags); faketags = new ArrayList(); i++; strlen += ((String) srcs.get(i)).length(); } if ((tag.equals("</ins>")) && (tags.contains("<ins>"))) { tags.remove("<ins>"); } else { tags.add(tag); } String trgstr = (String) srcs.get(i); int insertpnt = position + offset - (strlen - trgstr.length()); trgstr = trgstr.substring(0, insertpnt) + tag + trgstr.substring(insertpnt, trgstr.length()); srcs.set(i, trgstr); strlen += tag.length(); if ((entrycount == tagmap.size()) && (tags.size() != 0)) { for (int t = 0; t < tags.size(); t++) { String remt = (String) tags.get(t); if (remt.equals("<ins>")) { srcs.set(i, (String) srcs.get(i) + "</ins>"); } else if (remt.equals("</ins>")) { srcs.set(i, "<ins>" + (String) srcs.get(i)); } } } } } for (int f = 0; f < srcs.size(); f++) { srcs_split.addAll(segmentStringWithRevs2((String) srcs.get(f), segmenter)); } for (int f = 0; f < srcs_split.size(); f++) { srcs_split.set(f, trimText( ((String) srcs_split.get(f)).replace("&", "&").replace("<", "<") .replace(">", ">").replaceAll("<([/]*)(ins|del|br)>", "<$1$2>"), false)[0]); } return srcs_split; } public List<String> segmentStringWithRevs2(String srctext, Segmenter segmenter) { List<String> srcs = new ArrayList(); Map<Integer, String> tagmap = new TreeMap(); Pattern p = Pattern.compile("<(/)*ins>|<(/)*del>|<br>|¶del;"); Matcher m = p.matcher(srctext); while (m.find()) { tagmap.put(Integer.valueOf(m.start()), m.group()); } String srctext_notag = srctext.replaceAll("<(/)*ins>|<(/)*del>|<br>|¶del;", ""); srcs = segmenter.segment(srctext_notag); List<String> new_srcs = new ArrayList(); if (srcs.size() == 0) { new_srcs.add(srctext); } else { int i = 0; int entrycount = 0; int strlen = ((String) srcs.get(0)).length(); ArrayList<String> tags = new ArrayList(); ArrayList<String> faketags = new ArrayList(); int offset = 0; for (Map.Entry<Integer, String> entry : tagmap.entrySet()) { entrycount++; int position = ((Integer) entry.getKey()).intValue(); String tag = (String) entry.getValue(); while (strlen < position + offset) { if (tags.size() != 0) { for (int t = 0; t < tags.size(); t++) { String remt = (String) tags.get(t); if (remt.equals("<ins>")) { srcs.set(i, (String) srcs.get(i) + "</ins>"); if (i < srcs.size() - 1) { srcs.set(i + 1, "<ins>" + (String) srcs.get(i + 1)); offset += 5; faketags.add("<ins>"); } } else if (remt.equals("<del>")) { srcs.set(i, (String) srcs.get(i) + "</del>"); if (i < srcs.size() - 1) { srcs.set(i + 1, "<del>" + (String) srcs.get(i + 1)); offset += 5; faketags.add("<del>"); } } else if (remt.equals("</ins>")) { srcs.set(i, "<ins>" + (String) srcs.get(i)); } else if (remt.equals("</del>")) { srcs.set(i, "<del>" + (String) srcs.get(i)); } } } tags = new ArrayList(); tags.addAll(faketags); faketags = new ArrayList(); i++; strlen += ((String) srcs.get(i)).length(); } if ((tag.equals("</ins>")) && (tags.contains("<ins>"))) { tags.remove("<ins>"); } else if ((tag.equals("</del>")) && (tags.contains("<del>"))) { tags.remove("<del>"); } else { tags.add(tag); } String trgstr = (String) srcs.get(i); int insertpnt = position + offset - (strlen - trgstr.length()); trgstr = trgstr.substring(0, insertpnt) + tag + trgstr.substring(insertpnt, trgstr.length()); srcs.set(i, trgstr); strlen += tag.length(); if ((entrycount == tagmap.size()) && (tags.size() != 0)) { for (int t = 0; t < tags.size(); t++) { String remt = (String) tags.get(t); if (remt.equals("<ins>")) { srcs.set(i, (String) srcs.get(i) + "</ins>"); } else if (remt.equals("<del>")) { srcs.set(i, (String) srcs.get(i) + "</del>"); } else if (remt.equals("</ins>")) { srcs.set(i, "<ins>" + (String) srcs.get(i)); } else if (remt.equals("</del>")) { srcs.set(i, "<del>" + (String) srcs.get(i)); } } } } new_srcs.addAll(srcs); } return new_srcs; } private static String[] trimText(String text, boolean preservenbsp) { if (!preservenbsp) { text = text.replace("", " "); } String[] t = new String[3]; String patternLEFT = "^[\\s\\t]+"; String patternRIGHT = "[\\s\\t]+$"; String acceptedText = text.replaceAll("(?s)<del>.*?</del>", "").replaceAll("<(/)*ins>", ""); Pattern left = Pattern.compile(patternLEFT); Pattern right = Pattern.compile(patternRIGHT); Matcher m_left = left.matcher(acceptedText); Matcher m_right = right.matcher(acceptedText); if (m_left.find()) { t[1] = "true"; } else { t[1] = "false"; } if (m_right.find()) { t[2] = "true"; } else { t[2] = "false"; } t[0] = text.trim().replaceAll("(?s)<ins>(\\s|<br>)*</ins>$", "") .replaceAll("(?s)<del>(\\s|<br>)*</del>$", "").trim().replaceAll("(?s)(\\s|<br>)+</ins>$", "</ins>") .replaceAll("(?s)(\\s|<br>)+</del>$", "</del>").replaceAll("(?s)^<ins>(\\s|<br>)*</ins>", "") .replaceAll("(?s)^<del>(\\s|<br>)*</del>", "").trim().replaceAll("(?s)^<ins>(\\s|<br>)+", "<ins>") .replaceAll("(?s)^<del>(\\s|<br>)+", "<del>"); return t; } private List trimContents(List contents) { List newcontents = new ArrayList(); int start_idx = 0; boolean start_reach = true; int end_idx = contents.size() - 1; boolean end_reach = true; while ((start_idx < end_idx) && ((start_reach) || (end_reach))) { Object st = contents.get(start_idx); if ((st instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) st).getText().trim().equals("")) && (start_reach)) { start_idx++; } else { start_reach = false; } } else { start_reach = false; } Object ed = contents.get(end_idx); if ((ed instanceof org.dom4j.Text)) { if ((((org.dom4j.Text) ed).getText().trim().equals("")) && (end_reach)) { end_idx--; } else { end_reach = false; } } else { end_reach = false; } } for (int i = start_idx; i <= end_idx; i++) { newcontents.add(contents.get(i)); } return newcontents; } public void exportHtmlLogFileForFinalReview() throws Exception { System.out.println("creating log file for final review...."); List<String> newtrgs = gatherNewTargetsFromTranslatedTxlf(); List<List<String>> notes = gatherNotesFromTranslatedTxlf(); org.jsoup.nodes.Document doc = Jsoup.parse(new File(this.htmlreportfortranslation_temp), "UTF-8", ""); org.jsoup.nodes.Element ttl = doc.select("ttl").first(); for (org.jsoup.nodes.Element radio : doc.select("input[type=\"radio\"]")) { if (radio.hasAttr("disabled")) { radio.removeAttr("disabled"); } } for (org.jsoup.nodes.Element radiotext : doc.select(".filter")) { radiotext.attr("style", "#23407B;"); } ttl.text("RA Report - For Final Reivew"); Elements rows = doc.select("tr"); int newtrgcnt = 0; for (int i = 3; i < rows.size(); i++) { org.jsoup.nodes.Element row = (org.jsoup.nodes.Element) rows.get(i); org.jsoup.nodes.Element trg = row.select("td.trg").first(); org.jsoup.nodes.Element cmt = row.select("th.cmt").first(); String orgtrgtext = trg.text().replace(" ", "<br>").replace("", " "); String newtrgtext = ""; //System.out.println(newtrgs.get(newtrgcnt)); if (!row.attr("id").isEmpty()) { newtrgtext = trimText((String) newtrgs.get(newtrgcnt), false)[0]; List<String> note_list = (List) notes.get(newtrgcnt); String html = ""; for (String note : note_list) { if (note_list.indexOf(note) != note_list.size() - 1) { html = html + "• " + note.replace("\n", "<br>") + "<br>"; } else { html = html + "• " + note.replace("\n", "<br>"); } } cmt.html(html); newtrgcnt++; } if (!orgtrgtext.equals(newtrgtext)) { row.addClass("TrgTracked"); diff_match_patch dmp = new diff_match_patch(); dmp.Diff_EditCost = 6; LinkedList<diff_match_patch.Diff> Diffs = dmp.diff_main(orgtrgtext, newtrgtext); dmp.diff_cleanupSemantic(Diffs); String result = dmp.diff_prettyHtml(Diffs); result = result.replace("<br>", "<br>").replace("<span>", "").replace("</span>", "") .replaceAll("(?s)<ins[^>]*?>", "<ins>").replaceAll("(?s)</ins[^>]*?>", "</ins>") .replaceAll("(?s)<del[^>]*?>", "<del>").replaceAll("(?s)</del[^>]*?>", "</del>"); trg.html(result.replace(" ", " ")); } } this.htmlreportforfinalreview = (new File(this.translatedtxlf).getParent() + File.separator + new File(this.sourcefile).getName().substring(0, new File(this.sourcefile).getName().lastIndexOf(".")) + "_final.html"); if (new File(this.htmlreportforfinalreview).exists()) { new File(this.htmlreportforfinalreview).delete(); } Writer logWriter = new OutputStreamWriter(new FileOutputStream(this.htmlreportforfinalreview), "UTF-8"); logWriter.write(doc.toString().replace(" ", " ")); logWriter.close(); } private List<String> gatherNewTargetsFromTranslatedTxlf() { List<String> trgs = new ArrayList(); org.dom4j.Document document_trans = XmlParser.parseXmlFile(this.translatedtxlf); org.dom4j.Element root_trans = document_trans.getRootElement(); List<com.aspose.words.Node> list_trans = root_trans.selectNodes("//*[name() = 'trans-unit']"); for (int i = 0; i < list_trans.size(); i++) { org.dom4j.Element unit = (org.dom4j.Element) list_trans.get(i); org.dom4j.Element trg = unit.element("target"); String trgtext = getTxlfElementText_noescape(trg).replace("<br> ", "<br>"); trgs.add(trgtext); } return trgs; } private List<List<String>> gatherNotesFromTranslatedTxlf() { List<List<String>> notes = new ArrayList(); org.dom4j.Document document_trans = XmlParser.parseXmlFile(this.translatedtxlf); org.dom4j.Element root_trans = document_trans.getRootElement(); List<com.aspose.words.Node> list_trans = root_trans.selectNodes("//*[name() = 'trans-unit']"); for (int i = 0; i < list_trans.size(); i++) { List<String> note_list = new ArrayList(); org.dom4j.Element unit = (org.dom4j.Element) list_trans.get(i); List<org.dom4j.Element> note_element_list = unit.elements("note"); for (org.dom4j.Element e : note_element_list) { note_list.add(e.getText()); } notes.add(note_list); } return notes; } public void exportHtmlLogFileForTranslation(ArrayList<String[]> reportstats, int[] fr_stats) throws Exception { System.out.println("creating log file for translation...."); StringBuffer sb = new StringBuffer(); sb.append("<html>\n"); sb.append("<head>\n"); sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html;charset=UTF-8\" />\n"); sb.append("<title>Source Changes Report</title>\n"); sb.append("<script src=\"http://ajax.googleapis.com/ajax/libs/jquery/1.11.0/jquery.min.js\"></script>\n"); sb.append("<STYLE type=\"text/css\">\n"); sb.append("body, table {font-family: arial,sans-serif; font-size: 12px;}\n"); sb.append("table {display: block; width:100%;}\n"); sb.append("tbody {display: table; width:100%;}\n"); sb.append("td {background: #FFFFFF; color: #000000;}\n"); sb.append("th {background: #E0E0E0; text-align:center; font-weight:bold}\n"); sb.append("ttl {font-size: 24; font-weight: bold; color : #888}\n"); sb.append( ".lbsrc {float: left;text-align:center;border-radius:16px;box-shadow: 3px 3px 3px #888888;border: 1px solid #CCC;font-size: 12.3px;word-wrap: break-word;line-height: 16px;margin:6px 0px;padding: 2px 7px;width: 6%;outline: 0;vertical-align: baseline;background: #FFFFFF no-repeat 19px 19px;}\n"); sb.append( ".lbtrg {float: left;text-align:center;border-radius:16px;box-shadow: 3px 3px 3px #888888;border: 1px solid #CCC;font-size: 12.3px;word-wrap: break-word;line-height: 16px;margin:6px;padding: 2px 7px;width: 6%;outline: 0;vertical-align: baseline;background: #E0EEEE no-repeat 19px 19px;}\n"); sb.append(".fproperties {font-weight: bold;margin:2px;}\n"); sb.append(".fpropertiescontent {font-weight: bold; color : #bbb}\n"); sb.append(".filter {font-weight: bold;margin:2px;color:#23407B}\n"); sb.append(".sid {width:5%}\n"); sb.append(".act {width:5%;font-size:12px}\n"); sb.append(".src {width:34%;white-space:pre-wrap;vertical-align:top;text-align:left;}\n"); sb.append( ".trg {width:34%;background: #E0EEEE;white-space:pre-wrap;vertical-align:top;text-align:left;}\n"); sb.append(".sco {width:4%}\n"); sb.append(".swc {width:4%}\n"); sb.append(".cmt {width:14%;text-align:left}\n"); sb.append(".tdh {background:#C9C9C9;font-weight:bold;}\n"); sb.append(".wc {background:#EEEEEE;font-weight:bold;text-align:center}\n"); sb.append(".nomatch {font-weight:bold;text-align:center;background: #888;}\n"); sb.append(".nontranslatable {font-weight:bold;text-align:center;background: #888;}\n"); sb.append(".fuzzy {font-weight:bold;text-align:center;background: yellow;}\n"); sb.append(".gold {font-weight:bold;text-align:center;background: #5EDA9E;}\n"); sb.append("ins {color: #1E90FF;text-decoration: underline;}\n"); sb.append("del {color: #990000;text-decoration: line-through;}\n"); sb.append(".accept {border-color: #1E90FF}\n"); sb.append(".reject {border-color: red}\n"); sb.append(".normal {border-color: black}\n"); sb.append( ".option {display:inline-block;cursor:pointer;border-style:solid;border-width:1px;width:10px;height:10px;margin-left:6%;margin-right:6%;border-radius: 2px;-moz-box-shadow:inset 0 0 10px #bbbbbb;-webkit-box-shadow: inset 0 0 10px #bbbbbb;box-shadow:inset 0 0 10px #bbbbbb;}\n"); sb.append(".selected {font-size: 14px}\n"); sb.append(".unselected {font-size: 6px}\n"); sb.append(".topref{float:right;font-weight:bold;color:blue}\n"); sb.append( "#search {float:right;position:fixed;top:1%;right:7px;opacity:0.5;width:40px;Background-color:#ccc;border-style:none;padding:2px}\n"); sb.append("</STYLE>\n"); sb.append("</head>\n"); sb.append("<body>\n"); sb.append("<ttl>RA Report - For Translation</ttl>\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<div class=\"lbsrc\">source</div>\n"); sb.append("<div class=\"lbtrg\">translation</div>\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<hr>\n"); sb.append( "<font color=\"1E90FF\">⬛ accept track changes</font> <font color=\"red\">⬛ reject track changes</font> ⬛ show all track changes\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<div class=\"fproperties\">File Name : <span class=\"fpropertiescontent\">" + new File(this.populatedsourcetxlf).getName() + "</span></div>\n"); sb.append("<div class=\"fproperties\">Language Pair : <span class=\"fpropertiescontent\">" + this.sourcelanguage + " > " + this.targetlanguage + "</span></div>\n"); StringBuffer sb3 = new StringBuffer(); sb3.append( "<table cellspacing=\"0\" cellpadding=\"4\" bgcolor=\"#E0E0E0\" border=\"1\" bordercolor=\"#224466\" style=\"table-layout:fixed\">\n"); sb3.append("<tr>\n"); sb3.append("<th class=\"tdh\">Source</th>\n"); sb3.append("<th class=\"tdh\">Fully Aligned</th>\n"); sb3.append("<th class=\"tdh\">Aligned but Need Review</th>\n"); sb3.append("<th class=\"tdh\">Aligned but Source Tracked</th>\n"); sb3.append("<th class=\"tdh\">Not Aligned</th>\n"); sb3.append("<th class=\"tdh\">Total</th>\n"); sb3.append("</tr>\n"); int generalid = 0; int tc_count = 0; StringBuffer sb2 = new StringBuffer(); sb2.append( "<table cellspacing=\"0\" cellpadding=\"4\" bgcolor=\"#E0E0E0\" border=\"1\" bordercolor=\"#224466\" style=\"table-layout:fixed\">\n"); sb2.append("<tr>\n"); sb2.append("<th class=\"tdh sid\">Segment ID</th>\n"); sb2.append("<th class=\"tdh act\">View</th>\n"); sb2.append("<th class=\"tdh src\">Source</th>\n"); sb2.append("<th class=\"tdh trg\">Target</th>\n"); sb2.append("<th class=\"tdh sco\">Score</th>\n"); sb2.append("<th class=\"tdh swc\">Count</th>\n"); sb2.append("<th class=\"tdh cmt\">Comment</th>\n"); sb2.append("</tr>\n"); int wc_gold = 0; int wc_fuzzy_high = 0; int wc_fuzzy_low = 0; int wc_nomatch = 0; int wc_reps = fr_stats[0]; int wc_fuzzyreps = fr_stats[1]; for (int i = 0; i < reportstats.size(); i++) { String[] rstats = (String[]) reportstats.get(i); String source = rstats[0]; String target = rstats[1]; String score = rstats[2]; String match_type = rstats[3]; String tc_type = rstats[4]; String wc = rstats[5]; String comment = rstats[6]; if ((tc_type.equals("INSERTION")) || (tc_type.equals("MIX"))) { tc_count++; } if (!match_type.equals("N/A")) { generalid++; sb2.append("<tr id=\"" + generalid + "\" class=\"tp" + tc_type + "\">\n"); sb2.append("<th class=\"sid\">" + generalid + "</td>\n"); } else { sb2.append("<tr class=\"tp" + tc_type + "\">\n"); sb2.append("<th class=\"sid\">N/A</td>\n"); } sb2.append( "<th class=\"act\"><div class=\"option accept\"></div><div class=\"option reject\"></div><div class=\"option normal\"></div></td>\n"); sb2.append("<td class=\"src\">" + source.replace(" ", " ") + "</td>\n"); sb2.append("<td class=\"trg\">" + target.replace(" ", " ") + "</td>\n"); if (match_type.equals("exact-match")) { sb2.append("<td class=\"sco gold\">" + score + "</td>\n"); sb2.append("<th class=\"swc\">" + wc + "</td>\n"); wc_gold += Integer.parseInt(wc); } else if (match_type.equals("fuzzy-match")) { sb2.append("<td class=\"sco fuzzy\">" + score + "</td>\n"); sb2.append("<th class=\"swc\">" + wc + "</td>\n"); if (score.equals("75")) { wc_fuzzy_low += Integer.parseInt(wc); } else { wc_fuzzy_high += Integer.parseInt(wc); } } else if (match_type.equals("x-no-match")) { sb2.append("<td class=\"sco nomatch\">" + score + "</td>\n"); sb2.append("<th class=\"swc\">" + wc + "</td>\n"); wc_nomatch += Integer.parseInt(wc); } else { sb2.append("<td class=\"sco nontranslatable\">" + score + "</td>\n"); sb2.append("<th class=\"swc\">" + wc + "</td>\n"); } sb2.append("<th class=\"cmt\">" + comment + "</td>\n"); sb2.append("</tr>\n"); } sb2.append("</table>\n"); sb3.append("<tr>\n"); if (Locale.makeLocale(this.sourcelanguage).isFarEast()) { sb3.append("<th class=\"tdh\">Character Count</th>\n"); } else { sb3.append("<th class=\"tdh\">Word Count</th>\n"); } wc_nomatch = wc_nomatch - wc_reps - wc_fuzzyreps; sb3.append("<td class=\"wc\">" + wc_gold + "</th>\n"); sb3.append("<td class=\"wc\">" + wc_fuzzy_high + "</th>\n"); sb3.append("<td class=\"wc\">" + wc_fuzzy_low + "</th>\n"); sb3.append("<td class=\"wc\">" + wc_nomatch + "</th>\n"); sb3.append("<td class=\"wc\">" + (wc_gold + wc_fuzzy_high + wc_fuzzy_low + wc_nomatch + wc_reps + wc_fuzzyreps) + "</th>\n"); sb3.append("</tr>\n"); sb3.append("</table>\n"); sb.append( "<div class=\"fproperties\">Number of Segments with Track Changes / Total Segments(in TXLF): <span class=\"fpropertiescontent\">" + tc_count + " / " + generalid + "</span></div>\n"); sb.append("<br />\n"); sb.append(sb3); sb.append("<br />\n"); sb.append("<form>\n"); sb.append( "<input type=\"checkbox\" name=\"CKB\" value=\"SIT\"><span class=\"filter\">Show Segments in the TXLF Only</span>\n"); sb.append("<br />\n"); sb.append("<hr>\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"SA\" checked><span class=\"filter\">Show All</span>\n"); sb.append("<br />\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"ST\"><span class=\"filter\">Show Changed Segments</span>\n"); sb.append("<br />\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"STS\"><span class=\"filter\">Show Changed Sources</span>\n"); sb.append("<br />\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"STT\" disabled><span class=\"filter\" style=\"color:#bbbbbb\">Show Changed Targets</span>\n"); sb.append("<br />\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"STSNTT\" disabled><span class=\"filter\" style=\"color:#bbbbbb\">Show Changed Sources with Non-Changed Targets</span>\n"); sb.append("<br />\n"); sb.append( "<input type=\"radio\" name=\"Filter\" value=\"SNTTS\" disabled><span class=\"filter\" style=\"color:#bbbbbb\">Show Non-Changed Sources with Changed Targets</span>\n"); sb.append("</form>\n"); sb.append(sb2); sb.append("<br />\n"); sb.append("<form action=\"javascript:goToSegment();\">\n"); sb.append( "<input type=\"text\" id=\"search\" onfocus=\"searchfocus(this)\" onblur=\"searchnotfocus(this)\">\n"); sb.append("</form>\n"); sb.append("<a class=\"topref\" href=\"#TOP\" title=\"Back to Top\">TOP</a>\n"); sb.append("<br />\n"); sb.append("<hr>\n"); sb.append("<br />\n"); sb.append("<br />\n"); sb.append("<script type=\"text/javascript\">\n"); sb.append("$(document).ready(function(){\n"); sb.append("var norms = $(document).find('.normal');\n"); sb.append("for (var i = 0; i < norms.length; i++) {\n"); sb.append("$(norms[i]).addClass(\"selected\");\n"); sb.append("$(norms[i]).css(\"background-color\",\"black\");\n"); sb.append("}\n"); sb.append("});\n"); sb.append("\n"); sb.append("function goToSegment() {\n"); sb.append("var segid = document.getElementById(\"search\").value;\n"); sb.append("location.href = \"#\" + segid;\n"); sb.append("}\n"); sb.append("\n"); sb.append("function searchfocus(field) {\n"); sb.append("field.style.background = \"red\";\n"); sb.append("}\n"); sb.append("\n"); sb.append("function searchnotfocus(field) {\n"); sb.append("field.style.background = \"#ccc\";\n"); sb.append("}\n"); sb.append("\n"); sb.append("$(\".accept\").click(function() {\n"); sb.append("if(!$(this).hasClass(\"selected\")){\n"); sb.append("$(this).removeClass(\"unselected\")\n"); sb.append("$(this).addClass(\"selected\")\n"); sb.append("$(this).css(\"background-color\",\"1E90FF\");\n"); sb.append("var rej = $(this).parent().find('.reject')[0];\n"); sb.append("var nor = $(this).parent().find('.normal')[0];\n"); sb.append("$(rej).removeClass(\"selected\");\n"); sb.append("$(rej).addClass(\"unselected\");\n"); sb.append("$(rej).css(\"background-color\",\"\");\n"); sb.append("$(nor).removeClass(\"selected\");\n"); sb.append("$(nor).addClass(\"unselected\");\n"); sb.append("$(nor).css(\"background-color\",\"\");\n"); sb.append("\n"); sb.append("var accepts = $(this).parent().parent().find('ins');\n"); sb.append("for (var i = 0; i < accepts.length; i++) {\n"); sb.append("accepts[i].style.display = \"inline\";\n"); sb.append("accepts[i].style.textDecoration = \"none\";\n"); sb.append("}\n"); sb.append("var rejects = $(this).parent().parent().find('del');\n"); sb.append("for (var i = 0; i < rejects.length; i++) {\n"); sb.append("rejects[i].style.display = \"none\";\n"); sb.append("}\n"); sb.append("}\n"); sb.append("});\n"); sb.append("\n"); sb.append("$(\".reject\").click(function() {\n"); sb.append("if(!$(this).hasClass(\"selected\")){\n"); sb.append("$(this).removeClass(\"unselected\")\n"); sb.append("$(this).addClass(\"selected\")\n"); sb.append("$(this).css(\"background-color\",\"red\");\n"); sb.append("var act = $(this).parent().find('.accept')[0];\n"); sb.append("var nor = $(this).parent().find('.normal')[0];\n"); sb.append("$(act).removeClass(\"selected\");\n"); sb.append("$(act).addClass(\"unselected\");\n"); sb.append("$(act).css(\"background-color\",\"\");\n"); sb.append("$(nor).removeClass(\"selected\");\n"); sb.append("$(nor).addClass(\"unselected\");\n"); sb.append("$(nor).css(\"background-color\",\"\");\n"); sb.append("\n"); sb.append("var accepts = $(this).parent().parent().find('ins');\n"); sb.append("for (var i = 0; i < accepts.length; i++) {\n"); sb.append("accepts[i].style.display = \"none\";\n"); sb.append("}\n"); sb.append("var rejects = $(this).parent().parent().find('del');\n"); sb.append("for (var i = 0; i < rejects.length; i++) {\n"); sb.append("rejects[i].style.display = \"inline\";\n"); sb.append("rejects[i].style.textDecoration = \"none\";\n"); sb.append("}\n"); sb.append("}\n"); sb.append("});\n"); sb.append("\n"); sb.append("$(\".normal\").click(function() {\n"); sb.append("if(!$(this).hasClass(\"selected\")){\n"); sb.append("$(this).removeClass(\"unselected\")\n"); sb.append("$(this).addClass(\"selected\")\n"); sb.append("$(this).css(\"background-color\",\"black\");\n"); sb.append("var act = $(this).parent().find('.accept')[0];\n"); sb.append("var rej = $(this).parent().find('.reject')[0];\n"); sb.append("$(act).removeClass(\"selected\");\n"); sb.append("$(act).addClass(\"unselected\");\n"); sb.append("$(act).css(\"background-color\",\"\");\n"); sb.append("$(rej).removeClass(\"selected\");\n"); sb.append("$(rej).addClass(\"unselected\");\n"); sb.append("$(rej).css(\"background-color\",\"\");\n"); sb.append("\n"); sb.append("var accepts = $(this).parent().parent().find('ins');\n"); sb.append("for (var i = 0; i < accepts.length; i++) {\n"); sb.append("accepts[i].style.display = \"inline\";\n"); sb.append("accepts[i].style.textDecoration = \"underline\";\n"); sb.append("}\n"); sb.append("var rejects = $(this).parent().parent().find('del');\n"); sb.append("for (var i = 0; i < rejects.length; i++) {\n"); sb.append("rejects[i].style.display = \"inline\";\n"); sb.append("rejects[i].style.textDecoration = \"line-through\";\n"); sb.append("}\n"); sb.append("}\n"); sb.append("});\n"); sb.append("\n"); sb.append("$(\"input[name='CKB']\").change(function(){"); sb.append("if($(this).is(':checked')){"); sb.append("$('.tpDELETION').hide();"); sb.append("}else{"); sb.append( "if($(\"input[value='SA']\").is(':checked') || $(\"input[value='STS']\").is(':checked') || $(\"input[value='STSNTT']\").is(':checked')){"); sb.append("$('.tpDELETION').show();"); sb.append("}else{"); sb.append("$('.tpDELETION').hide();"); sb.append("}"); sb.append("}"); sb.append("});"); sb.append("\n"); sb.append("$(\"input[name='Filter']\").change(function(){"); sb.append("var value = $(this).val();"); sb.append("if(value == \"SA\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("$(tr).show();"); sb.append("}else{"); sb.append("if($(\"input[name='CKB']\").is(':checked')){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}"); sb.append("}"); sb.append("}else if(value == \"ST\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("if(!$(tr).hasClass(\"tpNONE\") || $(tr).hasClass(\"TrgTracked\")){"); sb.append("$(tr).show();"); sb.append("}else{"); sb.append("$(tr).hide();"); sb.append("}"); sb.append("}else{"); sb.append("if($(\"input[name='CKB']\").is(':checked')){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}"); sb.append("}"); sb.append("}else if(value == \"STS\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("if($(tr).hasClass(\"tpNONE\")){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}else{"); sb.append("if($(\"input[name='CKB']\").is(':checked')){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}"); sb.append("}"); sb.append("}else if(value == \"STT\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("if($(tr).hasClass(\"TrgTracked\")){"); sb.append("$(tr).show();"); sb.append("}else{"); sb.append("$(tr).hide();"); sb.append("}"); sb.append("}else{"); sb.append("$(tr).hide();"); sb.append("}"); sb.append("}"); sb.append("}else if(value == \"STSNTT\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("if($(tr).hasClass(\"tpNONE\") || $(tr).hasClass(\"TrgTracked\")){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}else{"); sb.append("if($(\"input[name='CKB']\").is(':checked')){"); sb.append("$(tr).hide();"); sb.append("}else{"); sb.append("$(tr).show();"); sb.append("}"); sb.append("}"); sb.append("}"); sb.append("}else if(value == \"SNTTS\"){"); sb.append("var trs = document.getElementsByTagName(\"tr\");"); sb.append("for(i=3;i<trs.length;i++){"); sb.append("var tr = trs[i];"); sb.append("if(!$(tr).hasClass(\"tpDELETION\")){"); sb.append("if($(tr).hasClass(\"tpNONE\") && $(tr).hasClass(\"TrgTracked\")){"); sb.append("$(tr).show();"); sb.append("}else{"); sb.append("$(tr).hide();"); sb.append("}"); sb.append("}else{"); sb.append("$(tr).hide();"); sb.append("}"); sb.append("}"); sb.append("}"); sb.append("});"); sb.append("\n"); sb.append("</script>"); sb.append("</body>\n"); sb.append("</html>\n"); this.htmlreportfortranslation = (new File(this.prjfolder) + File.separator + new File(this.sourcefile) .getName().substring(0, new File(this.sourcefile).getName().lastIndexOf(".")) + "_aligned.html"); this.htmlreportfortranslation_temp = (new File(this.prjfolder) + File.separator + new File(this.sourcefile) .getName().substring(0, new File(this.sourcefile).getName().lastIndexOf(".")) + "_aligned.temp"); if (new File(this.htmlreportfortranslation).exists()) { new File(this.htmlreportfortranslation).delete(); } org.jsoup.nodes.Document doc = Jsoup.parse(sb.toString()); Writer logWriter = new OutputStreamWriter(new FileOutputStream(this.htmlreportfortranslation), "UTF-8"); logWriter.write(doc.toString().replace(" ", " ")); Writer logWriter_temp = new OutputStreamWriter(new FileOutputStream(this.htmlreportfortranslation_temp), "UTF-8"); logWriter_temp.write(doc.toString()); logWriter.close(); logWriter_temp.close(); } public void collectFilesForProjectFile() throws Exception { this.exportfolder = (this.prjfolder + File.separator + "export"); File export = new File(this.exportfolder); if (export.exists()) { FileUtils.cleanDirectory(export); } else { export.mkdirs(); } FileUtils.copyFileToDirectory(new File(this.backupsourcefile), export); FileUtils.copyFileToDirectory(new File(this.targetfile), export); FileUtils.copyFileToDirectory(new File(this.alignedfile), export); FileUtils.copyFileToDirectory(new File(this.prjinfofile), export); } public void collectFilesForTranslationKit(String prjid) throws Exception { this.exportfolder = (this.prjfolder + File.separator + "export"); File export = new File(this.exportfolder); if (export.exists()) { FileUtils.cleanDirectory(export); } else { export.mkdirs(); } FileUtils.copyFileToDirectory(new File(createglpfile(prjid)), export); FileUtils.copyFileToDirectory(new File(this.htmlreportfortranslation), export); } private String createglpfile(String prjid) throws Exception { this.tempfolder = (this.prjfolder + File.separator + "temp"); File temp = new File(this.tempfolder); if (temp.exists()) { FileUtils.cleanDirectory(temp); } else { temp.mkdirs(); } InputStream contentfile = FileAligner.class.getClassLoader().getResourceAsStream("content.xml"); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); javax.xml.parsers.DocumentBuilder builder = factory.newDocumentBuilder(); org.w3c.dom.Document document = builder.parse(contentfile); org.w3c.dom.Element root = document.getDocumentElement(); org.w3c.dom.Node sourcelang = root.getElementsByTagName("sourceLanguage").item(0); sourcelang.getAttributes().getNamedItem("localeCode").setTextContent(this.sourcelanguage); org.w3c.dom.Node targetlang = root.getElementsByTagName("targetLanguage").item(0); targetlang.getAttributes().getNamedItem("localeCode").setTextContent(this.targetlanguage); org.w3c.dom.Node xliffpath = root.getElementsByTagName("__xliffFile").item(0); xliffpath.getAttributes().getNamedItem("archivePath") .setTextContent(this.targetlanguage + "/txlf/" + new File(this.populatedsourcetxlf).getName()); org.w3c.dom.Node sourcepath = root.getElementsByTagName("__storable").item(0); sourcepath.getAttributes().getNamedItem("archivePath") .setTextContent("source/" + new File(this.backupsourcefile).getName()); org.w3c.dom.Node projectname = root.getElementsByTagName("projectName").item(0); projectname.setTextContent(prjid); TransformerFactory tf = TransformerFactory.newInstance(); Transformer m = tf.newTransformer(); DOMSource source = new DOMSource(document); File dest = new File(temp + File.separator + "content.xml"); StreamResult result = new StreamResult(new FileOutputStream(dest)); m.transform(source, result); File sourcefolder = new File(temp + File.separator + "source"); sourcefolder.mkdir(); FileUtils.copyFileToDirectory(new File(this.backupsourcefile), sourcefolder); File targetlangfolder = new File(temp + File.separator + this.targetlanguage); targetlangfolder.mkdir(); File txlffolder = new File(targetlangfolder + File.separator + "txlf"); txlffolder.mkdir(); FileUtils.copyFileToDirectory(new File(this.populatedsourcetxlf), txlffolder); ZipFile zf = new ZipFile(); String glpfile = this.tempfolder + File.separator + prjid + ".glp"; zf.ZipIt(glpfile, this.tempfolder); contentfile.close(); result.getOutputStream().close(); return glpfile; } public void collectFilesForFinalReview() throws Exception { this.exportfolder = (this.prjfolder + File.separator + "export"); File export = new File(this.exportfolder); if (export.exists()) { FileUtils.cleanDirectory(export); } else { export.mkdirs(); } FileUtils.copyFileToDirectory(new File(this.aligneddoc), export); FileUtils.copyFileToDirectory(new File(this.translatedtxlf), export); FileUtils.copyFileToDirectory(new File(this.htmlreportforfinalreview), export); if (new File(this.comparedoc).exists()) { FileUtils.copyFileToDirectory(new File(this.comparedoc), export); } String reformattedtargetfolder = this.exportfolder + File.separator + "org_target"; if (!new File(reformattedtargetfolder).exists()) { new File(reformattedtargetfolder).mkdir(); } FileUtils.copyFileToDirectory(new File(this.reformattedtargetfile), new File(reformattedtargetfolder)); } public void convertSourceToTxlf(boolean segmentParagraph) throws Exception { String orgtxlf = convertFileToTxlf(this.sourcefile, segmentParagraph, this.sourcelanguage); String newtxlf; if (segmentParagraph) { newtxlf = this.sourcefile + ".nonSeg.txlf"; this.sourcetxlf_nonSeg = newtxlf; } else { newtxlf = this.sourcefile + ".Seg.txlf"; this.sourcetxlf_seg = newtxlf; } if (new File(newtxlf).exists()) { new File(newtxlf).delete(); } new File(orgtxlf).renameTo(new File(newtxlf)); org.dom4j.Document document = XmlParser.parseXmlFile(newtxlf); org.dom4j.Element root = document.getRootElement(); root.element("file").addAttribute("target-language", this.targetlanguage); OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(newtxlf)), "UTF8"); document.write(writer); writer.close(); } public void convertReformattedSourceToTxlf(boolean segmentParagraph) throws Exception { String orgtxlf = convertFileToTxlf(this.reformattedsourcefile, segmentParagraph, this.sourcelanguage); String ext = orgtxlf.substring(orgtxlf.lastIndexOf('.'), orgtxlf.length()); String newtxlf; if (segmentParagraph) { newtxlf = this.reformattedsourcefile + ".nonSeg.txlf"; this.reformattedsourcetxlf_nonSeg = newtxlf; } else { newtxlf = this.reformattedsourcefile + ".Seg.txlf"; this.reformattedsourcetxlf_seg = newtxlf; } if (new File(newtxlf).exists()) { new File(newtxlf).delete(); } new File(orgtxlf).renameTo(new File(newtxlf)); org.dom4j.Document document = XmlParser.parseXmlFile(newtxlf); org.dom4j.Element root = document.getRootElement(); root.element("file").addAttribute("target-language", this.targetlanguage); OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(newtxlf)), "UTF8"); document.write(writer); writer.close(); } public void convertTargetToTxlf(boolean segmentParagraph) throws Exception { String orgtxlf = convertFileToTxlf(this.targetfile, segmentParagraph, this.targetlanguage); String ext = orgtxlf.substring(orgtxlf.lastIndexOf('.'), orgtxlf.length()); String newtxlf; if (segmentParagraph) { newtxlf = this.targetfile + ".nonSeg.txlf"; this.targettxlf_nonSeg = newtxlf; } else { newtxlf = this.targetfile + ".Seg.txlf"; this.targettxlf_seg = newtxlf; } if (new File(newtxlf).exists()) { new File(newtxlf).delete(); } new File(orgtxlf).renameTo(new File(newtxlf)); org.dom4j.Document document = XmlParser.parseXmlFile(newtxlf); org.dom4j.Element root = document.getRootElement(); root.element("file").addAttribute("target-language", this.targetlanguage); OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(newtxlf)), "UTF8"); document.write(writer); writer.close(); } public void convertReformattedTargetToTxlf(boolean segmentParagraph) throws Exception { String orgtxlf = convertFileToTxlf(this.reformattedtargetfile, segmentParagraph, this.targetlanguage); String ext = orgtxlf.substring(orgtxlf.lastIndexOf('.'), orgtxlf.length()); String newtxlf; if (segmentParagraph) { newtxlf = this.reformattedtargetfile + ".nonSeg.txlf"; this.reformattedtargettxlf_nonSeg = newtxlf; } else { newtxlf = this.reformattedtargetfile + ".seg.txlf"; this.reformattedtargettxlf_seg = newtxlf; } if (new File(newtxlf).exists()) { new File(newtxlf).delete(); } new File(orgtxlf).renameTo(new File(newtxlf)); org.dom4j.Document document = XmlParser.parseXmlFile(newtxlf); org.dom4j.Element root = document.getRootElement(); root.element("file").addAttribute("target-language", this.targetlanguage); OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(newtxlf)), "UTF8"); document.write(writer); writer.close(); } public String convertFileToTxlf(String filePath, boolean segmentParagraph, String sourcelanguage) throws Exception { com.aspose.words.Document doc = new com.aspose.words.Document(filePath); doc.joinRunsWithSameFormatting(); doc.save(filePath); ArrayList<String> srcs = new ArrayList(); srcs.add(filePath); String orgtxlfname = filePath + ".txlf"; if (new File(orgtxlfname).exists()) { new File(orgtxlfname).delete(); } Locale locale = Locale.makeLocale(sourcelanguage); Configuration config = new BaseConfiguration(); config.setProperty("word.acceptTrackChanges", "true"); config.setProperty("word.extractDropDownList", "false"); config.setProperty("word.extractEquations", "false"); config.setProperty("word.extractComments", "false"); config.setProperty("extraction.tokens.extract", "all"); config.setProperty("word.translateHyperlinkText", "true"); config.setProperty("word.translateHyperlinkValue", "false"); config.setProperty("word.ignoreBiLingualStyles", "true"); ConvertDOC converter = new ConvertDOC(); converter.setConfiguration(config); converter.setIgnoreSuccessfullConversion(true); converter.convert(srcs, locale); segmentTxlf(orgtxlfname, segmentParagraph, sourcelanguage); return orgtxlfname; } public boolean verifyParas() throws Exception { System.out.println("verifying paragraphs mapping...."); boolean isValid = false; ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); org.dom4j.Document document_source = XmlParser.parseXmlFile(this.sourcetxlf_nonSeg); org.dom4j.Element root_source = document_source.getRootElement(); org.dom4j.Document document_formatted = XmlParser.parseXmlFile(this.reformattedsourcetxlf_nonSeg); org.dom4j.Element root_formatted = document_formatted.getRootElement(); List list_source = root_source.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_source = list_source.size(); List list_formatted = root_formatted.selectNodes("//*[name() = 'trans-unit']"); int numberOfPara_formatted = list_formatted.size(); List<org.dom4j.Element> text_source = new ArrayList(); Iterator iter_source = list_source.iterator(); while (iter_source.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_source.next()).element("source"); text_source.add(source); } List<org.dom4j.Element> text_formatted = new ArrayList(); Iterator iter_formatted = list_formatted.iterator(); while (iter_formatted.hasNext()) { org.dom4j.Element source = ((org.dom4j.Element) iter_formatted.next()).element("source"); text_formatted.add(source); } Workbook wb = new Workbook(); Worksheet ws = wb.getWorksheets().get(0); Cells cells = ws.getCells(); int count = Math.max(text_source.size(), text_formatted.size()); int t_count = 0; for (int i = 0; i < count; i++) { String t_src = ""; String t_fom = ""; if (i < text_source.size()) { org.dom4j.Element src = (org.dom4j.Element) text_source.get(i); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_src = t_src + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_src = t_src + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_src = t_src + "<br> "; } } } } cells.get(i, 0).setHtmlString("<html>" + t_src.trim().replace("<br> ", "↵<br>") + "</html>"); if (i < text_formatted.size()) { org.dom4j.Element src = (org.dom4j.Element) text_formatted.get(i); ArrayList<String> node_ids = new ArrayList(); for (int j = 0; j < src.content().size(); j++) { if ((src.content().get(j) instanceof org.dom4j.Text)) { t_fom = t_fom + ((org.dom4j.Text) src.content().get(j)).getText().replace("&", "&") .replace("<", "<").replace(">", ">"); } else if ((src.content().get(j) instanceof org.dom4j.Element)) { org.dom4j.Element e = (org.dom4j.Element) src.content().get(j); if ((e.getName().equals("bx")) && (e.attribute("ctype").getValue().equals("x-strike-through"))) { t_fom = t_fom + "<strike>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ex")) { if (node_ids.contains(e.attribute("rid").getValue())) { t_fom = t_fom + "</strike>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("bpt")) && (e.attribute("ctype").getValue().equals("x-underlined")) && (e.getText().contains("type=\"1\""))) { t_fom = t_fom + "<u>"; node_ids.add(e.attribute("rid").getValue()); } else if (e.getName().equals("ept")) { if (node_ids.contains(e.attribute("rid").getValue())) { t_fom = t_fom + "</u>"; node_ids.remove(e.attribute("rid").getValue()); } } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("x-tab"))) { t_fom = t_fom + " "; } else if ((e.getName().equals("x")) && (e.attribute("ctype").getValue().equals("lb"))) { t_fom = t_fom + "<br> "; } } } if (!t_fom.contains("<u>")) { if (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']") .size() != 0) { org.dom4j.Node node = (org.dom4j.Node) src .selectNodes("..//*[name() = 'it'][@ctype = 'x-underlined'][@pos = 'open']").get(0); if (node.getText().contains("type=\"1\"")) { t_fom = "<u>" + t_fom + "</u>"; } } } else if ((!t_fom.contains("<strike>")) && (src.selectNodes("..//*[name() = 'it'][@ctype = 'x-strike-through'][@pos = 'open']") .size() != 0)) { t_fom = "<strike>" + t_fom + "</strike>"; } } String accepted_t_fom = t_fom.replaceAll("(?s)<strike>.*?</strike>", "").replace("<u>", "") .replace("</u>", "").replace("&paradel;", "").replace("&parains;", "") .replace("&hf;", "").replace("<br>", ""); if (extractionSupportImpl.isExtractable(accepted_t_fom)) { String input = "<html>" + t_fom.replace("&hf;", "").replace("&parains;", "") .replace("<br> ", "↵<br>").replace("&paradel;", "<strike>¶</strike><br>") + "</html>"; cells.get(t_count, 1).setHtmlString(input); t_count++; } } wb.save(this.prjfolder + File.separator + "verifyParas.xlsx"); if (numberOfPara_source == t_count) { System.out.println("result: TRUE source: " + numberOfPara_source + " formatted: " + t_count); isValid = true; } else { System.out.println("result: false source: " + numberOfPara_source + " formatted: " + t_count); } return isValid; } public void segmentTxlf(String txlf, boolean segmentParagraph, String srcLangcode) throws Exception { ArrayList<String> txlfs = new ArrayList(); txlfs.add(txlf); Locale sourceLocale = Locale.makeLocale(srcLangcode); Configuration config = createConfigForSegmenter(segmentParagraph, srcLangcode); XliffSegmenter segmenter = new XliffSegmenter(sourceLocale); segmenter.setConfiguration(config); segmenter.segment(txlfs); } private Configuration createConfigForSegmenter(boolean segmentParagraph, String srcLangcode) throws Exception { Configuration config = new BaseConfiguration(); config.setProperty("segmenter.default.paragraphsegmenter", Boolean.valueOf(segmentParagraph)); config.setProperty("segmenter.strategy.trados.rule1enabled", "true"); config.setProperty("segmenter.default.strategy", "trados"); config.setProperty("segmenter.default.simplifier", "fontformat"); config.setProperty("breakiterator.strategy.sentence.trados.esps", createESMString(Locale.makeLocale(srcLangcode))); String abbv = getAbbreviations(srcLangcode); if (!abbv.equals("")) { config.setProperty("abbreviations." + srcLangcode.split("-")[0], abbv); } return config; } private String getAbbreviations(String srcLangcode) throws Exception { String abbv_str = ""; String key = "abbreviations." + srcLangcode.split("-")[0]; InputStream langcodeFile = ProjectManager.class.getClassLoader() .getResourceAsStream("abbreviations.properties"); InputStreamReader inReader = new InputStreamReader(langcodeFile); BufferedReader readbuffer = new BufferedReader(inReader); String line; while ((line = readbuffer.readLine()) != null) { if (line.startsWith(key)) { abbv_str = line.replace(key, ""); } } readbuffer.close(); inReader.close(); return abbv_str; } private int calculateSimilarityScore(String oldtext, String newtext) { DamerauLevenshteinAlgorithm dla = new DamerauLevenshteinAlgorithm(1, 1, 1, 1); double metrics = dla.execute(oldtext, newtext); double maxlength = Math.max(oldtext.length(), newtext.length()); int result = (int) ((maxlength - metrics) / maxlength * 100.0D); return result; } public ArrayList<String[]> populateSourceTxlf() throws Exception { System.out.println("populating source txlf with aligned segments...."); ArrayList<String[]> reportStates = new ArrayList(); ExtractionSupportImpl extractionSupportImpl = new ExtractionSupportImpl( Locale.makeLocale(this.sourcelanguage), Locale.makeLocale(this.targetlanguage)); Configuration config = new BaseConfiguration(); config.setProperty("extraction.tokens.extract", "all"); extractionSupportImpl.setConfiguration(config); Locale locale = Locale.makeLocale(this.sourcelanguage); TradosWordCounter wcounter = new TradosWordCounter(locale, config); org.dom4j.Document document_src = XmlParser.parseXmlFile(this.sourcetxlf_nonSeg); org.dom4j.Element root_src = document_src.getRootElement(); org.dom4j.Document document_src_ingt = XmlParser.parseXmlFile(this.sourcetxlf_nonSeg); org.dom4j.Element root_src_ingt = document_src_ingt.getRootElement(); org.dom4j.Document document_src_seg = XmlParser.parseXmlFile(this.sourcetxlf_seg); org.dom4j.Element root_src_seg = document_src_seg.getRootElement(); List<com.aspose.words.Node> list_source = root_src .selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); List<com.aspose.words.Node> list_source_ingt = root_src_ingt .selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); List<com.aspose.words.Node> list_source_seg = root_src_seg .selectNodes("//*[name() = 'group'][@restype = 'x-paragraph']"); int count = 0; int totalWC = 0; org.dom4j.Document document = XmlParser.parseXmlFile(this.alignedfile); List<org.dom4j.Element> groups = document.getRootElement().element("aligned").elements("group"); for (int i = 0; i < groups.size(); i++) { org.dom4j.Element group = (org.dom4j.Element) groups.get(i); List<org.dom4j.Element> units = group.elements("unit"); if (((org.dom4j.Element) units.get(0)).element("src_para") != null) { boolean isParaAllSegmented = true; for (int j = 0; j < units.size(); j++) { if (((org.dom4j.Element) units.get(j)).attributeValue("alignsegs").equals("false")) { isParaAllSegmented = false; break; } } String srcTextAccepted = group.elementText("text").replaceAll("(?s)<del>.*?</del>", "") .replaceAll("<(/)*ins>", ""); if (!extractionSupportImpl.isExtractable(srcTextAccepted)) { if (isParaAllSegmented) { for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); List<org.dom4j.Element> srcsegs = unit.element("src_para").element("segments") .elements("src_seg"); List<org.dom4j.Element> trgsegs = unit.element("trg_para").element("segments") .elements("trg_seg"); for (int x = 0; x < srcsegs.size(); x++) { String[] s = new String[7]; s[0] = ((org.dom4j.Element) srcsegs.get(x)).getText(); if (x >= trgsegs.size()) { s[1] = ""; } else { org.dom4j.Element trgseg = (org.dom4j.Element) trgsegs.get(x); String id = trgseg.attributeValue("id"); if (id.startsWith("n - ")) { s[1] = trgseg.getText(); } else { List tmp_contents = new ArrayList(); if (id.contains(" - ")) { int start = Integer.parseInt(id.split(" - ")[0]); int end = Integer.parseInt(id.split(" - ")[1]); tmp_contents.addAll( (Collection) this.txlftrgsegmap.get(Integer.valueOf(start))); for (int su = start + 1; su <= end; su++) { boolean isprevendofpara = ((boolean[]) this.txlftrgsewsmap .get(Integer.valueOf(su - 1)))[1]; boolean iscurrentstartofpara = ((boolean[]) this.txlftrgsewsmap .get(Integer.valueOf(su)))[0]; if ((isprevendofpara) && (iscurrentstartofpara)) { List prevseg = (List) this.txlftrgsegmap .get(Integer.valueOf(su - 1)); int previdx = -1; for (int prev = 0; prev < prevseg.size(); prev++) { org.dom4j.Node prevnode = (org.dom4j.Node) prevseg .get(prev); if (prevnode.getNodeType() == 1) { org.dom4j.Element prevnode_e = (org.dom4j.Element) prevnode; if ((prevnode_e.getName().equals("ws")) && (prevnode_e .attributeValue("pos").equals("after"))) { previdx = prevseg.size() - prev; } } } if (previdx != -1) { tmp_contents.remove(tmp_contents.size() - previdx); } List currseg = (List) this.txlftrgsegmap .get(Integer.valueOf(su)); int curridx = -1; for (int curr = 0; curr < currseg.size(); curr++) { org.dom4j.Node currnode = (org.dom4j.Node) currseg .get(curr); if (currnode.getNodeType() == 1) { org.dom4j.Element currnode_e = (org.dom4j.Element) currnode; if ((currnode_e.getName().equals("ws")) && (currnode_e .attributeValue("pos").equals("before"))) { curridx = curr; } } } if (curridx != -1) { currseg.remove(curridx); } if (Locale.makeLocale(this.targetlanguage).isFarEast()) { tmp_contents.addAll(currseg); } else { tmp_contents.add(DocumentHelper.createText(" ")); tmp_contents.addAll(currseg); } } else { tmp_contents.addAll((Collection) this.txlftrgsegmap .get(Integer.valueOf(su))); } } } else { tmp_contents.addAll((Collection) this.txlftrgsegmap .get(Integer.valueOf(Integer.parseInt(id)))); } s[1] = trimText(assembleText(tmp_contents).replace("<br> ", "↵<br>"), false)[0]; } } s[2] = "N/A"; s[3] = "N/A"; s[4] = ((org.dom4j.Element) srcsegs.get(x)).attributeValue("tctype"); s[5] = "0"; s[6] = ""; reportStates.add(s); } } } else { String[] s = new String[7]; for (int j = 0; j < units.size(); j++) { s[0] = ((org.dom4j.Element) units.get(j)).element("src_para").elementText("text"); if (((org.dom4j.Element) units.get(j)).element("trg_para") != null) { s[1] = ((org.dom4j.Element) units.get(j)).element("trg_para").elementText("text"); } else { s[1] = ""; } s[2] = "N/A"; s[3] = "N/A"; s[4] = ((org.dom4j.Element) units.get(j)).element("src_para").attributeValue("tctype"); s[5] = "0"; s[6] = ""; reportStates.add(s); } } } else { if (isParaAllSegmented) { org.dom4j.Element txlf_group = (org.dom4j.Element) list_source.get(count); org.dom4j.Element txlf_group_ingt = (org.dom4j.Element) list_source_ingt.get(count); org.dom4j.Element txlf_group_seg = (org.dom4j.Element) list_source_seg.get(count); txlf_group.setContent(txlf_group_seg.content()); List transunits = txlf_group.elements("trans-unit"); txlf_group_ingt.setContent(txlf_group_seg.content()); List transunits_ingt = txlf_group_ingt.elements("trans-unit"); ArrayList<String> mergedsegtext = new ArrayList(); ArrayList<List> merged_trg_contents = new ArrayList(); ArrayList<String> mergedsegtctypes = new ArrayList(); ArrayList<String> keys = new ArrayList(); ArrayList<String> key_left = new ArrayList(); ArrayList<String> key_right = new ArrayList(); ArrayList<String> org_keys = new ArrayList(); ArrayList<String> trg_keys = new ArrayList(); ArrayList<List> trg_contents = new ArrayList(); ArrayList<String> src_tctypes = new ArrayList(); ArrayList<String> src_review_stats = new ArrayList(); ArrayList<String> src_ignore_stats = new ArrayList(); ArrayList<Integer> edited_idx = new ArrayList(); for (int j = 0; j < units.size(); j++) { org.dom4j.Element unit = (org.dom4j.Element) units.get(j); org.dom4j.Element src_para = unit.element("src_para"); org.dom4j.Element trg_para = unit.element("trg_para"); List src_segs = src_para.element("segments").elements("src_seg"); for (int z = 0; z < src_segs.size(); z++) { org.dom4j.Element src_seg = (org.dom4j.Element) src_segs.get(z); src_tctypes.add(src_seg.attributeValue("tctype")); src_review_stats.add(src_seg.attributeValue("needreview")); src_ignore_stats.add(src_seg.attributeValue("ignored")); keys.add(src_seg.getText().replaceAll("(?s)<del>.*?</del>", "") .replaceAll("<(/)*ins>", "").replace("<br>", "").trim()); org_keys.add(src_seg.getText()); if (trg_para != null) { List trg_segs = trg_para.element("segments").elements("trg_seg"); if (((org.dom4j.Element) trg_segs.get(z)).attributeValue("edited") .equals("true")) { edited_idx.add(Integer.valueOf(trg_contents.size())); } if (trg_segs.size() > z) { trg_keys.add(((org.dom4j.Element) trg_segs.get(z)).getText()); String id = ((org.dom4j.Element) trg_segs.get(z)).attributeValue("id"); if (id.startsWith("n - ")) { trg_contents.add(new ArrayList()); } else { List tmp_contents = new ArrayList(); if (id.contains(" - ")) { int start = Integer.parseInt(id.split(" - ")[0]); int end = Integer.parseInt(id.split(" - ")[1]); tmp_contents.addAll((Collection) this.txlftrgsegmap .get(Integer.valueOf(start))); for (int su = start + 1; su <= end; su++) { boolean isprevendofpara = ((boolean[]) this.txlftrgsewsmap .get(Integer.valueOf(su - 1)))[1]; boolean iscurrentstartofpara = ((boolean[]) this.txlftrgsewsmap .get(Integer.valueOf(su)))[0]; if ((isprevendofpara) && (iscurrentstartofpara)) { List prevseg = (List) this.txlftrgsegmap .get(Integer.valueOf(su - 1)); int previdx = -1; for (int prev = 0; prev < prevseg.size(); prev++) { org.dom4j.Node prevnode = (org.dom4j.Node) prevseg .get(prev); if (prevnode.getNodeType() == 1) { org.dom4j.Element prevnode_e = (org.dom4j.Element) prevnode; if ((prevnode_e.getName().equals("ws")) && (prevnode_e.attributeValue("pos") .equals("after"))) { previdx = prevseg.size() - prev; } } } if (previdx != -1) { tmp_contents.remove(tmp_contents.size() - previdx); } List currseg = (List) this.txlftrgsegmap .get(Integer.valueOf(su)); int curridx = -1; for (int curr = 0; curr < currseg.size(); curr++) { org.dom4j.Node currnode = (org.dom4j.Node) currseg .get(curr); if (currnode.getNodeType() == 1) { org.dom4j.Element currnode_e = (org.dom4j.Element) currnode; if ((currnode_e.getName().equals("ws")) && (currnode_e.attributeValue("pos") .equals("before"))) { curridx = curr; } } } if (curridx != -1) { currseg.remove(curridx); } if (Locale.makeLocale(this.targetlanguage).isFarEast()) { tmp_contents.addAll(currseg); } else { tmp_contents.add(DocumentHelper.createText(" ")); tmp_contents.addAll(currseg); } } else { tmp_contents.addAll((Collection) this.txlftrgsegmap .get(Integer.valueOf(su))); } } } else { tmp_contents.addAll((Collection) this.txlftrgsegmap .get(Integer.valueOf(Integer.parseInt(id)))); } trg_contents.add(tmp_contents); } } else { trg_keys.add(""); trg_contents.add(new ArrayList()); } } else { trg_keys.add(""); trg_contents.add(new ArrayList()); } if ((z == 0) && (z == src_segs.size() - 1)) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add(src_para.attributeValue("righttrim")); } else if (z == 0) { key_left.add(src_para.attributeValue("lefttrim")); key_right.add("true"); } else if (z == src_segs.size() - 1) { key_left.add("true"); key_right.add(src_para.attributeValue("righttrim")); } else { key_left.add("true"); key_right.add("true"); } } } SegmenterFactory factory = new SegmenterFactory(); Configuration segconfig = createConfigForSegmenter(false, this.sourcelanguage); Segmenter segmenter = factory.getSegmenter("trados", Locale.makeLocale(this.sourcelanguage), segconfig); List<String> finsegs = segmenter .segment(group.elementText("text").replaceAll("(?s)<del>.*?</del>", "") .replaceAll("<(/)*ins>", "").replace("<br>", "").replace("<", "<") .replace(">", ">").replace("&", "&")); ArrayList<ArrayList<Integer>> indices = new ArrayList(); int key_start_index = 0; for (int k = 0; k < finsegs.size(); k++) { String finsegtext = ((String) finsegs.get(k)).replace("&", "&").replace("<", "<") .replace(">", ">"); String combined_key = ""; ArrayList<Integer> indice = new ArrayList(); for (int x = key_start_index; x < keys.size(); x++) { combined_key = combined_key + (String) keys.get(x); indice.add(Integer.valueOf(x)); if (combined_key.replace("", " ").trim().replaceAll("(\\s)+", "") .equals(finsegtext.replace("", " ").trim().replaceAll("(\\s)+", ""))) { indices.add(indice); key_start_index = x + 1; break; } } } ArrayList<Integer> merged_edited_idx = new ArrayList(); ArrayList<String[]> statss = new ArrayList(); for (int m = 0; m < indices.size(); m++) { boolean iscontentsuseable = true; ArrayList<Integer> temp_indice = (ArrayList) indices.get(m); String temp_src = ""; String temp_org_src = ""; String temp_trg = ""; List temp_trg_content = new ArrayList(); int id = 1; int rid = 1; int bxrid = 1; int bptrid = 1; int bxid = 1; int bptid = 1; HashMap<String, String> map_rid = new HashMap(); String temp_tctype = (String) src_tctypes .get(((Integer) temp_indice.get(0)).intValue()); String temp_review_stats = (String) src_review_stats .get(((Integer) temp_indice.get(0)).intValue()); for (Iterator localIterator = temp_indice.iterator(); localIterator.hasNext();) { int it = ((Integer) localIterator.next()).intValue(); temp_tctype = temp_tctype.equals(src_tctypes.get(it)) ? temp_tctype : "MIX"; temp_review_stats = ((String) src_review_stats.get(it)).equals("true") ? "true" : temp_review_stats.equals("true") ? "true" : "false"; String temp_ignore_stats = (String) src_ignore_stats.get(it); if (edited_idx.contains(Integer.valueOf(it))) { iscontentsuseable = false; } temp_src = temp_src + (String) keys.get(it); temp_org_src = temp_org_src + (String) org_keys.get(it); if (temp_ignore_stats.equals("true")) { temp_trg = temp_trg + "[skipseg]"; temp_trg_content.add(DocumentHelper.createText("[skipseg]")); } else { temp_trg = temp_trg + (String) trg_keys.get(it); List trg_content = (List) trg_contents.get(it); for (int nc = 0; nc < trg_content.size(); nc++) { org.dom4j.Node raw = (org.dom4j.Node) trg_content.get(nc); if (raw.getNodeType() == 3) { temp_trg_content.add(raw); } else if (raw.getNodeType() == 1) { org.dom4j.Element rawe = (org.dom4j.Element) raw; if (rawe.getName().equals("source")) { for (int ncc = 0; ncc < rawe.content().size(); ncc++) { org.dom4j.Node node = (org.dom4j.Node) rawe.content().get(ncc); if (node.getNodeType() == 3) { temp_trg_content.add(node); } else if (node.getNodeType() == 1) { org.dom4j.Element e = (org.dom4j.Element) node; if (!e.getName().equals("x")) { if (!e.getName().equals("ph")) { if (e.getName().equals("bx")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf( Integer.valueOf(it)) != 0)) { continue; } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } else if (e.getName().equals("ex")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf(Integer.valueOf( it)) != temp_indice.size() - 1)) { continue; } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } else if (e.getName().equals("bpt")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf( Integer.valueOf(it)) != 0)) { continue; } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } else if (e.getName().equals("ept")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf(Integer.valueOf( it)) != temp_indice.size() - 1)) { continue; } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } } } if (e.attribute("fake") != null) { e.remove(e.attribute("fake")); } temp_trg_content.add(e); } } } else if (rawe.getName().equals("ws")) { String pos = rawe.attributeValue("pos"); if (pos.equals("before")) { for (int ncc = 0; ncc < rawe.content().size(); ncc++) { org.dom4j.Node node = (org.dom4j.Node) rawe.content() .get(ncc); if (node.getNodeType() == 3) { temp_trg_content.add(0, node); } else if (node.getNodeType() == 1) { org.dom4j.Element e = (org.dom4j.Element) node; if ((!e.getName().equals("x")) && (e.getName().equals("it"))) { if (e.attributeValue("pos").equals("open")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf( Integer.valueOf(it)) != 0)) { continue; } if (e.getText().equals("")) { e.setName("bx"); } else { e.setName("bpt"); } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } else if (e.attributeValue("pos") .equals("close")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf(Integer.valueOf( it)) != temp_indice.size() - 1)) { continue; } if (e.getText().equals("")) { e.setName("ex"); } else { e.setName("ept"); } e.remove(e.attribute("ctype")); if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } e.remove(e.attribute("pos")); } else { if (e.attribute("fake") != null) { e.remove(e.attribute("fake")); } temp_trg_content.add(0, e); } } } } else if (pos.equals("after")) { for (int ncc = 0; ncc < rawe.content().size(); ncc++) { org.dom4j.Node node = (org.dom4j.Node) rawe.content() .get(ncc); if (node.getNodeType() == 3) { temp_trg_content.add(node); } else if (node.getNodeType() == 1) { org.dom4j.Element e = (org.dom4j.Element) node; if ((!e.getName().equals("x")) && (e.getName().equals("it"))) { if (e.attributeValue("pos").equals("open")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf( Integer.valueOf(it)) != 0)) { continue; } if (e.getText().equals("")) { e.setName("bx"); } else { e.setName("bpt"); } if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } else if (e.attributeValue("pos") .equals("close")) { if ((e.attribute("fake") != null) && (e.attributeValue("fake") .equals("true")) && (temp_indice.indexOf(Integer.valueOf( it)) != temp_indice.size() - 1)) { continue; } if (e.getText().equals("")) { e.setName("ex"); } else { e.setName("ept"); } e.remove(e.attribute("ctype")); if ((e.attribute("fake") == null) || (!e.attributeValue("fake") .equals("true"))) { } } e.remove(e.attribute("pos")); } else { if (e.attribute("fake") != null) { e.remove(e.attribute("fake")); } temp_trg_content.add(e); } } } } } } } } } String[] stats = TrackChangeHelper.getTxlfTrgStatsFromTCType(temp_tctype, temp_trg); if ((stats[0].equals("1")) && (temp_review_stats.equals("true"))) { stats[2] = "fuzzy-match"; } String[] s = new String[7]; s[0] = temp_org_src.replace("<br> ", "↵<br>"); if (iscontentsuseable) { s[1] = trimText(assembleText(temp_trg_content).replace("<br> ", "↵<br>"), false)[0]; } else { s[1] = temp_trg.replace("<br> ", "↵<br>"); } if (s[1].contains("[skipseg]")) { if (s[1].replace("[skipseg]", "").trim().equals("")) { s[1] = ""; temp_trg_content = new ArrayList(); temp_trg_content.add(DocumentHelper.createText("")); temp_trg = ""; stats[0] = "1"; stats[1] = "translated"; stats[2] = "exact-match"; } else { s[1] = s[1].replace("[skipseg]", ""); temp_trg_content = replacetextinDomObj(temp_trg_content); temp_trg = temp_trg.replace("[skipseg]", ""); } } s[2] = stats[0]; s[3] = stats[2]; s[4] = temp_tctype; wcounter = new TradosWordCounter(locale, config); wcounter.countText(((org.dom4j.Element) transunits.get(m)).element("source").getText()); s[5] = Integer.toString(wcounter.getWordCount()); s[6] = ""; totalWC += wcounter.getWordCount(); reportStates.add(s); if (extractionSupportImpl.isExtractable(temp_src)) { mergedsegtext.add(temp_trg); if (!iscontentsuseable) { merged_edited_idx.add(Integer.valueOf(merged_trg_contents.size())); } merged_trg_contents.add(temp_trg_content); mergedsegtctypes.add(temp_tctype); statss.add(stats); } } for (int t = 0; t < transunits.size(); t++) { org.dom4j.Element trans_unit = (org.dom4j.Element) transunits.get(t); org.dom4j.Element trans_unit_ignt = (org.dom4j.Element) transunits_ingt.get(t); trans_unit.addAttribute("gs4tr:editStatus", "leveraged"); org.dom4j.Element source = trans_unit.element("source"); org.dom4j.Element target = trans_unit.addElement("target"); trans_unit.elements().add(source.indexOf(source.getParent()) + 2, target.clone()); trans_unit.remove(target); target = trans_unit.element("target"); org.dom4j.Element target_ignt = trans_unit_ignt.addElement("target"); trans_unit_ignt.elements().add(source.indexOf(source.getParent()) + 2, target_ignt.clone()); trans_unit_ignt.remove(target_ignt); target_ignt = trans_unit_ignt.element("target"); if (merged_edited_idx.contains(Integer.valueOf(t))) { target.setText(((String) mergedsegtext.get(t)).replace("<", "<") .replace(">", ">").replace("&", "&").trim()); target_ignt.setText(((String) mergedsegtext.get(t)).replace("<", "<") .replace(">", ">").replace("&", "&").trim()); } else { target.setContent(trimContents((List) merged_trg_contents.get(t))); target_ignt.setContent(trimContents((List) merged_trg_contents.get(t))); if (!((String[]) statss.get(t))[0].equals("75")) { org.dom4j.Element source_ingt = trans_unit_ignt.element("source"); source_ingt.setContent(trimContents((List) merged_trg_contents.get(t))); } } String[] stats = (String[]) statss.get(t); if (stats[0].equals("1")) { trans_unit.addAttribute("gs4tr:locked", "true"); } target.addAttribute("gs4tr:score", stats[0]); target.addAttribute("state", stats[1]); target.addAttribute("state-qualifier", stats[2]); if (stats[0].equals("0")) { trans_unit.remove(target); } } } else { String trgtext = ""; if (((org.dom4j.Element) units.get(0)).element("trg_para") != null) { trgtext = ((org.dom4j.Element) units.get(0)).element("trg_para").elementText("text"); } String temp_tctype = ((org.dom4j.Element) units.get(0)).element("src_para") .attributeValue("tctype"); for (int j = 1; j < units.size(); j++) { org.dom4j.Element prev_unit = (org.dom4j.Element) units.get(j - 1); org.dom4j.Element unit = (org.dom4j.Element) units.get(j); String src_tctype = unit.element("src_para").attributeValue("tctype"); temp_tctype = temp_tctype.equals(src_tctype) ? temp_tctype : "MIX"; if (unit.element("trg_para") != null) { String Rtrim = prev_unit.element("src_para").attributeValue("righttrim"); String Ltrim = unit.element("src_para").attributeValue("lefttrim"); if ((Rtrim.equals("true")) || (Ltrim.equals("true"))) { trgtext = trgtext + " " + unit.element("trg_para").elementText("text"); } else { trgtext = trgtext + unit.element("trg_para").elementText("text"); } } } org.dom4j.Element txlf_group = (org.dom4j.Element) list_source.get(count); org.dom4j.Element trans_unit = txlf_group.element("trans-unit"); trans_unit.addAttribute("gs4tr:editStatus", "leveraged"); org.dom4j.Element source = trans_unit.element("source"); org.dom4j.Element target = trans_unit.addElement("target"); trans_unit.elements().add(source.indexOf(source.getParent()) + 2, target.clone()); trans_unit.remove(target); target = trans_unit.element("target"); int lb_cnt = 0; String surfix = trgtext; while (surfix.indexOf("<br> ") != -1) { lb_cnt++; int pos = surfix.indexOf("<br> "); String prefix = surfix.substring(0, pos); target.addText(prefix.replace("<", "<").replace(">", ">").replace("&", "&")); org.dom4j.Element x = target.addElement("x"); x.addAttribute("ctype", "lb"); x.addAttribute("id", Integer.toString(lb_cnt)); x.addAttribute("equiv-text", " "); surfix = surfix.substring(pos + 5, surfix.length()); } target.addText(surfix.replace("<", "<").replace(">", ">").replace("&", "&")); String[] stats = TrackChangeHelper.getTxlfTrgStatsFromTCType(temp_tctype, trgtext); target.addAttribute("gs4tr:score", stats[0]); target.addAttribute("state", stats[1]); target.addAttribute("state-qualifier", stats[2]); String[] s = new String[7]; s[0] = group.elementText("text").replace("<br> ", "↵<br>"); s[1] = trgtext.replace("<br> ", "↵<br>"); s[2] = stats[0]; s[3] = stats[2]; s[4] = temp_tctype; wcounter = new TradosWordCounter(locale, config); wcounter.countText(source.getText()); s[5] = Integer.toString(wcounter.getWordCount()); s[6] = ""; totalWC += wcounter.getWordCount(); reportStates.add(s); } count++; } } } root_src.element("file").addAttribute("gs4tr:wordcount", Integer.toString(totalWC)); fixTxlfTrgTags(document_src); this.populatedsourcetxlf = (this.sourcefile + ".txlf"); if (new File(this.populatedsourcetxlf).exists()) { new File(this.populatedsourcetxlf).delete(); } OutputStreamWriter writer = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(this.populatedsourcetxlf)), "UTF8"); document_src.write(writer); writer.close(); removeBlankLinesAndNameSpace(this.populatedsourcetxlf); root_src_ingt.element("file").addAttribute("gs4tr:wordcount", Integer.toString(totalWC)); fixTxlfTrgTags(document_src_ingt); String ingtfile = this.sourcefile + ".ingt.txlf"; if (new File(ingtfile).exists()) { new File(ingtfile).delete(); } OutputStreamWriter writer_ingt = new OutputStreamWriter( new BufferedOutputStream(new FileOutputStream(ingtfile)), "UTF8"); document_src_ingt.write(writer_ingt); writer_ingt.close(); removeBlankLinesAndNameSpace(ingtfile); return reportStates; } private void removeBlankLinesAndNameSpace(String file) throws Exception { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF8")); StringBuffer sb = new StringBuffer(); String line; while ((line = in.readLine()) != null) { if (!line.trim().equals("")) { sb.append(line.replace(" xmlns=\"\"", "")); } } in.close(); OutputStreamWriter writermp = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(file)), "UTF8"); writermp.write(sb.toString()); writermp.close(); } public void fixTxlfTrgTags(org.dom4j.Document doc) throws Exception { org.dom4j.Element root = doc.getRootElement(); List<org.dom4j.Element> list_transunit = root.selectNodes("//*[name() = 'trans-unit']"); for (org.dom4j.Element e : list_transunit) { org.dom4j.Element source = e.element("source"); org.dom4j.Element target = e.element("target"); if ((target != null) && (!target.getText().equals(""))) { HashMap<String, List<Object>> src_map = new HashMap(); boolean tagnodeappear = false; boolean textnodeappear = false; boolean sourcetagsurroundtext = true; int maxid = 1; for (int i = 0; i < source.content().size(); i++) { org.dom4j.Node node = (org.dom4j.Node) source.content().get(i); if (node.getNodeType() == 1) { org.dom4j.Element tag = (org.dom4j.Element) node; maxid = Math.max(maxid, Integer.parseInt(tag.attributeValue("id"))); } } maxid++; for (int i = 0; i < source.content().size(); i++) { org.dom4j.Node node = (org.dom4j.Node) source.content().get(i); if (node.getNodeType() == 1) { if (textnodeappear) { tagnodeappear = true; } org.dom4j.Element tag = (org.dom4j.Element) node; String key = ""; if ((tag.getName().equals("ex")) || (tag.getName().equals("ept"))) { key = tag.getName() + "#" + tag.attributeValue("rid"); } else if (tag.getName().equals("x")) { key = tag.getName() + "#" + tag.attributeValue("ctype"); } else { key = tag.getName(); } if (src_map.containsKey(key)) { ((List) src_map.get(key)).add(node.clone()); } else { List<Object> list = new ArrayList(); list.add(node.clone()); src_map.put(key, list); } } else { if (tagnodeappear) { sourcetagsurroundtext = false; } textnodeappear = true; } } HashMap<String, String> rid_map = new HashMap(); List trg_contents = new ArrayList(); boolean targetnotag = true; for (int i = 0; i < target.content().size(); i++) { org.dom4j.Node node = (org.dom4j.Node) target.content().get(i); if (node.getNodeType() == 1) { org.dom4j.Element tag = (org.dom4j.Element) node; String rid = tag.attributeValue("rid"); String key = ""; if ((tag.getName().equals("ph")) || (tag.getName().equals("bpt")) || (tag.getName().equals("bx"))) { key = tag.getName(); if (src_map.containsKey(key)) { List list = (List) src_map.get(key); trg_contents.add(list.get(0)); targetnotag = false; rid_map.put(rid, ((org.dom4j.Element) list.get(0)).attributeValue("rid")); list.remove(0); if (list.size() == 0) { src_map.remove(key); } } } else if (tag.getName().equals("x")) { key = tag.getName() + "#" + tag.attributeValue("ctype"); if (src_map.containsKey(key)) { List list = (List) src_map.get(key); trg_contents.add(list.get(0)); targetnotag = false; list.remove(0); if (list.size() == 0) { src_map.remove(key); } } else if ((tag.getName().equals("x")) && (tag.attributeValue("ctype").equals("x-tab"))) { org.dom4j.Element tab = DocumentHelper.createElement("x"); tab.addAttribute("ctype", "x-tab"); tab.addAttribute("id", Integer.toString(maxid)); maxid++; tab.addAttribute("equiv-text", " "); trg_contents.add(tab); } else if ((tag.getName().equals("x")) && (tag.attributeValue("ctype").equals("lb"))) { org.dom4j.Element lb = DocumentHelper.createElement("x"); lb.addAttribute("ctype", "lb"); lb.addAttribute("id", Integer.toString(maxid)); maxid++; lb.addAttribute("equiv-text", " "); trg_contents.add(lb); } } else if ((tag.getName().equals("ex")) || (tag.getName().equals("ept"))) { String mapped_rid = (String) rid_map.get(tag.attributeValue("rid")); key = tag.getName() + "#" + mapped_rid; if (src_map.containsKey(key)) { List list = (List) src_map.get(key); trg_contents.add(list.get(0)); rid_map.put(rid, ((org.dom4j.Element) list.get(0)).attributeValue("rid")); list.remove(0); if (list.size() == 0) { src_map.remove(key); } } } } else if (node.getNodeType() == 3) { String text = node.getText(); if (text.contains("<br> ")) { String key = "x#lb"; String[] ss = text.split("<br> "); for (int s = 0; s < ss.length; s++) { trg_contents.add(DocumentHelper.createText(ss[s])); if (s < ss.length - 1) { if (src_map.containsKey(key)) { List list = (List) src_map.get(key); trg_contents.add(list.get(0)); targetnotag = false; list.remove(0); if (list.size() == 0) { src_map.remove(key); } } else { org.dom4j.Element lb = DocumentHelper.createElement("x"); lb.addAttribute("ctype", "lb"); lb.addAttribute("id", Integer.toString(maxid)); maxid++; lb.addAttribute("equiv-text", " "); trg_contents.add(lb); } } } } else if (text.contains("<br>")) { String key = "x#lb"; String[] ss = text.split("<br> "); for (int s = 0; s < ss.length; s++) { trg_contents.add(DocumentHelper.createText(ss[s])); if (s < ss.length - 1) { if (src_map.containsKey(key)) { List list = (List) src_map.get(key); trg_contents.add(list.get(0)); targetnotag = false; list.remove(0); if (list.size() == 0) { src_map.remove(key); } } else { org.dom4j.Element lb = DocumentHelper.createElement("x"); lb.addAttribute("ctype", "lb"); lb.addAttribute("id", Integer.toString(maxid)); maxid++; lb.addAttribute("equiv-text", " "); trg_contents.add(lb); } } } } else { trg_contents.add(node); } } else { trg_contents.add(node); } } while (trg_contents.size() > 0) { org.dom4j.Node nd = (org.dom4j.Node) trg_contents.get(0); if (nd.getNodeType() == 3) { if (nd.getText().trim().equals("")) { trg_contents.remove(0); } else { nd.setText(nd.getText().replaceAll("^(\\s)+", "")); break; } } else if ((nd.getNodeType() == 1) && (nd.getName().equals("x")) && (((org.dom4j.Element) nd).attributeValue("ctype").equals("lb"))) { trg_contents.remove(0); } else { if ((nd.getNodeType() != 1) || (!nd.getName().equals("x")) || (!((org.dom4j.Element) nd).attributeValue("ctype").equals("x-tab"))) { break; } trg_contents.remove(0); } } while (trg_contents.size() > 0) { org.dom4j.Node nd = (org.dom4j.Node) trg_contents.get(trg_contents.size() - 1); if (nd.getNodeType() == 3) { if (nd.getText().trim().equals("")) { trg_contents.remove(trg_contents.size() - 1); } else { nd.setText(nd.getText().replaceAll("(\\s)+$", "")); break; } } else if ((nd.getNodeType() == 1) && (nd.getName().equals("x")) && (((org.dom4j.Element) nd).attributeValue("ctype").equals("lb"))) { trg_contents.remove(trg_contents.size() - 1); } else { if ((nd.getNodeType() != 1) || (!nd.getName().equals("x")) || (!((org.dom4j.Element) nd).attributeValue("ctype").equals("x-tab"))) { break; } trg_contents.remove(trg_contents.size() - 1); } } boolean isnotignoredsegment = true; if (trg_contents.size() == 1) { org.dom4j.Node node = (org.dom4j.Node) trg_contents.get(0); if ((node.getNodeType() == 3) && (node.getText().replaceFirst("", "").equals(""))) { isnotignoredsegment = false; } } if ((targetnotag) && (sourcetagsurroundtext) && (isnotignoredsegment)) { boolean isleadingtag = true; int index = 0; for (int i = 0; i < source.content().size(); i++) { org.dom4j.Node node = (org.dom4j.Node) source.content().get(i); if (node.getNodeType() == 1) { if (isleadingtag) { trg_contents.add(index, node); index++; } else { trg_contents.add(node); } } else { isleadingtag = false; index++; } } } target.setContent(trg_contents); } } } public int[] getRepsAndFuzzyReps() throws Exception { System.out.println("gathering reps and fuzzy reps count...."); int[] fr_stats = new int[2]; return fr_stats; } private List<String[]> integrateNotesIntoSegments(String file) throws Exception { System.out.println("integrating comments...."); List<String[]> notesinfo = new ArrayList<String[]>(); int id = 0; org.dom4j.Document doc = XmlParser.parseXmlFile(file); org.dom4j.Element root = doc.getRootElement(); List transunits = root.selectNodes("//*[name() = 'trans-unit']"); for (Object transunit_obj : transunits) { Element transunit = (Element) transunit_obj; Element target = transunit.element("target"); if (target != null && !target.content().isEmpty()) { List<Element> notes = transunit.elements("note"); for (int i = notes.size() - 1; i >= 0; i--) { Element note = notes.get(i); List content = target.content(); content.add(0, DocumentHelper.createText("[%cmt_" + id + "_s%]")); content.add(DocumentHelper.createText("[%cmt_" + id + "_e%]")); target.setContent(content); String[] noteinfo = new String[3]; noteinfo[0] = note.attribute("from") != null ? note.attributeValue("from") : "Unknown"; noteinfo[1] = note.attribute("timestamp") != null ? note.attributeValue("timestamp") : "Unknown"; noteinfo[2] = note.getText(); notesinfo.add(noteinfo); id++; } } } OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(file)), "UTF8"); doc.write(writer); writer.close(); return notesinfo; } public void createTargetFile() throws Exception { System.out.println("creating target file...."); if (!isReplaceStyleAvailable(new com.aspose.words.Document(this.backupsourcefile))) { throw new Exception("cannot create target file, replace styles not available"); } Configuration config = new BaseConfiguration(); MergeDOC merger = new MergeDOC(); merger.setConfiguration(config); merger.setOverwriteOriginalFile(false); XliffDocument xliffDocument = null; String tmp_translatedtxlf = this.translatedtxlf + ".tmp"; FileUtils.copyFile(new File(this.translatedtxlf), new File(tmp_translatedtxlf), true); List<String[]> notesinfo = integrateNotesIntoSegments(tmp_translatedtxlf); xliffDocument = new XliffDocument(new File(tmp_translatedtxlf)); String skeletonFile = this.sourcefile; com.aspose.words.Document document = new AsposeFactory() .createDocumentInstance(new FileInputStream(skeletonFile)); Configuration docConfig = xliffDocument.getConfiguration(new OfficeConfigurationConverterImpl()); WordDocumentAligner aligner = new WordDocumentAligner(document, xliffDocument); aligner.addWordConfiguration(docConfig); aligner.align(); String ext = this.sourcefile.substring(this.sourcefile.lastIndexOf('.'), this.sourcefile.length()); String name = new File(this.sourcefile).getName(); this.aligneddoc = (new File(this.translatedtxlf).getParent() + File.separator + name.substring(0, name.lastIndexOf(".")) + "_" + this.targetlanguage + ext); aligner.writeAlignedDocument(new File(this.aligneddoc)); com.aspose.words.Document trg_doc = new com.aspose.words.Document(this.aligneddoc); if (this.replacestyles[1] == doublestrikethrough) { for (int i = 0; i < trg_doc.getChildNodes(21, true).getCount(); i++) { Run run = (Run) trg_doc.getChildNodes(21, true).get(i); if ((run.getFont().getUnderline() == this.replacestyles[0]) && (run.getFont().getDoubleStrikeThrough() == true)) { run.getFont().setUnderline(1); run.getFont().setDoubleStrikeThrough(false); run.getFont().setStrikeThrough(true); } else if (run.getFont().getUnderline() == this.replacestyles[0]) { run.getFont().setUnderline(1); } else if (run.getFont().getDoubleStrikeThrough() == true) { run.getFont().setDoubleStrikeThrough(false); run.getFont().setStrikeThrough(true); } } } else { for (int i = 0; i < trg_doc.getChildNodes(21, true).getCount(); i++) { Run run = (Run) trg_doc.getChildNodes(21, true).get(i); if (run.getFont().getUnderline() == this.replacestyles[2]) { run.getFont().setUnderline(1); run.getFont().setStrikeThrough(true); } else if (run.getFont().getUnderline() == this.replacestyles[0]) { run.getFont().setUnderline(1); } else if (run.getFont().getUnderline() == this.replacestyles[1]) { run.getFont().setUnderline(0); run.getFont().setStrikeThrough(true); } } } removeIgnoredSegments(trg_doc, Locale.makeLocale(this.targetlanguage)); removeIgnoredParagraphs(trg_doc); insertComments(trg_doc, notesinfo); trg_doc.save(this.aligneddoc); if (xliffDocument != null) { xliffDocument.dispose(); } new File(tmp_translatedtxlf).delete(); } private void insertComments(com.aspose.words.Document doc, List<String[]> notesinfo) throws Exception { String reg = "\\[%cmt_(\\d)+_[se]%\\]"; Pattern regex = Pattern.compile(reg); List<Run> collectedruns = new ArrayList<Run>(); doc.getRange().replace(regex, new ReplaceEvaluatorCommentsPlaceHolders(notesinfo, doc, collectedruns), false); for (Run run : collectedruns) { String[] ss = run.getText().substring(2, run.getText().length() - 2).split("_"); String[] noteinfo = notesinfo.get(Integer.parseInt(ss[1])); if (ss[2].equals("s")) { Comment comment = new Comment(doc); Paragraph para = new Paragraph(doc); Run newrun = new Run(doc, noteinfo[2].replace("\n", ControlChar.LINE_BREAK)); para.getRuns().add(newrun); comment.getParagraphs().add(para); comment.setAuthor(noteinfo[0]); SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMdd'T'HHmmssZ"); Date date = formatter.parse(noteinfo[1].replaceAll("[-:]", "").replaceAll("Z$", "+0000")); comment.setDateTime(date); CommentRangeStart start = new CommentRangeStart(doc, Integer.parseInt(ss[1])); run.getParentNode().insertAfter(comment, run); run.getParentNode().insertAfter(start, comment); } else { CommentRangeEnd end = new CommentRangeEnd(doc, Integer.parseInt(ss[1])); run.getParentNode().insertAfter(end, run); } run.setText(""); } } public void createTargetCompareFile(String preservefmt) throws Exception { System.out.println("creating target compare file...."); boolean iscompareerror = false; String ext = this.sourcefile.substring(this.sourcefile.lastIndexOf('.'), this.sourcefile.length()); String name = new File(this.sourcefile).getName(); this.comparedoc = (new File(this.translatedtxlf).getParent() + File.separator + name.substring(0, name.lastIndexOf(".")) + "_tracked" + ext); com.aspose.words.Document doc_old = new com.aspose.words.Document(this.reformattedtargetfile); String temp_old = new File(this.translatedtxlf).getParent() + File.separator + "old.docx"; doc_old.save(temp_old); doc_old = new com.aspose.words.Document(temp_old); com.aspose.words.Document doc_new = new com.aspose.words.Document(this.aligneddoc); String temp_new = new File(this.translatedtxlf).getParent() + File.separator + "new.docx"; doc_new.save(temp_new); doc_new = new com.aspose.words.Document(temp_new); normalizeSpaceAfterESM(doc_new, Locale.makeLocale(this.targetlanguage)); //just want to delete comments DeleteCommentsAndShapeAltText(doc_new); try { doc_old.compare(doc_new, "user", new Date()); } catch (Exception ex) { iscompareerror = true; ex.printStackTrace(); } if (preservefmt.equals("true")) { new File(this.aligneddoc).delete(); com.aspose.words.Document doc_clone = doc_old.deepClone(); for (int i = 0; i < doc_clone.getRevisions().getCount(); i++) { Revision rev = doc_clone.getRevisions().get(i); if ((rev.getRevisionType() == 2) || (rev.getRevisionType() == 3)) { rev.reject(); i--; } else if ((rev.getParentNode().getText().equals(ControlChar.PARAGRAPH_BREAK)) || (rev.getParentNode().getText().equals(""))) { rev.reject(); i--; } } for (int i = 0; i < doc_clone.getRevisions().getCount(); i++) { Revision rev = doc_clone.getRevisions().get(i); rev.accept(); i--; } doc_clone.save(this.aligneddoc); iscompareerror = false; doc_old = new com.aspose.words.Document(temp_old); try { doc_old.compare(doc_clone, "user", new Date()); } catch (Exception ex) { iscompareerror = true; ex.printStackTrace(); } if (!iscompareerror) { for (int i = 0; i < doc_old.getRevisions().getCount(); i++) { Revision rev = doc_old.getRevisions().get(i); if ((rev.getRevisionType() == 2) || (rev.getRevisionType() == 3)) { rev.reject(); i--; } else if ((rev.getParentNode().getText().equals(ControlChar.PARAGRAPH_BREAK)) || (rev.getParentNode().getText().equals(""))) { rev.reject(); i--; } } doc_old.save(this.comparedoc); } } else if (!iscompareerror) { for (int i = 0; i < doc_old.getRevisions().getCount(); i++) { Revision rev = doc_old.getRevisions().get(i); if ((rev.getRevisionType() == 2) || (rev.getRevisionType() == 3)) { rev.accept(); i--; } } doc_old.save(this.comparedoc); } new File(temp_old).delete(); new File(temp_new).delete(); } private static int getTagPenalty(org.dom4j.Element source) { int penalty = 0; penalty = (int) Math.round(source.elements("ut").size() * 0.5D); return penalty; } private static String wordToHtml(String paratext) { paratext = paratext.replace("<del></del>", "").replace("<ins></ins>", "").replace("</del><del>", "") .replace("</ins><ins>", ""); paratext = paratext.replaceAll("\t", "	").replace(ControlChar.LINE_BREAK, "<cr>↵</cr><br>") .replace(String.valueOf('\036'), "‑").replace(String.valueOf('\037'), "") .replace(String.valueOf('\016'), "").replace(String.valueOf('\f'), "") .replace(String.valueOf('\f'), ""); return paratext; } private static String HtmlToPlain(String paratext) { paratext = paratext.replaceAll("	", " ").replace("<cr>↵</cr><br>", " ").replace("‑", "-") .replace("<", "<").replace(">", ">"); return paratext; } private static void trimParaLeadingTrailingSpace(com.aspose.words.Document doc) { for (int i = 0; i < doc.getChildNodes(8, true).getCount(); i++) { Paragraph para = (Paragraph) doc.getChildNodes(8, true).get(i); if (para.getChildNodes(21, true).getCount() != 0) { Run firstrun = (Run) para.getChildNodes(21, true).get(0); if (firstrun.getText().trim().equals("")) { firstrun.remove(); } } if (para.getChildNodes(21, true).getCount() != 0) { Run lastrun = (Run) para.getChildNodes(21, true).get(para.getChildNodes(21, true).getCount() - 1); if (lastrun.getText().trim().equals("")) { lastrun.remove(); } } } } private static void UnlinkFields(com.aspose.words.Document doc) { for (int i = 0; i < doc.getChildNodes(22, true).getCount(); i++) { com.aspose.words.Node start = doc.getChildNodes(22, true).get(i); com.aspose.words.Node curNode = start; if (((FieldStart) curNode).getFieldType() != 13) { while ((curNode.getNodeType() != 23) && (curNode.getNodeType() != 24)) { curNode = curNode.nextPreOrder(doc); if (curNode == null) { break; } } if (curNode != null) { RemoveSequence(start, curNode); } start.remove(); i--; } } } private static void RemoveSequence(com.aspose.words.Node start, com.aspose.words.Node end) { com.aspose.words.Node curNode = start.nextPreOrder(start.getDocument()); while ((curNode != null) && (!curNode.equals(end))) { com.aspose.words.Node nextNode = curNode.nextPreOrder(start.getDocument()); if (curNode.isComposite()) { if ((!((CompositeNode) curNode).getChildNodes(0, true).contains(end)) && (!((CompositeNode) curNode).getChildNodes(0, true).contains(start))) { nextNode = curNode.getNextSibling(); curNode.remove(); } } else { curNode.remove(); } curNode = nextNode; } } private static String cumulateSegType(String oldtype, String newtype) { if (oldtype.equals("mix")) { return "mix"; } if (oldtype.equals("insertion")) { if (newtype.equals("insertion")) { return "insertion"; } return "mix"; } if (oldtype.equals("deletion")) { if (newtype.equals("deletion")) { return "deletion"; } return "mix"; } if (newtype.equals("regular")) { return "regular"; } return "mix"; } private String getParaText(Paragraph para) throws Exception { String paratext = ""; boolean isInField = false; for (int i = 0; i < para.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node nd = para.getChildNodes(0, true).get(i); if ((nd.getNodeType() == 21) && (!isInField)) { Run run = (Run) nd; if (!run.getFont().getName().equals("Wingdings")) { paratext = paratext + run.getText(); } } else if (nd.getNodeType() == 22) { isInField = true; } else if (nd.getNodeType() == 24) { isInField = false; } } return paratext.trim(); } private String getParaRejectedText(Paragraph para) throws Exception { String paratext = ""; boolean isInField = false; for (int i = 0; i < para.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node nd = para.getChildNodes(0, true).get(i); if ((nd.getNodeType() == 21) && (!isInField)) { Run run = (Run) nd; if (!run.getFont().getName().equals("Wingdings")) { if ((!run.isInsertRevision()) || (run.isDeleteRevision())) { paratext = paratext + run.getText(); } } } else if (nd.getNodeType() == 22) { isInField = true; } else if (nd.getNodeType() == 24) { isInField = false; } } return paratext.trim(); } private String getParaAccpetedText(Paragraph para) throws Exception { String paratext = ""; boolean isInField = false; for (int i = 0; i < para.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node nd = para.getChildNodes(0, true).get(i); if ((nd.getNodeType() == 21) && (!isInField)) { Run run = (Run) nd; if (!run.getFont().getName().equals("Wingdings")) { if (!run.isDeleteRevision()) { paratext = paratext + run.getText(); } } } else if (nd.getNodeType() == 22) { isInField = true; } else if (nd.getNodeType() == 24) { isInField = false; } } return paratext.trim(); } private void DeleteCommentsAndShapeAltText(com.aspose.words.Document doc) { while (doc.getChildNodes(19, true).getCount() != 0) { doc.getChildNodes(19, true).get(0).remove(); } for (int i = 0; i < doc.getChildNodes(0, true).getCount(); i++) { com.aspose.words.Node node = doc.getChildNodes(0, true).get(i); if (node.getNodeType() == 19) { node.remove(); i--; } else if (node.getNodeType() == 18) { Shape shape = (Shape) node; shape.setAlternativeText(""); } } } private void removeHyperlinks(com.aspose.words.Document doc) throws Exception { NodeCollection<FieldStart> starts = doc.getChildNodes(22, true); ArrayList<FieldStart> hyperlinkStarts = new ArrayList(); for (FieldStart start : starts) { if (start.getFieldType() == 88) { hyperlinkStarts.add(start); com.aspose.words.Node currentNode = start.getNextSibling(); com.aspose.words.Node fieldSeparator = null; while ((currentNode != null) && (currentNode.getNodeType() != 23)) { currentNode = currentNode.getNextSibling(); currentNode.getPreviousSibling().remove(); } fieldSeparator = currentNode; while ((currentNode != null) && (currentNode.getNodeType() != 24)) { if (currentNode.getNodeType() == 21) { ((Run) currentNode).getFont().setColor(Color.BLACK); } currentNode = currentNode.getNextSibling(); } if (fieldSeparator != null) { fieldSeparator.remove(); } if (currentNode != null) { currentNode.remove(); } } } for (FieldStart start : hyperlinkStarts) { start.remove(); } } private boolean isReplaceStyleAvailable(com.aspose.words.Document doc) throws Exception { List<Integer> availables = new ArrayList(); //disable for now because WF5 does not display double strike through formatting correctly. //availables.add(Integer.valueOf(doublestrikethrough)); for (int i = 0; i < Underline.getValues().length; i++) { int value = Underline.getValues()[i]; if ((value != 0) && (value != 1)) { availables.add(Integer.valueOf(value)); } } for (int i = 0; i < doc.getChildNodes(21, true).getCount(); i++) { Run run = (Run) doc.getChildNodes(21, true).get(i); if (run.getFont().getDoubleStrikeThrough()) { availables.remove(Integer.valueOf(doublestrikethrough)); } if (availables.contains(Integer.valueOf(run.getFont().getUnderline()))) { availables.remove(Integer.valueOf(run.getFont().getUnderline())); } } if (availables.contains(Integer.valueOf(doublestrikethrough))) { if (availables.size() >= 1) { this.replacestyles[0] = ((Integer) availables.get(1)).intValue(); this.replacestyles[1] = doublestrikethrough; return true; } return false; } if (availables.size() >= 3) { this.replacestyles[0] = ((Integer) availables.get(1)).intValue(); this.replacestyles[1] = ((Integer) availables.get(2)).intValue(); this.replacestyles[2] = ((Integer) availables.get(3)).intValue(); return true; } return false; } private boolean isTrackedHidden(Revision rev) throws Exception { com.aspose.words.Node parent = rev.getParentNode(); if ((parent != null) && (parent.isComposite())) { CompositeNode cn = (CompositeNode) parent; for (int i = 0; i < cn.getChildNodes(21, true).getCount(); i++) { Run run = (Run) cn.getChildNodes(21, true).get(i); if (run.getFont().getHidden()) { return true; } } } return false; } public void normalizeSpaceAfterESM(com.aspose.words.Document doc, Locale locale) throws Exception { String[] esm = createESMString(locale).split(","); for (String sesm : esm) { String reg = "\\" + sesm + "[ ]+"; Pattern regex = Pattern.compile(reg); String rep = sesm; if ((!locale.isFarEast()) || (locale.isKorean())) { rep = rep + " "; } doc.getRange().replace(regex, rep, new FindReplaceOptions()); } for (int i = 0; i < doc.getChildNodes(8, true).getCount(); i++) { Paragraph para = (Paragraph) doc.getChildNodes(8, true).get(i); Run lastrun = null; for (int j = para.getChildNodes(21, true).getCount() - 1; j >= 0; j--) { Run run = (Run) para.getChildNodes(21, true).get(j); if (run != null) { lastrun = run; break; } } if (lastrun != null) { lastrun.setText(lastrun.getText().replaceAll("\\s+$", " ")); } } } public void removeIgnoredSegments(com.aspose.words.Document doc, Locale locale) throws Exception { String[] esm = createESMString(locale).split(","); for (String sesm : esm) { String reg = "\\" + sesm; if ((!locale.isFarEast()) || (locale.isKorean())) { reg = reg + " "; } else { reg = reg + ""; } Pattern regex = Pattern.compile(reg); String rep = sesm; doc.getRange().replace(regex, rep, new FindReplaceOptions()); } } public void removeIgnoredParagraphs(com.aspose.words.Document doc) throws Exception { for (int i = 0; i < doc.getChildNodes(8, true).getCount(); i++) { Paragraph para = (Paragraph) doc.getChildNodes(8, true).get(i); if ((para.getText().contains("")) && (para.getText().replace("", "").replace(" ", "").equals(ControlChar.PARAGRAPH_BREAK))) { para.remove(); i--; } } } private String createESMString(Locale locale) throws Exception { String ESM = ""; for (int i = 0; i < this.esps.length; i++) { char c = this.esps[i]; if (i == 0) { ESM = ESM + c; } else { ESM = ESM + "," + c; } } if ((locale.isJapanese()) || (locale.isKorean()) || (locale.isChinese())) { int i = 0; for (int max = this.esps.length; i < max; i++) { if (this.esps[i] == ':') { ESM = ESM + ","; } else if (this.esps[i] == '.') { ESM = ESM + ","; ESM = ESM + ","; ESM = ESM + ","; } else if (this.esps[i] == '!') { ESM = ESM + ",?"; } else if (this.esps[i] == '?') { ESM = ESM + ","; } } } else if (locale.isHindi()) { ESM = ESM + ","; } if (locale.isArabic()) { int i = 0; for (int max = this.esps.length; i < max; i++) { if (this.esps[i] == '.') { ESM = ESM + ","; } else if (this.esps[i] == '?') { ESM = ESM + ","; } } } return ESM; } private boolean isNormalTextExpansion(String source, String target) { if (source.length() > target.length() * textexpansionthreshold) { return false; } if (target.length() > source.length() * textexpansionthreshold) { return false; } return true; } }