List of usage examples for org.jdom2 Element setAttribute
public Element setAttribute(final String name, final String value)
This sets an attribute value for this element.
From source file:eu.himeros.digitaledition.AlignedQuotationParser.java
License:Open Source License
private void parseTextLine(String textLine) { String[] tokens = textLine.split(" "); for (String token : tokens) { token = token.replaceAll("[\n\t,;.]+", ""); if (token.matches("[\u0380-\u03FF\u1F00-\u1FFF]+")) { if (hyphenatedFirstPart != null) { token = hyphenatedFirstPart + token; hyphenatedFirstPart = null; }/*from ww w . ja v a2 s.c o m*/ Element el = new Element("w"); el.setAttribute("id", "" + id++); el.setAttribute("text", token); el.setAttribute("uc", upperTrans.parse(token)); rootOut.addContent(el); } else if (token.endsWith("-")) { hyphenatedFirstPart = token.substring(0, token.length() - 1); } } }
From source file:eu.himeros.digitaledition.AlignedQuotationParser.java
License:Open Source License
private void injectOcc(Element root) { makeOccHm(root);//from w w w. j a v a 2 s. co m List<Element> words = root.getChildren(); for (Element word : words) { String upWord = word.getAttributeValue("uc"); String occ = occHm.get(upWord).toString(); word.setAttribute("occ", occ); } }
From source file:eu.himeros.hocr.FlatXml.java
License:Open Source License
private void init(File inFile, File outFile) throws Exception { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(inFile); Element root = doc.getRootElement(); Namespace oldns = root.getNamespace(); Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml"); Namespace xmlns = newRoot.getNamespace(); Element head = root.getChild("head", oldns); head.setNamespace(xmlns);//w w w . j a v a 2 s. co m for (Element child : head.getChildren()) child.setNamespace(xmlns); Element title = new Element("title", xmlns); title.addContent("ocr"); if (head != null) head.addContent(title); Element body = root.getChild("body", oldns); body.setNamespace(xmlns); /*Element oldPage; try{ oldPage=body.getChild("div",xmlns); }catch(Exception ex){ oldPage=new Element("div",xmlns); }*/ Element page = new Element("div", xmlns); page.setAttribute("class", "ocr_page"); page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png")); XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> careaElL = xpath.evaluate(body); for (Element careaEl : careaElL) { page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\"" + careaEl.getAttributeValue("title") + "\">")); for (Element pEl : careaEl.getChildren()) { page.addContent(new Comment("<p>")); for (Element lineEl : pEl.getChildren()) { lineEl.removeAttribute("id"); lineEl.setNamespace(xmlns); for (Element child : lineEl.getChildren()) { child.removeAttribute("id"); child.removeAttribute("lang"); child.removeAttribute("lang", xmlns); child.setNamespace(xmlns); } page.addContent(lineEl.clone()); } page.addContent(new Comment("</p>")); } page.addContent(new Comment("</div>")); } //oldPage.detach(); if (body != null) { body.removeContent(); body.addContent(page); } newRoot.addContent(root.removeContent()); doc.detachRootElement(); doc.setRootElement(newRoot); XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat()); xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile))); }
From source file:eu.himeros.hocr.GrcContextFilterMananger.java
License:Open Source License
@Override public void adjustPreviousSuitableElement() { Element prevEl = queue.poll(); Element currEl = queue.peek(); Element nextEl = queue.get(1); try {//from w ww . j a v a2s . co m Element prevInfo = prevEl.getChild("span", prevEl.getNamespace()); Element currInfo = currEl.getChild("span", currEl.getNamespace()); Element nextInfo = nextEl.getChild("span", nextEl.getNamespace()); if (currInfo != null && "UCWORD".equals(currInfo.getAttributeValue("class"))) { String suggestions = ""; try { suggestions = filterSuggestions(currInfo.getText(), prevInfo.getText(), nextInfo.getText(), currInfo.getAttributeValue("title")); } catch (NullPointerException npex) { // } if (suggestions.trim().contains(" ")) { currInfo.setAttribute("title", suggestions); } else if (suggestions.length() > 0) { currInfo.setAttribute("class", "CORRWORD"); currInfo.setAttribute("title", currInfo.getText()); currInfo.setText(suggestions); } } } catch (Exception ex) { ex.printStackTrace(System.err); } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void parseOcrWord(Element ocrWord) { String text = ocrWord.getText(); text = adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(text)); String upText = low2upL1Trans.parse(text); if (text.endsWith("-")) { ocrWord.setAttribute("idx", "" + id++); hyphenPart1 = ocrWord;//from w w w . ja v a2s . co m return; } else if (hyphenPart1 != null) { text = adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(parseOcrHyphenatedWord(hyphenPart1, ocrWord))); upText = low2upL1Trans.parse(text); } Element infoSpan = new Element("span", xmlns); infoSpan.setText(adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(ocrWord.getText()))); upText = upText.replaceAll(l1NonAlphabeticFilter, ""); infoSpan.setAttribute("id", "" + id++); Integer occ; occ = ((occ = occHm.get(upText)) == null ? 1 : ++occ); occHm.put(upText, occ); infoSpan.setAttribute("uc", upText); try { ocrWord.getContent(0).detach(); } catch (Exception ex) { } Token token = new Token(text); token = setClassiFicationAndScore(token); infoSpan = setInfoSpanClass(token, infoSpan); ocrWord.addContent(infoSpan); l1Fm.addSuitableElement(ocrWord); l1Fm.adjustPreviousSuitableElement(); if (hyphenPart1 != null) { text = hyphenPart1.getText(); hyphenPart1.getContent(0).detach(); Element infoSpan1 = new Element("span", xmlns); infoSpan1.setAttribute("class", infoSpan.getAttributeValue("class")); infoSpan1.setText(text); hyphenPart1.addContent(infoSpan1); hyphenPart1 = null; //TODO: ??? } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private Element setInfoSpanClass(Token token, Element infoSpan) { switch (token.getClassification()) { case WORD://from www . j a va 2s. c om infoSpan.setAttribute("class", "WORD"); break; case UCWORD: infoSpan.setAttribute("class", "UCWORD"); infoSpan.setAttribute("title", makeSuggestions(token)); break; case SYLLABICSEQ: infoSpan.setAttribute("class", "SYLLABICSEQ"); infoSpan.setAttribute("title", makeSuggestions(token)); break; case CHARSEQ: infoSpan.setAttribute("class", "CHARSEQ"); infoSpan.setAttribute("title", makeSuggestions(token)); break; case BADONE: infoSpan.setAttribute("class", "BADONE"); infoSpan.setAttribute("title", makeSuggestions(token)); break; case BADMANY: infoSpan.setAttribute("class", "BADMANY"); infoSpan.setAttribute("title", makeSuggestions(token)); break; case L2WORD: infoSpan.setAttribute("class", "L2WORD"); makeSuggestions(token); infoSpan.setAttribute("title", token.getText()); break; } return infoSpan; }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void updateElements() { xpath = XPathFactory.instance().compile("//ns:span[@uc!='']", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); for (Element element : elements) { String uc = element.getAttributeValue("uc"); element.setAttribute("occ", "" + occHm.get(uc)); try {//w w w . j a va 2 s .c o m if (occHm.get(uc) == 1) { element.setAttribute("anchor", nearGtHm.get(uc).getAttributeValue("uc")); element.setAttribute("anchor-id", nearGtHm.get(uc).getAttributeValue("id")); if ("CORRWORD".equals(element.getAttributeValue("class")) | "UCWORD".equals(element.getAttributeValue("class"))) { String title = element.getAttributeValue("title"); title = nearGtHm.get(uc).getAttributeValue("text") + "\u261a " + title; element.setAttribute("title", title); } } } catch (Exception ex) { continue; } } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void makeCompliantHocr() { xpath = XPathFactory.instance().compile("//ns:span[@id|@idx]", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); int spanId = 0; for (Element span : elements) { if (span.getAttribute("idx") != null) { try { span = span.getChildren().get(0); } catch (Exception ex) { // }/* w ww.j a v a 2 s. c o m*/ } LinkedList<Attribute> attributeLl = new LinkedList(span.getParentElement().getAttributes()); attributeLl.addFirst(new Attribute("id", "w_" + spanId++)); span.getParentElement().setAttributes(attributeLl); String[] suggestions = null; String title = span.getAttributeValue("title"); if (title != null) { suggestions = title.split(" "); } if (suggestions == null) { suggestions = new String[] { "" }; } Element ins = new Element("ins", xmlns); ins.setAttribute("class", "alt"); ins.setAttribute("title", makeNlp(span.getAttributeValue("class"))); ins.setText(span.getText()); span.removeContent(); span.addContent(ins); span.setAttribute("class", "alternatives"); span.removeAttribute("uc"); span.removeAttribute("occ"); span.removeAttribute("title"); span.removeAttribute("anchor"); span.removeAttribute("anchor-id"); span.removeAttribute("id"); span.getParentElement().removeAttribute("idx"); span.removeAttribute("whole"); span.getParentElement().removeAttribute("whole"); if (title == null || "".equals(title)) { continue; } double score = 0.90; for (String suggestion : suggestions) { if (suggestion == null || "".equals(suggestion)) { continue; } Element del = new Element("del", xmlns); del.setAttribute("title", "nlp " + String.format("%.2f", score).replaceAll(",", ".")); score = score - 0.01; suggestion = suggestion.replaceAll(l1PunctMarkFilter, ""); Matcher leftMatcher = l1LeftPunctMarkPattern.matcher(ins.getText()); if (leftMatcher.matches()) { suggestion = leftMatcher.group(1) + suggestion; } Matcher rightMatcher = l1RightPunctMarkPattern.matcher(ins.getText()); if (rightMatcher.matches()) { String ngtSymbol = ""; if (suggestion.endsWith("\u261a")) { ngtSymbol = "\u261a"; suggestion = suggestion.substring(0, suggestion.length() - 1); } suggestion = suggestion + rightMatcher.group(1) + ngtSymbol; } ///!!!! if (suggestion.endsWith("\u261a") && ins.getParentElement().getParentElement() .getAttributeValue("lang", Namespace.XML_NAMESPACE) != null) { String buff = suggestion.substring(0, suggestion.length() - 1); sa.align(buff, ins.getText()); double sim = 1 - sa.getEditDistance() / Math.max((double) buff.length(), (double) ins.getText().length()); if (sim > 0.6) { suggestion = ins.getText() + "\u261b"; ins.setText(buff); ins.setAttribute("title", "nlp 0.70"); } } del.addContent(suggestion); span.addContent(del); } } }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String nafToAbsa2015(String inputNAF) throws IOException { Path kafPath = Paths.get(inputNAF); KAFDocument kaf = KAFDocument.createFromFile(kafPath.toFile()); Set<String> reviewIds = getReviewIdsFromXpathAttribute(kaf); // root element in ABSA 2015 and 2016 format Element reviewsElem = new Element("Reviews"); Document doc = new Document(reviewsElem); // creating Reviews children of Review for (String reviewId : reviewIds) { Element reviewElem = new Element("Review"); reviewElem.setAttribute("rid", reviewId); Element sentencesElem = new Element("sentences"); // getting the sentences in the review List<List<WF>> sentencesByReview = getSentencesByReview(kaf, reviewId); for (List<WF> sent : sentencesByReview) { String sentId = sent.get(0).getXpath(); Integer sentNumber = sent.get(0).getSent(); // getting text element from word forms in NAF String textString = NAFUtils.getSentenceStringFromWFs(sent); Element sentenceElem = new Element("sentence"); sentenceElem.setAttribute("id", sentId); Element textElem = new Element("text"); textElem.setText(textString); sentenceElem.addContent(textElem); // creating opinions element for sentence List<Opinion> opinionsBySentence = getOpinionsBySentence(kaf, sentNumber); Element opinionsElem = new Element("Opinions"); if (!opinionsBySentence.isEmpty()) { // getting opinion info from NAF Opinion layer for (Opinion opinion : opinionsBySentence) { Element opinionElem = new Element("Opinion"); // String polarity = opinion.getOpinionExpression().getPolarity(); String category = opinion.getOpinionExpression().getSentimentProductFeature(); String targetString = opinion.getStr(); int fromOffset = opinion.getOpinionTarget().getTerms().get(0).getWFs().get(0).getOffset(); List<WF> targetWFs = opinion.getOpinionTarget().getTerms() .get(opinion.getOpinionTarget().getTerms().size() - 1).getWFs(); int toOffset = targetWFs.get(targetWFs.size() - 1).getOffset() + targetWFs.get(targetWFs.size() - 1).getLength(); opinionElem.setAttribute("target", targetString); opinionElem.setAttribute("category", category); // TODO we still do not have polarity here opinionElem.setAttribute("polarity", "na"); opinionElem.setAttribute("from", Integer.toString(fromOffset)); opinionElem.setAttribute("to", Integer.toString(toOffset)); opinionsElem.addContent(opinionElem); }//from ww w . j a v a 2s. c o m } sentenceElem.addContent(opinionsElem); sentencesElem.addContent(sentenceElem); } reviewElem.addContent(sentencesElem); reviewsElem.addContent(reviewElem); } // end of review XMLOutputter xmlOutput = new XMLOutputter(); Format format = Format.getPrettyFormat(); xmlOutput.setFormat(format); return xmlOutput.outputString(doc); }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
License:Apache License
public static String nafToAbsa2014(String kafDocument) { KAFDocument kaf = null;/* ww w. j a v a2 s.com*/ try { Path kafPath = Paths.get(kafDocument); kaf = KAFDocument.createFromFile(kafPath.toFile()); } catch (IOException e) { e.printStackTrace(); } Element sentencesElem = new Element("sentences"); Document doc = new Document(sentencesElem); for (List<WF> sent : kaf.getSentences()) { String sentId = sent.get(0).getXpath(); Integer sentNumber = sent.get(0).getSent(); // getting text element from WFs in NAF String textString = NAFUtils.getSentenceStringFromWFs(sent); Element sentenceElem = new Element("sentence"); sentenceElem.setAttribute("id", sentId); Element textElem = new Element("text"); textElem.setText(textString); sentenceElem.addContent(textElem); // creating opinions element for sentence List<Opinion> opinionsBySentence = getOpinionsBySentence(kaf, sentNumber); if (!opinionsBySentence.isEmpty()) { Element aspectTerms = new Element("aspectTerms"); // getting opinion info from NAF Opinion layer for (Opinion opinion : opinionsBySentence) { String polarity = ""; String targetString = opinion.getStr(); int fromOffset = opinion.getOpinionTarget().getTerms().get(0).getWFs().get(0).getOffset(); List<WF> targetWFs = opinion.getOpinionTarget().getTerms() .get(opinion.getOpinionTarget().getTerms().size() - 1).getWFs(); int toOffset = targetWFs.get(targetWFs.size() - 1).getOffset() + targetWFs.get(targetWFs.size() - 1).getLength(); Element aspectTerm = new Element("aspectTerm"); aspectTerm.setAttribute("term", targetString); aspectTerm.setAttribute("polarity", polarity); aspectTerm.setAttribute("from", Integer.toString(fromOffset)); aspectTerm.setAttribute("to", Integer.toString(toOffset)); aspectTerms.addContent(aspectTerm); } sentenceElem.addContent(aspectTerms); } sentencesElem.addContent(sentenceElem); } XMLOutputter xmlOutput = new XMLOutputter(); Format format = Format.getPrettyFormat(); xmlOutput.setFormat(format); return xmlOutput.outputString(doc); }