List of usage examples for org.jdom2 Element getText
public String getText()
From source file:Enrichissement.Jaccard.java
private static boolean comparerObjet(Element triplet1, Element triplet2) { Element uri1 = triplet1.getChild("uri"); Element uri2 = triplet2.getChild("uri"); Element literal1 = triplet1.getChild("literal"); Element literal2 = triplet1.getChild("literal"); if ((uri1 == null && uri2 != null) || (uri2 == null && uri1 != null)) { return false; } else if (uri1 != null && uri2 != null) { String uri1text = uri1.getText(); String uri2text = uri2.getText(); if (!(uri1text.equals(uri2text))) { return false; }//from w w w . ja v a 2 s . c o m } else if (literal1 != null && literal2 != null) { String lang1 = literal1.getAttributeValue("lang", Namespace.XML_NAMESPACE); String lang2 = literal2.getAttributeValue("lang", Namespace.XML_NAMESPACE); String datatype1 = literal1.getAttributeValue("datatype"); String datatype2 = literal2.getAttributeValue("datatype"); String literal1Text = literal1.getText(); String literal2Text = literal2.getText(); if (lang1 != null && lang2 != null) { if (!lang1.equals(lang2)) { return false; } else if (!literal1Text.equals(literal2Text)) { return false; } } else if (datatype1 != null && datatype2 != null) { if (!datatype1.equals(datatype2)) { return false; } else if (!literal1Text.equals(literal2Text)) { return false; } } } return true; }
From source file:es.ucm.fdi.ac.Annotation.java
License:Open Source License
public void loadFromXML(Element element) throws IOException { String labelsAttribute = element.getAttributeValue("labels"); labels.clear();//from w w w. j a v a 2s . c o m for (String labelString : labelsAttribute.split(" ")) { labels.add(Label.valueOf(labelString.toLowerCase())); } author = element.getAttributeValue("author"); target = element.getAttributeValue("target"); localFile = element.getAttributeValue("localFile"); if (localFile != null) { while (localFile.endsWith("/")) { localFile = localFile.substring(0, localFile.length() - 1); } } targetFile = element.getAttributeValue("targetFile"); if (targetFile != null) { while (targetFile.endsWith("/")) { targetFile = targetFile.substring(0, targetFile.length() - 1); } } if (element.getAttributeValue("date") != null) { try { date = dateFormat.parse(element.getAttributeValue("date")); } catch (ParseException ex) { throw new IOException("Error parsing annotation date", ex); } } if (element.getText() != null) { commentary = element.getText().trim(); } }
From source file:es.ucm.fdi.ac.extract.PatternFilter.java
License:Open Source License
public void loadFromXML(Element element) throws IOException { setPattern(element.getText().trim()); }
From source file:es.upm.dit.xsdinferencer.extraction.extractorImpl.TypesExtractorImpl.java
License:Apache License
/** * Recursive method that traverses an element to extract all the possible information from it. * It is recursive because it calls itself for each child of the element (obviously, infinite recursion * is not possible as there are not, or there should not be, parent-child loops). * The index of the current document is necessary in order to add well some information to * the statistics./*w w w. j ava 2 s . co m*/ * @param documentIndex index of current document * @param element the element to traverse (as a JDOM2 {@link Element}) * @param enclosingComplexType the complex type which will contain the current element */ private void traverseElement(int documentIndex, Element element, String enclosingComplexType) { //Elements in the XSI namespace should be ignored if (element.getNamespaceURI().equalsIgnoreCase(XSI_NAMESPACE_URI)) return; List<String> realPathUnfiltered = getRealPathOfElementUnfiltered(element, configuration, false, solvedNamespaceToPrefixMapping); String realPathFiltered = filterAndJoinRealPath(realPathUnfiltered);//Path for the statistics List<String> typePathUnfiltered = getRealPathOfElementUnfiltered(element, configuration, false, solvedNamespaceToPrefixMapping); List<String> suitablePath = getSuitablePath(typePathUnfiltered);//Path for type name inferencing //First, we will register the information of width and depth //The root is in a level whose width is 1, if we did not do the following, that width would be never registered if (element.isRootElement()) { statistics.registerWidth(documentIndex, 1); } statistics.registerDepth(documentIndex, realPathUnfiltered.size()); int width = element.getChildren().size(); if (width > 0) { statistics.registerWidth(documentIndex, width); } TypeNameInferencer typeNameInferencer = configuration.getTypeNameInferencer(); String complexTypeName = typeNameInferencer.inferTypeName(suitablePath, configuration);//Complex type of this element // //Little workaround that ensures that the same complex type is used // //when the elements on its path are the same (same name and namespace) but some of them // //use different namespace prefixes // List<String> realPathUnfilteredKey=getRealPathOfElementUnfiltered(element, configuration, false, solvedNamespaceToPrefixMapping); // List<String> suitablePathKey=getSuitablePath(realPathUnfilteredKey);//Path for type name inferencing // String complexTypeNameKey = typeNameInferencer.inferTypeName(suitablePathKey, configuration);//Complex type of this element String complexTypeNameKey = complexTypeName; //The complex type object of this element. ComplexType complexType = complexTypes.get(complexTypeNameKey); if (complexType == null) { complexType = new ComplexType(complexTypeName, null, null, null); complexTypes.put(complexTypeNameKey, complexType); //New complex type } complexType.addSourceNodeNamespaceAndName(element.getNamespaceURI(), element.getName()); //Comment processing for (Comment comment : element.getDescendants(Filters.comment())) { if (comment.getParentElement().equals(element)) complexType.getComments().add(comment.getText()); } //Key to find the corresponding SchemaElement //This key is: if the SchemaElement has an enclosing complex type (i.e., it is not a valid root), its name will be: //enclosingComplexType+typeNamesSeparator+elementName //If the element is a suitable root, the key is the name of the element. String schemaElementKey = (!enclosingComplexType.equals("")) ? enclosingComplexType + configuration.getTypeNamesAncestorsSeparator() + element.getName() : element.getName(); if (configuration.getTypeNameInferencer() instanceof NameTypeNameInferencer) { schemaElementKey = element.getName(); //If we use a name-based type inferencer, the key is the name and we avoid problems. } SchemaElement schemaElement = elements.get(element.getNamespaceURI(), schemaElementKey); if (schemaElement == null) { schemaElement = new SchemaElement(element.getName(), element.getNamespaceURI(), complexType);//Complex type already not known. elements.put(element.getNamespaceURI(), schemaElementKey, schemaElement); } boolean wasAlreadyValidRoot = schemaElement.isValidRoot(); schemaElement.setValidRoot(wasAlreadyValidRoot || element.isRootElement()); ComplexTypeStatisticsEntry complexTypeStatisticsEntry = statistics.getComplexTypeInfo().get(complexType); if (complexTypeStatisticsEntry == null) { complexTypeStatisticsEntry = new ComplexTypeStatisticsEntry(xmlDocuments.size()); statistics.getComplexTypeInfo().put(complexType, complexTypeStatisticsEntry); } AttributeListInferencer attributeListInferencer = attributeListInferencers.get(complexTypeName); if (attributeListInferencer == null) { attributeListInferencer = inferencersFactory.getAttributeListInferencerInstance(complexTypeName, configuration, solvedNamespaceToPrefixMapping, statistics); attributeListInferencers.put(complexTypeName, attributeListInferencer); } attributeListInferencer.learnAttributeList(element.getAttributes(), documentIndex); SimpleTypeInferencer simpleTypeInferencer = simpleTypeInferencersOfComplexTypes.get(complexTypeName); if (simpleTypeInferencer == null) { simpleTypeInferencer = inferencersFactory.getSimpleTypeInferencerInstance(complexTypeName, configuration); simpleTypeInferencersOfComplexTypes.put(complexTypeName, simpleTypeInferencer); } simpleTypeInferencer.learnValue(element.getText(), element.getNamespaceURI(), element.getName()); // SchemaElement previousChildSchemaElement=null; //We need to store the previous child in order to add the edge between it and the current child. List<SchemaElement> schemaElementChildren = new ArrayList<>(element.getChildren().size()); for (int i = 0; i < element.getChildren().size(); i++) { Element child = element.getChildren().get(i); traverseElement(documentIndex, child, complexTypeName); String childSchemaElementKey = complexTypeName + configuration.getTypeNamesAncestorsSeparator() + child.getName(); if (configuration.getTypeNameInferencer() instanceof NameTypeNameInferencer) { childSchemaElementKey = child.getName(); // If we use the name-based type name inferencer, the name is the key } SchemaElement childSchemaElement = elements.get(child.getNamespaceURI(), childSchemaElementKey);//The SchemaElement object does exist because the method traverseElement is called before this. // if(i==0){ // automaton.addEdge(automaton.getInitialState(), childSchemaElement); // } // else { // automaton.addEdge(previousChildSchemaElement, childSchemaElement); // if(i==(element.getChildren().size()-1)){ // automaton.addEdge(childSchemaElement, automaton.getFinalState()); // } // } complexTypeStatisticsEntry.registerElementCount(childSchemaElement, documentIndex); schemaElementChildren.add(childSchemaElement); // previousChildSchemaElement=childSchemaElement; } ExtendedAutomaton automaton = automatons.get(complexTypeName); if (automaton == null) { automaton = new ExtendedAutomaton(); SchemaElement initialState = new SchemaElement("initial", DEFAULT_PSEUDOELEMENTS_NAMESPACE, null); automaton.setInitialState(initialState); SchemaElement finalState = new SchemaElement("final", DEFAULT_PSEUDOELEMENTS_NAMESPACE, null); automaton.setFinalState(finalState); automatons.put(complexTypeName, automaton); } List<SchemaElement> schemaElementChildrenWithInitialAndFinal = new ArrayList<>(schemaElementChildren); schemaElementChildrenWithInitialAndFinal.add(0, automaton.getInitialState()); schemaElementChildrenWithInitialAndFinal.add(automaton.getFinalState()); automaton.learn(schemaElementChildrenWithInitialAndFinal); complexTypeStatisticsEntry.registerSubpatternsFromList(schemaElementChildren); complexTypeStatisticsEntry.registerValueOfNodeCount(element.getText(), schemaElement, documentIndex); statistics.registerElementAtPathCount(realPathFiltered, documentIndex); statistics.registerValueAtPathCount(realPathFiltered, element.getText(), documentIndex); if (enclosingComplexType.equals("")) { statistics.registerRootElementOccurrence(schemaElement); } }
From source file:eu.himeros.cophi.ocr.proofreader.controller.pojo.OcrPageParser.java
License:Open Source License
/** * Parses an ocr alternative element and maps it on an Insertion. * @param ocrAlternativeEl the ocr alternative element. * @return the Insertion.//ww w.j a v a 2 s .c om */ private Insertion parseAlternativeInsertion(Element ocrAlternativeEl) { Insertion alternativeInsertion = new Insertion(); alternativeInsertion.setText(ocrAlternativeEl.getText()); alternativeInsertion.setNlp(ocrAlternativeEl.getAttributeValue("title")); return alternativeInsertion; }
From source file:eu.himeros.cophi.ocr.proofreader.controller.pojo.OcrPageParser.java
License:Open Source License
/** * Parses an ocr alternative element adn maps it on a Deletion. * @param ocrAlternativeEl the ocr alternative element. * @return the Deletion./* ww w . j a v a 2 s .c o m*/ */ private Deletion parseAlternativeDeletion(Element ocrAlternativeEl) { Deletion alternativeDeletion = new Deletion(); alternativeDeletion.setText(ocrAlternativeEl.getText()); alternativeDeletion.setNlp(ocrAlternativeEl.getAttributeValue("title")); return alternativeDeletion; }
From source file:eu.himeros.hocr.GrcContextFilterMananger.java
License:Open Source License
@Override public void adjustPreviousSuitableElement() { Element prevEl = queue.poll(); Element currEl = queue.peek(); Element nextEl = queue.get(1); try {//from w ww . j a va 2s . c o m Element prevInfo = prevEl.getChild("span", prevEl.getNamespace()); Element currInfo = currEl.getChild("span", currEl.getNamespace()); Element nextInfo = nextEl.getChild("span", nextEl.getNamespace()); if (currInfo != null && "UCWORD".equals(currInfo.getAttributeValue("class"))) { String suggestions = ""; try { suggestions = filterSuggestions(currInfo.getText(), prevInfo.getText(), nextInfo.getText(), currInfo.getAttributeValue("title")); } catch (NullPointerException npex) { // } if (suggestions.trim().contains(" ")) { currInfo.setAttribute("title", suggestions); } else if (suggestions.length() > 0) { currInfo.setAttribute("class", "CORRWORD"); currInfo.setAttribute("title", currInfo.getText()); currInfo.setText(suggestions); } } } catch (Exception ex) { ex.printStackTrace(System.err); } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void parseOcrWord(Element ocrWord) { String text = ocrWord.getText(); text = adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(text)); String upText = low2upL1Trans.parse(text); if (text.endsWith("-")) { ocrWord.setAttribute("idx", "" + id++); hyphenPart1 = ocrWord;//from w w w .ja v a 2 s . co m return; } else if (hyphenPart1 != null) { text = adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(parseOcrHyphenatedWord(hyphenPart1, ocrWord))); upText = low2upL1Trans.parse(text); } Element infoSpan = new Element("span", xmlns); infoSpan.setText(adjuster.adjust(new String[] { "monotonic2polytonic", "ocr2u" }, normalizer2.normalize(ocrWord.getText()))); upText = upText.replaceAll(l1NonAlphabeticFilter, ""); infoSpan.setAttribute("id", "" + id++); Integer occ; occ = ((occ = occHm.get(upText)) == null ? 1 : ++occ); occHm.put(upText, occ); infoSpan.setAttribute("uc", upText); try { ocrWord.getContent(0).detach(); } catch (Exception ex) { } Token token = new Token(text); token = setClassiFicationAndScore(token); infoSpan = setInfoSpanClass(token, infoSpan); ocrWord.addContent(infoSpan); l1Fm.addSuitableElement(ocrWord); l1Fm.adjustPreviousSuitableElement(); if (hyphenPart1 != null) { text = hyphenPart1.getText(); hyphenPart1.getContent(0).detach(); Element infoSpan1 = new Element("span", xmlns); infoSpan1.setAttribute("class", infoSpan.getAttributeValue("class")); infoSpan1.setText(text); hyphenPart1.addContent(infoSpan1); hyphenPart1 = null; //TODO: ??? } }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private String parseOcrHyphenatedWord(Element part1, Element part2) { String res = ""; try {/*from w w w . j a v a2s . com*/ res = part1.getText().substring(0, part1.getText().length() - 1) + part2.getText(); } catch (Exception ex) { } return res; }
From source file:eu.himeros.hocr.HocrInfoAggregator.java
License:Open Source License
private void makeCompliantHocr() { xpath = XPathFactory.instance().compile("//ns:span[@id|@idx]", Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml")); List<Element> elements = xpath.evaluate(root); int spanId = 0; for (Element span : elements) { if (span.getAttribute("idx") != null) { try { span = span.getChildren().get(0); } catch (Exception ex) { // }// w w w.j a va2 s. co m } LinkedList<Attribute> attributeLl = new LinkedList(span.getParentElement().getAttributes()); attributeLl.addFirst(new Attribute("id", "w_" + spanId++)); span.getParentElement().setAttributes(attributeLl); String[] suggestions = null; String title = span.getAttributeValue("title"); if (title != null) { suggestions = title.split(" "); } if (suggestions == null) { suggestions = new String[] { "" }; } Element ins = new Element("ins", xmlns); ins.setAttribute("class", "alt"); ins.setAttribute("title", makeNlp(span.getAttributeValue("class"))); ins.setText(span.getText()); span.removeContent(); span.addContent(ins); span.setAttribute("class", "alternatives"); span.removeAttribute("uc"); span.removeAttribute("occ"); span.removeAttribute("title"); span.removeAttribute("anchor"); span.removeAttribute("anchor-id"); span.removeAttribute("id"); span.getParentElement().removeAttribute("idx"); span.removeAttribute("whole"); span.getParentElement().removeAttribute("whole"); if (title == null || "".equals(title)) { continue; } double score = 0.90; for (String suggestion : suggestions) { if (suggestion == null || "".equals(suggestion)) { continue; } Element del = new Element("del", xmlns); del.setAttribute("title", "nlp " + String.format("%.2f", score).replaceAll(",", ".")); score = score - 0.01; suggestion = suggestion.replaceAll(l1PunctMarkFilter, ""); Matcher leftMatcher = l1LeftPunctMarkPattern.matcher(ins.getText()); if (leftMatcher.matches()) { suggestion = leftMatcher.group(1) + suggestion; } Matcher rightMatcher = l1RightPunctMarkPattern.matcher(ins.getText()); if (rightMatcher.matches()) { String ngtSymbol = ""; if (suggestion.endsWith("\u261a")) { ngtSymbol = "\u261a"; suggestion = suggestion.substring(0, suggestion.length() - 1); } suggestion = suggestion + rightMatcher.group(1) + ngtSymbol; } ///!!!! if (suggestion.endsWith("\u261a") && ins.getParentElement().getParentElement() .getAttributeValue("lang", Namespace.XML_NAMESPACE) != null) { String buff = suggestion.substring(0, suggestion.length() - 1); sa.align(buff, ins.getText()); double sim = 1 - sa.getEditDistance() / Math.max((double) buff.length(), (double) ins.getText().length()); if (sim > 0.6) { suggestion = ins.getText() + "\u261b"; ins.setText(buff); ins.setAttribute("title", "nlp 0.70"); } } del.addContent(suggestion); span.addContent(del); } } }