Example usage for org.jdom2 Namespace getNamespace

Introduction

In this page you can find the example usage for org.jdom2 Namespace getNamespace.

Prototype

public static Namespace getNamespace(final String prefix, final String uri)

Source Link

Document

This will retrieve (if in existence) or create (if not) a Namespace for the supplied prefix and uri.

Usage

From source file:de.uniba.dsg.ppn.ba.validation.XmlLocator.java

License:Open Source License

/**
 * Searches the line of the given xpath expression in the given file and
 * returns either the line or -1. -1 means, that with the xpath expression
 * couldn't be determined a bpmn element.
 *
 * @param xmlFile//from   w ww  . j a v  a  2  s.com
 *            the xml file where the error has to be found
 * @param xpathExpression
 *            the xpath expression to find the error in the file
 * @return line or -1
 */
public int findLine(File xmlFile, String xpathExpression) {
    try {
        Document doc = saxBuilder.build(xmlFile);
        int bracketPosition = xpathExpression.lastIndexOf('[');
        int elementPosition = 0;
        try {
            elementPosition = Integer
                    .parseInt(xpathExpression.substring(bracketPosition + 1, xpathExpression.lastIndexOf(']')));
            xpathExpression = xpathExpression.substring(0, bracketPosition);
        } catch (NumberFormatException e) {
            // ignore, because then there's no position number in the xpath
            // expression and the expression needn't to be rewritten
        }
        XPathExpression<Element> xpath = xPathFactory.compile(xpathExpression, Filters.element(), null,
                Namespace.getNamespace("bpmn", ConstantHelper.BPMNNAMESPACE));

        List<Element> foundElements = xpath.evaluate(doc);

        if (foundElements.size() > elementPosition) {
            return ((LocatedElement) foundElements.get(elementPosition)).getLine();
        }
    } catch (IOException | JDOMException e) {
        LOGGER.debug(ConstantHelper.FILENOTFOUNDMESSAGEWITHCAUSE, xmlFile.getName(), e);
    }
    return -1;
}

From source file:ec.edu.cedia.redi.ldclient.provider.ScopusAuthorProvider.java

License:Apache License

/**
 * Parse each XML result of publications. Assings each publication resource
 * to its author. See//w w w. j a v  a2s  .c  om
 * <a href="http://api.elsevier.com/documentation/SCOPUSSearchAPI.wadl">Scopus
 * Search API</a>.
 *
 * @param input
 * @param requestUrl
 * @param triples
 * @return list of publication resources
 * @throws DataRetrievalException
 */
private List<String> parseSearchPub(InputStream input, String requestUrl, final Model triples)
        throws DataRetrievalException {
    try {
        List<String> publications = new ArrayList<>();
        ValueFactory vf = ValueFactoryImpl.getInstance();
        String authorId = requestUrl.substring(requestUrl.indexOf("au-id(") + 6, requestUrl.indexOf(")&"));
        URI author = vf.createURI("http://api.elsevier.com/content/author/author_id/", authorId);

        final Document doc = new SAXBuilder(XMLReaders.NONVALIDATING).build(input);
        XPathExpression<Attribute> path = XPathFactory.instance().compile(
                "/atom:search-results/atom:entry/atom:link[@ref='self']/@href", Filters.attribute(), null,
                Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom"));
        List<Attribute> publicationsFound = path.evaluate(doc);
        for (int i = 0; i < publicationsFound.size(); i++) {
            String pubResource = publicationsFound.get(i).getValue();
            triples.add(author, FOAF.PUBLICATIONS, vf.createURI(pubResource));
            publications.add(pubResource + "?apiKey=" + apiKey + "&httpAccept=application/rdf%2Bxml");
        }
        return publications;
    } catch (JDOMException | IOException ex) {
        throw new DataRetrievalException(ex);
    }
}

From source file:ec.edu.cedia.redi.ldclient.provider.ScopusAuthorProvider.java

License:Apache License

/**
 * Maps each author from XML to RDF using default implementation of
 * {@link AbstractXMLDataProvider#parseResponse}.
 *
 * @see//  w ww  . j  a v  a  2s.  c o  m
 * <a href="http://api.elsevier.com/documentation/AUTHORSearchAPI.wadl">Authors
 * search API.</a>
 *
 * @param input
 * @param resource
 * @param requestUrl
 * @param triples
 * @param contentType
 * @return list of resources of authors found.
 * @throws DataRetrievalException
 */
private List<String> parseResponseAuthorsSearch(InputStream input, String resource, String requestUrl,
        Model triples, String contentType) throws DataRetrievalException {
    try {

        // List of authors to extract perfil information such as publications, affiliations, etc.
        List<String> authorsFound = new ArrayList();
        ValueFactory vf = ValueFactoryImpl.getInstance();
        // Keep stream for various reads.
        byte[] response = IOUtils.toByteArray(input);
        final Document doc = new SAXBuilder(XMLReaders.NONVALIDATING).build(new ByteArrayInputStream(response));
        // get only URI of authors
        XPathExpression<Text> path = XPathFactory.instance().compile(
                "/atom:search-results/atom:entry/prism:url/text()", Filters.textOnly(), null,
                Namespace.getNamespace("atom", "http://www.w3.org/2005/Atom"),
                Namespace.getNamespace("prism", "http://prismstandard.org/namespaces/basic/2.0/"));
        // Map each author XML to RDF using default implementationf parseResponse method from AbstractXMLDataProvider.
        List<Text> auhtorsFound = path.evaluate(doc);
        for (int i = 0; i < auhtorsFound.size(); i++) {
            setAuthorXPathMappings(i);
            String authorsResource = auhtorsFound.get(i).getValue();
            super.parseResponse(authorsResource, requestUrl, triples, new ByteArrayInputStream(response),
                    contentType);
            authorsFound.add(
                    authorsResource + "?apiKey=" + apiKey + "&httpAccept=application/rdf%2Bxml&view=ENHANCED");
            triples.add(vf.createURI(authorsResource), OWL.ONEOF, vf.createURI(resource));
        }
        return authorsFound;
    } catch (JDOMException | IOException | DataRetrievalException ex) {
        throw new DataRetrievalException(ex);
    }
}

From source file:es.upm.dit.xsdinferencer.generation.generatorimpl.schemageneration.XMLSchemaDocumentGenerator.java

License:Apache License

/**
 * It generates the XSD file of the targetNamespace given at the constructor, taking into account that 
 * the main namespace is the one given at the constructor.
 * //from   w ww .j a v  a 2s .c  o m
 * @param schema the schema object
 * @param configuration the inference configuration
 * 
 * @return a JDOM2 {@link Document} object containing the XSD contents.
 * 
 * @see SchemaDocumentGenerator#generateSchemaDocument(Schema, XSDInferenceConfiguration)
 */
@Override
public Document generateSchemaDocument(Schema schema, XSDInferenceConfiguration configuration) {
    //      if(!configuration.getElementsGlobal()==false || 
    //            !configuration.getComplexTypesGlobal()==true ||
    //            !configuration.getSimpleTypesGlobal()==true
    //            )
    //         throw new UnsupportedOperationException("Not implemented yet.");
    //
    checkArgument(schema.getNamespacesToPossiblePrefixMappingModifiable().containsKey(mainNamespace),
            "The main namespace must be a known namespace");
    checkArgument(schema.getNamespacesToPossiblePrefixMappingModifiable().containsKey(targetNamespace),
            "The target namespace must be a known namespace");
    //      checkArgument(!schema.getNamespacesToPossiblePrefixMappingModifiable().containsKey(XSD_NAMESPACE_URI),"The XSD namespace must not be a known namespace");
    //      checkArgument(!schema.getNamespacesToPossiblePrefixMappingModifiable().containsKey(XSI_NAMESPACE_URI),"The XSI namespace must not be a known namespace");
    Map<String, String> namespaceURIToPrefixMappings = schema.getSolvedNamespaceMappings();
    if (configuration.getSkipNamespaces().contains(targetNamespace)) {
        throw new IllegalArgumentException("This is an skipped namespace, so its XSD should not be generated");
    }
    if (targetNamespace.equals(XSD_NAMESPACE_URI))
        System.err.println(
                "The XML Schema namespace is being considered as a target namespace in your documents. Independing of the inferred schemas, the only valid XSD for an XSD would be the normative one present at its first RFC");
    Namespace xsdNamespace = Namespace.getNamespace(XSD_NAMESPACE_PREFIX.replace(":", ""), XSD_NAMESPACE_URI);
    List<Namespace> namespaceDeclarations = getNamespaceDeclarations(namespaceURIToPrefixMappings,
            xsdNamespace);
    Element elementSchema = new Element("schema", xsdNamespace);
    for (int i = 0; i < namespaceDeclarations.size(); i++) {
        Namespace currentNamespace = namespaceDeclarations.get(i);
        elementSchema.addNamespaceDeclaration(currentNamespace);
        String currentNamespaceUri = currentNamespace.getURI();
        if (!targetNamespace.equals(mainNamespace) && !currentNamespaceUri.equals(mainNamespace))
            continue;
        if (currentNamespace.equals(Namespace.XML_NAMESPACE)
                && (!schema.getAttributes().containsRow(XSDInferenceConfiguration.XML_NAMESPACE_URI)
                        && !schema.getElements().containsRow(XSDInferenceConfiguration.XML_NAMESPACE_URI))) {
            continue;
        }
        if (currentNamespaceUri.equals(XSD_NAMESPACE_URI)
                && !namespaceURIToPrefixMappings.containsKey(XSD_NAMESPACE_URI))
            continue;
        if (targetNamespace.equals(currentNamespaceUri)
                || (currentNamespaceUri.equals("") && (fileNameGenerator == null)))
            continue;
        if (currentNamespaceUri.equals("") && !currentNamespaceUri.equals(mainNamespace)
                && !schema.getElements().containsRow(""))
            continue;
        Element importElement = new Element("import", xsdNamespace);
        if (!currentNamespaceUri.equals("")) {
            Attribute namespaceAttr = new Attribute("namespace", currentNamespaceUri);
            importElement.setAttribute(namespaceAttr);
        }
        if (fileNameGenerator != null && !configuration.getSkipNamespaces().contains(currentNamespaceUri)) {
            String fileName = fileNameGenerator.getSchemaDocumentFileName(currentNamespaceUri,
                    namespaceURIToPrefixMappings);
            Attribute schemaLocationAttr = new Attribute("schemaLocation", fileName);
            importElement.setAttribute(schemaLocationAttr);
        }
        elementSchema.addContent(importElement);
    }

    if (!targetNamespace.equals("")) {
        Attribute targetNamespaceAttr = new Attribute("targetNamespace", targetNamespace);
        elementSchema.setAttribute(targetNamespaceAttr);
    }
    SortedSet<SimpleType> sortedSimpleTypes = new TreeSet<>(new SimpleTypeComparator());
    sortedSimpleTypes.addAll(schema.getSimpleTypes().values());
    SortedSet<ComplexType> sortedComplexTypes = new TreeSet<>(new ComplexTypeComparator());
    sortedComplexTypes.addAll(schema.getComplexTypes().values());
    //CONTINUE FROM HERE: Generate sorted sets for SchemaElement and SchemaAttribute objects and use them where needed.
    Attribute elementFormDefault = new Attribute("elementFormDefault", "qualified");
    elementSchema.setAttribute(elementFormDefault);
    Document resultingDocument = new Document(elementSchema);
    if (targetNamespace.equals(mainNamespace)) {
        //First, we declare global SimpleTypes.
        //If simpleTypesGlobal is true, any enumeration will be declared as a global simple type.
        //if not, simple types of complex types which have attributes but not children will be declared globally 
        //(due to limitations of XSD, they may not be declared locally together with the attributes info)
        if (configuration.getSimpleTypesGlobal()) {
            for (SimpleType simpleType : sortedSimpleTypes) {
                if (!simpleType.isEnum() || simpleType.isEmpty())
                    continue;
                Element simpleTypeElement = generateSimpleType(simpleType, false, configuration, xsdNamespace);
                elementSchema.addContent(simpleTypeElement);
            }
        } else {
            for (ComplexType complexType : sortedComplexTypes) {
                SimpleType simpleType = complexType.getTextSimpleType();
                if (complexType.getAttributeList().isEmpty() || !(complexType.getAutomaton().nodeCount() == 0)
                        || !simpleType.isEnum() || simpleType.isEmpty())
                    continue;
                Element simpleTypeElement = generateSimpleType(simpleType, false, configuration, xsdNamespace);
                elementSchema.addContent(simpleTypeElement);
            }
        }
        //Global complexType elements are only generated in the main schema (i.e. the one whose targetNamespace is equal to mainNamespace)
        if (configuration.getComplexTypesGlobal()) {
            for (ComplexType complexType : sortedComplexTypes) {
                boolean hasNoChildren = complexType.getRegularExpression().equals(new EmptyRegularExpression());
                boolean hasNoAttributes = complexType.getAttributeList().size() == 0;
                boolean hasNoComments = complexType.getComments().size() == 0;
                //               boolean simpleTypeIsNotEmpty = !complexType.getTextSimpleType().isEmpty();
                boolean simpleTypeIsWhiteSpaceOnlyOrEmpty = !(complexType.getTextSimpleType().isEmpty()
                        || complexType.getTextSimpleType().consistOnlyOfWhitespaceCharacters());
                if (hasNoChildren && hasNoAttributes && simpleTypeIsWhiteSpaceOnlyOrEmpty && hasNoComments)
                    continue; //Because the elements which are linked to this ComplexType at our internal model 
                              //will be linked to an XSD simple type elsewhere, either a builtin or a custom one.
                Element complexTypeElement = generateComplexType(configuration, complexType, false,
                        targetNamespace, namespaceURIToPrefixMappings, mainNamespace, xsdNamespace);
                elementSchema.addContent(complexTypeElement);
            }
        }
    }
    //If there are many namespaces and the workaround is disabled, we must declare global attributes.
    //If the targetNamespace is not the mainNamespace, we must declare all the attributes.
    //if the target namespace is the main namespace, we do not need to declare anything, because the complex types which hold the attributes 
    //are also in the main namespace.
    if ((namespaceURIToPrefixMappings.size() - configuration.getSkipNamespaces().size()) > 1) {

        SortedMap<String, SchemaAttribute> globalAttributeCandidates = new TreeMap<>(
                schema.getAttributes().row(targetNamespace));
        if (!targetNamespace.equals(mainNamespace) && !targetNamespace.equals("")) {
            globalAttributesLoop: for (Map.Entry<String, SchemaAttribute> schemaAttributeEntry : globalAttributeCandidates
                    .entrySet()) {
                SchemaAttribute schemaAttribute = schemaAttributeEntry.getValue();
                //First, we check if the attribute has been already declared when the workaround is disabled. 
                //If so, we update the "use" property.
                //The type should have been already merged.
                if (!configuration.getStrictValidRootDefinitionWorkaround()) {
                    List<Element> alreadyGeneratedAttributeElements = elementSchema.getChildren("attribute",
                            xsdNamespace);
                    for (int i = 0; i < alreadyGeneratedAttributeElements.size(); i++) {
                        Element currentAttributeElement = alreadyGeneratedAttributeElements.get(i);
                        if (currentAttributeElement.getAttributeValue("name")
                                .equals(schemaAttribute.getName())) {
                            continue globalAttributesLoop;
                        }
                    }
                }
                Element attributeOrAttributeGroupElement = generateAttribute(schemaAttribute, true,
                        configuration, namespaceURIToPrefixMappings, targetNamespace, mainNamespace,
                        schemaAttributeEntry.getKey(), xsdNamespace);
                elementSchema.addContent(attributeOrAttributeGroupElement);
            }
        }
    }

    //Now, we declare global elements.
    //An element will be declared globally if and only if: 
    //1-elementsGlobal is true in the configuration
    //2-The element is a valid root
    //3-The element is in a namespace other than the main namespace. Note that the element WILL be surrounded by the corresponding group if the workaround is enabled.
    //Another important remark: Iterating over a set copy implies iterating over DISTINCT SchemaElements, so if two keys pointed to equal SchemaElements, we would generate it only once-
    SortedSet<SchemaElement> schemaElementsAtTargetNamespace = new TreeSet<>(new SchemaElementComparator());
    schemaElementsAtTargetNamespace.addAll(schema.getElements().row(targetNamespace).values());
    globalSchemaElementsLoop: for (SchemaElement schemaElement : schemaElementsAtTargetNamespace) {
        //         if(!configuration.getElementsGlobal()&&
        //               !schemaElement.isValidRoot()&&
        //               (targetNamespace.equals(mainNamespace)||configuration.getStrictValidRootDefinitionWorkaround()))
        if (!configuration.getElementsGlobal() && !schemaElement.isValidRoot()
                && (targetNamespace.equals(mainNamespace)))
            continue;
        //         for(Element currentElement:elementSchema.getContent(Filters.element("element",xsdNamespace))){
        //            if(schemaElement.getName().equals(currentElement.getAttributeValue("name")))
        //               continue globalSchemaElementsLoop;
        //         }
        String possibleGroupName = schemaElement.getName() + configuration.getTypeNamesAncestorsSeparator()
                + schemaElement.getType().getName();
        for (Element currentElement : elementSchema.getContent(Filters.element("group", xsdNamespace))) {
            if (possibleGroupName.equals(currentElement.getAttributeValue("name")))
                continue globalSchemaElementsLoop;
        }
        Element elementOrGroupElement = generateElement(schemaElement, true, configuration, targetNamespace,
                mainNamespace, null, namespaceURIToPrefixMappings, xsdNamespace);
        if (elementOrGroupElement.getName().equals("element")) {
            for (Element currentElement : elementSchema.getChildren("element", xsdNamespace)) {
                if (schemaElement.getName().equals(currentElement.getAttributeValue("name")))
                    continue globalSchemaElementsLoop;
            }
        }
        elementSchema.addContent(elementOrGroupElement);
    }
    return resultingDocument;
}

From source file:esiptestbed.mudrod.ontology.pre.AggregateTriples.java

License:Apache License

/**
 * Method of extract triples (subclassOf, equivalent class) from OWL file
 * @throws IOException IOException//ww  w . j av  a 2 s  . c o m
 */
public void getAllClass() throws IOException {
    List<?> classElements = rootNode.getChildren("Class", Namespace.getNamespace("owl", owl_namespace));

    for (int i = 0; i < classElements.size(); i++) {
        Element classElement = (Element) classElements.get(i);
        String className = classElement.getAttributeValue("about",
                Namespace.getNamespace("rdf", rdf_namespace));

        if (className == null) {
            className = classElement.getAttributeValue("ID", Namespace.getNamespace("rdf", rdf_namespace));
        }

        List<?> subclassElements = classElement.getChildren("subClassOf",
                Namespace.getNamespace("rdfs", rdfs_namespace));
        for (int j = 0; j < subclassElements.size(); j++) {
            Element subclassElement = (Element) subclassElements.get(j);
            String subclassName = subclassElement.getAttributeValue("resource",
                    Namespace.getNamespace("rdf", rdf_namespace));
            if (subclassName == null) {
                Element allValuesFromEle = findChild("allValuesFrom", subclassElement);
                if (allValuesFromEle != null) {
                    subclassName = allValuesFromEle.getAttributeValue("resource",
                            Namespace.getNamespace("rdf", rdf_namespace));
                    bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n");
                }
            } else {
                bw.write(cutString(className) + ",SubClassOf," + cutString(subclassName) + "\n");
            }

        }

        List equalClassElements = classElement.getChildren("equivalentClass",
                Namespace.getNamespace("owl", owl_namespace));
        for (int k = 0; k < equalClassElements.size(); k++) {
            Element equalClassElement = (Element) equalClassElements.get(k);
            String equalClassElementName = equalClassElement.getAttributeValue("resource",
                    Namespace.getNamespace("rdf", rdf_namespace));

            if (equalClassElementName != null) {
                bw.write(cutString(className) + ",equivalentClass," + cutString(equalClassElementName) + "\n");
            }
        }

    }
}

From source file:eu.himeros.hocr.FlatXml.java

License:Open Source License

private void init(File inFile, File outFile) throws Exception {
    SAXBuilder builder = new SAXBuilder();
    Document doc = builder.build(inFile);
    Element root = doc.getRootElement();
    Namespace oldns = root.getNamespace();
    Element newRoot = new Element("html", "http://www.w3.org/1999/xhtml");
    Namespace xmlns = newRoot.getNamespace();
    Element head = root.getChild("head", oldns);
    head.setNamespace(xmlns);/*from  ww  w .  j a v a  2  s  .  c om*/
    for (Element child : head.getChildren())
        child.setNamespace(xmlns);
    Element title = new Element("title", xmlns);
    title.addContent("ocr");
    if (head != null)
        head.addContent(title);
    Element body = root.getChild("body", oldns);
    body.setNamespace(xmlns);
    /*Element oldPage;
    try{
    oldPage=body.getChild("div",xmlns);
    }catch(Exception ex){
    oldPage=new Element("div",xmlns);
    }*/
    Element page = new Element("div", xmlns);
    page.setAttribute("class", "ocr_page");
    page.setAttribute("id", "i" + inFile.getName().substring(1).replace(".html", ".png"));
    XPathExpression<Element> xpath = XPathFactory.instance().compile("//*[@class='ocr_carea']",
            Filters.element(), null, Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> careaElL = xpath.evaluate(body);
    for (Element careaEl : careaElL) {
        page.addContent(new Comment("<div class=\"" + careaEl.getAttributeValue("class") + "\" title=\""
                + careaEl.getAttributeValue("title") + "\">"));
        for (Element pEl : careaEl.getChildren()) {
            page.addContent(new Comment("<p>"));
            for (Element lineEl : pEl.getChildren()) {
                lineEl.removeAttribute("id");
                lineEl.setNamespace(xmlns);
                for (Element child : lineEl.getChildren()) {
                    child.removeAttribute("id");
                    child.removeAttribute("lang");
                    child.removeAttribute("lang", xmlns);
                    child.setNamespace(xmlns);
                }
                page.addContent(lineEl.clone());
            }
            page.addContent(new Comment("</p>"));
        }
        page.addContent(new Comment("</div>"));
    }
    //oldPage.detach();
    if (body != null) {
        body.removeContent();
        body.addContent(page);
    }
    newRoot.addContent(root.removeContent());
    doc.detachRootElement();
    doc.setRootElement(newRoot);
    XMLOutputter xmlOutputter = new XMLOutputter(Format.getPrettyFormat());
    xmlOutputter.output(doc, new BufferedWriter(new FileWriter(outFile)));
}

From source file:eu.himeros.hocr.HocrInfoAggregator.java

License:Open Source License

private void updateElements() {
    xpath = XPathFactory.instance().compile("//ns:span[@uc!='']", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    for (Element element : elements) {
        String uc = element.getAttributeValue("uc");
        element.setAttribute("occ", "" + occHm.get(uc));
        try {/*from  ww  w.ja va  2 s. co  m*/
            if (occHm.get(uc) == 1) {
                element.setAttribute("anchor", nearGtHm.get(uc).getAttributeValue("uc"));
                element.setAttribute("anchor-id", nearGtHm.get(uc).getAttributeValue("id"));
                if ("CORRWORD".equals(element.getAttributeValue("class"))
                        | "UCWORD".equals(element.getAttributeValue("class"))) {
                    String title = element.getAttributeValue("title");
                    title = nearGtHm.get(uc).getAttributeValue("text") + "\u261a " + title;
                    element.setAttribute("title", title);
                }
            }
        } catch (Exception ex) {
            continue;
        }
    }
}

From source file:eu.himeros.hocr.HocrInfoAggregator.java

License:Open Source License

public void alignToGroundTruth() {
    ArrayList<Element> ocrAl = new ArrayList<>();
    ArrayList<Element> nearGtAl;
    int start = 1;
    int end;/*from w w  w  .ja v a 2s . co m*/
    xpath = XPathFactory.instance().compile("//ns:span[@id]", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    for (Element element : elements) {
        if (element.getAttributeValue("anchor-id") == null) {
            if ("".equals(element.getAttributeValue("uc"))) {
                continue;
            }
            ocrAl.add(element);
        } else {
            end = ((end = Integer.parseInt(element.getAttributeValue("anchor-id")) - 1) < 1 ? 1 : end);
            nearGtAl = makeNearGtAl(start, end);
            makeAlignment(ocrAl, nearGtAl);
            ocrAl = new ArrayList<>();
            start = end + 2;
        }
    }
}

From source file:eu.himeros.hocr.HocrInfoAggregator.java

License:Open Source License

private void makeCompliantHocr() {
    xpath = XPathFactory.instance().compile("//ns:span[@id|@idx]", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    int spanId = 0;
    for (Element span : elements) {
        if (span.getAttribute("idx") != null) {
            try {
                span = span.getChildren().get(0);
            } catch (Exception ex) {
                //
            }/* ww w  . j av  a  2s .  co  m*/
        }
        LinkedList<Attribute> attributeLl = new LinkedList(span.getParentElement().getAttributes());
        attributeLl.addFirst(new Attribute("id", "w_" + spanId++));
        span.getParentElement().setAttributes(attributeLl);
        String[] suggestions = null;
        String title = span.getAttributeValue("title");
        if (title != null) {
            suggestions = title.split(" ");
        }
        if (suggestions == null) {
            suggestions = new String[] { "" };
        }
        Element ins = new Element("ins", xmlns);
        ins.setAttribute("class", "alt");
        ins.setAttribute("title", makeNlp(span.getAttributeValue("class")));
        ins.setText(span.getText());
        span.removeContent();
        span.addContent(ins);
        span.setAttribute("class", "alternatives");
        span.removeAttribute("uc");
        span.removeAttribute("occ");
        span.removeAttribute("title");
        span.removeAttribute("anchor");
        span.removeAttribute("anchor-id");
        span.removeAttribute("id");
        span.getParentElement().removeAttribute("idx");
        span.removeAttribute("whole");
        span.getParentElement().removeAttribute("whole");
        if (title == null || "".equals(title)) {
            continue;
        }
        double score = 0.90;
        for (String suggestion : suggestions) {
            if (suggestion == null || "".equals(suggestion)) {
                continue;
            }
            Element del = new Element("del", xmlns);
            del.setAttribute("title", "nlp " + String.format("%.2f", score).replaceAll(",", "."));
            score = score - 0.01;
            suggestion = suggestion.replaceAll(l1PunctMarkFilter, "");
            Matcher leftMatcher = l1LeftPunctMarkPattern.matcher(ins.getText());
            if (leftMatcher.matches()) {
                suggestion = leftMatcher.group(1) + suggestion;
            }
            Matcher rightMatcher = l1RightPunctMarkPattern.matcher(ins.getText());
            if (rightMatcher.matches()) {
                String ngtSymbol = "";
                if (suggestion.endsWith("\u261a")) {
                    ngtSymbol = "\u261a";
                    suggestion = suggestion.substring(0, suggestion.length() - 1);
                }
                suggestion = suggestion + rightMatcher.group(1) + ngtSymbol;
            }
            ///!!!!
            if (suggestion.endsWith("\u261a") && ins.getParentElement().getParentElement()
                    .getAttributeValue("lang", Namespace.XML_NAMESPACE) != null) {
                String buff = suggestion.substring(0, suggestion.length() - 1);
                sa.align(buff, ins.getText());
                double sim = 1 - sa.getEditDistance()
                        / Math.max((double) buff.length(), (double) ins.getText().length());
                if (sim > 0.6) {

                    suggestion = ins.getText() + "\u261b";
                    ins.setText(buff);
                    ins.setAttribute("title", "nlp 0.70");
                }
            }
            del.addContent(suggestion);
            span.addContent(del);
        }
    }
}

From source file:eu.himeros.hocr.NgtMaker.java

License:Open Source License

public void parseDoc(File file) throws Exception {
    adjustFile(file);/*  w  w w.j a v  a2  s  .  co  m*/
    start = -1;
    end = -1;
    prevValue = -1;
    ocrAl = new ArrayList<>(1000);
    outFileName = file.getAbsolutePath().substring(0, file.getAbsolutePath().length() - 4) + "ngt.xml";
    builder = new SAXBuilder();
    doc = builder.build(file);
    root = doc.getRootElement();
    xmlns = root.getNamespace();
    xpath = XPathFactory.instance().compile("//ns:span[@class='ocr_word']", Filters.element(), null,
            Namespace.getNamespace("ns", "http://www.w3.org/1999/xhtml"));
    List<Element> elements = xpath.evaluate(root);
    for (Element element : elements) {
        parseOcrWord(element);
    }

    ocrAl.add("%%%");
    ocrAl.add("%%%");
    findAnchors();
    writeFragment(start, end);
}