Example usage for org.jsoup.nodes Element tagName

List of usage examples for org.jsoup.nodes Element tagName

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tagName.

Prototype

public Element tagName(String tagName) 

Source Link

Document

Change the tag of this element.

Usage

From source file:com.maxl.java.aips2xml.Aips2Xml.java

static String convertHtmlToXml(String med_title, String html_str, String regnr_str) {
    Document mDoc = Jsoup.parse(html_str);
    mDoc.outputSettings().escapeMode(EscapeMode.xhtml);
    mDoc.outputSettings().prettyPrint(true);
    mDoc.outputSettings().indentAmount(4);

    // <div id="monographie"> -> <fi>
    mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id");
    // <div class="MonTitle"> -> <title>
    mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id");
    // Beautify the title to the best of my possibilities ... still not good enough!
    String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+",
            "");//from  w  w  w . ja  v a2  s .c  om
    if (!title_str.equals(med_title))
        if (SHOW_ERRORS)
            System.err.println(med_title + " differs from " + title_str);
    // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good!
    mDoc.select("title").first().text(med_title);
    // <div class="ownerCompany"> -> <owner>
    Element owner_elem = mDoc.select("div[class=ownerCompany]").first();
    if (owner_elem != null) {
        owner_elem.tagName("owner").removeAttr("class");
        String owner_str = mDoc.select("owner").text();
        mDoc.select("owner").first().text(owner_str);
    } else {
        mDoc.select("title").after("<owner></owner>");
        if (DB_LANGUAGE.equals("de"))
            mDoc.select("owner").first().text("k.A.");
        else if (DB_LANGUAGE.equals("fr"))
            mDoc.select("owner").first().text("n.s.");
    }

    // <div class="paragraph"> -> <paragraph>
    mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id");
    // <div class="absTitle"> -> <paragraphTitle>
    mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class");
    // <div class="untertitle1"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="untertitle"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="shortCharacteristic"> -> <characteristic>
    mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class");
    // <div class="image">
    mDoc.select("div[class=image]").tagName("image").removeAttr("class");

    // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p>
    mDoc.select("p[class]").tagName("p").removeAttr("class");
    // <span style="font-style:italic"> -> <i>
    mDoc.select("span").tagName("i").removeAttr("style");
    // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> 
    mDoc.select("i[class=indention1]").tagName("i").removeAttr("class");
    mDoc.select("i[class=indention2]").tagName("i").removeAttr("class");
    // mDoc.select("p").select("i").tagName("i");
    // mDoc.select("paragraphtitle").select("i").tagName("para-i");
    // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i");
    Elements elems = mDoc.select("paragraphtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }
    elems = mDoc.select("paragraphsubtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }

    // Here we take care of tables
    // <table class="s21"> -> <table>
    mDoc.select("table[class]").removeAttr("class");
    mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border");
    mDoc.select("colgroup").remove();
    mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan");
    mDoc.select("tr").removeAttr("class");
    elems = mDoc.select("div[class]");
    for (Element e : elems) {
        if (e.text().isEmpty())
            e.remove();
    }

    mDoc.select("tbody").unwrap();
    // Remove nested table (a nasty table-in-a-table
    Elements nested_table = mDoc.select("table").select("tr").select("td").select("table");
    if (!nested_table.isEmpty()) {
        nested_table.select("table").unwrap();
    }

    // Here we take care of the images
    mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border");

    // Subs and sups
    mDoc.select("sub[class]").tagName("sub").removeAttr("class");
    mDoc.select("sup[class]").tagName("sup").removeAttr("class");
    mDoc.select("td").select("sub").tagName("td-sub");
    mDoc.select("td").select("sup").tagName("td-sup");
    // Remove floating <td-sup> tags
    mDoc.select("p").select("td-sup").tagName("sup");
    mDoc.select("p").select("td-sub").tagName("sub");

    // Box
    mDoc.select("div[class=box]").tagName("box").removeAttr("class");

    // Insert swissmedicno5 after <owner> tag
    mDoc.select("owner").after("<swissmedicno5></swissmedicno5");
    mDoc.select("swissmedicno5").first().text(regnr_str);

    // Remove html, head and body tags         
    String xml_str = mDoc.select("body").first().html();

    //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", "");
    xml_str = xml_str.replaceAll("<sup> </sup>", "");
    xml_str = xml_str.replaceAll("<sub> </sub>", "");
    xml_str = xml_str.replaceAll("<p> <i>", "<p><i>");
    xml_str = xml_str.replaceAll("</p> </td>", "</p></td>");
    xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!!
    xml_str = xml_str.replaceAll("", "- ");
    xml_str = xml_str.replaceAll("<br />", "");
    xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", "");

    // Remove multiple instances of <p></p>
    Scanner scanner = new Scanner(xml_str);
    String new_xml_str = "";
    int counter = 0;
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        if (line.trim().equals("<p></p>")) {
            counter++;
        } else
            counter = 0;
        if (counter < 3)
            new_xml_str += line;
    }
    scanner.close();

    return new_xml_str;
}

From source file:no.kantega.publishing.admin.content.htmlfilter.ConvertUnderlineToEditorStyleFilter.java

@Override
public Document runFilter(Document document) {
    for (Element span : document.getElementsByTag("span")) {
        String style = span.attr("style");
        if (isNotBlank(style)) {
            String textDecoration = getSubAttributeValue(style, "text-decoration");
            if ("underline".equalsIgnoreCase(textDecoration)) {
                span.removeAttr("style");
                span.tagName("u");
            }// w  w  w.j a v a  2 s. c om
        }
    }
    return document;
}

From source file:org.apache.archiva.web.docs.RestDocsServlet.java

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {

    logger.debug("docs request to path: {}", req.getPathInfo());

    String path = StringUtils.removeStart(req.getPathInfo(), "/");
    InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path);

    if (StringUtils.endsWith(path, ".xsd")) {
        StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is));
        //IOUtils.copy( is, resp.getOutputStream() );
        return;// www .j a v  a2 s.  co m
    }

    String startPath = StringUtils.substringBefore(path, "/");

    // replace all links !!
    Document document = Jsoup.parse(is, "UTF-8", "");

    Element body = document.body().child(0);

    Elements links = body.select("a[href]");

    for (Element link : links) {
        link.attr("href", "#" + startPath + "/" + link.attr("href"));
    }

    Elements datalinks = body.select("[data-href]");

    for (Element link : datalinks) {
        link.attr("data-href", "#" + startPath + "/" + link.attr("data-href"));
    }

    Elements codes = body.select("code");

    for (Element code : codes) {
        code.attr("class", code.attr("class") + " nice-code");
    }

    //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5

    Elements headers = body.select("h1");

    for (Element header : headers) {
        header.tagName("h3");
    }

    headers = body.select("h2");

    for (Element header : headers) {
        header.tagName("h4");
    }

    headers = body.select("h3");

    for (Element header : headers) {
        header.tagName("h5");
    }

    Document res = new Document("");
    res.appendChild(body.select("div[id=main]").first());

    Elements scripts = body.select("script");
    for (Element script : scripts) {
        res.appendChild(script);
    }
    resp.getOutputStream().write(res.outerHtml().getBytes());

}

From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java

private static void createTestcaseFiles() throws IOException {
    File srcDir = new File(RGAA3_TESTCASE_PATH);
    for (File file : srcDir.listFiles()) {
        String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", "");
        String theme = fileName.substring(0, 2);
        String crit = fileName.substring(2, 4);
        String test = fileName.substring(4, 6);
        String testKey = Integer.valueOf(theme).toString() + "-" + Integer.valueOf(crit).toString() + "-"
                + Integer.valueOf(test).toString();
        String wrongKey = theme + "." + crit + "." + test;
        for (File testcase : file.listFiles()) {
            if (testcase.isFile() && testcase.getName().contains("html")) {
                Document doc = Jsoup.parse(FileUtils.readFileToString(testcase));
                Element detail = doc.select(".test-detail").first();
                if (detail == null) {
                    System.out.println(doc.outerHtml());
                } else {
                    detail.tagName("div");
                    detail.text("");
                    for (Element el : detail.children()) {
                        el.remove();//  w ww  .j a  v  a2 s .  c o m
                    }
                    if (!detail.hasAttr("lang")) {
                        detail.attr("lang", "fr");
                    }
                    detail.append("\n" + RGAA3.get(testKey).ruleRawHtml + "\n");
                    doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml);
                    doc.outputSettings().outline(false);
                    doc.outputSettings().indentAmount(4);
                    String outputHtml = doc.outerHtml();
                    if (outputHtml.contains(wrongKey)) {
                        outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot());
                    }
                    FileUtils.writeStringToFile(testcase, outputHtml);
                }
            }
        }
    }
}