List of usage examples for org.jsoup.nodes Element tagName
public Element tagName(String tagName)
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");//from w w w . ja v a2 s .c om if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:no.kantega.publishing.admin.content.htmlfilter.ConvertUnderlineToEditorStyleFilter.java
@Override public Document runFilter(Document document) { for (Element span : document.getElementsByTag("span")) { String style = span.attr("style"); if (isNotBlank(style)) { String textDecoration = getSubAttributeValue(style, "text-decoration"); if ("underline".equalsIgnoreCase(textDecoration)) { span.removeAttr("style"); span.tagName("u"); }// w w w.j a v a 2 s. c om } } return document; }
From source file:org.apache.archiva.web.docs.RestDocsServlet.java
@Override protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { logger.debug("docs request to path: {}", req.getPathInfo()); String path = StringUtils.removeStart(req.getPathInfo(), "/"); InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream(path); if (StringUtils.endsWith(path, ".xsd")) { StringEscapeUtils.escapeXml(resp.getWriter(), IOUtils.toString(is)); //IOUtils.copy( is, resp.getOutputStream() ); return;// www .j a v a2 s. co m } String startPath = StringUtils.substringBefore(path, "/"); // replace all links !! Document document = Jsoup.parse(is, "UTF-8", ""); Element body = document.body().child(0); Elements links = body.select("a[href]"); for (Element link : links) { link.attr("href", "#" + startPath + "/" + link.attr("href")); } Elements datalinks = body.select("[data-href]"); for (Element link : datalinks) { link.attr("data-href", "#" + startPath + "/" + link.attr("data-href")); } Elements codes = body.select("code"); for (Element code : codes) { code.attr("class", code.attr("class") + " nice-code"); } //default generated enunciate use h1/h2/h3 which is quite big so transform to h3/h4/h5 Elements headers = body.select("h1"); for (Element header : headers) { header.tagName("h3"); } headers = body.select("h2"); for (Element header : headers) { header.tagName("h4"); } headers = body.select("h3"); for (Element header : headers) { header.tagName("h5"); } Document res = new Document(""); res.appendChild(body.select("div[id=main]").first()); Elements scripts = body.select("script"); for (Element script : scripts) { res.appendChild(script); } resp.getOutputStream().write(res.outerHtml().getBytes()); }
From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java
private static void createTestcaseFiles() throws IOException { File srcDir = new File(RGAA3_TESTCASE_PATH); for (File file : srcDir.listFiles()) { String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", ""); String theme = fileName.substring(0, 2); String crit = fileName.substring(2, 4); String test = fileName.substring(4, 6); String testKey = Integer.valueOf(theme).toString() + "-" + Integer.valueOf(crit).toString() + "-" + Integer.valueOf(test).toString(); String wrongKey = theme + "." + crit + "." + test; for (File testcase : file.listFiles()) { if (testcase.isFile() && testcase.getName().contains("html")) { Document doc = Jsoup.parse(FileUtils.readFileToString(testcase)); Element detail = doc.select(".test-detail").first(); if (detail == null) { System.out.println(doc.outerHtml()); } else { detail.tagName("div"); detail.text(""); for (Element el : detail.children()) { el.remove();// w ww .j a v a2 s . c o m } if (!detail.hasAttr("lang")) { detail.attr("lang", "fr"); } detail.append("\n" + RGAA3.get(testKey).ruleRawHtml + "\n"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(false); doc.outputSettings().indentAmount(4); String outputHtml = doc.outerHtml(); if (outputHtml.contains(wrongKey)) { outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot()); } FileUtils.writeStringToFile(testcase, outputHtml); } } } } }