List of usage examples for org.jsoup.nodes Element remove
public void remove()
From source file:com.astamuse.asta4d.render.RenderUtil.java
private final static void apply(Element target, List<Renderer> rendererList, RenderAction renderAction, int startIndex, int count) { // The renderer list have to be applied recursively because the // transformer will always return a new Element clone. if (startIndex >= count) { return;/*w w w . j a v a2 s . c o m*/ } final Renderer currentRenderer = rendererList.get(startIndex); RendererType rendererType = currentRenderer.getRendererType(); switch (rendererType) { case GO_THROUGH: apply(target, rendererList, renderAction, startIndex + 1, count); return; /* case DEBUG: currentRenderer.getTransformerList().get(0).invoke(target); apply(target, rendererList, renderAction, startIndex + 1, count); return; */ case RENDER_ACTION: ((RenderActionRenderer) currentRenderer).getStyle().apply(renderAction); apply(target, rendererList, renderAction, startIndex + 1, count); return; default: // do nothing break; } String selector = currentRenderer.getSelector(); List<Transformer<?>> transformerList = currentRenderer.getTransformerList(); List<Element> elemList; if (PSEUDO_ROOT_SELECTOR.equals(selector)) { elemList = new LinkedList<Element>(); elemList.add(target); } else { elemList = new ArrayList<>(target.select(selector)); } if (elemList.isEmpty()) { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER) { elemList.add(target); transformerList.clear(); transformerList.add( new RendererTransformer(((ElementNotFoundHandler) currentRenderer).alternativeRenderer())); } else if (renderAction.isOutputMissingSelectorWarning()) { String creationInfo = currentRenderer.getCreationSiteInfo(); if (creationInfo == null) { creationInfo = ""; } else { creationInfo = " at [ " + creationInfo + " ]"; } logger.warn( "There is no element found for selector [{}]{}, if it is deserved, try Renderer#disableMissingSelectorWarning() " + "to disable this message and Renderer#enableMissingSelectorWarning could enable this warning again in " + "your renderer chain", selector, creationInfo); apply(target, rendererList, renderAction, startIndex + 1, count); return; } } else { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER) { apply(target, rendererList, renderAction, startIndex + 1, count); return; } } Element delayedElement = null; Element resultNode; // TODO we suppose that the element is listed as the order from parent // to children, so we reverse it. Perhaps we need a real order process // to ensure the wanted order. Collections.reverse(elemList); boolean renderForRoot; for (Element elem : elemList) { renderForRoot = PSEUDO_ROOT_SELECTOR.equals(selector) || rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER; if (!renderForRoot) { // faked group node will be not applied by renderers(only when the current selector is not the pseudo :root) if (elem.tagName().equals(ExtNodeConstants.GROUP_NODE_TAG) && ExtNodeConstants.GROUP_NODE_ATTR_TYPE_FAKE .equals(elem.attr(ExtNodeConstants.GROUP_NODE_ATTR_TYPE))) { continue; } } if (elem == target) { delayedElement = elem; continue; } for (Transformer<?> transformer : transformerList) { resultNode = transformer.invoke(elem); elem.before(resultNode); } // for transformer elem.remove(); } // for element // if the root element is one of the process targets, we can not apply // the left renderers to original element because it will be replaced by // a new element even it is not necessary (that is how Transformer // works). if (delayedElement == null) { apply(target, rendererList, renderAction, startIndex + 1, count); } else { if (rendererType == RendererType.ELEMENT_NOT_FOUND_HANDLER && delayedElement instanceof Document) { delayedElement = delayedElement.child(0); } for (Transformer<?> transformer : transformerList) { resultNode = transformer.invoke(delayedElement); delayedElement.before(resultNode); apply(resultNode, rendererList, renderAction, startIndex + 1, count); } // for transformer delayedElement.remove(); } }
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");/*from w ww .j av a 2 s . c o m*/ if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:com.iorga.iraj.servlet.AgglomeratorServlet.java
private void parseResource(final ServletConfig config, final String path) throws IOException, URISyntaxException { //TODO catch the modifications on the path itself final URL pathUrl = config.getServletContext().getResource(path); long lastModified = pathUrl.openConnection().getLastModified(); final InputStream targetIS = pathUrl.openStream(); final Document document = Jsoup.parse(targetIS, "UTF-8", ""); final Elements elements = document.getElementsByAttribute(ATTRIBUTE_NAME); for (final Element element : elements) { // each element which defines iraj-agglomerate // retrieve the suffix final String suffix = element.attr(ATTRIBUTE_NAME); final String urlAttribute = element.attr(URL_ATTRIBUTE_ATTRIBUTE_NAME); String src = StringUtils.removeEndIgnoreCase(element.attr(urlAttribute), suffix); String prefix = ""; if (!src.startsWith("/")) { // this is not an absolute file, let's add the prefix from the given path prefix = StringUtils.substringBeforeLast(path, "/") + "/"; src = prefix + src;//from w w w. j a v a2 s . c om } // searching all scripts inside the folder defined by src attribute lastModified = searchAndAppendAfter(config, element, src, prefix, suffix, urlAttribute, lastModified); // finally remove it element.remove(); } caches.put(path, new ParsedResourceCacheEntry(path, document, lastModified)); }
From source file:ac.simons.oembed.Oembed.java
/** * Parses the given html document into a document and processes * all anchor elements. If a valid anchor is found, it tries to * get an oembed response for it's url and than render the result * into the document replacing the given anchor.<br> * It returns the html representation of the new document.<br> * If there's an error or no oembed result for an url, the anchor tag * will be left as it was. //from w w w . ja v a 2 s .c o m * @param document The document that should be checked for links to transform * @return the transformed document */ public Document transformDocument(final Document document) { boolean changedBaseUri = false; if (document.baseUri() == null && this.getBaseUri() != null) { document.setBaseUri(this.getBaseUri()); changedBaseUri = true; } for (Element a : document.getElementsByTag("a")) { final String href = a.absUrl("href"); try { String renderedRespose = null; final OembedResponse oembedResponse = this.transformUrl(href); // There was no response or an exception happened if (oembedResponse == null) continue; // There is a handler for this response else if (this.getHandler().containsKey(oembedResponse.getSource())) this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse); // Try to render the response itself and replace the current anchor else if ((renderedRespose = oembedResponse.render()) != null) { a.before(renderedRespose); a.remove(); } } catch (OembedException e) { logger.warn(String.format("Skipping '%s': %s", href, e.getMessage())); } } if (changedBaseUri) document.setBaseUri(null); return document; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node/*from w w w. j ava 2 s. c om*/ * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }
From source file:com.normalexception.app.rx8club.fragment.pm.PrivateMessageViewFragment.java
/** * Format the user post by removing the vb style quotes and the * duplicate youtube links/* www . j a va 2s. c o m*/ * @param innerPost The element that contains the inner post * @return The formatted string */ private String formatUserPost(Elements innerPost) { // Remove the duplicate youtube links (this is caused by a plugin on // the forum that embeds youtube videos automatically) for (Element embedded : innerPost.select("div[id^=ame_doshow_post_]")) embedded.remove(); // Remove the vbulletin quotes String upost = Utils.reformatQuotes(innerPost.html()); return upost; }
From source file:com.normalexception.app.rx8club.fragment.thread.ThreadFragment.java
/** * Format the user post by removing the vb style quotes and the * duplicate youtube links/*from ww w . ja v a 2s. co m*/ * @param innerPost The element that contains the inner post * @return The formatted string */ private String formatUserPost(Elements innerPost) { try { Element ipost = innerPost.select("td[class=alt1]").select("div[id^=post_message]").first(); // Only if there is a post to key off of if (ipost != null) { // Remove the duplicate youtube links (this is caused by a plugin on // the forum that embeds youtube videos automatically) for (Element embedded : ipost.select("div[id^=ame_doshow_post_]")) embedded.remove(); // Remove the vbulletin quotes return Utils.reformatQuotes(ipost.html()); } else { return null; } } catch (Exception e) { Log.e(TAG, "Error Parsing Post", e); return null; } }
From source file:im.ene.lab.attiq.ui.activities.ItemDetailActivity.java
@SuppressWarnings("unused") public void onEventMainThread(ItemCommentsEvent event) { if (!UIUtil.isEmpty(event.comments)) { mCommentsView.setVisibility(View.VISIBLE); List<Comment> comments = event.comments; mCommentCount.setText(comments.size() + ""); String info = comments.size() == 1 ? getString(R.string.comment_singular) : getString(R.string.comment_plural); // FIXME should use plural strings mCommentInfo.setText(getString(R.string.article_comment, comments.size(), info)); final String html; try {//from w w w. j a v a2s. co m html = IOUtil.readAssets("html/comments.html"); Document fullBody = Jsoup.parse(html); Element content = fullBody.getElementById("content"); for (Comment comment : comments) { String commentHtml = IOUtil.readAssets("html/comment.html"); commentHtml = commentHtml.replace("{user_icon_url}", comment.getUser().getProfileImageUrl()) .replace("{user_name}", comment.getUser().getId()) .replace("{comment_time}", TimeUtil.commentTime(comment.getCreatedAt())) .replace("{article_uuid}", mItemUuid).replace("{comment_id}", comment.getId()); Document commentDoc = Jsoup.parse(commentHtml); Element eComment = commentDoc.getElementsByClass("comment-box").first(); eComment.getElementsByClass("message").first().append(comment.getRenderedBody()); // remove comment edit block if it is not from current user if (mMyProfile == null || !mMyProfile.getId().equals(comment.getUser().getId())) { String commentId = "comment_{comment_id}_{user_name}" .replace("{comment_id}", comment.getId()) .replace("{user_name}", comment.getUser().getId()); Element commentEditor = commentDoc.getElementById(commentId); commentEditor.remove(); } content.appendChild(eComment); } String result = fullBody.outerHtml(); mCommentsView.loadDataWithBaseURL("http://qiita.com/", result, null, null, null); } catch (IOException e) { e.printStackTrace(); } } else { mCommentCount.setText("0"); mCommentInfo.setText(getString(R.string.article_comment, 0, getString(R.string.comment_plural))); mCommentsView.setVisibility(View.GONE); } }
From source file:moose.com.ac.ArticleViewActivity.java
private void filterImg(String str) { Document mDocument = Jsoup.parse(str); Elements imgs = mDocument.select("img"); for (int imgIndex = 0; imgIndex < imgs.size(); imgIndex++) { Element img = imgs.get(imgIndex); String src = img.attr("src").trim(); if (TextUtils.isEmpty(src)) continue; Uri parsedUri = Uri.parse(src);// w w w.jav a2 s . co m if ("file".equals(parsedUri.getScheme())) continue; if (parsedUri.getPath() == null) continue; if (!"http".equals(parsedUri.getScheme())) { parsedUri = parsedUri.buildUpon().scheme("http").authority("www.acfun.tv").build(); } // url may have encoded path parsedUri = parsedUri.buildUpon().path(parsedUri.getPath()).build(); src = parsedUri.toString(); Log.i(TAG, "image src:" + src); img.attr("org", src); if (CommonUtil.getMode() == 1 && !CommonUtil.isWifiConnected(mContext)) {// Log.i(TAG, "[?]"); img.after("<div style=\"width: 100%;text-align: center;\"><br><p>[]</p></div>"); } else { Log.i(TAG, "[?]"); StringBuilder builder = new StringBuilder(); builder.append("<div style='text-align: center;'><br>") .append("<img src='file:///android_asset/loading.gif'").append("name = '").append(src) .append("'\n;onclick = window.JsBridge.showImage('").append(src).append("')") .append(" alt=' '/>\n").append("</div>"); img.after(builder.toString()); Log.i(TAG, "image:table:-" + builder.toString()); } /*if (CommonUtil.getMode() == 1 && !CommonUtil.isWifiConnected(mContext)) { img.after("<p >[]</p>"); } else if (!src.contains(Config.AC_EMOTION)) { StringBuilder builder = new StringBuilder(); builder.append("<div style=\"width: 100%;text-align: center;\"><br><img src=\"") .append(src) .append("\" width=: 100%;height:auto\"") .append(" alt=\" \"/>\n") .append("</div>"); Log.i(TAG, "index image:" + builder.toString()); img.after(builder.toString()); } else { img.after("<img src=\"" + src + "\" alt=\" \"/>\n"); }*/ img.remove(); //img.removeAttr("style"); HtmlBody = mDocument.toString(); Log.i(TAG, "??html:" + HtmlBody); } }
From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java
private static void createTestcaseFiles() throws IOException { File srcDir = new File(RGAA3_TESTCASE_PATH); for (File file : srcDir.listFiles()) { String fileName = file.getName().replace("Rgaa30Rule", "").replace(".java", ""); String theme = fileName.substring(0, 2); String crit = fileName.substring(2, 4); String test = fileName.substring(4, 6); String testKey = Integer.valueOf(theme).toString() + "-" + Integer.valueOf(crit).toString() + "-" + Integer.valueOf(test).toString(); String wrongKey = theme + "." + crit + "." + test; for (File testcase : file.listFiles()) { if (testcase.isFile() && testcase.getName().contains("html")) { Document doc = Jsoup.parse(FileUtils.readFileToString(testcase)); Element detail = doc.select(".test-detail").first(); if (detail == null) { System.out.println(doc.outerHtml()); } else { detail.tagName("div"); detail.text(""); for (Element el : detail.children()) { el.remove(); }// w w w. j ava 2 s . c om if (!detail.hasAttr("lang")) { detail.attr("lang", "fr"); } detail.append("\n" + RGAA3.get(testKey).ruleRawHtml + "\n"); doc.outputSettings().escapeMode(Entities.EscapeMode.xhtml); doc.outputSettings().outline(false); doc.outputSettings().indentAmount(4); String outputHtml = doc.outerHtml(); if (outputHtml.contains(wrongKey)) { outputHtml = outputHtml.replaceAll(wrongKey, RGAA3.get(testKey).getRuleDot()); } FileUtils.writeStringToFile(testcase, outputHtml); } } } } }