List of usage examples for org.jsoup.nodes Element remove
public void remove()
From source file:org.b3log.symphony.util.Markdowns.java
/** * Gets the safe HTML content of the specified content. * * @param content the specified content// ww w .j a va2s .com * @param baseURI the specified base URI, the relative path value of href will starts with this URL * @return safe HTML content */ public static String clean(final String content, final String baseURI) { final Document.OutputSettings outputSettings = new Document.OutputSettings(); outputSettings.prettyPrint(false); final String tmp = Jsoup.clean(content, baseURI, Whitelist.relaxed().addAttributes(":all", "id", "target", "class") .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u") .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight") .addAttributes("audio", "controls", "src") .addAttributes("video", "controls", "src", "width", "height") .addAttributes("source", "src", "media", "type") .addAttributes("object", "width", "height", "data", "type") .addAttributes("param", "name", "value") .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type", "width", "height", "wmode", "allowNetworking"), outputSettings); final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser()); final Elements ps = doc.getElementsByTag("p"); for (final Element p : ps) { p.removeAttr("style"); } final Elements iframes = doc.getElementsByTag("iframe"); for (final Element iframe : iframes) { final String src = StringUtils.deleteWhitespace(iframe.attr("src")); if (StringUtils.startsWithIgnoreCase(src, "javascript") || StringUtils.startsWithIgnoreCase(src, "data:")) { iframe.remove(); } } final Elements objs = doc.getElementsByTag("object"); for (final Element obj : objs) { final String data = StringUtils.deleteWhitespace(obj.attr("data")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { obj.remove(); continue; } final String type = StringUtils.deleteWhitespace(obj.attr("type")); if (StringUtils.containsIgnoreCase(type, "script")) { obj.remove(); } } final Elements embeds = doc.getElementsByTag("embed"); for (final Element embed : embeds) { final String data = StringUtils.deleteWhitespace(embed.attr("src")); if (StringUtils.startsWithIgnoreCase(data, "data:") || StringUtils.startsWithIgnoreCase(data, "javascript")) { embed.remove(); continue; } } final Elements as = doc.getElementsByTag("a"); for (final Element a : as) { a.attr("rel", "nofollow"); final String href = a.attr("href"); if (href.startsWith(Latkes.getServePath())) { continue; } a.attr("target", "_blank"); } final Elements audios = doc.getElementsByTag("audio"); for (final Element audio : audios) { audio.attr("preload", "none"); } final Elements videos = doc.getElementsByTag("video"); for (final Element video : videos) { video.attr("preload", "none"); } String ret = doc.body().html(); ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue return ret; }
From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java
void extractWidget(JSONObject originalStructure, String contentId, Set<String> widgetsUsed, String ref, JSONObject currentPage, JSONObject currentRow, int leftSideColumn, Element widgetElement) throws JSONException { String[] widgetIdParts = widgetElement.attr("id").split("_"); String widgetType = widgetIdParts[1]; String widgetId = widgetIdParts.length > 2 ? widgetIdParts[2] : generateWidgetId(); int columnIndex; if (widgetElement.hasClass("block_image_left")) { columnIndex = 0;/*from www. j a v a2 s. c o m*/ } else if (widgetElement.hasClass("block_image_right")) { columnIndex = leftSideColumn > 0 ? 2 : 1; } else { columnIndex = leftSideColumn > 0 ? 1 : 0; } generateNewCell(widgetId, widgetType, currentPage, currentRow, columnIndex, getJSONObjectOrNull(originalStructure, widgetId)); widgetsUsed.add(widgetId); if ("discussion".equals(widgetType)) { migrateDiscussionWidget(contentId, ref, currentPage, widgetId); } widgetElement.remove(); }
From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java
private void displayWysiwyg(String html, HttpServletRequest request, HttpServletResponse response, String instanceId) throws IOException { html = "<html><body>" + html + "</body></html>"; Document doc = Jsoup.parse(html); Elements body = doc.getElementsByTag("body"); if (!body.isEmpty()) { html = body.first().html();//from w w w . ja v a 2 s .c o m } Elements images = doc.getElementsByTag("img"); for (Element img : images) { String source = img.attr("src"); String newSource = source; if (source.contains("/silverpeas")) { // need to convert in dataurl newSource = convertSpImageUrlToDataUrl(source); } img.attr("src", newSource); } Elements embeds = doc.getElementsByTag("embed"); for (Element embed : embeds) { String htmlPart = embed.outerHtml(); if (htmlPart.contains("flash")) { String attachmentId = htmlPart .substring(htmlPart.indexOf("attachmentId/") + "attachmentId/".length()); attachmentId = attachmentId.substring(0, attachmentId.indexOf("/")); SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById( new SimpleDocumentPK(attachmentId), getUserInSession(request).getUserPreferences().getLanguage()); String type = attachment.getContentType(); String url = getServletContext().getContextPath() + "/services/spmobile/Attachment"; url = url + "?id=" + attachmentId + "&instanceId=" + instanceId + "&lang=" + getUserInSession(request).getUserPreferences().getLanguage() + "&userId=" + getUserInSession(request).getId(); if (type.equals("audio/mpeg") || type.equals("audio/ogg") || type.equals("audio/wav")) { embed.parent().append("<audio controls><source src='" + url + "' type='" + type + "'></audio>"); embed.remove(); } else if (type.equals("video/mp4") || type.equals("video/ogg") || type.equals("video/webm")) { embed.parent() .append("<video controls='controls'><source src='" + url + "' type='" + type + "' />"); embed.remove(); } } } html = doc.outerHtml(); OutputStreamWriter out = new OutputStreamWriter(response.getOutputStream(), "UTF-8"); writeContainer(out, html); out.flush(); }
From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java
private void displayFormView(Writer out, PublicationDetail pub, UserDetail user, String ua) throws Exception { PublicationTemplate pubTemplate = PublicationTemplateManager.getInstance() .getPublicationTemplate(pub.getInstanceId() + ":" + pub.getInfoId()); DataRecord xmlData = pubTemplate.getRecordSet().getRecord(pub.getId()); PagesContext xmlContext = new PagesContext("myForm", "0", user.getUserPreferences().getLanguage(), false, pub.getInstanceId(), "useless"); xmlContext.setObjectId(pub.getId()); xmlContext.setDesignMode(false);//from w w w . j ava2 s . co m xmlContext.setBorderPrinted(false); xmlContext.setContentLanguage(user.getUserPreferences().getLanguage()); xmlContext.setCreation(false); StringWriter generatedHtml = new StringWriter(); PrintWriter outTmp = new PrintWriter(generatedHtml); Form xmlForm = pubTemplate.getViewForm(); if (xmlForm instanceof XmlForm) { Method m = XmlForm.class.getDeclaredMethod("display", new Class[] { PrintWriter.class, PagesContext.class, DataRecord.class }); m.setAccessible(true); m.invoke(xmlForm, outTmp, xmlContext, xmlData); outTmp.flush(); } else if (xmlForm instanceof HtmlForm) { String html = ((HtmlForm) xmlForm).toString(xmlContext, xmlData); outTmp.write(html); outTmp.flush(); } String html = generatedHtml.toString(); Document doc = Jsoup.parse(html); Elements images = doc.getElementsByTag("img"); for (Element img : images) { if (img.attr("class").equals("preview-file")) { // remove preview for files img.remove(); } else if (img.attr("src").startsWith("/silverpeas/attached_file/componentId/")) { // convert url to dataurl String data = img.attr("src"); data = convertImageAttachmentUrl(data, data); img.attr("src", data); } } Elements links = doc.getElementsByTag("a"); for (Element link : links) { if (link.attr("href").startsWith("/silverpeas/attached_file/componentId/")) { // link to file String url = link.attr("href"); String attachmentId = url.substring(url.indexOf("attachmentId/") + "attachmentId/".length()); attachmentId = attachmentId.substring(0, attachmentId.indexOf("/")); url = getServletContext().getContextPath() + "/services/spmobile/Attachment"; url = url + "?id=" + attachmentId + "&instanceId=" + pub.getInstanceId() + "&lang=" + user.getUserPreferences().getLanguage() + "&userId=" + user.getId(); link.attr("href", url); link.attr("target", "_self"); if (link.attr("id").startsWith("player")) { boolean playable = false; SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById( new SimpleDocumentPK(attachmentId), user.getUserPreferences().getLanguage()); String type = attachment.getContentType(); if (type.contains("mp4") || type.contains("ogg") || type.contains("webm")) { playable = true; } if (playable) { String style = link.attr("style"); String width = style.substring(style.indexOf("width") + "width".length() + 1); width = width.substring(0, width.indexOf("px")); String height = style.substring(style.indexOf("height") + "height".length() + 1); height = height.substring(0, height.indexOf("px")); link.parent().append("<video width='" + width + "' height='" + height + "' controls='controls'><source src='" + url + "' type='" + type + "' />"); link.remove(); } else { // display image instead of video player String style = "display:block; width:150px; height:98px; background-repeat: no-repeat; "; style += "background-image: url(data:image/jpeg;base64," + "/9j/4AAQSkZJRgABAQEBLAEsAAD" + "/4QYfRXhpZgAATU0AKgAAAAgAAAAAAA4AAgIBAAQAAAABAAAALAICAAQAAAABAAAF6wAAAAD/2P" + "/gABBKRklGAAEBAAABAAEAAP/bAEMACAYGBwYFCAcHBwkJCAoMFA0MCwsMGRITDxQdGh8eHRocHCAkLicgIiwjHBwoNyksMDE0NDQfJzk9ODI8LjM0Mv/AAAsIAEAAYgEBEQD/xAAfAAABBQEBAQEBAQAAAAAAAAAAAQIDBAUGBwgJCgv/xAC1EAACAQMDAgQDBQUEBAAAAX0BAgMABBEFEiExQQYTUWEHInEUMoGRoQgjQrHBFVLR8CQzYnKCCQoWFxgZGiUmJygpKjQ1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4eLj5OXm5+jp6vHy8/T19vf4+fr/2gAIAQEAAD8A98orN1zXdO8OaVLqWqXS29tEOWPUnsAO5PoK8X1H9ouR7t00Xw600APEk8pDMP8AdUcfma3PCXx60nWr6Ow1qzbSp5CFWUvviLehOAV/Hj3r1me4htraS4mlSOGNS7yMcKqjnJPpXjPiL9oSxtL17XQNKfUQpI8+VyiN/uqASR+VQaJ+0TBJeLBr+iSWcZODPA5fb9UIB/In6V7TYX1tqVlDeWc6T20yho5EOQw9qs0UUUUV89/HO6udZ8f6D4WExitWWN+vG+Ryu78AP1Ne2eHvDGkeGNLisdLs4oY0UAuFG9z6sepNec/HHwVpVz4RuPEMFtFb6jZsrGSNQvmqWCkN6nnIPtXB6/4z1K7+Aei2ryuXuLp7SaXJy8cfIBP4r/3zXrnws8F6V4e8IabeR20cmoXlvHcTXLAFsuobaD2AzjitLx34K0nxb4fuoru2iF0kbNBchQHjYDI59OORXnf7OWs3NxpesaPM7NBaPHLDk527924fmoP4mvcKKKKKK8e+N3gXUNZis/EmixvLf2C7JI4/vtGCWDL6lSTx71m+Hf2hbKLT0tvEem3S3sQ2vLbKCrkdyCQVP51zfjH4iat8VpofDPhnS7hLSSQNIH5eTB4LY4VR1/AV6Br3wn+0fCO18OWbq+o2H+kxv0EsvJYfQ5IH0FcZ4K+MVz4KsV8N+K9Muz9i/dRugxLGo6KysRkDsc9KseMfjmdf06TRvCmnXgmvFMRmlUb8HghFUnk9M/pXefB3wNP4N8NSyagmzUr9lkmTr5ajO1frySfrXo1FFFFFcl8RPGsfgXwwdTNv9omklEEEZOAXIJyT6YBr5y1T4qPrFyZ7/wAJ+HZ5ScmRrZtx+pDc1e0v43arosBh0zQNCs4z1EFuy5+uG5q//wANE+Kv+fDTP+/b/wDxVZWrfGS+10D+1fDegXhHAaa2YsPod2abpPxeutDfdpfhjw/aueN8Vswb/vrdmve/hh4//wCE90S4nmtVtry0kCTIhJU5BIYZ+h49qf4n+KfhnwprEWl31xJJdMR5iQJv8oHpu9Pp1rtI5FljWRTlWGR9KdRRRXj37Rn/ACIun/8AYRX/ANFyV8yUUUUV23hD4iXngzw3q9jpkeL7UHj23B5EKqGBIHduePSuRE0lxfiaaRpJZJNzuxyWJPJJr7tsv+PGD/rmv8qnooorx79oz/kRdP8A+wkv/ot6+ZKKKKKKkt/+PiL/AHh/OvvGy/48YP8Armv8qnooorx/9osf8ULp/wD2El/9FyV8xmiiiivSvh58O4vHnhbXWhk8rVLSSI2zsflbIbKt9cDntXCXOm3elaw1jfQPBcwy7JI3GCpBr7msv+PG3/65r/Kp6KKKwPGHhLT/ABpoL6TqO9ULB45Iz80bjOGH5n868nP7Ndpk48STY/69R/8AFUf8M12n/QyTf+Ao/wDiqP8Ahmu0/wChkm/8BR/8VR/wzXaf9DJN/wCAo/8AiqP+Ga7T/oZJv/AUf/FV6f4H8Dab4E0d7GwZ5XlfzJp5PvSN0HA6ADtV3UvCHh7WNUg1PUNJtri8gxsldMnjpn1x71tgADAGBRRX/9n/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wgALCABiAJYBAREA/8QAHQABAAICAwEBAAAAAAAAAAAAAAcIAgYEBQkDAf/aAAgBAQAAAAG0o4cWdnJf71sXbBIOQB1VCZO6Dl2uolLekbxbQAgPVbTKAyDu9hvzzq9DeSAi6sV3+iorZmJrm6bSn0PyAK6V+59q5RrDB/eWzkAAUe0sDfrrxVt2zgw8t/kBz7fU2u/NoMPLf5AZMbyTYDDy3+QAvJNgMPLf5AcnYdVvJNgMfLrjAdn6GUKuNMAEL6mBukwfL6gAAB//xAAoEAABAwQBBAIBBQAAAAAAAAAFAwQGAAECByAWFzAyEzYQERIxNED/2gAIAQEAAQUC4OXaLJB1tWPNsxmxgBTO1/1tT4i2GIONsR5DMTPgZlTxEyKIlgcPE50XHaWeLIyDU5MQhrieLCXkgNox4S7elp6ZZ6ScqJSXWJKPIayni2Lrw7kdZJRnSo1G+P42QOSGS7ZJRZ1E9MjUkgdZWte0qaYg5Y2U+Zv4NjAszsX11MMIsRZv25FE9J2EcbOVnc4lGy4vdeJ6ymyUeVQcpOkpNMR8YbCmTmaynHH9uPhmWqkyiysMkY1QdrmQFlodBWkTTva17SzUnzrZxCRsMxeszxRSJQ9nE2nik2zipB/1yfrrk/XXJ+uuT9dcn665P11yfqJbKKNClSjYY6MLx6QN5KN5ZeufvzY/3dgbIxD2UUyWz1D9Q5Zeufvzte+N73vf8ah+ocsvXP38WofqHLL1z9+aCfzLSGOu40RrUP1Dlf8AhbG6avMbhkqQksaaSgcfj7qNkNWM12UT5yLVww887Js67Js67Js67Js67Js67Js67Js6jeshsed1mlgr/o//xABAEAACAQICAg0JBwQDAAAAAAABAgMABBESITEFEBMiMDI1QUJRYXGSFCBSc5GTocHRFSQzYnKx8AYjQFOBguH/2gAIAQEABj8C8xpriVIIl1vIcAKyi5eftiiOHxoIl8IZDqWdcnx1VjzbRmup47eIdKQ4VlWeWb80cRw+NCOC+VZTqjmBQ/Hg57y4bLDCuZqVcHkLthBax6l/nXQe8v47Zz0I03TD/nGnntpF2QiXSwjXB/DUWx19LnsJDkQufwT9Knvp9KxjQo1seYUBg1zcOd5EnEjHyFY3WyccUnoxxZx7cRTXCMt9bLxmiGDL2lai2I2QlMkUm9t5X1q3o93BQRLoEtwA3cAT9K2QvmAMwIhX8o1n+dm3eJCMkb5Zco5sRpr+mcxP3iLdpO1gi/U1c3uAM8spTN1KMNHx2tOmr6K13ghmzR4dHnFRSaiyg8DMkS5p4G3dB14Y4j2E1Ilzj5DcYCQjoEamoS206XEZ6cbYimlvJ1U9GIHF27hTMq/eLuTBV9Ec3sFWvkyljsaBgOfc8MD+wPtqWxvWyWUzZxJ/rf6fShLBIs0Z1NGcQaZriVXuMN5bq2/Y/LvpQ++e5lMkzDorjvqwGrgpLzYtlguX0vA/EY9nUaI+zrsHrgGYe1a02b24OuW6OX/2jIG8ovnGDzkauxeqsDqp7nYZkTHSbV9A/wCpor9nXqHn3JSR8KG6W3kcZ1yXJw+GuikH924f8W4YaW7OwcHILG5aztFYiMRaCw6ya5WuvHXK11465WuvHXK11465WuvHXK11465WuvHUEV/cNd2kjBG3TjL2g7Qt3z3Nz0o4cN530l7ah1jJylZBpB8803fwEH61/en2P2McSX2qSUaRF2fq/amd2LuxxLMcSaX17/Lzz3Ue/gMQcCK0nHaX17/Lzz3Ue/g19e/y8891Hv4BE1ZiFxp7S6TfDSrjiuvWNpfXv8uAdWGDAkEHm4C2RRmZpFAA76a2uRlYaY5RrjP85qa1vI8rDiuOK46xUQniaIvIzqG9E8/ANdq8tlPIcX3PDKx68OuuU5/diuU5/diuU5/diuU5/diuU5/diuU5/diuU5/dilu80l3cJxDLhlTtA2hnRXw0jMP8j//EACkQAQABAgQFBQADAQAAAAAAAAERACExQVGhMGFxgcEQkbHw8SBA0eH/2gAIAQEAAT8h/gaMJJ65aQQ5hV90J7U7aYSl3eVAARVxMPTGLtFf9eVIIG38EUizoQToQBeQtTwpnFsYvI5tqGV8eQythOvVgVdRQs7kF+k9aLL8wM1xT2V5UgkaWK4Q+1MscmpJLprmw9Vqx5C25PIDFe7NEIvmcHd8FFc+XAmOg5i84pi4DU5RuawMxt04LavP4HPc9lM0dXFDet7vTGj7wTDcwjrL3qVZAjkA+2dSJrYXCjuU+3orABIRp09vYjpO1KwSedQeDFroZeIfQTFTExBy5KzLoxk5xCV65BbVBERI3J+TDmVPHGe5hjoN3k0AWlm4HvkjkUh5lCtAEvMBOkKJTci9MlSnchnJEZNVvhWj06LpcoLHagIwBAacJi+NsdirO9nlV9gwubuNBJywDvd2GsOBUoZlk3dqRBKIRMaVmWsRu1OjY1oksYkXe3enxWZZ+b2qaCHYG0fB78KdcCg98VdLRJuZV+9r97X6Wv3tfpa/W1+tqFk8hFYzlpLOPpnXvDOU1IXTHtikAIAHEjE66/z278VvHAcXY/yVckjtTa/TSlpwlA1XgTb58VvXAMIRImJSqom6vBm3z4rev6U2+fFb1wLv78IUh4sJyTfYbcGYyzUincGghUonAQFioFhV9IA+tPNJZgzGr8x2zvQpH6hYIh24C+wR0rGTcMV9x819x819x819x819x819x819x81LC0mCZAzNV9MYAsDDrWP9f//aAAgBAQAAABD8n/2U/wDyv/jH/uJ/4A//AP7/AL+X9/7+/wDf3+f8AP8Af/8A/8QAKBABAAIBAwMDBAMBAAAAAAAAAREhADFBYTBRcYGR8BAgocFAULHx/9oACAEBAAE/EPsQZM4/dUDxOO04Bp4IYQnEC1xJ7sAHSQtzMpHj6D+Gvm7E+wJwytc9+JPOWr4DzmYMlMkzBMHsXJ6nSchaXAKLdEHLi1Qt26BKRJXtJKAAaWmxxgfCDCFvdmMJBzDhMkIgeoNREEgwA3V5EFhXuQnYl2yaqUpIzBX/AFSTCClI3ezO4U9fU1QTXthgloQKzfy2JZQBhOl1+pvDo9yZDvkGsBClU3Jhr8mNxNxZOICEk5yNzIAHmFBMDYGExTJbxX5/dlUU3J0HUlBvHYfRHpiJEdRO2BxOJ71kePjwxUqD7LB6T0VXX3gAcpDhiXWmTB2SIbaBsMJC1ETypj2ybGUw8sXyeWEAO0Q5MeVTUTXNbmzloALE73AY05c5CSOtsFqDVz3NNiIx9HAWJN+gBU9hBsLE5woWZa7KTk74VtQSABABsRXRgNt5xXmVTJRiwqjcya8iaMGB4VzQKH58xLMEhiQit4lFmYJWAISLoAOybl5YWYJMstHYXYGIvWB69d4d3VDSdUkt5ws0MCCZC2Gw72lyeigBpqOBGfMFFaRuggTpnw79ZB878ZP8D8Z8O/WfEP1nyT9ZdPzPGBSPwdgNJozAE1v6CZSm0pKPVEuBlsY2IVuWWKhZ95OAbNN9BdRRc5J3o0JGsFd7bdeGjYSTZUKquq4JRtLf4EionmLogyImjjlCUFVdV/pVwKrgVOD2QSEAx64IcV9IY3oWk1sWdFfx0sWuMwJAdxOgJ8H1VEGTG1LCaJ9R0kh01hy2yMwfSnxKVUMb5SjLMsHRNu/QlEFAlKHSWwLt0O3bt27du2tci4ZBW2iE2jIO3OHkiM4WSU846pue/Qgutdeeh/37/wD/2Q==);"; link.attr("style", style); } } } } // remove all scripts Elements scripts = doc.getElementsByTag("script"); for (Element script : scripts) { script.remove(); } html = doc.outerHtml(); writeContainer(out, html); out.flush(); }
From source file:webcralwerproject1.Webcrawler.java
public String contentprocessor() { File folder = new File(DirectoryName + "/" + crawlcount); FileWriter f_write = null;//from w w w.j a va 2 s .c o m Elements p, c = null; String contentprocessfile = "./crawler" + crawlcount + "content.html"; if (!folder.exists()) { } else { try { File[] listOfFiles = folder.listFiles(); f_write = new FileWriter(contentprocessfile, true); //Open repo directory and loop through all files for (File file : listOfFiles) { if (file.isFile()) { File input = new File(file.getAbsolutePath()); Document doc = Jsoup.parse(input, "UTF-8"); String title = doc.select("title").toString(); Elements n = doc.select("nav").remove(); // String d =doc.select("div.id"); doc.select("head").remove(); doc.select("link").remove(); doc.select("style").remove(); doc.select("meta").remove(); doc.select("script").remove(); doc.select("figure").remove(); doc.select("img").remove(); doc.select("footer").remove(); doc.select("input[type = search]").remove(); doc.select("form").remove(); doc.select("button").remove(); doc.select("video").remove(); doc.select("div:empty").remove(); doc.select("div#footer").remove(); doc.select("div#id").remove(); doc.select("div#nav").remove(); doc.select("div#navigation").remove(); doc.select("div.footer").remove(); doc.select("div.header").remove(); doc.select("li > a[href]").remove(); Elements linksOnPage = doc.select("body a[href]"); for (Element link : linksOnPage) { if (link.html() == null) { link.remove();//<a></a> } else if (link.html().length() <= 4) {// does not contains title of the page link.remove(); } else { int child = link.parentNode().childNodeSize(); if (child == 1) {//only element remove link.remove(); } } } f_write.write(doc.text()); } f_write.write("<br>"); } f_write.close(); } catch (Exception e) { System.out.println("Inside Contentprocessor" + e); } return contentprocessfile; } return null; }