Example usage for org.jsoup.nodes Element remove

List of usage examples for org.jsoup.nodes Element remove

Introduction

In this page you can find the example usage for org.jsoup.nodes Element remove.

Prototype

public void remove() 

Source Link

Document

Remove (delete) this node from the DOM tree.

Usage

From source file:org.b3log.symphony.util.Markdowns.java

/**
 * Gets the safe HTML content of the specified content.
 *
 * @param content the specified content//  ww w  .j a  va2s .com
 * @param baseURI the specified base URI, the relative path value of href will starts with this URL
 * @return safe HTML content
 */
public static String clean(final String content, final String baseURI) {
    final Document.OutputSettings outputSettings = new Document.OutputSettings();
    outputSettings.prettyPrint(false);

    final String tmp = Jsoup.clean(content, baseURI,
            Whitelist.relaxed().addAttributes(":all", "id", "target", "class")
                    .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u")
                    .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight")
                    .addAttributes("audio", "controls", "src")
                    .addAttributes("video", "controls", "src", "width", "height")
                    .addAttributes("source", "src", "media", "type")
                    .addAttributes("object", "width", "height", "data", "type")
                    .addAttributes("param", "name", "value")
                    .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type",
                            "width", "height", "wmode", "allowNetworking"),
            outputSettings);
    final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser());

    final Elements ps = doc.getElementsByTag("p");
    for (final Element p : ps) {
        p.removeAttr("style");
    }

    final Elements iframes = doc.getElementsByTag("iframe");
    for (final Element iframe : iframes) {
        final String src = StringUtils.deleteWhitespace(iframe.attr("src"));
        if (StringUtils.startsWithIgnoreCase(src, "javascript")
                || StringUtils.startsWithIgnoreCase(src, "data:")) {
            iframe.remove();
        }
    }

    final Elements objs = doc.getElementsByTag("object");
    for (final Element obj : objs) {
        final String data = StringUtils.deleteWhitespace(obj.attr("data"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            obj.remove();

            continue;
        }

        final String type = StringUtils.deleteWhitespace(obj.attr("type"));
        if (StringUtils.containsIgnoreCase(type, "script")) {
            obj.remove();
        }
    }

    final Elements embeds = doc.getElementsByTag("embed");
    for (final Element embed : embeds) {
        final String data = StringUtils.deleteWhitespace(embed.attr("src"));
        if (StringUtils.startsWithIgnoreCase(data, "data:")
                || StringUtils.startsWithIgnoreCase(data, "javascript")) {
            embed.remove();

            continue;
        }
    }

    final Elements as = doc.getElementsByTag("a");
    for (final Element a : as) {
        a.attr("rel", "nofollow");

        final String href = a.attr("href");
        if (href.startsWith(Latkes.getServePath())) {
            continue;
        }

        a.attr("target", "_blank");
    }

    final Elements audios = doc.getElementsByTag("audio");
    for (final Element audio : audios) {
        audio.attr("preload", "none");
    }

    final Elements videos = doc.getElementsByTag("video");
    for (final Element video : videos) {
        video.attr("preload", "none");
    }

    String ret = doc.body().html();
    ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue

    return ret;
}

From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java

void extractWidget(JSONObject originalStructure, String contentId, Set<String> widgetsUsed, String ref,
        JSONObject currentPage, JSONObject currentRow, int leftSideColumn, Element widgetElement)
        throws JSONException {
    String[] widgetIdParts = widgetElement.attr("id").split("_");
    String widgetType = widgetIdParts[1];
    String widgetId = widgetIdParts.length > 2 ? widgetIdParts[2] : generateWidgetId();
    int columnIndex;
    if (widgetElement.hasClass("block_image_left")) {
        columnIndex = 0;/*from  www. j  a v  a2 s.  c o m*/
    } else if (widgetElement.hasClass("block_image_right")) {
        columnIndex = leftSideColumn > 0 ? 2 : 1;
    } else {
        columnIndex = leftSideColumn > 0 ? 1 : 0;
    }
    generateNewCell(widgetId, widgetType, currentPage, currentRow, columnIndex,
            getJSONObjectOrNull(originalStructure, widgetId));
    widgetsUsed.add(widgetId);
    if ("discussion".equals(widgetType)) {
        migrateDiscussionWidget(contentId, ref, currentPage, widgetId);
    }
    widgetElement.remove();
}

From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java

private void displayWysiwyg(String html, HttpServletRequest request, HttpServletResponse response,
        String instanceId) throws IOException {
    html = "<html><body>" + html + "</body></html>";
    Document doc = Jsoup.parse(html);

    Elements body = doc.getElementsByTag("body");
    if (!body.isEmpty()) {
        html = body.first().html();//from  w w  w .  ja  v  a 2 s .c  o  m
    }

    Elements images = doc.getElementsByTag("img");
    for (Element img : images) {
        String source = img.attr("src");
        String newSource = source;
        if (source.contains("/silverpeas")) {
            // need to convert in dataurl
            newSource = convertSpImageUrlToDataUrl(source);
        }
        img.attr("src", newSource);
    }
    Elements embeds = doc.getElementsByTag("embed");
    for (Element embed : embeds) {
        String htmlPart = embed.outerHtml();
        if (htmlPart.contains("flash")) {
            String attachmentId = htmlPart
                    .substring(htmlPart.indexOf("attachmentId/") + "attachmentId/".length());
            attachmentId = attachmentId.substring(0, attachmentId.indexOf("/"));
            SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById(
                    new SimpleDocumentPK(attachmentId),
                    getUserInSession(request).getUserPreferences().getLanguage());
            String type = attachment.getContentType();
            String url = getServletContext().getContextPath() + "/services/spmobile/Attachment";
            url = url + "?id=" + attachmentId + "&instanceId=" + instanceId + "&lang="
                    + getUserInSession(request).getUserPreferences().getLanguage() + "&userId="
                    + getUserInSession(request).getId();
            if (type.equals("audio/mpeg") || type.equals("audio/ogg") || type.equals("audio/wav")) {
                embed.parent().append("<audio controls><source src='" + url + "' type='" + type + "'></audio>");
                embed.remove();
            } else if (type.equals("video/mp4") || type.equals("video/ogg") || type.equals("video/webm")) {
                embed.parent()
                        .append("<video controls='controls'><source src='" + url + "' type='" + type + "' />");
                embed.remove();
            }
        }
    }
    html = doc.outerHtml();
    OutputStreamWriter out = new OutputStreamWriter(response.getOutputStream(), "UTF-8");
    writeContainer(out, html);
    out.flush();
}

From source file:org.silverpeas.mobile.server.servlets.PublicationContentServlet.java

private void displayFormView(Writer out, PublicationDetail pub, UserDetail user, String ua) throws Exception {

    PublicationTemplate pubTemplate = PublicationTemplateManager.getInstance()
            .getPublicationTemplate(pub.getInstanceId() + ":" + pub.getInfoId());
    DataRecord xmlData = pubTemplate.getRecordSet().getRecord(pub.getId());

    PagesContext xmlContext = new PagesContext("myForm", "0", user.getUserPreferences().getLanguage(), false,
            pub.getInstanceId(), "useless");
    xmlContext.setObjectId(pub.getId());
    xmlContext.setDesignMode(false);//from   w w  w . j ava2  s . co m
    xmlContext.setBorderPrinted(false);
    xmlContext.setContentLanguage(user.getUserPreferences().getLanguage());
    xmlContext.setCreation(false);

    StringWriter generatedHtml = new StringWriter();
    PrintWriter outTmp = new PrintWriter(generatedHtml);

    Form xmlForm = pubTemplate.getViewForm();
    if (xmlForm instanceof XmlForm) {
        Method m = XmlForm.class.getDeclaredMethod("display",
                new Class[] { PrintWriter.class, PagesContext.class, DataRecord.class });
        m.setAccessible(true);
        m.invoke(xmlForm, outTmp, xmlContext, xmlData);
        outTmp.flush();
    } else if (xmlForm instanceof HtmlForm) {
        String html = ((HtmlForm) xmlForm).toString(xmlContext, xmlData);
        outTmp.write(html);
        outTmp.flush();
    }
    String html = generatedHtml.toString();

    Document doc = Jsoup.parse(html);
    Elements images = doc.getElementsByTag("img");
    for (Element img : images) {
        if (img.attr("class").equals("preview-file")) {
            // remove preview for files
            img.remove();
        } else if (img.attr("src").startsWith("/silverpeas/attached_file/componentId/")) {
            // convert url to dataurl
            String data = img.attr("src");
            data = convertImageAttachmentUrl(data, data);
            img.attr("src", data);
        }
    }
    Elements links = doc.getElementsByTag("a");
    for (Element link : links) {
        if (link.attr("href").startsWith("/silverpeas/attached_file/componentId/")) {
            // link to file
            String url = link.attr("href");
            String attachmentId = url.substring(url.indexOf("attachmentId/") + "attachmentId/".length());
            attachmentId = attachmentId.substring(0, attachmentId.indexOf("/"));
            url = getServletContext().getContextPath() + "/services/spmobile/Attachment";
            url = url + "?id=" + attachmentId + "&instanceId=" + pub.getInstanceId() + "&lang="
                    + user.getUserPreferences().getLanguage() + "&userId=" + user.getId();
            link.attr("href", url);
            link.attr("target", "_self");

            if (link.attr("id").startsWith("player")) {

                boolean playable = false;

                SimpleDocument attachment = AttachmentServiceProvider.getAttachmentService().searchDocumentById(
                        new SimpleDocumentPK(attachmentId), user.getUserPreferences().getLanguage());
                String type = attachment.getContentType();
                if (type.contains("mp4") || type.contains("ogg") || type.contains("webm")) {
                    playable = true;
                }

                if (playable) {
                    String style = link.attr("style");
                    String width = style.substring(style.indexOf("width") + "width".length() + 1);
                    width = width.substring(0, width.indexOf("px"));
                    String height = style.substring(style.indexOf("height") + "height".length() + 1);
                    height = height.substring(0, height.indexOf("px"));
                    link.parent().append("<video width='" + width + "' height='" + height
                            + "' controls='controls'><source src='" + url + "' type='" + type + "' />");
                    link.remove();
                } else {
                    // display image instead of video player
                    String style = "display:block; width:150px; height:98px; background-repeat: no-repeat; ";
                    style += "background-image: url(data:image/jpeg;base64," + "/9j/4AAQSkZJRgABAQEBLAEsAAD"
                            + "/4QYfRXhpZgAATU0AKgAAAAgAAAAAAA4AAgIBAAQAAAABAAAALAICAAQAAAABAAAF6wAAAAD/2P"
                            + "/gABBKRklGAAEBAAABAAEAAP/bAEMACAYGBwYFCAcHBwkJCAoMFA0MCwsMGRITDxQdGh8eHRocHCAkLicgIiwjHBwoNyksMDE0NDQfJzk9ODI8LjM0Mv/AAAsIAEAAYgEBEQD/xAAfAAABBQEBAQEBAQAAAAAAAAAAAQIDBAUGBwgJCgv/xAC1EAACAQMDAgQDBQUEBAAAAX0BAgMABBEFEiExQQYTUWEHInEUMoGRoQgjQrHBFVLR8CQzYnKCCQoWFxgZGiUmJygpKjQ1Njc4OTpDREVGR0hJSlNUVVZXWFlaY2RlZmdoaWpzdHV2d3h5eoOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4eLj5OXm5+jp6vHy8/T19vf4+fr/2gAIAQEAAD8A98orN1zXdO8OaVLqWqXS29tEOWPUnsAO5PoK8X1H9ouR7t00Xw600APEk8pDMP8AdUcfma3PCXx60nWr6Ow1qzbSp5CFWUvviLehOAV/Hj3r1me4htraS4mlSOGNS7yMcKqjnJPpXjPiL9oSxtL17XQNKfUQpI8+VyiN/uqASR+VQaJ+0TBJeLBr+iSWcZODPA5fb9UIB/In6V7TYX1tqVlDeWc6T20yho5EOQw9qs0UUUUV89/HO6udZ8f6D4WExitWWN+vG+Ryu78AP1Ne2eHvDGkeGNLisdLs4oY0UAuFG9z6sepNec/HHwVpVz4RuPEMFtFb6jZsrGSNQvmqWCkN6nnIPtXB6/4z1K7+Aei2ryuXuLp7SaXJy8cfIBP4r/3zXrnws8F6V4e8IabeR20cmoXlvHcTXLAFsuobaD2AzjitLx34K0nxb4fuoru2iF0kbNBchQHjYDI59OORXnf7OWs3NxpesaPM7NBaPHLDk527924fmoP4mvcKKKKKK8e+N3gXUNZis/EmixvLf2C7JI4/vtGCWDL6lSTx71m+Hf2hbKLT0tvEem3S3sQ2vLbKCrkdyCQVP51zfjH4iat8VpofDPhnS7hLSSQNIH5eTB4LY4VR1/AV6Br3wn+0fCO18OWbq+o2H+kxv0EsvJYfQ5IH0FcZ4K+MVz4KsV8N+K9Muz9i/dRugxLGo6KysRkDsc9KseMfjmdf06TRvCmnXgmvFMRmlUb8HghFUnk9M/pXefB3wNP4N8NSyagmzUr9lkmTr5ajO1frySfrXo1FFFFFcl8RPGsfgXwwdTNv9omklEEEZOAXIJyT6YBr5y1T4qPrFyZ7/wAJ+HZ5ScmRrZtx+pDc1e0v43arosBh0zQNCs4z1EFuy5+uG5q//wANE+Kv+fDTP+/b/wDxVZWrfGS+10D+1fDegXhHAaa2YsPod2abpPxeutDfdpfhjw/aueN8Vswb/vrdmve/hh4//wCE90S4nmtVtry0kCTIhJU5BIYZ+h49qf4n+KfhnwprEWl31xJJdMR5iQJv8oHpu9Pp1rtI5FljWRTlWGR9KdRRRXj37Rn/ACIun/8AYRX/ANFyV8yUUUUV23hD4iXngzw3q9jpkeL7UHj23B5EKqGBIHduePSuRE0lxfiaaRpJZJNzuxyWJPJJr7tsv+PGD/rmv8qnooorx79oz/kRdP8A+wkv/ot6+ZKKKKKKkt/+PiL/AHh/OvvGy/48YP8Armv8qnooorx/9osf8ULp/wD2El/9FyV8xmiiiivSvh58O4vHnhbXWhk8rVLSSI2zsflbIbKt9cDntXCXOm3elaw1jfQPBcwy7JI3GCpBr7msv+PG3/65r/Kp6KKKwPGHhLT/ABpoL6TqO9ULB45Iz80bjOGH5n868nP7Ndpk48STY/69R/8AFUf8M12n/QyTf+Ao/wDiqP8Ahmu0/wChkm/8BR/8VR/wzXaf9DJN/wCAo/8AiqP+Ga7T/oZJv/AUf/FV6f4H8Dab4E0d7GwZ5XlfzJp5PvSN0HA6ADtV3UvCHh7WNUg1PUNJtri8gxsldMnjpn1x71tgADAGBRRX/9n/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/wgALCABiAJYBAREA/8QAHQABAAICAwEBAAAAAAAAAAAAAAcIAgYEBQkDAf/aAAgBAQAAAAG0o4cWdnJf71sXbBIOQB1VCZO6Dl2uolLekbxbQAgPVbTKAyDu9hvzzq9DeSAi6sV3+iorZmJrm6bSn0PyAK6V+59q5RrDB/eWzkAAUe0sDfrrxVt2zgw8t/kBz7fU2u/NoMPLf5AZMbyTYDDy3+QAvJNgMPLf5AcnYdVvJNgMfLrjAdn6GUKuNMAEL6mBukwfL6gAAB//xAAoEAABAwQBBAIBBQAAAAAAAAAFAwQGAAECByAWFzAyEzYQERIxNED/2gAIAQEAAQUC4OXaLJB1tWPNsxmxgBTO1/1tT4i2GIONsR5DMTPgZlTxEyKIlgcPE50XHaWeLIyDU5MQhrieLCXkgNox4S7elp6ZZ6ScqJSXWJKPIayni2Lrw7kdZJRnSo1G+P42QOSGS7ZJRZ1E9MjUkgdZWte0qaYg5Y2U+Zv4NjAszsX11MMIsRZv25FE9J2EcbOVnc4lGy4vdeJ6ymyUeVQcpOkpNMR8YbCmTmaynHH9uPhmWqkyiysMkY1QdrmQFlodBWkTTva17SzUnzrZxCRsMxeszxRSJQ9nE2nik2zipB/1yfrrk/XXJ+uuT9dcn665P11yfqJbKKNClSjYY6MLx6QN5KN5ZeufvzY/3dgbIxD2UUyWz1D9Q5Zeufvzte+N73vf8ah+ocsvXP38WofqHLL1z9+aCfzLSGOu40RrUP1Dlf8AhbG6avMbhkqQksaaSgcfj7qNkNWM12UT5yLVww887Js67Js67Js67Js67Js67Js67Js6jeshsed1mlgr/o//xABAEAACAQICAg0JBwQDAAAAAAABAgMABBESITEFEBMiMDI1QUJRYXGSFCBSc5GTocHRFSQzYnKx8AYjQFOBguH/2gAIAQEABj8C8xpriVIIl1vIcAKyi5eftiiOHxoIl8IZDqWdcnx1VjzbRmup47eIdKQ4VlWeWb80cRw+NCOC+VZTqjmBQ/Hg57y4bLDCuZqVcHkLthBax6l/nXQe8v47Zz0I03TD/nGnntpF2QiXSwjXB/DUWx19LnsJDkQufwT9Knvp9KxjQo1seYUBg1zcOd5EnEjHyFY3WyccUnoxxZx7cRTXCMt9bLxmiGDL2lai2I2QlMkUm9t5X1q3o93BQRLoEtwA3cAT9K2QvmAMwIhX8o1n+dm3eJCMkb5Zco5sRpr+mcxP3iLdpO1gi/U1c3uAM8spTN1KMNHx2tOmr6K13ghmzR4dHnFRSaiyg8DMkS5p4G3dB14Y4j2E1Ilzj5DcYCQjoEamoS206XEZ6cbYimlvJ1U9GIHF27hTMq/eLuTBV9Ec3sFWvkyljsaBgOfc8MD+wPtqWxvWyWUzZxJ/rf6fShLBIs0Z1NGcQaZriVXuMN5bq2/Y/LvpQ++e5lMkzDorjvqwGrgpLzYtlguX0vA/EY9nUaI+zrsHrgGYe1a02b24OuW6OX/2jIG8ovnGDzkauxeqsDqp7nYZkTHSbV9A/wCpor9nXqHn3JSR8KG6W3kcZ1yXJw+GuikH924f8W4YaW7OwcHILG5aztFYiMRaCw6ya5WuvHXK11465WuvHXK11465WuvHXK11465WuvHUEV/cNd2kjBG3TjL2g7Qt3z3Nz0o4cN530l7ah1jJylZBpB8803fwEH61/en2P2McSX2qSUaRF2fq/amd2LuxxLMcSaX17/Lzz3Ue/gMQcCK0nHaX17/Lzz3Ue/g19e/y8891Hv4BE1ZiFxp7S6TfDSrjiuvWNpfXv8uAdWGDAkEHm4C2RRmZpFAA76a2uRlYaY5RrjP85qa1vI8rDiuOK46xUQniaIvIzqG9E8/ANdq8tlPIcX3PDKx68OuuU5/diuU5/diuU5/diuU5/diuU5/diuU5/diuU5/dilu80l3cJxDLhlTtA2hnRXw0jMP8j//EACkQAQABAgQFBQADAQAAAAAAAAERACExQVGhMGFxgcEQkbHw8SBA0eH/2gAIAQEAAT8h/gaMJJ65aQQ5hV90J7U7aYSl3eVAARVxMPTGLtFf9eVIIG38EUizoQToQBeQtTwpnFsYvI5tqGV8eQythOvVgVdRQs7kF+k9aLL8wM1xT2V5UgkaWK4Q+1MscmpJLprmw9Vqx5C25PIDFe7NEIvmcHd8FFc+XAmOg5i84pi4DU5RuawMxt04LavP4HPc9lM0dXFDet7vTGj7wTDcwjrL3qVZAjkA+2dSJrYXCjuU+3orABIRp09vYjpO1KwSedQeDFroZeIfQTFTExBy5KzLoxk5xCV65BbVBERI3J+TDmVPHGe5hjoN3k0AWlm4HvkjkUh5lCtAEvMBOkKJTci9MlSnchnJEZNVvhWj06LpcoLHagIwBAacJi+NsdirO9nlV9gwubuNBJywDvd2GsOBUoZlk3dqRBKIRMaVmWsRu1OjY1oksYkXe3enxWZZ+b2qaCHYG0fB78KdcCg98VdLRJuZV+9r97X6Wv3tfpa/W1+tqFk8hFYzlpLOPpnXvDOU1IXTHtikAIAHEjE66/z278VvHAcXY/yVckjtTa/TSlpwlA1XgTb58VvXAMIRImJSqom6vBm3z4rev6U2+fFb1wLv78IUh4sJyTfYbcGYyzUincGghUonAQFioFhV9IA+tPNJZgzGr8x2zvQpH6hYIh24C+wR0rGTcMV9x819x819x819x819x819x819x81LC0mCZAzNV9MYAsDDrWP9f//aAAgBAQAAABD8n/2U/wDyv/jH/uJ/4A//AP7/AL+X9/7+/wDf3+f8AP8Af/8A/8QAKBABAAIBAwMDBAMBAAAAAAAAAREhADFBYTBRcYGR8BAgocFAULHx/9oACAEBAAE/EPsQZM4/dUDxOO04Bp4IYQnEC1xJ7sAHSQtzMpHj6D+Gvm7E+wJwytc9+JPOWr4DzmYMlMkzBMHsXJ6nSchaXAKLdEHLi1Qt26BKRJXtJKAAaWmxxgfCDCFvdmMJBzDhMkIgeoNREEgwA3V5EFhXuQnYl2yaqUpIzBX/AFSTCClI3ezO4U9fU1QTXthgloQKzfy2JZQBhOl1+pvDo9yZDvkGsBClU3Jhr8mNxNxZOICEk5yNzIAHmFBMDYGExTJbxX5/dlUU3J0HUlBvHYfRHpiJEdRO2BxOJ71kePjwxUqD7LB6T0VXX3gAcpDhiXWmTB2SIbaBsMJC1ETypj2ybGUw8sXyeWEAO0Q5MeVTUTXNbmzloALE73AY05c5CSOtsFqDVz3NNiIx9HAWJN+gBU9hBsLE5woWZa7KTk74VtQSABABsRXRgNt5xXmVTJRiwqjcya8iaMGB4VzQKH58xLMEhiQit4lFmYJWAISLoAOybl5YWYJMstHYXYGIvWB69d4d3VDSdUkt5ws0MCCZC2Gw72lyeigBpqOBGfMFFaRuggTpnw79ZB878ZP8D8Z8O/WfEP1nyT9ZdPzPGBSPwdgNJozAE1v6CZSm0pKPVEuBlsY2IVuWWKhZ95OAbNN9BdRRc5J3o0JGsFd7bdeGjYSTZUKquq4JRtLf4EionmLogyImjjlCUFVdV/pVwKrgVOD2QSEAx64IcV9IY3oWk1sWdFfx0sWuMwJAdxOgJ8H1VEGTG1LCaJ9R0kh01hy2yMwfSnxKVUMb5SjLMsHRNu/QlEFAlKHSWwLt0O3bt27du2tci4ZBW2iE2jIO3OHkiM4WSU846pue/Qgutdeeh/37/wD/2Q==);";
                    link.attr("style", style);
                }
            }
        }
    }

    // remove all scripts
    Elements scripts = doc.getElementsByTag("script");
    for (Element script : scripts) {
        script.remove();
    }
    html = doc.outerHtml();
    writeContainer(out, html);
    out.flush();
}

From source file:webcralwerproject1.Webcrawler.java

public String contentprocessor() {
    File folder = new File(DirectoryName + "/" + crawlcount);
    FileWriter f_write = null;//from  w  w  w.j a  va  2 s  .c  o m
    Elements p, c = null;
    String contentprocessfile = "./crawler" + crawlcount + "content.html";
    if (!folder.exists()) {
    } else {
        try {
            File[] listOfFiles = folder.listFiles();
            f_write = new FileWriter(contentprocessfile, true);

            //Open repo directory and loop through all files
            for (File file : listOfFiles) {
                if (file.isFile()) {
                    File input = new File(file.getAbsolutePath());
                    Document doc = Jsoup.parse(input, "UTF-8");
                    String title = doc.select("title").toString();
                    Elements n = doc.select("nav").remove();
                    //  String d =doc.select("div.id");
                    doc.select("head").remove();
                    doc.select("link").remove();
                    doc.select("style").remove();
                    doc.select("meta").remove();
                    doc.select("script").remove();
                    doc.select("figure").remove();
                    doc.select("img").remove();
                    doc.select("footer").remove();
                    doc.select("input[type = search]").remove();
                    doc.select("form").remove();
                    doc.select("button").remove();
                    doc.select("video").remove();
                    doc.select("div:empty").remove();
                    doc.select("div#footer").remove();
                    doc.select("div#id").remove();
                    doc.select("div#nav").remove();
                    doc.select("div#navigation").remove();
                    doc.select("div.footer").remove();
                    doc.select("div.header").remove();
                    doc.select("li > a[href]").remove();

                    Elements linksOnPage = doc.select("body a[href]");
                    for (Element link : linksOnPage) {
                        if (link.html() == null) {
                            link.remove();//<a></a>
                        } else if (link.html().length() <= 4) {// does not contains title of the page 
                            link.remove();
                        } else {
                            int child = link.parentNode().childNodeSize();
                            if (child == 1) {//only element remove
                                link.remove();
                            }
                        }
                    }
                    f_write.write(doc.text());
                }
                f_write.write("<br>");
            }
            f_write.close();
        } catch (Exception e) {
            System.out.println("Inside Contentprocessor" + e);
        }

        return contentprocessfile;
    }
    return null;
}