Example usage for org.jsoup.nodes Document body

List of usage examples for org.jsoup.nodes Document body

Introduction

In this page you can find the example usage for org.jsoup.nodes Document body.

Prototype

public Element body() 

Source Link

Document

Accessor to the document's body element.

Usage

From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java

@Override
public void visitTypeDeclaration(TypeDeclaration typeDeclaration) {
    if (typeDeclaration.getDocumentation() == null) {
        String moduleName = getCurrentModuleName();
        if (JSweetDefTranslatorConfig.LANG_PACKAGE.equals(moduleName)
                || JSweetDefTranslatorConfig.DOM_PACKAGE.equals(moduleName)) {
            this.currentModule = moduleName;

            String content = getTypeContent(context.cacheDir, "mdn", moduleName, typeDeclaration.getName());
            if (content != null) {
                try {
                    Document doc = Jsoup.parse(content, "UTF-8");
                    NodeTraversor traversor;
                    traversor = new NodeTraversor(new MdnTableFormatGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                    traversor = new NodeTraversor(new MdnDefinitionListFormatGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                    traversor = new NodeTraversor(new MdnMainDescriptionGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                } catch (Throwable t) {
                    context.reportError("cannot fill documentation for " + context.getTypeName(typeDeclaration),
                            typeDeclaration.getToken(), t);
                }/* www  .j a  v  a 2 s  . c o  m*/
            }
        }
    }
}

From source file:org.mar9000.space2latex.WikiPage.java

public static WikiPage loadForFormat(File file) throws IOException {
    String fileContent = IOUtils.readFileAsString(file);
    Document doc = Jsoup.parseBodyFragment(fileContent);
    // Maintain input string.
    doc.outputSettings().prettyPrint(false);
    Element body = doc.body();
    Element pageElement = body.select("page").first();
    String title = pageElement.attr("title");
    String id = pageElement.attr("id");
    Element pageContent = pageElement.select("content").first();
    WikiPage page = new WikiPage(null, title, id, pageContent.html());
    page.pageContent = pageContent;/*from   w  w  w  .  j  a va 2s  . co m*/
    // Images.
    Elements images = body.select("wikiimages").first().select("wikiimage");
    for (Element imageElement : images) {
        WikiImage image = new WikiImage();
        String acKey = imageElement.select("ac|image").first().outerHtml();
        image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename");
        page.images.put(acKey, image);
    }
    return page;
}

From source file:org.niord.core.publication.PublicationUtils.java

/**
 * Updates the message publications from the publication, parameters and link
 *
 * @param message the message/*from   ww  w . ja v a2 s  . c  o m*/
 * @param publication the publication to extract
 * @param parameters the optional parameters
 * @param link the optional link
 * @param lang either a specific language or null for all languages
 * @return the message publication or null if not found
 */
public static MessageVo updateMessagePublications(MessageVo message, SystemPublicationVo publication,
        String parameters, String link, String lang) {
    // Sanity check
    if (message == null || publication == null) {
        return null;
    }

    boolean internal = publication.getMessagePublication() == MessagePublication.INTERNAL;

    message.getDescs().stream().filter(msgDesc -> lang == null || lang.equals(msgDesc.getLang()))
            .forEach(msgDesc -> {

                String updatedPubHtml = computeMessagePublication(publication, parameters, link,
                        msgDesc.getLang());

                String pubHtml = internal ? msgDesc.getInternalPublication() : msgDesc.getPublication();
                pubHtml = StringUtils.defaultIfBlank(pubHtml, "");

                Document doc = Jsoup.parseBodyFragment(pubHtml);
                String pubAttr = "[publication=" + publication.getPublicationId() + "]";
                Element e = doc.select("a" + pubAttr + ",span" + pubAttr).first();
                if (e != null) {
                    // TODO: Is there a better way to replace an element?
                    e.replaceWith(Jsoup.parse(updatedPubHtml).body().child(0));
                    pubHtml = doc.body().html();
                } else {
                    pubHtml += " " + updatedPubHtml;
                }
                // Lastly, clean up html for artifacts often added by TinyMCE
                if (StringUtils.isNotBlank(pubHtml)) {
                    pubHtml = pubHtml.replace("<p>", "").replace("</p>", "").trim();
                    if (internal) {
                        msgDesc.setInternalPublication(pubHtml);
                    } else {
                        msgDesc.setPublication(pubHtml);
                    }
                }
            });

    return message;
}

From source file:org.niord.core.util.TextUtils.java

/**
 * Converts the text from html to plain text
 * @param html the html/* ww  w.j ava2s. c om*/
 * @return the plain text version
 */
public static String html2txt(String html) {
    if (StringUtils.isNotBlank(html)) {
        try {
            Document doc = Jsoup.parse(html);
            return new HtmlToPlainText().getPlainText(doc.body());
        } catch (Exception ignored) {
        }
    }
    // If blank, or if any error occurs, return the original html
    return html;
}

From source file:org.sakaiproject.lessonbuildertool.cc.PrintHandler.java

public String fixupInlineReferences(String htmlString) {
    // and fix relative URLs to absolute, since this is going to be inserted inline
    // in a page that's not in resources.
    // These fixups are inserted as identified by the lessons exporter
    if (htmlString.startsWith("<!--fixups:")) {
        int fixend = htmlString.indexOf("-->");
        String fixString = htmlString.substring(11, fixend);
        htmlString = htmlString.substring(fixend + 3);
        String[] fixups = fixString.split(",");
        // iterate backwards since once we fix something, offsets
        // further in the string are bad
        for (int i = (fixups.length - 1); i >= 0; i--) {
            String fixup = fixups[i];
            // these are offsets of a URL. The URL is for a file in attachments, so we need
            // to map it to a full URL. The file should be attachments/item-xx.html in the
            // package. relFixup will have added ../ to it to get to the base.
            try {
                int offset = Integer.parseInt(fixup);
                htmlString = htmlString.substring(0, offset) + baseUrl + htmlString.substring(offset + 3);
            } catch (Exception e) {
                log.info("exception " + e);
            }/* ww  w  . java2s . c  om*/
        }
        //Otherwise try jsoup to do the fixups
    } else {
        /*
         Now we need to go through the string looking for other references that weren't identified by the fixups
         Using full class names here because of conflict in names
         I think the ideal here would be that this only updates resources that are in the manifest, but really any
         relative resources are going to be incorrect pulled out of a package and need an update.
         */
        org.jsoup.nodes.Document doc = Jsoup.parseBodyFragment(htmlString);
        org.jsoup.select.Elements hrefs = doc.select("[href]");
        org.jsoup.select.Elements srcs = doc.select("[src]");

        log.debug("BaseURL is: {}", baseUrl);

        // Have to look for both href and src tags
        for (org.jsoup.nodes.Element element : srcs) {
            String src = element.attr("src");
            if (src != null && !src.startsWith("http")) {
                for (Map.Entry<String, String> entry : fileNames.entrySet()) {
                    if (entry.getKey() != null && entry.getValue() != null && entry.getValue().contains(src)) {
                        // Found key, set it and stop looking
                        log.debug(String.format("Updating tag %s: <%s> to <%s>", element.tagName(), src,
                                baseUrl + src));
                        element.attr("src", baseUrl + entry.getValue());
                        break;
                    }
                }
            }
        }

        for (org.jsoup.nodes.Element element : hrefs) {
            String href = element.attr("href");
            if (href != null && !href.startsWith("http")) {
                for (Map.Entry<String, String> entry : fileNames.entrySet()) {
                    if (entry.getKey() != null && entry.getValue() != null && entry.getValue().contains(href)) {
                        // Found key, set it and stop looking
                        log.debug(String.format("Updating a: <%s> to <%s> (%s)", href, baseUrl + href,
                                element.text()));
                        element.attr("href", baseUrl + entry.getValue());
                        break;
                    }
                }
            }
        }
        htmlString = doc.body().html();
    }

    return htmlString;
}

From source file:org.sakaiproject.util.impl.FormattedTextImpl.java

@Override
public String stripHtmlFromText(String text, boolean smartSpacing, boolean stripEscapeSequences) {
    // KNL-1267   --bbailla2
    if (!stripEscapeSequences) {
        return stripHtmlFromText(text, smartSpacing);
    }// w  w w .j  a v a  2  s.  c o  m

    if (smartSpacing) {
        text = addSmartSpacing(text);
    }

    org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(text);
    org.jsoup.nodes.Element body = document.body();
    //remove any html tags, unescape any escape characters
    String strippedText = body.text();
    //&nbsp; are converted to char code 160, java doesn't treat it like whitespace, so replace it with ' '
    //Could there be others like this?
    strippedText = strippedText.replace((char) 160, ' ');
    strippedText = eliminateExtraWhiteSpace(strippedText);
    return strippedText;
}

From source file:org.symphonyoss.client.util.MlMessageParser.java

public void parseMessage(String message) throws SymException {

    Document doc = Jsoup.parse(message);
    originalDoc = doc.clone();// w  w w .  j  a v a 2  s  . c  om
    Element elementErrors = doc.body().getElementsByTag("errors").first();

    if (elementErrors != null) {
        if (elementErrors.outerHtml() != null)
            logger.debug("Errors found in message: {}", elementErrors.outerHtml());
    }
    //Lets remove the errors elements
    doc.select("errors").remove();

    elementMessageML = doc.select("messageML").first();

    if (elementMessageML == null)
        elementMessageML = doc.select("div").first();

    if (elementMessageML != null) {
        if (elementMessageML.outerHtml() != null)
            logger.debug("Doc parsed: {}", elementMessageML.outerHtml());
    } else {

        logger.error("Could not parse document for message {}", message);
        throw new SymException("Malformed message");
    }

    textDoc = new StringBuilder();
    stripTags(textDoc, elementMessageML.childNodes());

    textChunks = textDoc.toString().split("\\s+");

}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    if (options.getType() != MediaType.MOVIE) {
        throw new UnsupportedMediaTypeException(options.getType());
    }/*w w w  . j  a  v a 2  s  .  co  m*/

    // we have 3 entry points here
    // a) getMetadata has been called with an ofdbId
    // b) getMetadata has been called with an imdbId
    // c) getMetadata has been called from a previous search

    String detailUrl = "";

    // case a) and c)
    if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) {

        if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) {
            detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId());
        } else {
            detailUrl = options.getResult().getUrl();
        }
    }

    // case b)
    if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) {
        MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE);
        searchOptions.setImdbId(options.getId(MediaMetadata.IMDB));
        try {
            List<MediaSearchResult> results = search(searchOptions);
            if (results != null && !results.isEmpty()) {
                options.setResult(results.get(0));
                detailUrl = options.getResult().getUrl();
            }
        } catch (Exception e) {
            LOGGER.warn("failed IMDB search: " + e.getMessage());
        }
    }

    // we can only work further if we got a search result on ofdb.de
    if (StringUtils.isBlank(detailUrl)) {
        throw new Exception("We did not get any useful movie url");
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),");
    if (StringUtils.isBlank(ofdbId)) {
        ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)");
    }

    Url url;
    try {
        LOGGER.trace("get details page");
        url = new Url(detailUrl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        if (doc.getAllElements().size() < 10) {
            throw new Exception("meh - we did not receive a valid web page");
        }

        // parse details

        // IMDB ID "http://www.imdb.com/Title?1194173"
        el = doc.getElementsByAttributeValueContaining("href", "imdb.com");
        if (!el.isEmpty()) {
            md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)"));
        }

        // title / year
        // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" />
        el = doc.getElementsByAttributeValue("property", "og:title");
        if (!el.isEmpty()) {
            String[] ty = parseTitle(el.first().attr("content"));
            md.setTitle(StrgUtils.removeCommonSortableName(ty[0]));
            try {
                md.setYear(Integer.parseInt(ty[1]));
            } catch (Exception ignored) {
            }
        }
        // another year position
        if (md.getYear() == 0) {
            // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a>
            el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr");
            try {
                md.setYear(Integer.parseInt(el.first().text()));
            } catch (Exception ignored) {
            }
        }

        // original title (has to be searched with a regexp)
        // <tr valign="top">
        // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif"
        // size="2">Originaltitel:</font></td>
        // <td>&nbsp;&nbsp;</td>
        // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif"
        // size="2"><b>Brave</b></font></td>
        // </tr>
        String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>");
        if (!originalTitle.isEmpty()) {
            md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle));
        }

        // Genre: <a href="view.php?page=genre&Genre=Action">Action</a>
        el = doc.getElementsByAttributeValueContaining("href", "page=genre");
        for (Element g : el) {
            md.addGenre(getTmmGenre(g.text()));
        }

        // rating
        // <div itemtype="http://schema.org/AggregateRating" itemscope
        // itemprop="aggregateRating">Note: <span
        // itemprop="ratingValue">6.73</span><meta
        // itemprop="worstRating" content="1" />
        el = doc.getElementsByAttributeValue("itemprop", "ratingValue");
        if (!el.isEmpty()) {
            String r = el.text();
            if (!r.isEmpty()) {
                try {
                    md.setRating(Float.parseFloat(r));
                } catch (Exception e) {
                    LOGGER.debug("could not parse rating");
                }
            }
        }

        // get PlotLink; open url and parse
        // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a>
        LOGGER.trace("parse plot");
        el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,");
        if (!el.isEmpty()) {
            String plotUrl = BASE_URL + "/" + el.first().attr("href");
            try {
                url = new Url(plotUrl);
                in = url.getInputStream();
                Document plot = Jsoup.parse(in, "UTF-8", "");
                in.close();
                Elements block = plot.getElementsByClass("Blocksatz"); // first
                                                                       // Blocksatz
                                                                       // is plot
                String p = block.first().text(); // remove all html stuff
                p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header"
                md.setPlot(p);
            } catch (Exception e) {
                LOGGER.error("failed to get plot page: " + e.getMessage());
            }
        }

        // http://www.ofdb.de/view.php?page=film_detail&fid=226745
        LOGGER.debug("parse actor detail");
        String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId;
        doc = null;
        try {
            url = new Url(movieDetail);
            in = url.getInputStream();
            doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get detail page: " + e.getMessage());
        }

        if (doc != null) {
            parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md);
            parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"),
                    MediaCastMember.CastType.ACTOR, md);
            parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER,
                    md);
            parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md);
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + detailUrl);
        throw e;
    }

    return md;
}

From source file:poe.trade.assist.SearchView.java

private String addHeadElements(String html) {
    String htmlDirectory = htmlDirectory();
    Document doc = Jsoup.parse(html);
    Element head = doc.head();/*from  w ww . j av  a2s .co  m*/

    // Replace everthing in the <head>
    head.children().stream().forEach(e -> e.remove());
    head.appendElement("meta").attr("charset", "utf-8");
    head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width");
    head.appendElement("title").text("poe.trade.assist");
    head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js");
    head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css");

    doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js");

    String cleanHTML = doc.toString();
    //      try {
    //         FileUtils.writeStringToFile(new File("test"), cleanHTML);
    //      } catch (IOException e1) {
    //         // TODO Auto-generated catch block
    //         e1.printStackTrace();
    //      }
    return cleanHTML;
}

From source file:sachin.bws.site.Template.java

public String getTemplate(Document doc) {
    String bodyClass[] = doc.body().attr("class").split("\\u0020");
    List<String> classes = Arrays.asList(bodyClass);
    Collections.reverse(classes);
    for (String t : classes) {
        if (templates.contains(t)) {
            return t;
        }//from www . ja  v a  2 s . c  om
    }
    return "****";
}