List of usage examples for org.jsoup.nodes Document body
public Element body()
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
@Override public void visitTypeDeclaration(TypeDeclaration typeDeclaration) { if (typeDeclaration.getDocumentation() == null) { String moduleName = getCurrentModuleName(); if (JSweetDefTranslatorConfig.LANG_PACKAGE.equals(moduleName) || JSweetDefTranslatorConfig.DOM_PACKAGE.equals(moduleName)) { this.currentModule = moduleName; String content = getTypeContent(context.cacheDir, "mdn", moduleName, typeDeclaration.getName()); if (content != null) { try { Document doc = Jsoup.parse(content, "UTF-8"); NodeTraversor traversor; traversor = new NodeTraversor(new MdnTableFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnDefinitionListFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnMainDescriptionGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); } catch (Throwable t) { context.reportError("cannot fill documentation for " + context.getTypeName(typeDeclaration), typeDeclaration.getToken(), t); }/* www .j a v a 2 s . c o m*/ } } } }
From source file:org.mar9000.space2latex.WikiPage.java
public static WikiPage loadForFormat(File file) throws IOException { String fileContent = IOUtils.readFileAsString(file); Document doc = Jsoup.parseBodyFragment(fileContent); // Maintain input string. doc.outputSettings().prettyPrint(false); Element body = doc.body(); Element pageElement = body.select("page").first(); String title = pageElement.attr("title"); String id = pageElement.attr("id"); Element pageContent = pageElement.select("content").first(); WikiPage page = new WikiPage(null, title, id, pageContent.html()); page.pageContent = pageContent;/*from w w w . j a va 2s . co m*/ // Images. Elements images = body.select("wikiimages").first().select("wikiimage"); for (Element imageElement : images) { WikiImage image = new WikiImage(); String acKey = imageElement.select("ac|image").first().outerHtml(); image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename"); page.images.put(acKey, image); } return page; }
From source file:org.niord.core.publication.PublicationUtils.java
/** * Updates the message publications from the publication, parameters and link * * @param message the message/*from ww w . ja v a2 s . c o m*/ * @param publication the publication to extract * @param parameters the optional parameters * @param link the optional link * @param lang either a specific language or null for all languages * @return the message publication or null if not found */ public static MessageVo updateMessagePublications(MessageVo message, SystemPublicationVo publication, String parameters, String link, String lang) { // Sanity check if (message == null || publication == null) { return null; } boolean internal = publication.getMessagePublication() == MessagePublication.INTERNAL; message.getDescs().stream().filter(msgDesc -> lang == null || lang.equals(msgDesc.getLang())) .forEach(msgDesc -> { String updatedPubHtml = computeMessagePublication(publication, parameters, link, msgDesc.getLang()); String pubHtml = internal ? msgDesc.getInternalPublication() : msgDesc.getPublication(); pubHtml = StringUtils.defaultIfBlank(pubHtml, ""); Document doc = Jsoup.parseBodyFragment(pubHtml); String pubAttr = "[publication=" + publication.getPublicationId() + "]"; Element e = doc.select("a" + pubAttr + ",span" + pubAttr).first(); if (e != null) { // TODO: Is there a better way to replace an element? e.replaceWith(Jsoup.parse(updatedPubHtml).body().child(0)); pubHtml = doc.body().html(); } else { pubHtml += " " + updatedPubHtml; } // Lastly, clean up html for artifacts often added by TinyMCE if (StringUtils.isNotBlank(pubHtml)) { pubHtml = pubHtml.replace("<p>", "").replace("</p>", "").trim(); if (internal) { msgDesc.setInternalPublication(pubHtml); } else { msgDesc.setPublication(pubHtml); } } }); return message; }
From source file:org.niord.core.util.TextUtils.java
/** * Converts the text from html to plain text * @param html the html/* ww w.j ava2s. c om*/ * @return the plain text version */ public static String html2txt(String html) { if (StringUtils.isNotBlank(html)) { try { Document doc = Jsoup.parse(html); return new HtmlToPlainText().getPlainText(doc.body()); } catch (Exception ignored) { } } // If blank, or if any error occurs, return the original html return html; }
From source file:org.sakaiproject.lessonbuildertool.cc.PrintHandler.java
public String fixupInlineReferences(String htmlString) { // and fix relative URLs to absolute, since this is going to be inserted inline // in a page that's not in resources. // These fixups are inserted as identified by the lessons exporter if (htmlString.startsWith("<!--fixups:")) { int fixend = htmlString.indexOf("-->"); String fixString = htmlString.substring(11, fixend); htmlString = htmlString.substring(fixend + 3); String[] fixups = fixString.split(","); // iterate backwards since once we fix something, offsets // further in the string are bad for (int i = (fixups.length - 1); i >= 0; i--) { String fixup = fixups[i]; // these are offsets of a URL. The URL is for a file in attachments, so we need // to map it to a full URL. The file should be attachments/item-xx.html in the // package. relFixup will have added ../ to it to get to the base. try { int offset = Integer.parseInt(fixup); htmlString = htmlString.substring(0, offset) + baseUrl + htmlString.substring(offset + 3); } catch (Exception e) { log.info("exception " + e); }/* ww w . java2s . c om*/ } //Otherwise try jsoup to do the fixups } else { /* Now we need to go through the string looking for other references that weren't identified by the fixups Using full class names here because of conflict in names I think the ideal here would be that this only updates resources that are in the manifest, but really any relative resources are going to be incorrect pulled out of a package and need an update. */ org.jsoup.nodes.Document doc = Jsoup.parseBodyFragment(htmlString); org.jsoup.select.Elements hrefs = doc.select("[href]"); org.jsoup.select.Elements srcs = doc.select("[src]"); log.debug("BaseURL is: {}", baseUrl); // Have to look for both href and src tags for (org.jsoup.nodes.Element element : srcs) { String src = element.attr("src"); if (src != null && !src.startsWith("http")) { for (Map.Entry<String, String> entry : fileNames.entrySet()) { if (entry.getKey() != null && entry.getValue() != null && entry.getValue().contains(src)) { // Found key, set it and stop looking log.debug(String.format("Updating tag %s: <%s> to <%s>", element.tagName(), src, baseUrl + src)); element.attr("src", baseUrl + entry.getValue()); break; } } } } for (org.jsoup.nodes.Element element : hrefs) { String href = element.attr("href"); if (href != null && !href.startsWith("http")) { for (Map.Entry<String, String> entry : fileNames.entrySet()) { if (entry.getKey() != null && entry.getValue() != null && entry.getValue().contains(href)) { // Found key, set it and stop looking log.debug(String.format("Updating a: <%s> to <%s> (%s)", href, baseUrl + href, element.text())); element.attr("href", baseUrl + entry.getValue()); break; } } } } htmlString = doc.body().html(); } return htmlString; }
From source file:org.sakaiproject.util.impl.FormattedTextImpl.java
@Override public String stripHtmlFromText(String text, boolean smartSpacing, boolean stripEscapeSequences) { // KNL-1267 --bbailla2 if (!stripEscapeSequences) { return stripHtmlFromText(text, smartSpacing); }// w w w .j a v a 2 s. c o m if (smartSpacing) { text = addSmartSpacing(text); } org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(text); org.jsoup.nodes.Element body = document.body(); //remove any html tags, unescape any escape characters String strippedText = body.text(); // are converted to char code 160, java doesn't treat it like whitespace, so replace it with ' ' //Could there be others like this? strippedText = strippedText.replace((char) 160, ' '); strippedText = eliminateExtraWhiteSpace(strippedText); return strippedText; }
From source file:org.symphonyoss.client.util.MlMessageParser.java
public void parseMessage(String message) throws SymException { Document doc = Jsoup.parse(message); originalDoc = doc.clone();// w w w . j a v a 2 s . c om Element elementErrors = doc.body().getElementsByTag("errors").first(); if (elementErrors != null) { if (elementErrors.outerHtml() != null) logger.debug("Errors found in message: {}", elementErrors.outerHtml()); } //Lets remove the errors elements doc.select("errors").remove(); elementMessageML = doc.select("messageML").first(); if (elementMessageML == null) elementMessageML = doc.select("div").first(); if (elementMessageML != null) { if (elementMessageML.outerHtml() != null) logger.debug("Doc parsed: {}", elementMessageML.outerHtml()); } else { logger.error("Could not parse document for message {}", message); throw new SymException("Malformed message"); } textDoc = new StringBuilder(); stripTags(textDoc, elementMessageML.childNodes()); textChunks = textDoc.toString().split("\\s+"); }
From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); if (options.getType() != MediaType.MOVIE) { throw new UnsupportedMediaTypeException(options.getType()); }/*w w w . j a v a 2 s . co m*/ // we have 3 entry points here // a) getMetadata has been called with an ofdbId // b) getMetadata has been called with an imdbId // c) getMetadata has been called from a previous search String detailUrl = ""; // case a) and c) if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId())) || options.getResult() != null) { if (StringUtils.isNotBlank(options.getId(getProviderInfo().getId()))) { detailUrl = "http://www.ofdb.de/view.php?page=film&fid=" + options.getId(getProviderInfo().getId()); } else { detailUrl = options.getResult().getUrl(); } } // case b) if (options.getResult() == null && StringUtils.isNotBlank(options.getId(MediaMetadata.IMDB))) { MediaSearchOptions searchOptions = new MediaSearchOptions(MediaType.MOVIE); searchOptions.setImdbId(options.getId(MediaMetadata.IMDB)); try { List<MediaSearchResult> results = search(searchOptions); if (results != null && !results.isEmpty()) { options.setResult(results.get(0)); detailUrl = options.getResult().getUrl(); } } catch (Exception e) { LOGGER.warn("failed IMDB search: " + e.getMessage()); } } // we can only work further if we got a search result on ofdb.de if (StringUtils.isBlank(detailUrl)) { throw new Exception("We did not get any useful movie url"); } MediaMetadata md = new MediaMetadata(providerInfo.getId()); // generic Elements used all over Elements el = null; String ofdbId = StrgUtils.substr(detailUrl, "film\\/(\\d+),"); if (StringUtils.isBlank(ofdbId)) { ofdbId = StrgUtils.substr(detailUrl, "fid=(\\d+)"); } Url url; try { LOGGER.trace("get details page"); url = new Url(detailUrl); InputStream in = url.getInputStream(); Document doc = Jsoup.parse(in, "UTF-8", ""); in.close(); if (doc.getAllElements().size() < 10) { throw new Exception("meh - we did not receive a valid web page"); } // parse details // IMDB ID "http://www.imdb.com/Title?1194173" el = doc.getElementsByAttributeValueContaining("href", "imdb.com"); if (!el.isEmpty()) { md.setId(MediaMetadata.IMDB, "tt" + StrgUtils.substr(el.first().attr("href"), "\\?(\\d+)")); } // title / year // <meta property="og:title" content="Bourne Vermchtnis, Das (2012)" /> el = doc.getElementsByAttributeValue("property", "og:title"); if (!el.isEmpty()) { String[] ty = parseTitle(el.first().attr("content")); md.setTitle(StrgUtils.removeCommonSortableName(ty[0])); try { md.setYear(Integer.parseInt(ty[1])); } catch (Exception ignored) { } } // another year position if (md.getYear() == 0) { // <a href="view.php?page=blaettern&Kat=Jahr&Text=2012">2012</a> el = doc.getElementsByAttributeValueContaining("href", "Kat=Jahr"); try { md.setYear(Integer.parseInt(el.first().text())); } catch (Exception ignored) { } } // original title (has to be searched with a regexp) // <tr valign="top"> // <td nowrap=""><font class="Normal" face="Arial,Helvetica,sans-serif" // size="2">Originaltitel:</font></td> // <td> </td> // <td width="99%"><font class="Daten" face="Arial,Helvetica,sans-serif" // size="2"><b>Brave</b></font></td> // </tr> String originalTitle = StrgUtils.substr(doc.body().html(), "(?s)Originaltitel.*?<b>(.*?)</b>"); if (!originalTitle.isEmpty()) { md.setOriginalTitle(StrgUtils.removeCommonSortableName(originalTitle)); } // Genre: <a href="view.php?page=genre&Genre=Action">Action</a> el = doc.getElementsByAttributeValueContaining("href", "page=genre"); for (Element g : el) { md.addGenre(getTmmGenre(g.text())); } // rating // <div itemtype="http://schema.org/AggregateRating" itemscope // itemprop="aggregateRating">Note: <span // itemprop="ratingValue">6.73</span><meta // itemprop="worstRating" content="1" /> el = doc.getElementsByAttributeValue("itemprop", "ratingValue"); if (!el.isEmpty()) { String r = el.text(); if (!r.isEmpty()) { try { md.setRating(Float.parseFloat(r)); } catch (Exception e) { LOGGER.debug("could not parse rating"); } } } // get PlotLink; open url and parse // <a href="plot/22523,31360,Die-Bourne-Identitt"><b>[mehr]</b></a> LOGGER.trace("parse plot"); el = doc.getElementsByAttributeValueMatching("href", "plot\\/\\d+,"); if (!el.isEmpty()) { String plotUrl = BASE_URL + "/" + el.first().attr("href"); try { url = new Url(plotUrl); in = url.getInputStream(); Document plot = Jsoup.parse(in, "UTF-8", ""); in.close(); Elements block = plot.getElementsByClass("Blocksatz"); // first // Blocksatz // is plot String p = block.first().text(); // remove all html stuff p = p.substring(p.indexOf("Mal gelesen") + 12); // remove "header" md.setPlot(p); } catch (Exception e) { LOGGER.error("failed to get plot page: " + e.getMessage()); } } // http://www.ofdb.de/view.php?page=film_detail&fid=226745 LOGGER.debug("parse actor detail"); String movieDetail = BASE_URL + "/view.php?page=film_detail&fid=" + ofdbId; doc = null; try { url = new Url(movieDetail); in = url.getInputStream(); doc = Jsoup.parse(in, "UTF-8", ""); in.close(); } catch (Exception e) { LOGGER.error("failed to get detail page: " + e.getMessage()); } if (doc != null) { parseCast(doc.getElementsContainingOwnText("Regie"), MediaCastMember.CastType.DIRECTOR, md); parseCast(doc.getElementsContainingOwnText("Darsteller"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Stimme/Sprecher"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Synchronstimme (deutsch)"), MediaCastMember.CastType.ACTOR, md); parseCast(doc.getElementsContainingOwnText("Drehbuchautor(in)"), MediaCastMember.CastType.WRITER, md); parseCast(doc.getElementsContainingOwnText("Produzent(in)"), MediaCastMember.CastType.PRODUCER, md); } } catch (Exception e) { LOGGER.error("Error parsing " + detailUrl); throw e; } return md; }
From source file:poe.trade.assist.SearchView.java
private String addHeadElements(String html) { String htmlDirectory = htmlDirectory(); Document doc = Jsoup.parse(html); Element head = doc.head();/*from w ww . j av a2s .co m*/ // Replace everthing in the <head> head.children().stream().forEach(e -> e.remove()); head.appendElement("meta").attr("charset", "utf-8"); head.appendElement("meta").attr("name", "viewport").attr("content", "width=device-width"); head.appendElement("title").text("poe.trade.assist"); head.appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "packed.js"); head.appendElement("link").attr("rel", "stylesheet").attr("href", htmlDirectory + "packed_dark.css"); doc.body().appendElement("script").attr("type", "text/javascript").attr("src", htmlDirectory + "assist.js"); String cleanHTML = doc.toString(); // try { // FileUtils.writeStringToFile(new File("test"), cleanHTML); // } catch (IOException e1) { // // TODO Auto-generated catch block // e1.printStackTrace(); // } return cleanHTML; }
From source file:sachin.bws.site.Template.java
public String getTemplate(Document doc) { String bodyClass[] = doc.body().attr("class").split("\\u0020"); List<String> classes = Arrays.asList(bodyClass); Collections.reverse(classes); for (String t : classes) { if (templates.contains(t)) { return t; }//from www . ja v a 2 s . c om } return "****"; }