List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:org.opens.tanaguru.rules.rgaa30.Rgaa30Rule110102.java
/** * This method linked each label which have an input child on a page to its * form in a map./*from w ww . j av a 2 s .c o m*/ */ private void putLabelElementHandlerIntoTheMap() { for (Element el : labelElementHandler.get()) { Element tmpElement = el.parent(); while (StringUtils.isNotBlank(tmpElement.tagName())) { if (tmpElement.tagName().equals(FORM_TAG)) { if (labelFormMap.containsKey(tmpElement)) { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { labelFormMap.get(tmpElement).add(el); } } else { Elements els = el.select(FORM_ELEMENT_WITH_ID_CSS_LIKE_QUERY); if (!els.isEmpty()) { ElementHandler<Element> labelElement = new ElementHandlerImpl(); labelElement.add(el); labelFormMap.put(tmpElement, labelElement); } } break; } tmpElement = tmpElement.parent(); } } }
From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java
boolean isEmpty(Element htmlElement) { // filter out TinyMCE instances htmlElement.select(".mceEditor").remove(); String htmlContent = htmlElement.text().trim(); String[] elementNames = new String[] { "img", "iframe", "frame", "input", "select", "option" }; boolean containsElement = false; for (String elementName : elementNames) { if (!htmlElement.select(elementName).isEmpty()) { containsElement = true;/*from w w w . j ava 2s.co m*/ } } return !(htmlElement.hasText() || containsElement); }
From source file:org.sakaiproject.nakamura.files.migrator.PageMigrator.java
JSONObject migratePage(JSONObject originalStructure, String contentId, Set<String> widgetsUsed, String ref) throws JSONException { Document page;/*from w w w .j a v a 2s . co m*/ JSONObject oldFashionedWidget = originalStructure.getJSONObject(ref); if (oldFashionedWidget.has("page")) { page = Jsoup.parse(oldFashionedWidget.getString("page")); if (originalStructure.has("resources")) { LOGGER.debug("This may be an IMS-CP document. Looking for images..."); Elements images = page.select("img"); LOGGER.debug("Found {} images to process.", images.size()); for (Element image : images) { String imgSrc = image.attr("src"); LOGGER.debug("Looking at image path '{}'", imgSrc); if (imgSrc.startsWith("p/") && hasImageExtension(imgSrc)) { image.attr("src", imgSrc.substring(0, imgSrc.lastIndexOf(".") + 1)); } } } } else { page = Jsoup.parse(EMPTY_DIV); } Document currentHtmlBlock = Jsoup.parse(EMPTY_DIV); JSONObject currentPage = new JSONObject(); currentPage.put("rows", new JSONArray()); JSONObject currentRow = generateEmptyRow(1); Elements topLevelElements = page.select("body").first().children(); for (Element topLevelElement : topLevelElements) { if (topLevelElement.select(".widget_inline").size() > 0) { addRowToPage(currentRow, currentPage, 0, currentHtmlBlock.select("body").first()); currentHtmlBlock = Jsoup.parse(EMPTY_DIV); int numColumns = 1; int leftSideColumn = topLevelElement.select(".widget_inline.block_image_left").size() > 0 ? 1 : 0; numColumns += leftSideColumn; int rightSideColumn = topLevelElement.select(".widget_inline.block_image_right").size() > 0 ? 1 : 0; numColumns += rightSideColumn; if (numColumns > 1) { currentRow = addRowToPage(currentRow, currentPage, numColumns, currentHtmlBlock.select("body").first()); } for (Element widgetElement : topLevelElement.select(".widget_inline")) { extractWidget(originalStructure, contentId, widgetsUsed, ref, currentPage, currentRow, leftSideColumn, widgetElement); } if (numColumns > 1) { currentRow = addRowToPage(currentRow, currentPage, 1, currentHtmlBlock.select("body").first()); } } else { currentHtmlBlock.select("div").first().appendChild(topLevelElement); } } addRowToPage(currentRow, currentPage, 1, currentHtmlBlock.select("body").first()); ensureRowPresent(currentPage); return currentPage; }
From source file:org.sbs.goodcrawler.jobconf.ExtractConfig.java
/** * ????/* www . j a v a2 s . c om*/ * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) throws ConfigurationException { Elements extractElement = doc.select("extract"); super.jobName = doc.select("job").attr("name"); super.indexName = doc.select("job").attr("indexName"); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } this.templates.add(extractTemplate); } return this; }
From source file:org.sbs.goodcrawler.plugin.extract.ExtractorDytt8.java
@Override public ExtractedPage<?, ?> onExtract(Page page) { if (null != page) { try {/*from ww w . ja va 2s . c om*/ Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()), urlUtils.getBaseUrl(page.getWebURL().getURL())); if (null != page.getWebURL().getURL() && page.getWebURL().getURL().contains("game/")) return null; // ???Url?Url Elements links = doc.getElementsByTag("a"); if (!links.isEmpty()) { for (Element link : links) { String linkHref = link.absUrl("href"); if (StringUtils.isNotBlank(linkHref) && filterUrls(linkHref)) { try { WebURL url = new WebURL(); url.setURL(linkHref); url.setJobName(conf.jobName); pendingUrls.addUrl(url); } catch (QueueException e) { log.error(e.getMessage()); } catch (Exception e) { log.error(e.getMessage()); } } } } // ?? // Map<String, String> selects = conf.getSelects(); Map<String, String> selects = null; ExtractedPage<String, Object> epage = pendingStore.new ExtractedPage<String, Object>(); epage.setUrl(page.getWebURL()); HashMap<String, Object> result = new HashMap<>(); Elements text = doc.select("#Zoom"); if (null == text || text.size() == 0) { return null; } String name = doc.select("h1").text(); name = name.replace("", "").replace("<<", "").replace("", "").replace(">>", ""); result.put("movie", name); // result.put("_id", name); String ts[] = doc.select("h2 a").text().split(" "); if (ts.length >= 2) { result.put("type", ts[1].trim()); } else { result.put("type", "unknow"); } result.put("url", page.getWebURL().getURL()); for (Entry<String, String> entry : selects.entrySet()) { Elements elements = doc.select(entry.getValue()); if (elements.isEmpty()) return null; else { if ("content".equals(entry.getKey())) { for (Element element : elements) { // Elements imgs = element.select("img[src]"); StringBuilder sb = new StringBuilder(); for (Element img : imgs) { sb.append(img.attr("src")).append(";"); } result.put("img", sb.toString()); // ? Elements movieInfos = element.select("p"); for (Element info : movieInfos) { String infotext = info.text(); try { String infotext_ = info.html(); int start, end = 0; start = infotext_.indexOf(""); if (start > 0) { end = infotext_.lastIndexOf(""); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } else { end = infotext_.lastIndexOf("."); if (end > 0 && start < end) { result.put("jq", infotext_.substring(start, end)); } } } infotext_ = null; } catch (Exception e) { e.printStackTrace(); } if (infotext.startsWith("")) { String ss[] = infotext.split(""); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.startsWith("?")) { String ss[] = infotext.split("?"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains("")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } else if (infotext.contains(":")) { infotext = info.html(); String[] ss = infotext.split("<br />"); for (String s : ss) { s.trim(); result = getInfoName(s, result); } } } // if(result.size()<5){ // result.put("content", value) // } // ? Elements elements2 = elements.select("td"); sb.setLength(0); for (Element download : elements2) { sb.append(download.text()).append(";"); } result.put("download", sb.toString()); } } } // result.put(entry.getKey(), elements.html()); } if (StringUtils.isNotBlank((String) result.get("nd"))) { result.put("nd", Integer.parseInt((String) result.get("nd"))); } epage.setMessages(result); try { pendingStore.addExtracedPage(epage); } catch (QueueException e) { log.error(e.getMessage()); } return epage; } catch (UnsupportedEncodingException e) { log.error(e.getMessage()); e.printStackTrace(); } } return null; }
From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java
@Override public int getLikeCount() throws ParsingException { String likesString = ""; try {/*from ww w . ja v a 2 s . co m*/ Element button = doc.select("button.like-button-renderer-like-button").first(); try { likesString = button.select("span.yt-uix-button-content").first().text(); } catch (NullPointerException e) { //if this ckicks in our button has no content and thefore likes/dislikes are disabled return -1; } return Integer.parseInt(likesString.replaceAll("[^\\d]", "")); } catch (NumberFormatException nfe) { throw new ParsingException("failed to parse likesString \"" + likesString + "\" as integers", nfe); } catch (Exception e) { throw new ParsingException("Could not get like count", e); } }
From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java
@Override public int getDislikeCount() throws ParsingException { String dislikesString = ""; try {// w w w . ja v a 2s . c o m Element button = doc.select("button.like-button-renderer-dislike-button").first(); try { dislikesString = button.select("span.yt-uix-button-content").first().text(); } catch (NullPointerException e) { //if this kicks in our button has no content and therefore likes/dislikes are disabled return -1; } return Integer.parseInt(dislikesString.replaceAll("[^\\d]", "")); } catch (NumberFormatException nfe) { throw new ParsingException("failed to parse dislikesString \"" + dislikesString + "\" as integers", nfe); } catch (Exception e) { throw new ParsingException("Could not get dislike count", e); } }
From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java
@Override public Vector<StreamPreviewInfo> getRelatedVideos() throws ParsingException { try {/*from www . j a va2 s.c o m*/ Vector<StreamPreviewInfo> relatedVideos = new Vector<>(); for (Element li : doc.select("ul[id=\"watch-related\"]").first().children()) { // first check if we have a playlist. If so leave them out if (li.select("a[class*=\"content-link\"]").first() != null) { relatedVideos.add(extractVideoPreviewInfo(li)); } } return relatedVideos; } catch (Exception e) { throw new ParsingException("Could not get related videos", e); } }
From source file:org.schabi.newpipe.extractor.services.youtube.YoutubeStreamExtractor.java
/**Provides information about links to other videos on the video page, such as related videos. * This is encapsulated in a StreamPreviewInfo object, * which is a subset of the fields in a full StreamInfo.*/ private StreamPreviewInfo extractVideoPreviewInfo(Element li) throws ParsingException { StreamPreviewInfo info = new StreamPreviewInfo(); try {//w ww .j a v a 2 s . com info.webpage_url = li.select("a.content-link").first().attr("abs:href"); info.id = Parser.matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url); //todo: check NullPointerException causing info.title = li.select("span.title").first().text(); //this page causes the NullPointerException, after finding it by searching for "tjvg": //https://www.youtube.com/watch?v=Uqg0aEhLFAg //this line is unused //String views = li.select("span.view-count").first().text(); //Log.i(TAG, "title:"+info.title); //Log.i(TAG, "view count:"+views); try { info.view_count = Long .parseLong(li.select("span.view-count").first().text().replaceAll("[^\\d]", "")); } catch (Exception e) {//related videos sometimes have no view count info.view_count = 0; } info.uploader = li.select("span.g-hovercard").first().text(); info.duration = YoutubeParsingHelper.parseDurationString(li.select("span.video-time").first().text()); Element img = li.select("img").first(); info.thumbnail_url = img.attr("abs:src"); // Sometimes youtube sends links to gif files which somehow seem to not exist // anymore. Items with such gif also offer a secondary image source. So we are going // to use that if we caught such an item. if (info.thumbnail_url.contains(".gif")) { info.thumbnail_url = img.attr("data-thumb"); } if (info.thumbnail_url.startsWith("//")) { info.thumbnail_url = "https:" + info.thumbnail_url; } } catch (Exception e) { throw new ParsingException("Could not get video preview info", e); } return info; }
From source file:org.schabi.newpipe.services.youtube.YoutubeVideoExtractor.java
@Override public VideoInfo getVideoInfo() { videoInfo = super.getVideoInfo(); //todo: replace this with a call to getVideoId, if possible videoInfo.id = matchGroup1("v=([0-9a-zA-Z_-]{11})", pageUrl); videoInfo.age_limit = 0;/*from w w w . j av a 2 s . c o m*/ //average rating try { videoInfo.average_rating = playerArgs.getString("avg_rating"); } catch (JSONException e) { e.printStackTrace(); } //--------------------------------------- // extracting information from html page //--------------------------------------- // Determine what went wrong when the Video is not available if (videoInfo.videoAvailableStatus == VideoInfo.VIDEO_UNAVAILABLE) { if (doc.select("h1[id=\"unavailable-message\"]").first().text().contains("GEMA")) { videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE_GEMA; } } String likesString = ""; String dislikesString = ""; try { // likes likesString = doc.select("button.like-button-renderer-like-button").first() .select("span.yt-uix-button-content").first().text(); videoInfo.like_count = Integer.parseInt(likesString.replaceAll("[^\\d]", "")); // dislikes dislikesString = doc.select("button.like-button-renderer-dislike-button").first() .select("span.yt-uix-button-content").first().text(); videoInfo.dislike_count = Integer.parseInt(dislikesString.replaceAll("[^\\d]", "")); } catch (NumberFormatException nfe) { Log.e(TAG, "failed to parse likesString \"" + likesString + "\" and dislikesString \"" + dislikesString + "\" as integers"); } catch (Exception e) { // if it fails we know that the video does not offer dislikes. e.printStackTrace(); videoInfo.like_count = 0; videoInfo.dislike_count = 0; } // next video videoInfo.nextVideo = extractVideoPreviewInfo( doc.select("div[class=\"watch-sidebar-section\"]").first().select("li").first()); // related videos Vector<VideoPreviewInfo> relatedVideos = new Vector<>(); for (Element li : doc.select("ul[id=\"watch-related\"]").first().children()) { // first check if we have a playlist. If so leave them out if (li.select("a[class*=\"content-link\"]").first() != null) { relatedVideos.add(extractVideoPreviewInfo(li)); } } //todo: replace conversion videoInfo.relatedVideos = relatedVideos; //videoInfo.relatedVideos = relatedVideos.toArray(new VideoPreviewInfo[relatedVideos.size()]); return videoInfo; }