List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:org.bigmouth.tfc.v1.PageIteratorImpl.java
protected void initPagination() { Elements paginationEle = this.asynSearchDoc.select("div.J_TItems .pagination"); if (CollectionUtils.isNotEmpty(paginationEle)) { Element firstPagination = paginationEle.get(0); Elements aEles = firstPagination.select("a"); if (CollectionUtils.isNotEmpty(aEles)) { for (Element element : aEles) { String href = element.attr("href"); if (StringHelper.isNotBlank(href)) { String url = Constants.PROTOCOL_PREFIX + href; String text = element.text(); int pageNo = NumberUtils.toInt(text, -1); if (pageNo != -1) { this.elementData.add(new PageImpl(url, pageNo)); }//from w w w . j av a 2 s .c om } } } } }
From source file:org.cellcore.code.engine.page.GathererDataExtractor.java
private Set<CardName> fetchOtherNames(String cardId, Card card) throws IOException { Document document = Jsoup.connect(langUrl + cardId).get(); Elements elements = document.select(".cardItem"); Set<CardName> cardNames = new HashSet<CardName>(); for (Element element : elements) { Elements tds = element.select("td"); String lang = tds.get(1).text(); if (!skip(lang)) { String name = tds.get(0).text(); String multiverseId = tds.get(0).select("a").get(0).attr("href"); multiverseId = multiverseId.substring(multiverseId.indexOf("=") + 1, multiverseId.length()); String transLang = tds.get(2).text(); CardName cn = new CardName(); cn.setTranslatedLang(transLang); cn.setLanguage(lang);/*w w w .ja v a 2 s . c o m*/ cn.setName(name); cn.setMultiverseId(multiverseId); cn.setCard(card); cardNames.add(cn); } } return cardNames; }
From source file:org.codeexample.anchorlinks.CVAnchorContentIndexingFilter.java
public void getAnchorsImpl(Element rootElement, String anchorPattern, Set<String> anchors) { Elements elements = rootElement.select(anchorPattern); if (!elements.isEmpty()) { for (Element element : elements) { String href = element.attr("href"); anchors.add(href.substring(1)); }/*www. j av a2 s . c o m*/ } }
From source file:org.dronix.android.unisannio.fragment.AvvisiIngFragment.java
public List<NewsIng> getNews() { List<NewsIng> newsList = new ArrayList<NewsIng>(); try {/*from w w w. j a va 2s . co m*/ Document doc = Jsoup.connect(URL).timeout(10000).get(); Elements newsItems = doc.select("item"); for (Element e : newsItems) { String title = e.select("title").first().text(); String description = e.select("description").first().text(); String link = e.select("link").first().text(); String pubDate = e.select("pubDate").first().text(); newsList.add(new NewsIng(title, link, description, pubDate, "")); } } catch (SocketException e) { return null; } catch (IOException e) { e.printStackTrace(); } /* * for (News n : newsList) { Log.i("NEWS", n.getDate() + " " + * n.getBody()); } */ return newsList; }
From source file:org.dronix.android.unisannio.fragment.TabThree.java
public List<NewsIng> getNews() { List<NewsIng> newsList = new ArrayList<NewsIng>(); try {// w ww.j a v a 2 s . c o m Document doc = Jsoup.connect(URL).timeout(10000).get(); Elements newsItems = doc.select("item"); for (Element e : newsItems) { String title = e.select("title").first().text(); String description = e.select("description").first().text(); String link = e.select("link").first().text(); String author = e.select("author").first().text(); newsList.add(new NewsIng(title, link, description, "", author)); } } catch (SocketException e) { return null; } catch (IOException e) { e.printStackTrace(); } /* * for (News n : newsList) { Log.i("NEWS", n.getDate() + " " + * n.getBody()); } */ return newsList; }
From source file:org.keionline.keionline.ArticleView.java
private String getContent(String url) throws IOException { Document doc = Jsoup.connect(url).userAgent("Mozilla").get(); Element data = doc.getElementsByClass("node").first();// get the third content div, Elements select = data.select("img"); // Change the links to absolute!! so that images work for (Element e : select) { e.attr("src", e.absUrl("src")); }// w ww . jav a 2s .c o m select = data.select("a"); for (Element e : select) { e.attr("href", e.absUrl("href")); } Element info = data.getElementsByClass("submitted").first(); info.after("<hr>"); String cont = data.toString(); cont = CSS + cont + "</body>"; content = cont; return cont; }
From source file:org.loklak.api.search.WeiboUserInfo.java
@Override protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { Query post = RemoteAccess.evaluate(request); // manage DoS if (post.isDoS_blackout()) { response.sendError(503, "your request frequency is too high"); return;/*from w w w .j a v a2 s .c om*/ } String url = post.get("url", ""); JSONObject obj = new JSONObject(); Document doc = Jsoup.connect(url).get(); Elements infos; infos = doc.getElementsByAttributeValue("class", "li_1 clearfix"); if (infos != null) { Element info; String profile; for (int i = 0; i < infos.size(); i++) { info = infos.get(i); if (info.getElementsByAttributeValueContaining("href", "loc=infblog").size() == 0) { profile = info.getElementsByAttributeValue("class", "pt_detail").first().text().trim(); obj.put("pro", profile); switch (info.getElementsByAttributeValue("class", "pt_title S_txt2").first().text()) { case "Nickname": obj.put("username", profile); break; case "Location": obj.put("Address", profile); break; case "Gender": obj.put("Gender", profile); break; case "??": obj.put("Sexuality", profile.replace("t", "").replace("rn", "")); break; case "": obj.put("Relationship", profile.replace("t", "").replace("rn", "")); break; case "Birthday": obj.put("Birthday", profile); break; case "": obj.put("Blood", profile); break; case "Domain Name": if (info.getElementsByAttributeValueContaining("href", "loc=infdomain").size() != 0) profile = info.select("a").text(); obj.put("Personaldomain", profile); break; case "": obj.put("Profile", profile); break; case "Registration": obj.put("Registertime", profile.replace("t", "").replace("rn", "")); break; case "Email": obj.put("Email", profile); break; case "QQ": obj.put("Qq", profile); break; case "": obj.put("College", profile.replace("t", "").replace("rn", "")); break; case "Tags": obj.put("Tag", profile.replace("t", "").replace("rn", "")); break; } } else { String blogurl = info.select("a").text(); obj.put("Blog", blogurl); } } } //print JSON response.setCharacterEncoding("UTF-8"); PrintWriter sos = response.getWriter(); sos.print(obj.toString(2)); sos.println(); }
From source file:org.mar9000.space2latex.WikiPage.java
public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException { String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); Document document = Jsoup.parseBodyFragment(page.storage); document.outputSettings().prettyPrint(false); Elements images = document.select("ac|image"); if (images.size() > 0) LOGGER.info(" Download images:"); for (Element element : images) { String downloadURL = null; String imageKey = null;//from ww w . ja v a 2s . c o m // Attachment? Elements refs = element.select("ri|attachment"); WikiImage image = new WikiImage(); image.pageId = page.id; image.acImage = element.outerHtml(); // if (refs.size() > 0) { // Attachment. Element riAttachment = refs.get(0); imageKey = riAttachment.attr("ri:filename"); Elements riPages = riAttachment.select("ri|page"); // Thumbnails are not found with "child/attachment" URL schema. boolean isThumbnail = "true".equals(element.attr("ac:thumbnail")); String queryURL = null; if (!isThumbnail) { queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } else { // For thumbnail we construct directly the downloadURL without queryURL. /* Some pages have thumbnail images for better online reading. * Here we download always the attached file to embed readable imagesinto the pdf. downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey); */ downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/" + page.id + "/" + URLEncoder.encode(imageKey); } if (riPages.size() > 0) { // The attachment is related with another page. Element riPage = riPages.get(0); String space = riPage.attr("ri:space-key"); String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20"); String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle + "&spaceKey=" + space; JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL); if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0) throw new RuntimeException( "Page \"" + contentTitle + "\" in space " + space + " not found."); JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0); image.pageId = jsonNewPage.getString(JSON_ID_ATTR); // Overwrite queryURL. String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } if (!isThumbnail) downloadURL = getAttachmentDownloadURL(queryURL); } else { refs = element.select("ri|url"); if (refs.size() > 0) { // URL. downloadURL = refs.get(0).attr("ri:value"); URL tempURL = new URL(downloadURL); String urlPath = tempURL.getPath(); imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1); } else { throw new RuntimeException("Image format unknown: " + element.toString()); } } // Download the image data. image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX. if (downloadURL != null) { LOGGER.info(" about to download image {}/{}", new Object[] { image.pageId, image.filename }); image.data = IOUtils.getImageFromURL(downloadURL); } else { LOGGER.info(" NULL download URL for page/image: {}/{}", new Object[] { image.pageId, image.filename }); } page.images.put(imageKey, image); } }
From source file:org.mar9000.space2latex.WikiPage.java
public static WikiPage loadForFormat(File file) throws IOException { String fileContent = IOUtils.readFileAsString(file); Document doc = Jsoup.parseBodyFragment(fileContent); // Maintain input string. doc.outputSettings().prettyPrint(false); Element body = doc.body(); Element pageElement = body.select("page").first(); String title = pageElement.attr("title"); String id = pageElement.attr("id"); Element pageContent = pageElement.select("content").first(); WikiPage page = new WikiPage(null, title, id, pageContent.html()); page.pageContent = pageContent;//from w w w.j av a 2 s . c om // Images. Elements images = body.select("wikiimages").first().select("wikiimage"); for (Element imageElement : images) { WikiImage image = new WikiImage(); String acKey = imageElement.select("ac|image").first().outerHtml(); image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename"); page.images.put(acKey, image); } return page; }
From source file:org.metaservice.demo.wordpress.WordpressParser.java
@Override public List<VersionEntry> parse(Reader s, ArchiveAddress archiveParameters) throws ParserException { try {//from w w w. j a va 2s. c o m Document document = Jsoup.parse(IOUtils.toString(s), "http://wordpress.org/download/release-archive/"); ArrayList<VersionEntry> result = new ArrayList<>(); Elements tables = document.select("table.widefat"); for (Element table : tables) { Elements rows = table.select("tr"); // System.err.println(rows); for (Element row : rows) { Elements columns = row.select("td"); if (columns.size() > 0) { VersionEntry versionEntry = new VersionEntry(); versionEntry.setName(columns.get(0).text().trim()); versionEntry.setZip(columns.select("a[href$=zip]").attr("href")); versionEntry.setTar(columns.select("a[href$=tar.gz]").attr("href")); versionEntry.setIis(columns.select("a[href$=IIS.zip]").attr("href")); result.add(versionEntry); } } } return result; } catch (IOException e) { throw new ParserException(e); } }