List of usage examples for org.jsoup.nodes Element select
public Elements select(String cssQuery)
From source file:edu.rowan.app.carousel.CarouselFetch.java
@Override protected CarouselFeature[] doInBackground(Void... params) { String rowanURL = "http://rowan.edu"; ArrayList<CarouselFeature> cfeatures = new ArrayList<CarouselFeature>(); long lastUpdated = prefs.getLong(LAST_UPDATE, -1); if (lastUpdated > 0) { long timeDiff = Calendar.getInstance().getTimeInMillis() - lastUpdated; int hours = (int) (timeDiff / (60 * 60 * 1000)); if (hours < UPDATE_INTERVAL) { // just load saved features cfeatures.addAll(loadFeaturesFromPreferences()); // System.out.println("Loaded features from prefernces"); return cfeatures.toArray(new CarouselFeature[cfeatures.size()]); }/*from www.ja v a 2s . c om*/ } // ELSE: Attempt to update // but check if we have available connection try { // Download + Parse Rowan's homepage for features //Toast.makeText(context, "Updating CarouselView", Toast.LENGTH_SHORT).show(); DOUH CAN"T DO THIS Document document = Jsoup.connect(rowanURL).get(); Elements features = document.select(".feature "); for (Element feature : features) { String title = feature.select(".title a span").first().text(); String description = feature.select(".description a").first().text(); Element link = feature.select("a").first(); String linkURL = link.attr("abs:href"); String imageURL = link.select("img").first().attr("abs:src"); CarouselFeature cFeature = new CarouselFeature(title, description, linkURL, imageURL, RECEIVER, context); cfeatures.add(cFeature); } saveDataToPreferences(cfeatures); } catch (IOException e1) { e1.printStackTrace(); return null; } return cfeatures.toArray(new CarouselFeature[cfeatures.size()]); }
From source file:com.mycompany.grabberrasskazov.threads.ThreadForPageSave.java
public void indexStory(String pageUrl) { try {//w w w. j av a2 s . co m String oldId = pageUrl.replace(GlobalVars.mainSite, ""); if (!mainBean.storyExists(oldId)) { Stories r = new Stories(); Document doc = Jsoup.connect(pageUrl) .userAgent("Opera/9.80 (X11; Linux x86_64) " + "Presto/2.12.388 Version/12.16").get(); Elements nameBlockElements = doc.select("b:containsOwn(?)"); Element nameBlock = nameBlockElements.get(0); nameBlock = nameBlock.parent().parent(); nameBlockElements = nameBlock.select("td:eq(1)"); nameBlock = nameBlockElements.get(0); String storyName = nameBlock.text(); r.setStoryName(storyName); // Start of processing writer Elements writerBlockElements = doc.select("b:containsOwn(?:)"); Element writerBlock = writerBlockElements.get(0); writerBlock = writerBlock.parent().parent(); writerBlockElements = writerBlock.select("td:eq(1)"); writerBlock = writerBlockElements.get(0); String writersUrl = writerBlock.select("a:eq(0)").attr("href"); String writersName = writerBlock.select("a:eq(0)").text(); String writersContacts = writerBlock.select("a:eq(1)").attr("href"); StoryWriters storyWriter = new StoryWriters(); storyWriter.setOldId(writersUrl); storyWriter.setWriterEmail(writersContacts); storyWriter.setWriterName(writersName); storyWriter = mainBean.saveWriter(storyWriter); Set<StoriesToWritersRelations> storiesToWritersRelationses = new HashSet<StoriesToWritersRelations>(); StoriesToWritersRelations storiesToWritersRelations = new StoriesToWritersRelations(); storiesToWritersRelations.setStories(r); storiesToWritersRelations.setStoryWriters(storyWriter); r.setStoriesToWritersRelationses(storiesToWritersRelationses); // End of processing writer Set<StoriesToCategoriessRelations> catsRelationses = new HashSet<>(); Elements katsInfo = doc.select("a[href*=ras.shtml?kat]"); for (Element kat : katsInfo) { String katId = kat.attr("href"); StoryCategories cat = mainBean.getCat(katId); StoriesToCategoriessRelations catsRelations = new StoriesToCategoriessRelations(); catsRelations.setStoryCategories(cat); catsRelations.setStories(r); catsRelationses.add(catsRelations); } r.setStoriesToCategoriessRelationses(catsRelationses); Elements textBlocks = doc.select("p[align=justify]"); Element textBlock = textBlocks.get(0); String textStr = textBlock.html(); r.setStoryText(textStr.replace("\"", "'")); r.setOldId(oldId); mainBean.saveStory(r); } } catch (IOException ex) { ex.printStackTrace(); } }
From source file:cn.cuizuoli.appranking.service.GooglePlayService.java
/** * getAppRankingList//from w w w. j a v a2 s.c om * @param feedType * @return */ public List<AppRanking> getAppRankingList(FeedType feedType, Category category) { List<AppRanking> appRankingList = new ArrayList<AppRanking>(); try { if (feedType.getMediaType() == MediaType.GOOGLE) { String url = StringUtils.EMPTY; if (category == Category.ALL) { url = getHotUrl(feedType); } else { url = getUrl(feedType, category); } log.info("Google Play -> " + url); if (StringUtils.isNotBlank(url)) { Document doc = appRankingRestTemplate.getForObject(url, Document.class); Elements elements = doc.select(".card-list>.card"); Iterator<Element> iterator = elements.iterator(); int i = 1; while (iterator.hasNext()) { Element element = iterator.next(); String appId = element.attr("data-docid"); String name = element.select(".details .title").attr("title"); String uri = element.select(".details .title").attr("href"); String artist = element.select(".details .subtitle").attr("title"); String price = element.select(".details button.price.buy>span").text(); String image170 = element.select(".cover .cover-image").attr("data-cover-small"); String image340 = element.select(".cover .cover-image").attr("data-cover-large"); AppRanking appRanking = new AppRanking(); appRanking.setAppId(appId); appRanking.setDeviceType(DeviceType.ANDROID); appRanking.setCountry(Country.JAPAN); appRanking.setMediaType(MediaType.GOOGLE); appRanking.setFeedType(feedType); appRanking.setRanking(i); appRanking.setTitle(name + " - " + artist); appRanking.setCategory(category.getCode()); appRanking.setUri(GOOGLE_PLAY_DOMAIN + uri); appRanking.setName(name); appRanking.setArtist(artist); appRanking.setPrice(price); appRanking.setImage53(image170); appRanking.setImage75(image170); appRanking.setImage100(image340); appRankingList.add(appRanking); i++; } } } } catch (HttpStatusCodeException e) { log.error(ExceptionUtils.getFullStackTrace(e)); } catch (Exception e) { log.error(ExceptionUtils.getFullStackTrace(e)); } return appRankingList; }
From source file:me.vertretungsplan.parser.ESchoolParser.java
private void parseTable(Element table, SubstitutionScheduleDay day) { for (Element th : table.select("th[colspan=10]")) { String lesson;//from w w w . j ava 2 s. c o m Pattern pattern = Pattern.compile("(\\d+)\\. Stunde"); Matcher matcher = pattern.matcher(th.text()); if (matcher.find()) { lesson = matcher.group(1); } else { lesson = th.text(); } // skip over table headers Element row = th.parent().nextElementSibling().nextElementSibling(); while (row != null && row.select("th").size() == 0) { Substitution subst = new Substitution(); subst.setLesson(lesson); Elements columns = row.select("td"); String[] classes = columns.get(0).text().split(", |\\+"); subst.setClasses(new HashSet<>(Arrays.asList(classes))); subst.setPreviousTeacher(getPreviousValue(columns.get(1))); subst.setTeacher(getNewValue(columns.get(1))); subst.setPreviousSubject(getPreviousValue(columns.get(2))); subst.setSubject(getNewValue(columns.get(2))); subst.setPreviousRoom(getPreviousValue(columns.get(3))); subst.setRoom(getNewValue(columns.get(3))); if (columns.get(4).text().isEmpty()) { subst.setType("Vertretung"); subst.setColor(colorProvider.getColor("Vertretung")); } else { String desc = columns.get(4).text(); subst.setDesc(desc); String recognizedType = recognizeType(desc); if (recognizedType == null) recognizedType = "Vertretung"; subst.setType(recognizedType); subst.setColor(colorProvider.getColor(recognizedType)); } day.addSubstitution(subst); row = row.nextElementSibling(); } } }
From source file:org.apache.marmotta.ldclient.provider.phpbb.PHPBBTopicProvider.java
/** * Return a mapping table mapping from RDF properties to XPath Value Mappers. Each entry in the map is evaluated * in turn; in case the XPath expression yields a result, the property is added for the processed resource. * * @return/*from ww w. j a v a 2 s. com*/ * @param requestUrl */ @Override protected Map<String, JSoupMapper> getMappings(String resource, String requestUrl) { URI uri = null; try { uri = new URI(requestUrl); Map<String, String> params = new HashMap<String, String>(); for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) { params.put(p.getName(), p.getValue()); } if (params.containsKey("t")) { Map<String, JSoupMapper> postMappings = new HashMap<String, JSoupMapper>(); if (params.containsKey("start")) { // when start is set, we only take the replies; we are in a second or further page of the topic postMappings.put(Namespaces.NS_SIOC + "container_of", new PHPBBPostIdMapper("div#pagecontent table td.gensmall a[name]")); } else { // otherwise we also take the initial title, creator and date for the topic postMappings.put(Namespaces.NS_DC + "title", new CssTextLiteralMapper("div#pageheader a.titles")); postMappings.put(Namespaces.NS_DC + "creator", new CssTextLiteralMapper(new CssSelectorMapper.Selector() { @Override public Elements select(Element node) { final Element first = node.select("div#pagecontent table b.postauthor").first(); if (first != null) return new Elements(first); return new Elements(); } })); postMappings.put(Namespaces.NS_DC + "date", new PHPBBDateMapper("div#pagecontent table td.gensmall div") { @Override public Elements select(Element htmlDoc) { final Elements sel = super.select(htmlDoc); if (sel.size() > 0) { final Element e = sel.get(1); if (e != null) return new Elements(e); } return new Elements(); } }); postMappings.put(Namespaces.NS_SIOC + "has_container", new PHPBBForumHrefMapper("p.breadcrumbs a") { @Override public Elements select(Element htmlDoc) { final Element select = super.select(htmlDoc).last(); return select != null ? new Elements(select) : new Elements(); } }); postMappings.put(Namespaces.NS_SIOC + "container_of", new PHPBBPostIdMapper("div#pagecontent table td.gensmall a[name]")); } return postMappings; } else throw new RuntimeException( "the requested resource does not seem to identify a PHPBB topic (t=... parameter missing)"); } catch (URISyntaxException e) { throw new RuntimeException( "the requested resource does not seem to identify a PHPBB topic (URI syntax error)"); } }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
static String findLastChange(Element doc, SubstitutionScheduleData scheduleData) { String lastChange = null;//from w ww.j a v a2 s. c om boolean lastChangeLeft = false; if (scheduleData != null) { if (scheduleData.getData().has("stand_links")) { // backwards compatibility lastChangeLeft = scheduleData.getData().optBoolean("stand_links", false); } else { lastChangeLeft = scheduleData.getData().optBoolean(PARAM_LAST_CHANGE_LEFT, false); } } if (doc.select("table.mon_head").size() > 0) { Element monHead = doc.select("table.mon_head").first(); lastChange = findLastChangeFromMonHeadTable(monHead); } else if (lastChangeLeft) { final String bodyHtml = doc.select("body").size() > 0 ? doc.select("body").html() : doc.html(); lastChange = bodyHtml.substring(0, bodyHtml.indexOf("<p>") - 1); } else { List<Node> childNodes; if (doc instanceof Document) { childNodes = ((Document) doc).body().childNodes(); } else { childNodes = doc.childNodes(); } for (Node node : childNodes) { if (node instanceof Comment) { Comment comment = (Comment) node; if (comment.getData().contains("<table class=\"mon_head\">")) { Document commentedDoc = Jsoup.parse(comment.getData()); Element monHead = commentedDoc.select("table.mon_head").first(); lastChange = findLastChangeFromMonHeadTable(monHead); break; } } } } return lastChange; }
From source file:com.glluch.profilesparser.ProfileHtmlReader.java
private ArrayList<String> ul2array(Element list) { Elements llist = list.select("li"); ArrayList<String> l = new ArrayList<>(); for (Element li : llist) { l.add(li.ownText());//from ww w.jav a 2 s.c o m } return l; }
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
static List<LentItem> parse_medialist(Document doc) { List<LentItem> media = new ArrayList<>(); Elements copytrs = doc.select(".data tr"); DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN); int trs = copytrs.size(); if (trs == 1) { return null; }/* w w w. j a v a 2 s . c o m*/ assert (trs > 0); for (int i = 1; i < trs; i++) { Element tr = copytrs.get(i); LentItem item = new LentItem(); if (tr.text().contains("keine Daten")) { return null; } item.setTitle(tr.select(".account-display-title").select("b, strong").text().trim()); try { item.setAuthor(tr.select(".account-display-title").html().split("<br[ /]*>")[1].trim()); String[] col3split = tr.select(".account-display-state").html().split("<br[ /]*>"); String deadline = Jsoup.parse(col3split[0].trim()).text().trim(); if (deadline.contains(":")) { // BSB Munich: <span class="hidden-sm hidden-md hidden-lg">Flligkeitsdatum : </span>26.02.2016<br> deadline = deadline.split(":")[1].trim(); } if (deadline.contains("-")) { // Chemnitz: 22.07.2015 - 20.10.2015<br> deadline = deadline.split("-")[1].trim(); } try { item.setDeadline(fmt.parseLocalDate(deadline).toString()); } catch (IllegalArgumentException e1) { e1.printStackTrace(); } if (col3split.length > 1) item.setHomeBranch(col3split[1].trim()); if (tr.select("a").size() > 0) { for (Element link : tr.select("a")) { String href = link.attr("abs:href"); Map<String, String> hrefq = getQueryParamsFirst(href); if (hrefq.get("methodToCall").equals("renewal")) { item.setProlongData(href.split("\\?")[1]); item.setRenewable(true); break; } } } } catch (Exception ex) { ex.printStackTrace(); } media.add(item); } return media; }
From source file:com.crawler.app.run.JellyfishCrawlerSiteCB.java
/** * This function is called when a page is fetched and ready to be processed * by your program.//from w ww .java 2s.c o m */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); //logger.info("URL: ", url); String host = "127.0.0.1"; String port = "3306"; String dbName = "crawler"; String dbUser = "root"; String dbPwd = ""; MysqlCrawler.createConn(host, port, dbName, dbUser, dbPwd); System.out.println("\n URL visit: " + url); String href = url.toLowerCase(); if (href.startsWith("http://careerbuilder.vn/vi/tim-viec-lam") && href.endsWith(".html")) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); String title = htmlParseData.getTitle(); Document doc = Jsoup.parse(html, "UTF-8"); //doc.outputSettings().escapeMode(EscapeMode.xhtml); Element body = doc.body(); Elements listDetail = body.select("section div[class=MyJobLeft]"); String jobUrl = url; String jobName = listDetail.select("h1").html(); String companyName = listDetail.select("div[class=tit_company]").html(); String jobLocation = listDetail.select( "div[class=box2Detail] ul[class=DetailJobNew] p[class=fl_left] b[itemprop=jobLocation] a") .html(); String companyAddress = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label[itemprop=addressLocality]") .html(); String companyContact = listDetail .select("div[class=box1Detail] p[class=TitleDetailNew] label strong").html(); String companyPhone = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p[id=company_contact]").html(); String companyWebsite = listDetail .select("div[class=col-lg-6 col-md-6 col-sm-12] p a[id=company_website]").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[id=main_content_right]"); jobName = listDetail.select("h1 p").html(); companyName = listDetail.select("div[class=intro_company] div[class=title_into] p").html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); if (listDetail.isEmpty() || jobName.isEmpty()) { listDetail = body.select("div[id=main_content] div[class=content_right]"); jobName = listDetail.select("h1").html(); companyName = listDetail .select("div[class=intro_company] div[class=title_into] p[class=title_comp]") .html(); Elements gCompanyWebList = listDetail .select("div[class=intro_company] div[class=title_into] p"); if (!gCompanyWebList.isEmpty() && gCompanyWebList.size() > 1) companyWebsite = gCompanyWebList.get(1).html(); jobLocation = listDetail .select("div[class=intro_job] ul[class=left_380] p[itemprop=jobLocation] a").html(); } } jobName = listDetail.select("h1 a").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1 p").html(); if (jobName.isEmpty()) jobName = listDetail.select("h1").html(); System.out.println("\n Title : " + jobName); try { Integer siteID = 3; //String companyWebsite = ""; /* MysqlCrawler.getInstance().insertJFHRContents( siteID , jobUrl , jobName , jobLocation , companyName , companyAddress , companyPhone , companyContact , companyWebsite); */ //System.exit(1); } catch (Exception ex) { //System.out.println("\n Fail I : " + i); System.out.println("\n Ex : " + ex); } } } /* Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { logger.debug("Response headers:"); for (Header header : responseHeaders) { logger.debug("\t{}: {}", header.getName(), header.getValue()); } } */ logger.debug("============="); }
From source file:virgil.meanback.HistoryInfo.java
/** * * @param url//from ww w . j av a 2 s .com * @return * @throws Exception */ @SuppressWarnings("") public Stock parse(String url) throws Exception { java.util.logging.Logger.getLogger("com.gargoylesoftware.htmlunit").setLevel(Level.OFF); Stock stock = new Stock(); List<DayInfo> list = new ArrayList<>(); /** * HtmlUnitweb? */ WebClient wc = new WebClient(BrowserVersion.CHROME); wc.getOptions().setUseInsecureSSL(true); wc.getOptions().setJavaScriptEnabled(true); // ?JStrue wc.getOptions().setCssEnabled(false); // ?css? wc.getOptions().setThrowExceptionOnScriptError(false); // js?? wc.getOptions().setTimeout(50000); // 10S0? wc.getOptions().setDoNotTrackEnabled(false); HtmlPage page = wc.getPage(url); HtmlElement documentElement = page.getDocumentElement(); Document doc = Jsoup.parse(documentElement.asXml()); String name = doc.select("#BIZ_IS_Name").text(); String code = doc.select(".BIZ_IS_price_id span").text(); code = code.substring(code.indexOf("(") + 2, code.length() - 1); Elements els = doc.select("#BIZ_hq_historySearch tbody tr"); stock.setCode(code); stock.setName(name); int count = 0; for (Element el : els) { if (!el.html().contains("sum")) { DayInfo dayInfo = new DayInfo(); String dateString = el.select("td.e1").text(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); Date date = format.parse(dateString); String open = el.select("td").eq(1).text(); String close = el.select("td").eq(2).text(); double cd = Double.parseDouble(close); String low = el.select("td").eq(5).text(); String high = el.select("td").eq(6).text(); String volume = el.select("td").eq(7).text(); dayInfo.setClose(close); dayInfo.setDateString(dateString); dayInfo.setHigh(high); dayInfo.setLow(low); dayInfo.setOpen(open); dayInfo.setVolume(volume); dayInfo.setDate(date); list.add(dayInfo); count++; if (list.size() > 79) { break; } } } stock.setList(list); return stock; }