List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java
private void traverseHierarchy(Element e, DBpediaCategory category, HashMap<String, DBpediaCategory> map) { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("a")) { String href = c.attr("href"); if (href != null && href.length() > 0) { category.setLabel(c.text()); category.setUri(CLASSES_BASE_URI + c.text()); map.put(category.getLabel(), category); System.out.println(c.text() + "\t" + CLASSES_BASE_URI + c.text()); }/*from www. ja v a 2 s .co m*/ } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { DBpediaCategory cc = new DBpediaCategory(); traverseHierarchy(c1, cc, map); cc.parents = new HashSet<>(); cc.parents.add(category); category.getSubClasses().add(cc); } } } } }
From source file:qhindex.controller.SearchAuthorWorksController.java
private AuthorWork extractAuthorWorkData(Element authorWorkElements) throws IOException { AuthorWork aw = new AuthorWork(); Element titleElem = authorWorkElements.select("td.gsc_a_t > a").get(0); String name = titleElem.text(); aw.setTitle(name);/* w ww .ja va 2 s . co m*/ String urlAuthorWork = titleElem.attr("href"); Elements workData = authorWorkElements.select("td.gsc_a_t > div"); if (workData.size() > 1) { String publisherInGoogle = workData.get(1).text(); aw.setPublisherInGoogle(publisherInGoogle); aw.setPublisher(handlePublicationMedium(publisherInGoogle, urlAuthorWork)); String authors = workData.get(0).text(); aw.setAuthors(authors); } Elements citationsData = authorWorkElements.select("td.gsc_a_c > a"); if (citationsData.size() > 0) { aw.setCitationsUrl(citationsData.get(0).attr("href")); int cititationsExtractedNumber = 0; try { String citationStr = citationsData.get(0).text(); if (citationStr.length() > 0) { cititationsExtractedNumber = Integer.parseInt(citationStr); } } catch (Exception ex) { Debug.print("Exception while extracting author work data: " + ex.toString()); resultsMsg += "Exception while extracting author work data.\n"; } aw.setCitations(cititationsExtractedNumber); } return aw; }
From source file:cn.wanghaomiao.xpath.core.XpathEvaluator.java
/** * ?xpath/*ww w . ja v a 2 s . c om*/ * * @param xpath * @param root * @return */ public List<JXNode> evaluate(String xpath, Elements root) throws NoSuchAxisException, NoSuchFunctionException { List<JXNode> res = new LinkedList<JXNode>(); Elements context = root; List<Node> xpathNodes = getXpathNodeTree(xpath); for (int i = 0; i < xpathNodes.size(); i++) { Node n = xpathNodes.get(i); LinkedList<Element> contextTmp = new LinkedList<Element>(); if (n.getScopeEm() == ScopeEm.RECURSIVE || n.getScopeEm() == ScopeEm.CURREC) { if (n.getTagName().startsWith("@")) { for (Element e : context) { //? String key = n.getTagName().substring(1); if (key.equals("*")) { res.add(JXNode.t(e.attributes().toString())); } else { String value = e.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } //?? for (Element dep : e.getAllElements()) { if (key.equals("*")) { res.add(JXNode.t(dep.attributes().toString())); } else { String value = dep.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } } } } else if (n.getTagName().endsWith("()")) { //??text() res.add(JXNode.t(context.text())); } else { Elements searchRes = context.select(n.getTagName()); for (Element e : searchRes) { Element filterR = filter(e, n); if (filterR != null) { contextTmp.add(filterR); } } context = new Elements(contextTmp); if (i == xpathNodes.size() - 1) { for (Element e : contextTmp) { res.add(JXNode.e(e)); } } } } else { if (n.getTagName().startsWith("@")) { for (Element e : context) { String key = n.getTagName().substring(1); if (key.equals("*")) { res.add(JXNode.t(e.attributes().toString())); } else { String value = e.attr(key); if (StringUtils.isNotBlank(value)) { res.add(JXNode.t(value)); } } } } else if (n.getTagName().endsWith("()")) { res = (List<JXNode>) callFunc(n.getTagName().substring(0, n.getTagName().length() - 2), context); } else { for (Element e : context) { Elements filterScope = e.children(); if (StringUtils.isNotBlank(n.getAxis())) { filterScope = getAxisScopeEls(n.getAxis(), e); } for (Element chi : filterScope) { Element fchi = filter(chi, n); if (fchi != null) { contextTmp.add(fchi); } } } context = new Elements(contextTmp); if (i == xpathNodes.size() - 1) { for (Element e : contextTmp) { res.add(JXNode.e(e)); } } } } } return res; }
From source file:gov.medicaid.screening.dao.impl.SocialWorkLicenseDAOBean.java
/** * Retrieves all results from the source site. * * @param searchCriteria the search criteria. * @return the providers matched/*from w w w . j a v a 2 s.com*/ * @throws URISyntaxException if the URL could not be correctly constructed * @throws IOException for any I/O related errors * @throws ServiceException for any other errors encountered */ private SearchResult<License> getAllResults(SocialWorkCriteria searchCriteria) throws URISyntaxException, IOException, ServiceException { DefaultHttpClient client = new DefaultHttpClient(getLaxSSLConnectionManager()); client.setRedirectStrategy(new LaxRedirectStrategy()); HttpGet getSearchPage = new HttpGet(new URIBuilder(getSearchURL()).build()); HttpResponse response = client.execute(getSearchPage); verifyAndAuditCall(getSearchURL(), response); Document page = Jsoup.parse(EntityUtils.toString(response.getEntity())); String licenseNo = ""; if (searchCriteria instanceof SocialWorkLicenseSearchByLicenseNumberCriteria) { licenseNo = "" + ((SocialWorkLicenseSearchByLicenseNumberCriteria) searchCriteria).getLicenseNumber(); } String level = "none"; if (searchCriteria.getLevel() != null) { level = Util.defaultString(searchCriteria.getLevel().getName()); } HttpPost search = new HttpPost(new URIBuilder(getSearchURL()).build()); HttpEntity entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, null), true); page = Jsoup.parse(EntityUtils.toString(entity)); List<License> allLicenses = new ArrayList<License>(); // check if detail page (single match) if (page.select("#lblFormTitle").text().equals("License Details")) { allLicenses.add(parseLicenseDetail(page)); } else { Elements rows = page.select(RESULT_ROWS_SELECTOR); while (rows.size() > 0) { for (Element row : rows) { License license = parseLicense(row.children()); if (license != null) { allLicenses.add(license); } } rows.clear(); // check for next page Element currentPage = page.select("#_ctl7_grdSearchResults tr.TablePager span").first(); getLog().log(Level.DEBUG, "Current page is: " + currentPage.text()); Element pageLink = currentPage.nextElementSibling(); if (pageLink != null && pageLink.hasAttr("href")) { getLog().log(Level.DEBUG, "There are more results, getting the next page."); String target = parseEventTarget(pageLink.attr("href")); entity = postForm(getSearchURL(), client, search, buildParams(searchCriteria, page, licenseNo, level, target), true); page = Jsoup.parse(EntityUtils.toString(entity)); rows = page.select(RESULT_ROWS_SELECTOR); } } } SearchResult<License> results = new SearchResult<License>(); results.setItems(allLicenses); return results; }
From source file:com.gumtreescraper.scraper.GumtreeScraper.java
public void scrapeWithJSoup(List<Gumtree> gumtrees, String url) throws IOException { // openSite(url); // waitForPageToLoad(); String nextPageUrl = url;/*ww w .ja v a 2 s . c o m*/ boolean needContinue = true; do { try { Document doc = Jsoup.connect(nextPageUrl).timeout(getTimeout() * 1000).userAgent("Mozilla") // .userAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36") .get(); Elements adElements = doc.select("#srchrslt-adtable > li"); int size = adElements.size(); for (int i = 0; i < size; i++) { Element ad = adElements.get(i); if (!isOwner(ad)) { continue; } Element linkElement = ad.select("h6.rs-ad-title > a").first(); if (linkElement == null) { System.out.print(ad); continue; } String adUrl = linkElement.attr("href"); Gumtree gumtree = new Gumtree(); gumtree.setUrl(BASE_URL + adUrl); gumtrees.add(gumtree); if (i == size - 1) { // last element Elements adDateElements = ad.select("div.rs-ad-date"); if (adDateElements.isEmpty()) { continue; } if (!needToScrapeNextPage(adDateElements.first().text().trim())) { needContinue = false; } } } Elements nextElements = doc.select("a.rs-paginator-btn.next"); if (nextElements.isEmpty()) { break; } nextPageUrl = BASE_URL + nextElements.first().attr("href"); System.out.println("next page: " + nextPageUrl); } catch (Exception oex) { System.out.println(oex); } } while (true && needContinue); }
From source file:net.GoTicketing.GoTicketing.java
/** * ??//from w w w . j a va 2s . co m * @throws Exception */ private void praseImageCaptchaSrc() throws Exception { Document doc = Jsoup.parse(TicketingPageHTML); Element img = doc.getElementById("idRandomPic"); if (img == null) throw new Exception("Can't get image captcha source !"); //out.println(host + img.attr("src")); ImageCaptchaSrc = host + img.attr("src"); }
From source file:se.vgregion.portal.iframe.controller.CSViewController.java
private void addSpecialFieldForRaindance(Map<String, String> dynamicFieldValueMap, PortletConfig portletConfig) throws Exception { Document doc = getDynamicFieldsDocument(portletConfig); Element button = findButtonWithIdWhichStartsWith(doc, "loginForm:j_idt"); dynamicFieldValueMap.put("loginForm:j_idcl", button.attr("id")); }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a paragraph. These may be "p" or "hN" elements, often with classes * @param p the paragraph/heading element from the document fragment * @param defaultName the default name for the property *///from w w w . j av a2 s . c o m private void parsePara(Element p, String defaultName) throws JSONException { List<Node> children = p.childNodes(); String name = p.attr("class"); if (name == null || name.length() == 0) name = defaultName; if (isLineFormat(name) || prevWasMilestone) ensure(1, false); else ensure(2, true); int offset = sb.length(); Range r = new Range(name, offset, 0); stil.add(r); for (Node child : children) { if (child instanceof Element) { String nName = child.nodeName().toLowerCase(); if (nName.equals("span")) parseSpan((Element) child); else parseOtherElement((Element) child); } else if (child instanceof TextNode) { TextNode tn = (TextNode) child; sb.append(tn.getWholeText()); } } if (isLineFormat(name)) ensure(1, true); else ensure(2, true); this.stil.updateLen(r, sb.length() - offset); prevWasMilestone = false; }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a codeblock/* w w w . j a v a 2 s . c o m*/ * @param elem the element to parse * @throws a JSON exception */ private void parsePre(Element elem) throws JSONException { if (elem.hasText()) { int offset = sb.length(); String name = elem.attr("class"); if (name == null || name.length() == 0) name = "pre"; Range r = new Range(name, offset, 0); stil.add(r); if (elem.hasAttr("class")) { List<Node> children = elem.childNodes(); for (Node child : children) { if (child instanceof Element) { if (child.nodeName().equals("span")) parseSpan((Element) child); else parseOtherElement((Element) child); } else if (child instanceof TextNode) sb.append(((TextNode) child).getWholeText()); } } else sb.append(elem.text()); this.stil.updateLen(r, sb.length() - offset); } prevWasMilestone = false; ensure(1, false); }
From source file:com.github.binlee1990.spider.movie.spider.MovieCrawler.java
private void setFilmCover(Document doc, Film film) { Elements coverElements = doc.select(".fm-intro img[src]"); if (CollectionUtils.isNotEmpty(coverElements)) { Element coverElement = coverElements.get(0); String coverUrl = coverElement.attr("src").toString(); if (StringUtils.isNotBlank(coverUrl)) { film.setCoverUrl(coverUrl);/*from w ww .j a v a 2 s. c om*/ } } }