List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * May happen but should not//w w w . j a va 2 s. c o m * @param elem an element that is not a span, p or div */ private void parseOtherElement(Element elem) throws JSONException { List<Node> children = elem.childNodes(); int offset = sb.length(); String name = elem.attr("class"); if (name == null || name.length() == 0) name = elem.nodeName(); Range r = new Range(name, offset, 0); stil.add(r); for (Node child : children) { if (child instanceof Element) parseOtherElement((Element) child); else if (child instanceof TextNode) sb.append(((TextNode) child).getWholeText()); } this.stil.updateLen(r, sb.length() - offset); prevWasMilestone = false; }
From source file:net.GoTicketing.GoTicketing.java
/** * ??//from www .j a va2 s .c o m * @throws Exception */ private void praseVoiceCaptchaSrc() throws Exception { Document doc = Jsoup.parse(TicketingPageHTML); Element voc = doc.getElementsByTag("audio").last(); if (voc == null) throw new Exception("Can't get voice captcha source !"); //out.println(host + voc.attr("src").substring(1)); VoiceCaptchaSrc = host + voc.attr("src").substring(1); }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a div (section)//from ww w . j ava2s. c o m * @param div the div * @throws JSONException */ private void parseDiv(Element div) throws JSONException { List<Node> children = div.childNodes(); int offset = sb.length(); String name = div.attr("class"); if (name == null || name.length() == 0) name = "section"; Range r = new Range(name, offset, 0); stil.add(r); for (Node child : children) { if (child instanceof Element) { String nName = child.nodeName().toLowerCase(); if (nName.equals("p")) parsePara((Element) child, "p"); else if (nName.matches("(h|H)\\d")) parsePara((Element) child, nName); else if (child.nodeName().toLowerCase().equals("span")) parseSpan((Element) child); else if (nName.equals("pre")) parsePre((Element) child); else parseOtherElement((Element) child); } } ensure(3, true); this.stil.updateLen(r, sb.length() - offset); prevWasMilestone = false; }
From source file:com.weavers.duqhan.business.impl.ProductServiceImpl.java
@Override public List<StatusBean> getTempProductLinks(String link) { boolean status = true; //success String startDate = new Date().toString(); Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==I==)DATE: " + startDate + "Product link collection start.....\n For the link ( " + link + " )"); Elements productUrlList = null; List<StatusBean> statusBeans = new ArrayList<>(); // Elements nexturl = null; boolean contd = true; String productList = link /*"https://www.aliexpress.com/wholesale?minPrice=&maxPrice=&isBigSale=n&isFreeShip=y&isFavorite=all&isMobileExclusive=n&isLocalReturn=n&shipFromCountry=&shipCompanies=&SearchText=jwelry+for+women&CatId=1509&g=y&initiative_id=SB_20170330225112&needQuery=n&isrefine=y"*/; Temtproductlinklist temtproductlinklist; Temtproductlinklist savedTemtproductlinklist; String nexturl = null;/*from ww w . j av a 2 s . co m*/ String firstPart = null; String secondPart = null; int[] pageNumber = new int[199]; Random randomObj1 = new Random(); for (int i = 0; i < 198; i++) { pageNumber[i] = (randomObj1.ints(2, 200).findFirst().getAsInt()); } try { Document doc = Jsoup.connect(productList).get(); productUrlList = doc.select("div.ui-pagination-navi a"); if (!productUrlList.isEmpty()) { nexturl = productUrlList.get(0).attr("abs:href"); firstPart = nexturl.split(".html")[0]; firstPart = firstPart.substring(0, firstPart.length() - 1); secondPart = nexturl.split(".html")[1]; secondPart = ".html" + secondPart; for (int i = 0; i < 198; i++) { nexturl = firstPart + pageNumber[i] + secondPart; doc = Jsoup.connect(nexturl).get(); productUrlList = doc.select(".son-list .list-item .pic a[href]"); //=================== Random sleep START ===================// Random randomObj = new Random(); TimeUnit.SECONDS.sleep(randomObj.ints(30, 60).findFirst().getAsInt()); //=================== Random sleep END =====================// if (!productUrlList.isEmpty()) { for (Element element : productUrlList) { temtproductlinklist = temtproductlinklistDao .getTemtproductlinklistByLink(element.attr("abs:href")); if (temtproductlinklist == null) { StatusBean statusBean = new StatusBean(); temtproductlinklist = new Temtproductlinklist(); temtproductlinklist.setLink(element.attr("abs:href")); temtproductlinklist.setStatus(0); //System.out.println("element.toString()" + element.attr("abs:href")); savedTemtproductlinklist = temtproductlinklistDao.save(temtproductlinklist); statusBean.setStatus(String.valueOf(savedTemtproductlinklist.getStatus())); statusBean.setStatusCode(savedTemtproductlinklist.getLink()); statusBean.setId(savedTemtproductlinklist.getId()); statusBeans.add(statusBean); } } } } } } catch (Exception ex) { status = false; //failure System.out.println("(=============================================)DATE: " + new Date().toString() + "Product link collection get exception.....\n Which started on: " + startDate + "\n" + ex.getLocalizedMessage()); Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==E==)DATE: " + new Date().toString() + "Product link collection get exception.....\n Which started on: " + startDate + "\n", ex); String body = "DATE: " + new Date().toString() + "Product link collection get exception.....\nNext link not found.\n Which started on: " + startDate; // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Error", body, "subhendu.sett@pkweb.in"); } if (status) { System.out.println("=============================================DATE: " + new Date().toString() + "Product link collection end.....\n Which started on: " + startDate); Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==I==)DATE: " + new Date().toString() + "Product link collection end.....\n Which started on: " + startDate); String body = "DATE: " + new Date().toString() + "Product link collection end.....\n Which started on: " + startDate; // MailSender.sendEmail("krisanu.nandi@pkweb.in", "Success", body, "subhendu.sett@pkweb.in"); } return statusBeans; }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a span with a class or not/* ww w . java2 s .c om*/ * @param span the span in HTML */ private void parseSpan(Element span) throws JSONException { if (span.hasText()) { int offset = sb.length(); String name = span.attr("class"); Range r = new Range(name, offset, 0); if (name == null || name.length() == 0) name = "span"; if (isMilestone(name)) { pages.add(r); sb.append(span.text()); sb.append("\n"); pages.updateLen(r, sb.length() - offset); prevWasMilestone = true; } else if (name.equals("soft-hyphen")) { stil.add(r); // get previous word int i = sb.length() - 1; while (i > 0 && !Character.isWhitespace(sb.charAt(i))) i--; if (i > 0) i++; String prev = clean(sb.substring(i), true); // get next word String next = clean(nextWord(span), false); if (this.speller.isHardHyphen(prev, next)) r.name = "hard-hyphen"; sb.append(span.text()); stil.updateLen(r, sb.length() - offset); } else // span may contain other spans { stil.add(r); List<Node> children = span.childNodes(); for (Node child : children) { if (child instanceof Element) { String nName = child.nodeName().toLowerCase(); if (nName.equals("span")) parseSpan((Element) child); else parseOtherElement((Element) child); } else if (child instanceof TextNode) { TextNode tn = (TextNode) child; sb.append(tn.text()); } } if (isLineFormat(name)) ensure(1, false); stil.updateLen(r, sb.length() - offset); } } // else strangely no text: ignore it }
From source file:com.zacwolf.commons.email.Email.java
private void prepare(final org.jsoup.nodes.Document doc) { removeComments(doc);//Remove any comments from the html of the message to reduce the size //Change the title to match the subject of the email if (doc.getElementsByTag("title").size() > 0) doc.getElementsByTag("title").first().html(getSubject()); //Replace the contents of any tags with class="date" with the current date if (doc.getElementsByClass("date").size() > 0) { for (org.jsoup.nodes.Element datelem : doc.getElementsByClass("date")) { SimpleDateFormat df = new SimpleDateFormat("MMMMMMMMMM d, yyyy"); if (datelem.hasAttr("format")) { try { df = new SimpleDateFormat(datelem.attr("format")); } catch (Exception ee) { } //throw it away and just go back to the default format; datelem.html(df.format(TimeUtils.getGMTtime())); }//from w w w . jav a 2 s. co m } } //tables need the border-spacing: style attribute; added for GMail compatiblity for (org.jsoup.nodes.Element tbl : doc.getElementsByTag("table")) if (!tbl.attr("style").contains("border-spacing:")) tbl.attr("style", tbl.attr("style") + (!tbl.attr("style").endsWith(";") ? ";" : "") + "border-spacing:0;"); }
From source file:com.ferasinfotech.gwreader.ScreenSlidePageFragment.java
/** * Alternate Factory method for this fragment class. Constructs a new fragment for the given page number, * and HTML story element./*from w w w . j av a 2 s . c o m*/ */ public static ScreenSlidePageFragment create(int pageNumber, int numPages, org.jsoup.nodes.Element story) { int story_id = -1; String name = ""; String summary = ""; String headline = ""; String cover_photo_url = ""; String story_string = ""; long createdAt; ScreenSlidePageFragment fragment = new ScreenSlidePageFragment(); Bundle args = new Bundle(); if (pageNumber == 0) { story_id = 0; name = "Grasswire Help"; headline = "Usage Instructions"; cover_photo_url = "android.resource://com.ferasinfotech.gwreader/" + R.drawable.gw_logo; summary = "Swipe right and left to read each story.\n\n" + "Scroll down to read facts and associated news items (tweets and links) for each story.\n\n" + "Tap on a news items within a story and you'll be able to follow web links, view tweets via the Twitter app, or watch videos.\n\n" + "A long press on a story's cover photo will launch the device browser to view or edit the story on the Grasswire mobile site.\n\n" + "A long press on the image above will launch the Grasswire main page.\n\n" + "App Version: " + BuildConfig.VERSION_NAME + "\n\n"; } else { // doing a story page, Element 'story' is the story data Elements e_list; org.jsoup.nodes.Element tag; story_id = Integer.valueOf(story.attr("data-story-id")); e_list = story.getElementsByClass("feature__tag"); tag = e_list.get(0); name = tag.text() + " (" + pageNumber + "/" + numPages + ")"; e_list = story.getElementsByClass("story__summary"); tag = e_list.get(0); summary = tag.html().replace("<br />", "\r"); e_list = story.getElementsByClass("feature__text"); tag = e_list.get(0); headline = tag.text(); e_list = story.getElementsByClass("feature__image"); tag = e_list.get(0); cover_photo_url = tag.attr("src"); story_string = story.toString(); } args.putInt(ARG_PAGE, pageNumber); args.putInt(ARG_STORY_ID, story_id); args.putString(ARG_TITLE, name); args.putString(ARG_SUMMARY, summary); args.putString(ARG_HEADLINE, headline); args.putString(ARG_COVER_PHOTO, cover_photo_url); args.putString(ARG_STORY_STRING, "<html><head></head><body>" + story_string + "</body></html>"); fragment.setArguments(args); return fragment; }
From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java
@Override public void execute(Tuple tuple) { byte[] content = tuple.getBinaryByField("content"); String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that its content type is HTML // look at value found in HTTP headers boolean CT_OK = false; String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE); if (StringUtils.isNotBlank(httpCT)) { if (httpCT.toLowerCase().contains("html")) { CT_OK = true;/*from www . jav a 2 s . c o m*/ } } // simply ignore cases where the content type has not been set // TODO sniff content with Tika? else { CT_OK = true; } if (!CT_OK) { String errorMessage = "Exception content-type " + httpCT + " for " + url; RuntimeException e = new RuntimeException(errorMessage); handleException(url, e, metadata, tuple, "content-type checking", errorMessage); return; } LOG.info("Parsing : starting {}", url); long start = System.currentTimeMillis(); String charset = getContentCharset(content, metadata); // get the robots tags from the fetch metadata RobotsTags robotsTags = new RobotsTags(metadata); Map<String, List<String>> slinks; String text; DocumentFragment fragment; try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) { org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url); fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc); // extracts the robots directives from the meta tags robotsTags.extractMetaTags(fragment); // store a normalised representation in metadata // so that the indexer is aware of it robotsTags.normaliseToMetadata(metadata); // do not extract the links if no follow has been set // and we are in strict mode if (robotsTags.isNoFollow() && robots_noFollow_strict) { slinks = new HashMap<String, List<String>>(0); } else { Elements links = jsoupDoc.select("a[href]"); slinks = new HashMap<String, List<String>>(links.size()); for (Element link : links) { // abs:href tells jsoup to return fully qualified domains // for // relative urls. // e.g.: /foo will resolve to http://shopstyle.com/foo String targetURL = link.attr("abs:href"); // nofollow boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel")); // remove altogether if (noFollow && robots_noFollow_strict) { continue; } // link not specifically marked as no follow // but whole page is if (!noFollow && robotsTags.isNoFollow()) { noFollow = true; } String anchor = link.text(); if (StringUtils.isNotBlank(targetURL)) { // any existing anchors for the same target? List<String> anchors = slinks.get(targetURL); if (anchors == null) { anchors = new LinkedList<String>(); slinks.put(targetURL, anchors); } // track the anchors only if no follow is false if (!noFollow && StringUtils.isNotBlank(anchor)) { anchors.add(anchor); } } } } text = jsoupDoc.body().text(); } catch (Throwable e) { String errorMessage = "Exception while parsing " + url + ": " + e; handleException(url, e, metadata, tuple, "content parsing", errorMessage); return; } // store identified charset in md metadata.setValue("parse.Content-Encoding", charset); long duration = System.currentTimeMillis() - start; LOG.info("Parsed {} in {} msec", url, duration); List<Outlink> outlinks = toOutlinks(url, metadata, slinks); ParseResult parse = new ParseResult(); parse.setOutlinks(outlinks); // parse data of the parent URL ParseData parseData = parse.get(url); parseData.setMetadata(metadata); parseData.setText(text); parseData.setContent(content); // apply the parse filters if any try { parseFilters.filter(url, content, fragment, parse); } catch (RuntimeException e) { String errorMessage = "Exception while running parse filters on " + url + ": " + e; handleException(url, e, metadata, tuple, "content filtering", errorMessage); return; } if (emitOutlinks) { for (Outlink outlink : outlinks) { collector.emit(StatusStreamName, tuple, new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED)); } } // emit each document/subdocument in the ParseResult object // there should be at least one ParseData item for the "parent" URL for (Map.Entry<String, ParseData> doc : parse) { ParseData parseDoc = doc.getValue(); collector.emit(tuple, new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText())); } collector.ack(tuple); eventCounter.scope("tuple_success").incr(); }
From source file:com.fluidops.iwb.provider.HTMLProvider.java
@Override public void gather(List<Statement> res) throws Exception { String url = config.url;/*w ww . j a va 2 s . c o m*/ Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); // Elements article = // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table"); Elements article = doc.getElementsByTag("tbody").select("tr"); Elements tableElem; URI nameURI = null; URI roadsURI = null; URI sideURI = null; URI totalURI = null; File file = new File("HTMLdata.txt"); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file))); out.println("Media"); print("\nMedia: (%d)", media.size()); for (Element el : media) { if (el.tagName().equals("img")) { print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.println(); } else { print(" * %s: <%s>", el.tagName(), el.attr("abs:src")); out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src")); out.println(); } } out.println("Imports"); print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.println(); } out.println("Links"); print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text()); out.println(); } /* * out.println("Custom text"); print("\nCustom: (%d)",customArt.size()); * for (Element custom:customArt){ * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text()); * out.println(); } */ out.println("Article"); print("\nArticle: (%d)", article.size()); for (int i = 3; i < article.size() - 2; i++) { tableElem = article.get(i).select("td"); out.println(); if (i == 3) { nameURI = ProviderUtils.objectToUri(tableElem.get(0).text()); roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text()); sideURI = ProviderUtils.objectToUri(tableElem.get(2).text()); totalURI = ProviderUtils.objectToUri(tableElem.get(3).text()); } else { res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE, nameURI)); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDFS.LABEL, tableElem.get(0).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), roadsURI, tableElem.get(1).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), sideURI, tableElem.get(2).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), totalURI, tableElem.get(3).text())); for (Element el : tableElem) { out.printf("\n * (%s): (%s)", el.tagName(), el.text()); out.println(); } } out.println(); out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text()); out.println(); } out.close(); }
From source file:de.geeksfactory.opacclient.apis.Open.java
private String getCoverUrl(Element img) { String[] parts = img.attr("sources").split("\\|"); // Example: SetSimpleCover|a|https://vlb.de/GetBlob.aspx?strIsbn=9783868511291& // size=S|a|http://www.buchhandel.de/default.aspx?strframe=titelsuche& // caller=vlbPublic&func=DirectIsbnSearch&isbn=9783868511291& // nSiteId=11|c|SetNoCover|a|/DesktopModules/OCLC.OPEN.PL.DNN // .BaseLibrary/StyleSheets/Images/Fallbacks/emptyURL.gif?4.2.0.0|a| for (int i = 0; i + 2 < parts.length; i++) { if (parts[i].equals("SetSimpleCover")) { String url = parts[i + 2].replace("&", "&"); try { HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); conn.setRequestMethod("HEAD"); int code = conn.getResponseCode(); if (code == 200) { return url; }/*from ww w . j a v a 2 s. co m*/ } catch (IOException e) { e.printStackTrace(); } } } return null; }