List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:net.meiolania.apps.habrahabr.fragments.qa.loader.QaLoader.java
@Override public ArrayList<QaData> loadInBackground() { ArrayList<QaData> data = new ArrayList<QaData>(); try {//www. ja v a 2s . co m String readyUrl = url.replace("%page%", String.valueOf(page)); Log.i(TAG, "Loading a page: " + readyUrl); Document document = Jsoup.connect(readyUrl).get(); Elements qaList = document.select("div.post"); for (Element qa : qaList) { QaData qaData = new QaData(); Element title = qa.select("a.post_title").first(); Element hubs = qa.select("div.hubs").first(); Element answers = qa.select("div.informative").first(); Element date = qa.select("div.published").first(); Element author = qa.select("div.author > a").first(); Element score = qa.select("span.score").first(); qaData.setTitle(title.text()); qaData.setUrl(title.attr("abs:href")); qaData.setHubs(hubs.text()); qaData.setAnswers(answers.text()); qaData.setDate(date.text()); qaData.setAuthor(author.text()); qaData.setScore(score.text()); data.add(qaData); } } catch (IOException e) { } return data; }
From source file:net.meiolania.apps.habrahabr.fragments.users.loader.UsersLoader.java
@Override public ArrayList<UsersData> loadInBackground() { ArrayList<UsersData> data = new ArrayList<UsersData>(); try {//from w ww . j a v a 2 s . c om Log.i(TAG, "Loading a page: " + url); Document document = Jsoup.connect(url).get(); Elements users = document.select("div.user"); for (Element user : users) { UsersData usersData = new UsersData(); Element rating = user.select("div.rating").first(); Element karma = user.select("div.karma").first(); Element avatar = user.select("div.avatar > a > img").first(); Element name = user.select("div.userlogin > div.username > a").first(); Element lifetime = user.select("div.info > div.lifetime").first(); usersData.setName(name.text()); usersData.setUrl(name.attr("abs:href")); usersData.setRating(rating.text()); usersData.setKarma(karma.text()); usersData.setAvatar(avatar.attr("src")); usersData.setLifetime(lifetime.text()); data.add(usersData); } } catch (IOException e) { } return data; }
From source file:net.meiolania.apps.habrahabr.fragments.users.loader.UsersShowLoader.java
@Override public UsersFullData loadInBackground() { UsersFullData data = new UsersFullData(); try {//from ww w . ja v a 2 s . c o m Log.i(TAG, "Loading a page: " + url); Document document = Jsoup.connect(url).get(); Element avatar = document.select("a.avatar > img").first(); Element karma = document.select("div.karma > div.score > div.num").first(); Element rating = document.select("div.rating > div.num").first(); Element birthday = document.select("dd.bday").first(); Element fullname = document.select("div.fullname").first(); Element summary = document.select("dd.summary").first(); Element interests = document.select("dl.interests > dd").first(); data.setAvatar(avatar.attr("src")); data.setKarma(karma.text()); data.setRating(rating.text()); data.setBirthday(birthday != null ? birthday.text() : ""); data.setFullname(fullname != null ? fullname.text() : ""); data.setSummary(summary != null ? summary.text() : ""); data.setInterests(interests != null ? interests.text() : ""); } catch (IOException e) { } return data; }
From source file:net.trustie.model.OpenHubProject_Model.java
private void handleQuickRef(Element quickRef) { Elements itemNames = quickRef.select("dt"); Elements itemValues = quickRef.select("dd"); Element e = null;// www . j a va2 s . co m Element eValue = null; for (int i = 0; i < itemNames.size(); i++) { e = itemNames.get(i); eValue = itemValues.get(i); String refName = e.text(); switch (refName) { case "Organization:": { this.organization = eValue.text(); break; } case "Project Links:": { Elements links = eValue.select("a"); String[] tmp = new String[links.size()]; Element ele = null; for (int j = 0; j < links.size(); j++) { ele = links.get(j); tmp[j] = ele.text() + Seperator.SOURCE_SEPERATOR + ele.attr("href"); } this.projectLinks = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR); break; } case "Code Locations:": { Elements locs = eValue.select("a"); if (locs.size() == 0) { this.codeLocation = eValue.text(); } else { Element link = locs.get(0); this.codeLocation = link.text() + Seperator.SOURCE_SEPERATOR + link.attr("href"); } break; } case "Licenses:": { Elements links = eValue.select("a"); List<String> listLicenses = new ArrayList<String>(); // String[] tmp = new String[links.size()]; for (int j = 0; j < links.size(); j++) { listLicenses.add(links.get(j).text()); // tmp[j] = links.get(j).text(); } this.licenses = StringHandler.combineTags(listLicenses); break; } case "Similar Projects:": { // System.out.println(eValue); Elements projects = eValue.select("td[width=49%]"); // System.out.println(projects.size()); String[] tmp = new String[projects.size()]; Element ele = null; for (int j = 0; j < projects.size(); j++) { ele = projects.get(j); Element project = ele.select("a").get(0); tmp[j] = project.text() + Seperator.SOURCE_SEPERATOR + project.attr("href"); } this.similarProjects = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR); break; } case "Managers:": { if ("Become the first manager for BugSystem".equals(eValue.text())) { break; } Elements users = eValue.select("a"); String[] tmp = new String[users.size()]; Element ele = null; for (int j = 0; j < users.size(); j++) { ele = users.get(j); tmp[j] = ele.text() + Seperator.SOURCE_SEPERATOR + ele.attr("href"); } this.managers = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR); break; } default: { break; } } } }
From source file:net.trustie.model.SFProject_Model.java
public void afterProcess(Page page) { // long start = System.currentTimeMillis(); this.url = page.getPageUrl(); // justify it's enterprise or bluesteel user // this.html = page.getHtml().toString(); this.urlMd5 = DigestUtils.md5Hex(page.getPageUrl()); SimpleDateFormat bartDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); this.collectTime = bartDateFormat.format(new Date()); this.pageMd5 = DigestUtils.md5Hex(urlMd5 + lastUpdate + feature + downloadCount + stars); Document doc = page.getHtml().getDocument(); Elements bodyEles = doc.select("body"); if (bodyEles.size() > 0) { Element body = bodyEles.get(0); String bodyType = body.attr("id"); if ("pg_project".equals(bodyType)) { String type = body.attr("class"); if (type.equals("bluesteel user")) { // bluesteel user extractPageBluesteelUser(doc); } else if (type.equals("enterprise user")) { // enterprise user extractPageEnterpriseUser(doc); } else { // others }/* w ww . jav a2 s .c o m*/ if (lastUpdate.contains("ago")) { this.lastUpdate = getTime(lastUpdate); } if (registeredTime.contains("ago")) { this.registeredTime = getTime((registeredTime)); } if (lastUpdate.equals("")) { this.lastUpdate = "0000-00-00 00:00:00"; } if (registeredTime.equals("")) { this.registeredTime = "0000-00-00 00:00:00"; } } else { // name Elements nameEles = body.select("div#proj_header div.proj-title h2"); this.name = nameEles.text(); // desc Elements descEles = body.select("div#top_left div#home_intro div#proj-overview p"); this.desc = descEles.text(); // features Elements featuresEles = body.select("div#top_left div#home_intro div#proj-overview ul"); this.feature = featuresEles.text(); } this.lastUpdate = DateHandler.formatAllTypeDate(lastUpdate, page.getTime()); this.registeredTime = DateHandler.formatAllTypeDate(registeredTime, page.getTime()); } // long end = System.currentTimeMillis(); // System.out.println(end-start); // System.out.println(this.toString()); // System.out.println(types.get(0).attr("class")); // Document doc=page.getHtml().getDocument(); }
From source file:nl.surfsara.warcexamples.datascience.WordCountMapper.java
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process http response content. Note that the outlinks can also be found in the wat metadata. if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) { // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us: HttpHeader httpHeader = value.getHttpHeader(); if (httpHeader == null) { // No header so we are unsure that the content is text/html: NOP } else {//from w w w.j a v a2 s . c o m if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) { // Note that if you really want to do this right; you should look at the character encoding as well. // We'll leave that as an exercise for you ;-). context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1); // Get the html payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null && "".equals(warcContent)) { // NOP } else { try { //Remove all HTML from warcContent, right now it seems to empty the pages completely. Please do test for yourself //warcContent = Jsoup.parse(warcContent).text(); } catch (Exception e) { } String targetURI = value.header.warcTargetUriStr; //Write Word Count Mapping context.write(new Text(targetURI), new Text(parseToString(countWords(warcContent)))); //Write Links Document doc = Jsoup.parse(warcContent); Elements links = doc.select("a"); for (Element link : links) { String absHref = link.attr("abs:href"); // Omit nulls and empty strings if (absHref != null && !("".equals(absHref))) { context.write(new Text(targetURI), new Text(absHref)); } } } } } } } }
From source file:nl.surfsara.warcexamples.hadoop.warc.HrefExtracter.java
@Override public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException { context.setStatus(Counters.CURRENT_RECORD + ": " + key.get()); // Only process http response content. Note that the outlinks can also be found in the wat metadata. if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) { // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us: HttpHeader httpHeader = value.getHttpHeader(); if (httpHeader == null) { // No header so we are unsure that the content is text/html: NOP } else {// ww w . j a v a 2 s . c om if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) { // Note that if you really want to do this right; you should look at the character encoding as well. // We'll leave that as an exercise for you ;-). context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1); // Get the html payload Payload payload = value.getPayload(); if (payload == null) { // NOP } else { String warcContent = IOUtils.toString(payload.getInputStreamComplete()); if (warcContent == null || "".equals(warcContent)) { // NOP } else { String targetURI = value.header.warcTargetUriStr; Document doc = Jsoup.parse(warcContent); Elements links = doc.select("a"); for (Element link : links) { String absHref = link.attr("abs:href"); // Omit nulls and empty strings if (absHref != null && !("".equals(absHref))) { context.write(new Text(targetURI), new Text(absHref)); } } } } } } } }
From source file:no.kantega.publishing.admin.content.htmlfilter.CleanupFormHtmlFilter.java
@Override public Document runFilter(Document document) { Elements inputs = document.getElementsByTag("input"); for (Element input : inputs) { String type = input.attr("type"); if (isBlank(type)) { input.attr("type", "text"); }/* w w w. j a v a2s . c o m*/ } return document; }
From source file:no.kantega.publishing.admin.content.htmlfilter.ContextPathFilter.java
private void fixContextPathForAttribute(Elements elements, String attribute) { for (Element element : elements) { String attributeValue = element.attr(attribute); if (isNotBlank(attributeValue)) { if (attributeValue.startsWith("../")) { attributeValue = rootUrlToken + "/" + attributeValue .substring(attributeValue.lastIndexOf("../") + 3, attributeValue.length()); element.attr(attribute, attributeValue); }//from w w w .ja v a 2s . c o m if (contextPath.length() > 0) { if (attributeValue.startsWith(contextPath + "/")) { attributeValue = rootUrlToken + attributeValue.substring(contextPath.length(), attributeValue.length()); element.attr(attribute, attributeValue); } } } } }
From source file:no.kantega.publishing.admin.content.htmlfilter.ConvertUnderlineToEditorStyleFilter.java
@Override public Document runFilter(Document document) { for (Element span : document.getElementsByTag("span")) { String style = span.attr("style"); if (isNotBlank(style)) { String textDecoration = getSubAttributeValue(style, "text-decoration"); if ("underline".equalsIgnoreCase(textDecoration)) { span.removeAttr("style"); span.tagName("u"); }/*from www . java2 s. com*/ } } return document; }