List of usage examples for org.jsoup.nodes Element text
public String text()
From source file:com.maxl.java.aips2xml.Aips2Xml.java
static String convertHtmlToXml(String med_title, String html_str, String regnr_str) { Document mDoc = Jsoup.parse(html_str); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // <div id="monographie"> -> <fi> mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id"); // <div class="MonTitle"> -> <title> mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id"); // Beautify the title to the best of my possibilities ... still not good enough! String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+", "");/*from ww w . j a va 2 s.com*/ if (!title_str.equals(med_title)) if (SHOW_ERRORS) System.err.println(med_title + " differs from " + title_str); // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good! mDoc.select("title").first().text(med_title); // <div class="ownerCompany"> -> <owner> Element owner_elem = mDoc.select("div[class=ownerCompany]").first(); if (owner_elem != null) { owner_elem.tagName("owner").removeAttr("class"); String owner_str = mDoc.select("owner").text(); mDoc.select("owner").first().text(owner_str); } else { mDoc.select("title").after("<owner></owner>"); if (DB_LANGUAGE.equals("de")) mDoc.select("owner").first().text("k.A."); else if (DB_LANGUAGE.equals("fr")) mDoc.select("owner").first().text("n.s."); } // <div class="paragraph"> -> <paragraph> mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id"); // <div class="absTitle"> -> <paragraphTitle> mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class"); // <div class="untertitle1"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="untertitle"> -> <paragraphSubTitle> mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class"); // <div class="shortCharacteristic"> -> <characteristic> mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class"); // <div class="image"> mDoc.select("div[class=image]").tagName("image").removeAttr("class"); // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p> mDoc.select("p[class]").tagName("p").removeAttr("class"); // <span style="font-style:italic"> -> <i> mDoc.select("span").tagName("i").removeAttr("style"); // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> mDoc.select("i[class=indention1]").tagName("i").removeAttr("class"); mDoc.select("i[class=indention2]").tagName("i").removeAttr("class"); // mDoc.select("p").select("i").tagName("i"); // mDoc.select("paragraphtitle").select("i").tagName("para-i"); // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i"); Elements elems = mDoc.select("paragraphtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } elems = mDoc.select("paragraphsubtitle"); for (Element e : elems) { if (!e.text().isEmpty()) e.text(e.text()); } // Here we take care of tables // <table class="s21"> -> <table> mDoc.select("table[class]").removeAttr("class"); mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border"); mDoc.select("colgroup").remove(); mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan"); mDoc.select("tr").removeAttr("class"); elems = mDoc.select("div[class]"); for (Element e : elems) { if (e.text().isEmpty()) e.remove(); } mDoc.select("tbody").unwrap(); // Remove nested table (a nasty table-in-a-table Elements nested_table = mDoc.select("table").select("tr").select("td").select("table"); if (!nested_table.isEmpty()) { nested_table.select("table").unwrap(); } // Here we take care of the images mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border"); // Subs and sups mDoc.select("sub[class]").tagName("sub").removeAttr("class"); mDoc.select("sup[class]").tagName("sup").removeAttr("class"); mDoc.select("td").select("sub").tagName("td-sub"); mDoc.select("td").select("sup").tagName("td-sup"); // Remove floating <td-sup> tags mDoc.select("p").select("td-sup").tagName("sup"); mDoc.select("p").select("td-sub").tagName("sub"); // Box mDoc.select("div[class=box]").tagName("box").removeAttr("class"); // Insert swissmedicno5 after <owner> tag mDoc.select("owner").after("<swissmedicno5></swissmedicno5"); mDoc.select("swissmedicno5").first().text(regnr_str); // Remove html, head and body tags String xml_str = mDoc.select("body").first().html(); //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", ""); xml_str = xml_str.replaceAll("<sup> </sup>", ""); xml_str = xml_str.replaceAll("<sub> </sub>", ""); xml_str = xml_str.replaceAll("<p> <i>", "<p><i>"); xml_str = xml_str.replaceAll("</p> </td>", "</p></td>"); xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!! xml_str = xml_str.replaceAll("", "- "); xml_str = xml_str.replaceAll("<br />", ""); xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", ""); // Remove multiple instances of <p></p> Scanner scanner = new Scanner(xml_str); String new_xml_str = ""; int counter = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.trim().equals("<p></p>")) { counter++; } else counter = 0; if (counter < 3) new_xml_str += line; } scanner.close(); return new_xml_str; }
From source file:crawler.AScraper.java
@Transformer(inputChannel = "channel3", outputChannel = "channel4") public Artwork convert(Element payload) throws ParseException, MalformedURLException { Matcher m = patter.matcher(payload.text()); if (m.find()) { String year = m.group("year"); String month = m.group("month"); String day = m.group("day"); int id = Integer.parseInt(m.group("id")); String model = m.group("model").split("[\\s\\[\\]]")[0]; URL link = new URL(payload.attr("href")); DateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("GMT+8")); Date date = format.parse(String.format("%s-%s-%s", year, month, day)); String thread_title = payload.text(); return new Artwork(thread_title, id, -1, -1, null, link, null, model, date); } else {//from ww w .ja va2 s .c om LOG.error(payload.text()); return null; } }
From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDynamicExtractor.java
@Override public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception { String attrib_value = null;// w w w . jav a2 s . c om if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) { String cssSelector = attributeDetail.extractor_args.get(0); Element element = articleDoc.select(cssSelector).first(); if (StringUtils.isNotBlank(cssSelector)) { if (element != null) { attrib_value = element.text(); } } } if ((attrib_value != null) && (attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) { String regexSelector = attributeDetail.extractor_args.get(1); pattern = Pattern.compile(regexSelector); Matcher m = pattern.matcher(attrib_value); m.find(); attrib_value = m.group(1); } return attrib_value; }
From source file:org.brunocvcunha.taskerbox.impl.crawler.SniptAction.java
@Override public void action(final Document entry) { log.debug("Validating " + entry.title()); for (Element el : entry.select(".grid-block").select("a")) { final String id = el.attr("href").replace("http://snipt.org/", ""); final String title = id + " - " + el.text(); if (canAct(id)) { addAct(id);/*w w w . ja v a2 s .c om*/ spreadAction(id, title); serializeAlreadyAct(); sleep(FETCH_INTERVAL); } } }
From source file:org.jasig.portlet.proxy.search.AnchorSearchStrategy.java
@Override public List<SearchResult> search(SearchRequest searchQuery, EventRequest request, Document document) { List<SearchResult> results = new ArrayList<SearchResult>(); final String[] whitelistRegexes = request.getPreferences().getValues("anchorWhitelistRegex", new String[] {}); String searchTerms = searchQuery.getSearchTerms().toLowerCase(); Elements links = document.select("a[href]"); for (Element link : links) { String linkUrl = link.attr("abs:href"); for (String searchTerm : searchTerms.split(" ")) { if (link.text().toLowerCase().contains(searchTerm)) { log.debug("found a match, term: [" + searchTerm + "], anchor URL: [" + linkUrl + "], anchor text: [" + link.text() + "]"); SearchResult result = new SearchResult(); result.setTitle(link.text()); result.setSummary(link.text()); PortletUrl pUrl = new PortletUrl(); pUrl.setPortletMode(PortletMode.VIEW.toString()); pUrl.setType(PortletUrlType.RENDER); pUrl.setWindowState(WindowState.MAXIMIZED.toString()); PortletUrlParameter param = new PortletUrlParameter(); param.setName("proxy.url"); param.getValue().add(linkUrl); pUrl.getParam().add(param); new SearchUtil().updateUrls(linkUrl, request, whitelistRegexes); result.setPortletUrl(pUrl); results.add(result);//from w ww .j a va 2 s .com } } } return results; }
From source file:com.normalexception.app.rx8club.task.AdminTask.java
@Override protected Void doInBackground(Void... params) { try {/*from w w w .java2 s . c om*/ Log.d(TAG, progressText.get(doType)); if (this.doType == DELETE_THREAD) { HtmlFormUtils.adminTypePost(doType, token, thread, deleteResponse); } else HtmlFormUtils.adminTypePost(doType, token, thread, null); if (this.doType == MOVE_THREAD) { String response = HtmlFormUtils.getResponseUrl(); Log.d(TAG, "Response: " + response); Document doc = Jsoup.parse(HtmlFormUtils.getResponseContent()); threadTitle = HtmlFormUtils.getInputElementValueByName(doc, "title"); Log.d(TAG, "Thread Title: " + threadTitle); Elements selects = doc.select("select[name=destforumid] > option"); for (Element select : selects) { selectOptions.put(select.text(), Integer.parseInt(select.attr("value"))); } Log.d(TAG, "Parsed " + selectOptions.keySet().size() + " options"); } } catch (ClientProtocolException e) { Log.e(TAG, e.getMessage(), e); } catch (IOException e) { Log.e(TAG, e.getMessage(), e); } return null; }
From source file:com.anhao.spring.service.impl.PhotosServiceImpl.java
private void getWallpaperTags(String wallpaperId) { String wallpaperUrl = "http://alpha.wallhaven.cc/wallpaper/" + wallpaperId; Document docDetails = getWallpaperHtmlDocument(wallpaperUrl); Elements Tags = docDetails.select("#tags li"); for (Element tag : Tags) { //iduuid ?wallhavenID String photosId = jobPhotosDAO.findByWallpaperId(wallpaperId); //tagUUID Element tagName = tag.select(".tagname").first(); String TagId = tagDAO.findByTagName(tagName.text()); System.out.println("wallpaperId:" + wallpaperId + "====tag name " + tagName.text()); PhotosTag photosTag = new PhotosTag(); photosTag.setPhotoId(photosId);// w ww. java 2s . com photosTag.setTagId(TagId); photostagDAO.add(photosTag); } }
From source file:com.isoftstone.proxy.api.sdk.KuaidailiProxySDK.java
private List<ProxyVo> parseHtml(Document doc) { Elements eles = doc.select("#list table tr"); List<ProxyVo> proxyList = new ArrayList<ProxyVo>(); for (int i = 1; i < eles.size(); i++) { Element ele = eles.get(i); Element ipEle = ele.select("td:eq(0)").first(); Element portEle = ele.select("td:eq(1)").first(); ProxyVo proxyVo = new ProxyVo(); proxyVo.setProxyIp(ipEle.text()); proxyVo.setProxyPort(Integer.parseInt(portEle.text())); proxyList.add(proxyVo);//www .j av a 2s . co m } return proxyList; }
From source file:lolth.autohome.buy.AutohomeBuyInfoListTaskFetch.java
@Override protected void parsePage(Document doc, FetchTask task) throws Exception { Elements lis = doc.select("li.price-item"); for (Element li : lis) { AutohomeBuyInfoBean bean = new AutohomeBuyInfoBean(); bean.setUrl(task.getUrl());// w w w . j a va 2 s . co m bean.setForumId(task.getExtra()); // post id Elements id = li.select("div.price-share a.share"); if (!id.isEmpty()) { String idStr = id.first().attr("data-target"); idStr = StringUtils.substringAfterLast(idStr, "_"); if (StringUtils.isBlank(idStr)) { continue; } bean.setId(idStr); } // Elements user = li.select("div.user-name a"); if (!user.isEmpty()) { String userUrl = user.first().absUrl("href"); String userId = StringUtils.substringAfterLast(userUrl, "/"); String userName = user.first().text(); bean.setUserId(userId); bean.setUserUrl(userUrl); bean.setUserName(userName); } // ? Elements postTime = li.select("div.user-name span"); if (!postTime.isEmpty()) { bean.setPostTime(StringUtils.trim(StringUtils.substringBefore(postTime.first().text(), "?"))); } Elements dataLis = li.select("div.price-item-bd li"); for (Element dataLi : dataLis) { String data = dataLi.text(); if (StringUtils.startsWith(data, "")) { bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "?")) { bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, ""))); } if (StringUtils.startsWith(data, "")) { String area = StringUtils.trim(StringUtils.substringAfter(data, "")); String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2); if (pAndC.length == 1) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[0]); } if (pAndC.length == 2) { bean.setBuyProvince(pAndC[0]); bean.setBuyCity(pAndC[1]); } } if (StringUtils.startsWith(data, "")) { Elements level = dataLi.select("span.level"); // if (!level.isEmpty()) { bean.setSellerComment(level.first().text()); } // ? Elements seller = dataLi.select("a.title"); if (!seller.isEmpty()) { String sellerUrl = seller.first().absUrl("href"); String sellerName = seller.first().text(); String sellerId = StringUtils.substringAfterLast(sellerUrl, "/"); bean.setSellerId(sellerId); bean.setSellerName(sellerName); bean.setSellerUrl(sellerUrl); } // ? Elements sellerPhone = dataLi.select("em.phone-num"); if (!sellerPhone.isEmpty()) { bean.setSellerPhone(sellerPhone.first().text()); } // ? // Elements sellerAddress = dataLi.select("em.phone-num"); } if (StringUtils.startsWith(data, "?")) { bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, ""))); } } log.debug("Bean : {}", bean); bean.persistOnNotExist(); } }
From source file:me.vertretungsplan.parser.UntisInfoHeadlessParser.java
@Override public SubstitutionSchedule getSubstitutionSchedule() throws IOException, JSONException, CredentialInvalidException { new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore); SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData); Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null))); doc.setBaseUri(url);/*from ww w . j av a 2 s . com*/ Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b"); Elements frames = doc.select("frame[src*=w00]"); if (dayElems.size() == 0 && frames.size() > 0) { // doc is embedded in frame doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null))); dayElems = doc.select("#vertretung > p > b, #vertretung > b"); } for (Element dayElem : dayElems) { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); day.setLastChangeString(""); String date = dayElem.text(); day.setDateString(date); day.setDate(ParserUtils.parseDate(date)); Element next; if (dayElem.parent().tagName().equals("p")) { next = dayElem.parent().nextElementSibling().nextElementSibling(); } else { next = dayElem.parent().select("p").first().nextElementSibling(); } parseDay(day, next, v, null); } v.setClasses(getAllClasses()); v.setTeachers(getAllTeachers()); return v; }