Example usage for org.jsoup.nodes Element text

List of usage examples for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text() 

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:com.maxl.java.aips2xml.Aips2Xml.java

static String convertHtmlToXml(String med_title, String html_str, String regnr_str) {
    Document mDoc = Jsoup.parse(html_str);
    mDoc.outputSettings().escapeMode(EscapeMode.xhtml);
    mDoc.outputSettings().prettyPrint(true);
    mDoc.outputSettings().indentAmount(4);

    // <div id="monographie"> -> <fi>
    mDoc.select("div[id=monographie]").tagName("fi").removeAttr("id");
    // <div class="MonTitle"> -> <title>
    mDoc.select("div[class=MonTitle]").tagName("title").removeAttr("class").removeAttr("id");
    // Beautify the title to the best of my possibilities ... still not good enough!
    String title_str = mDoc.select("title").text().trim().replaceAll("<br />", "").replaceAll("(\\t|\\r?\\n)+",
            "");/*from   ww w .  j  a  va 2  s.com*/
    if (!title_str.equals(med_title))
        if (SHOW_ERRORS)
            System.err.println(med_title + " differs from " + title_str);
    // Fallback solution: use title from the header AIPS.xml file - the titles look all pretty good!
    mDoc.select("title").first().text(med_title);
    // <div class="ownerCompany"> -> <owner>
    Element owner_elem = mDoc.select("div[class=ownerCompany]").first();
    if (owner_elem != null) {
        owner_elem.tagName("owner").removeAttr("class");
        String owner_str = mDoc.select("owner").text();
        mDoc.select("owner").first().text(owner_str);
    } else {
        mDoc.select("title").after("<owner></owner>");
        if (DB_LANGUAGE.equals("de"))
            mDoc.select("owner").first().text("k.A.");
        else if (DB_LANGUAGE.equals("fr"))
            mDoc.select("owner").first().text("n.s.");
    }

    // <div class="paragraph"> -> <paragraph>
    mDoc.select("div[class=paragraph]").tagName("paragraph").removeAttr("class").removeAttr("id");
    // <div class="absTitle"> -> <paragraphTitle>
    mDoc.select("div[class=absTitle]").tagName("paragraphtitle").removeAttr("class");
    // <div class="untertitle1"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle1]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="untertitle"> -> <paragraphSubTitle>
    mDoc.select("div[class=untertitle]").tagName("paragraphsubtitle").removeAttr("class");
    // <div class="shortCharacteristic"> -> <characteristic>
    mDoc.select("div[class=shortCharacteristic]").tagName("characteristic").removeAttr("class");
    // <div class="image">
    mDoc.select("div[class=image]").tagName("image").removeAttr("class");

    // <p class="spacing1"> -> <p> / <p class="noSpacing"> -> <p>
    mDoc.select("p[class]").tagName("p").removeAttr("class");
    // <span style="font-style:italic"> -> <i>
    mDoc.select("span").tagName("i").removeAttr("style");
    // <i class="indention1"> -> <i> / <i class="indention2"> -> <b-i> 
    mDoc.select("i[class=indention1]").tagName("i").removeAttr("class");
    mDoc.select("i[class=indention2]").tagName("i").removeAttr("class");
    // mDoc.select("p").select("i").tagName("i");
    // mDoc.select("paragraphtitle").select("i").tagName("para-i");
    // mDoc.select("paragraphsubtitle").select("i").tagName("parasub-i");
    Elements elems = mDoc.select("paragraphtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }
    elems = mDoc.select("paragraphsubtitle");
    for (Element e : elems) {
        if (!e.text().isEmpty())
            e.text(e.text());
    }

    // Here we take care of tables
    // <table class="s21"> -> <table>
    mDoc.select("table[class]").removeAttr("class");
    mDoc.select("table").removeAttr("cellspacing").removeAttr("cellpadding").removeAttr("border");
    mDoc.select("colgroup").remove();
    mDoc.select("td").removeAttr("class").removeAttr("colspan").removeAttr("rowspan");
    mDoc.select("tr").removeAttr("class");
    elems = mDoc.select("div[class]");
    for (Element e : elems) {
        if (e.text().isEmpty())
            e.remove();
    }

    mDoc.select("tbody").unwrap();
    // Remove nested table (a nasty table-in-a-table
    Elements nested_table = mDoc.select("table").select("tr").select("td").select("table");
    if (!nested_table.isEmpty()) {
        nested_table.select("table").unwrap();
    }

    // Here we take care of the images
    mDoc.select("img").removeAttr("style").removeAttr("align").removeAttr("border");

    // Subs and sups
    mDoc.select("sub[class]").tagName("sub").removeAttr("class");
    mDoc.select("sup[class]").tagName("sup").removeAttr("class");
    mDoc.select("td").select("sub").tagName("td-sub");
    mDoc.select("td").select("sup").tagName("td-sup");
    // Remove floating <td-sup> tags
    mDoc.select("p").select("td-sup").tagName("sup");
    mDoc.select("p").select("td-sub").tagName("sub");

    // Box
    mDoc.select("div[class=box]").tagName("box").removeAttr("class");

    // Insert swissmedicno5 after <owner> tag
    mDoc.select("owner").after("<swissmedicno5></swissmedicno5");
    mDoc.select("swissmedicno5").first().text(regnr_str);

    // Remove html, head and body tags         
    String xml_str = mDoc.select("body").first().html();

    //xml_str = xml_str.replaceAll("<tbody>", "").replaceAll("</tbody>", "");
    xml_str = xml_str.replaceAll("<sup> </sup>", "");
    xml_str = xml_str.replaceAll("<sub> </sub>", "");
    xml_str = xml_str.replaceAll("<p> <i>", "<p><i>");
    xml_str = xml_str.replaceAll("</p> </td>", "</p></td>");
    xml_str = xml_str.replaceAll("<p> </p>", "<p></p>"); // MUST be improved, the space is not a real space!!
    xml_str = xml_str.replaceAll("", "- ");
    xml_str = xml_str.replaceAll("<br />", "");
    xml_str = xml_str.replaceAll("(?m)^[ \t]*\r?\n", "");

    // Remove multiple instances of <p></p>
    Scanner scanner = new Scanner(xml_str);
    String new_xml_str = "";
    int counter = 0;
    while (scanner.hasNextLine()) {
        String line = scanner.nextLine();
        if (line.trim().equals("<p></p>")) {
            counter++;
        } else
            counter = 0;
        if (counter < 3)
            new_xml_str += line;
    }
    scanner.close();

    return new_xml_str;
}

From source file:crawler.AScraper.java

@Transformer(inputChannel = "channel3", outputChannel = "channel4")
public Artwork convert(Element payload) throws ParseException, MalformedURLException {
    Matcher m = patter.matcher(payload.text());
    if (m.find()) {
        String year = m.group("year");
        String month = m.group("month");
        String day = m.group("day");
        int id = Integer.parseInt(m.group("id"));
        String model = m.group("model").split("[\\s\\[\\]]")[0];
        URL link = new URL(payload.attr("href"));
        DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
        format.setTimeZone(TimeZone.getTimeZone("GMT+8"));
        Date date = format.parse(String.format("%s-%s-%s", year, month, day));
        String thread_title = payload.text();
        return new Artwork(thread_title, id, -1, -1, null, link, null, model, date);
    } else {//from ww w  .ja  va2 s .c om
        LOG.error(payload.text());
        return null;
    }

}

From source file:io.seldon.importer.articles.dynamicextractors.FirstElementTextValueDynamicExtractor.java

@Override
public String extract(AttributeDetail attributeDetail, String url, Document articleDoc) throws Exception {

    String attrib_value = null;//  w w  w  .  jav  a2 s  . c  om

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 1)) {
        String cssSelector = attributeDetail.extractor_args.get(0);
        Element element = articleDoc.select(cssSelector).first();
        if (StringUtils.isNotBlank(cssSelector)) {
            if (element != null) {
                attrib_value = element.text();
            }
        }
    }

    if ((attrib_value != null) && (attributeDetail.extractor_args != null)
            && (attributeDetail.extractor_args.size() >= 2)) {
        String regexSelector = attributeDetail.extractor_args.get(1);
        pattern = Pattern.compile(regexSelector);
        Matcher m = pattern.matcher(attrib_value);
        m.find();
        attrib_value = m.group(1);
    }

    return attrib_value;
}

From source file:org.brunocvcunha.taskerbox.impl.crawler.SniptAction.java

@Override
public void action(final Document entry) {

    log.debug("Validating " + entry.title());

    for (Element el : entry.select(".grid-block").select("a")) {
        final String id = el.attr("href").replace("http://snipt.org/", "");

        final String title = id + " - " + el.text();

        if (canAct(id)) {
            addAct(id);/*w w  w .  ja v a2 s  .c om*/

            spreadAction(id, title);
            serializeAlreadyAct();
            sleep(FETCH_INTERVAL);
        }

    }

}

From source file:org.jasig.portlet.proxy.search.AnchorSearchStrategy.java

@Override
public List<SearchResult> search(SearchRequest searchQuery, EventRequest request, Document document) {
    List<SearchResult> results = new ArrayList<SearchResult>();
    final String[] whitelistRegexes = request.getPreferences().getValues("anchorWhitelistRegex",
            new String[] {});
    String searchTerms = searchQuery.getSearchTerms().toLowerCase();

    Elements links = document.select("a[href]");
    for (Element link : links) {
        String linkUrl = link.attr("abs:href");
        for (String searchTerm : searchTerms.split(" ")) {
            if (link.text().toLowerCase().contains(searchTerm)) {
                log.debug("found a match, term: [" + searchTerm + "], anchor URL: [" + linkUrl
                        + "], anchor text: [" + link.text() + "]");
                SearchResult result = new SearchResult();
                result.setTitle(link.text());
                result.setSummary(link.text());

                PortletUrl pUrl = new PortletUrl();
                pUrl.setPortletMode(PortletMode.VIEW.toString());
                pUrl.setType(PortletUrlType.RENDER);
                pUrl.setWindowState(WindowState.MAXIMIZED.toString());
                PortletUrlParameter param = new PortletUrlParameter();
                param.setName("proxy.url");
                param.getValue().add(linkUrl);
                pUrl.getParam().add(param);

                new SearchUtil().updateUrls(linkUrl, request, whitelistRegexes);

                result.setPortletUrl(pUrl);
                results.add(result);//from w  ww  .j a  va  2 s .com
            }
        }
    }
    return results;
}

From source file:com.normalexception.app.rx8club.task.AdminTask.java

@Override
protected Void doInBackground(Void... params) {
    try {/*from   w w  w .java2 s  . c om*/
        Log.d(TAG, progressText.get(doType));

        if (this.doType == DELETE_THREAD) {
            HtmlFormUtils.adminTypePost(doType, token, thread, deleteResponse);
        } else
            HtmlFormUtils.adminTypePost(doType, token, thread, null);

        if (this.doType == MOVE_THREAD) {
            String response = HtmlFormUtils.getResponseUrl();
            Log.d(TAG, "Response: " + response);

            Document doc = Jsoup.parse(HtmlFormUtils.getResponseContent());

            threadTitle = HtmlFormUtils.getInputElementValueByName(doc, "title");
            Log.d(TAG, "Thread Title: " + threadTitle);

            Elements selects = doc.select("select[name=destforumid] > option");
            for (Element select : selects) {
                selectOptions.put(select.text(), Integer.parseInt(select.attr("value")));
            }

            Log.d(TAG, "Parsed " + selectOptions.keySet().size() + " options");
        }
    } catch (ClientProtocolException e) {
        Log.e(TAG, e.getMessage(), e);
    } catch (IOException e) {
        Log.e(TAG, e.getMessage(), e);
    }
    return null;
}

From source file:com.anhao.spring.service.impl.PhotosServiceImpl.java

private void getWallpaperTags(String wallpaperId) {
    String wallpaperUrl = "http://alpha.wallhaven.cc/wallpaper/" + wallpaperId;
    Document docDetails = getWallpaperHtmlDocument(wallpaperUrl);
    Elements Tags = docDetails.select("#tags li");
    for (Element tag : Tags) {
        //iduuid ?wallhavenID
        String photosId = jobPhotosDAO.findByWallpaperId(wallpaperId);
        //tagUUID
        Element tagName = tag.select(".tagname").first();

        String TagId = tagDAO.findByTagName(tagName.text());

        System.out.println("wallpaperId:" + wallpaperId + "====tag name " + tagName.text());
        PhotosTag photosTag = new PhotosTag();

        photosTag.setPhotoId(photosId);// w ww. java  2s . com
        photosTag.setTagId(TagId);
        photostagDAO.add(photosTag);
    }
}

From source file:com.isoftstone.proxy.api.sdk.KuaidailiProxySDK.java

private List<ProxyVo> parseHtml(Document doc) {
    Elements eles = doc.select("#list table tr");
    List<ProxyVo> proxyList = new ArrayList<ProxyVo>();
    for (int i = 1; i < eles.size(); i++) {
        Element ele = eles.get(i);
        Element ipEle = ele.select("td:eq(0)").first();
        Element portEle = ele.select("td:eq(1)").first();
        ProxyVo proxyVo = new ProxyVo();
        proxyVo.setProxyIp(ipEle.text());
        proxyVo.setProxyPort(Integer.parseInt(portEle.text()));
        proxyList.add(proxyVo);//www .j  av  a 2s  .  co m
    }
    return proxyList;
}

From source file:lolth.autohome.buy.AutohomeBuyInfoListTaskFetch.java

@Override
protected void parsePage(Document doc, FetchTask task) throws Exception {
    Elements lis = doc.select("li.price-item");

    for (Element li : lis) {
        AutohomeBuyInfoBean bean = new AutohomeBuyInfoBean();
        bean.setUrl(task.getUrl());// w  w  w  .  j  a va 2 s  .  co m
        bean.setForumId(task.getExtra());

        // post id
        Elements id = li.select("div.price-share a.share");
        if (!id.isEmpty()) {
            String idStr = id.first().attr("data-target");
            idStr = StringUtils.substringAfterLast(idStr, "_");
            if (StringUtils.isBlank(idStr)) {
                continue;
            }

            bean.setId(idStr);
        }

        // 
        Elements user = li.select("div.user-name a");
        if (!user.isEmpty()) {
            String userUrl = user.first().absUrl("href");
            String userId = StringUtils.substringAfterLast(userUrl, "/");
            String userName = user.first().text();

            bean.setUserId(userId);
            bean.setUserUrl(userUrl);
            bean.setUserName(userName);
        }

        // ?
        Elements postTime = li.select("div.user-name span");
        if (!postTime.isEmpty()) {
            bean.setPostTime(StringUtils.trim(StringUtils.substringBefore(postTime.first().text(), "?")));
        }

        Elements dataLis = li.select("div.price-item-bd li");
        for (Element dataLi : dataLis) {
            String data = dataLi.text();

            if (StringUtils.startsWith(data, "")) {
                bean.setCar(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setGuidePrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "?")) {
                bean.setTotalPrice(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setPurchaseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "?")) {
                bean.setCommercialInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }

            if (StringUtils.startsWith(data, "")) {
                bean.setVehicleUseTax(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setCompulsoryInsurance(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setLicenseFee(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "?")) {
                bean.setPromotion(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                bean.setBuyTime(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
            if (StringUtils.startsWith(data, "")) {
                String area = StringUtils.trim(StringUtils.substringAfter(data, ""));
                String[] pAndC = StringUtils.splitByWholeSeparator(area, ",", 2);

                if (pAndC.length == 1) {
                    bean.setBuyProvince(pAndC[0]);
                    bean.setBuyCity(pAndC[0]);
                }

                if (pAndC.length == 2) {
                    bean.setBuyProvince(pAndC[0]);
                    bean.setBuyCity(pAndC[1]);
                }

            }
            if (StringUtils.startsWith(data, "")) {
                Elements level = dataLi.select("span.level");
                // 
                if (!level.isEmpty()) {
                    bean.setSellerComment(level.first().text());
                }

                // ?
                Elements seller = dataLi.select("a.title");
                if (!seller.isEmpty()) {
                    String sellerUrl = seller.first().absUrl("href");
                    String sellerName = seller.first().text();
                    String sellerId = StringUtils.substringAfterLast(sellerUrl, "/");

                    bean.setSellerId(sellerId);
                    bean.setSellerName(sellerName);
                    bean.setSellerUrl(sellerUrl);
                }

                // ?
                Elements sellerPhone = dataLi.select("em.phone-num");
                if (!sellerPhone.isEmpty()) {
                    bean.setSellerPhone(sellerPhone.first().text());
                }

                // ?
                // Elements sellerAddress = dataLi.select("em.phone-num");

            }
            if (StringUtils.startsWith(data, "?")) {
                bean.setBuyFeeling(StringUtils.trim(StringUtils.substringAfter(data, "")));
            }
        }

        log.debug("Bean : {}", bean);

        bean.persistOnNotExist();
    }
}

From source file:me.vertretungsplan.parser.UntisInfoHeadlessParser.java

@Override
public SubstitutionSchedule getSubstitutionSchedule()
        throws IOException, JSONException, CredentialInvalidException {
    new LoginHandler(scheduleData, credential, cookieProvider).handleLogin(executor, cookieStore);

    SubstitutionSchedule v = SubstitutionSchedule.fromData(scheduleData);

    Document doc = Jsoup.parse(httpGet(url, data.optString(PARAM_ENCODING, null)));
    doc.setBaseUri(url);/*from   ww w  . j av  a 2  s .  com*/
    Elements dayElems = doc.select("#vertretung > p > b, #vertretung > b");

    Elements frames = doc.select("frame[src*=w00]");
    if (dayElems.size() == 0 && frames.size() > 0) {
        // doc is embedded in frame
        doc = Jsoup.parse(httpGet(frames.get(0).absUrl("src"), data.optString(PARAM_ENCODING, null)));
        dayElems = doc.select("#vertretung > p > b, #vertretung > b");
    }

    for (Element dayElem : dayElems) {
        SubstitutionScheduleDay day = new SubstitutionScheduleDay();
        day.setLastChangeString("");

        String date = dayElem.text();
        day.setDateString(date);
        day.setDate(ParserUtils.parseDate(date));

        Element next;
        if (dayElem.parent().tagName().equals("p")) {
            next = dayElem.parent().nextElementSibling().nextElementSibling();
        } else {
            next = dayElem.parent().select("p").first().nextElementSibling();
        }
        parseDay(day, next, v, null);
    }
    v.setClasses(getAllClasses());
    v.setTeachers(getAllTeachers());
    return v;
}