Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:com.dajodi.scandic.JSoupScraper.java

@Override
public Map<String, String> scrapeFormInputFields(InputStream inStream) {

    try {//from w  w w .ja v a2s . c  om
        Document doc = Jsoup.parse(inStream, HTTP.UTF_8, "");

        Element form = doc.body().getElementById("aspnetForm");

        Elements inputNodes = form.getElementsByTag("input");
        Map<String, String> inputMap = new HashMap<String, String>();

        for (Element element : inputNodes) {

            String name = element.attr("name");
            String value = element.attr("value");

            if (name != null) {
                inputMap.put(name, value == null ? "" : value);
            } else {
                //TODO: remove me
                Log.d("Something weird");
            }
        }

        doc.empty();
        return inputMap;
    } catch (Exception e) {
        throw new ScandicHtmlException(e);
    }
}

From source file:de.tudarmstadt.ukp.argumentation.data.roomfordebate.NYTimesArticleExtractor.java

public Article extractArticle(String html) throws ParseException, IOException {
    Article result = new Article();

    Document doc = Jsoup.parse(html, getBaseName());

    Element element;//  www  .  j a v  a2s  . com
    try {
        element = doc.select("article.rfd").iterator().next();
    } catch (NoSuchElementException exception) {
        throw new IOException("Cannot find article.rfd element");
    }

    //      System.out.println(element);

    String dateText = element.select("p.pubdate").text().replaceAll("Updated[\\s]+", "");
    // time
    try {
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy, hh:mm aaa", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    } catch (ParseException e) {
        // June 24, 2015
        DateFormat df = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH);
        Date date = df.parse(dateText);
        result.setTimestamp(date);
    }

    // title
    result.setTitle(TextCleaningUtils.normalizeWithParagraphs(element.select("h1").text()));

    // text
    StringBuilder sb = new StringBuilder();
    for (Element p : element.select("div.nytint-post > p")) {
        sb.append(p.text());
        sb.append("\n");
    }
    result.setText(TextCleaningUtils.normalizeWithParagraphs(sb.toString()));

    // debate title
    result.setDebateTitle(TextCleaningUtils
            .normalizeWithParagraphs(doc.select("div.nytint-discussion-overview > h2").text()));

    // debate url
    result.setDebateUrl(doc.select("div.nytint-discussion-overview > h2 > a").iterator().next().attr("href"));

    // document url
    result.setUrl(doc.select("meta[name=communityAssetURL]").attr("content"));

    // debate description
    result.setDebateDescription(TextCleaningUtils
            .normalizeWithParagraphs(((TextNode) doc.select("div.nytint-discussion-overview > p").iterator()
                    .next().childNodes().iterator().next()).text()));

    // aurhor
    result.setAuthor(element.select("div.nytint-mugshots > img").iterator().next().attr("alt"));

    // topics
    for (Element a : element.select("p.nytint-tags > a")) {
        result.getTopics().add(a.attr("href"));
    }

    return result;
}

From source file:org.wallride.web.support.Posts.java

protected String parse(String html) {
    Document document = Jsoup.parse(html);
    Elements elements = document.select("img");
    for (Element element : elements) {
        String src = element.attr("src");
        if (src.startsWith(wallRideProperties.getMediaUrlPrefix())) {
            String style = element.attr("style");
            Pattern pattern = Pattern.compile("width: ([0-9]+)px;");
            Matcher matcher = pattern.matcher(element.attr("style"));
            if (matcher.find()) {
                String replaced = src + "?w=" + Integer.parseInt(matcher.group(1)) * 2;
                element.attr("src", replaced);
            }//ww  w.  ja v a  2  s.  c o  m
        }
    }
    return document.body().html();
}

From source file:web.analyzer.utils.Utils.java

public LinkResult getLinks(Document doc, String hostName) throws IOException {
    List<Link> linksInfo = new ArrayList<Link>();
    int totalInternalLink = 0;
    int totalExternalLink = 0;
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        String href = link.attr("abs:href");
        if (isValidUrl(href)) {
            URL url = new URL(href);
            String linkHostName = url.getHost();
            String linkType = "";
            if (linkHostName.equalsIgnoreCase(hostName)) {
                linkType = "internal";
                totalInternalLink++;//from w ww.  j a  v  a  2  s .c o m
            } else {
                linkType = "external";
                totalExternalLink++;
            }

            linksInfo.add(new Link(href, linkType));
        }
    }

    return new LinkResult(linksInfo, totalInternalLink, totalExternalLink);
}

From source file:com.liato.bankdroid.banking.banks.Bioklubben.java

@Override
protected LoginPackage preLogin() throws BankException, ClientProtocolException, IOException {
    urlopen = new Urllib(context, CertificateReader.getCertificates(context, R.raw.cert_bioklubben));
    urlopen.setAllowCircularRedirects(true);
    response = urlopen.open("http://bioklubben.sf.se/Start.aspx");

    Document d = Jsoup.parse(response);
    Element e = d.getElementById("__VIEWSTATE");
    if (e == null || e.attr("value") == null) {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " ViewState.");
    }//from w  ww  .  jav  a  2s  .  c  om
    String viewState = e.attr("value");

    e = d.getElementById("__EVENTVALIDATION");
    if (e == null || e.attr("value") == null) {
        throw new BankException(res.getText(R.string.unable_to_find).toString() + " EventValidation.");
    }
    String eventValidation = e.attr("value");

    List<NameValuePair> postData = new ArrayList<NameValuePair>();
    postData.add(
            new BasicNameValuePair("__EVENTTARGET", "ctl00$ContentPlaceHolder1$LoginUserControl$LogonButton"));
    postData.add(new BasicNameValuePair("__EVENTARGUMENT", ""));
    postData.add(new BasicNameValuePair("__VIEWSTATE", viewState));
    postData.add(new BasicNameValuePair("__EVENTVALIDATION", eventValidation));
    postData.add(new BasicNameValuePair("ctl00_toolkitscriptmanager_HiddenField", ""));
    postData.add(new BasicNameValuePair("ctl00$toolkitscriptmanager",
            "ctl00$UpdatePanel|ctl00$ContentPlaceHolder1$LoginUserControl$LogonButton"));
    postData.add(
            new BasicNameValuePair("ctl00$ContentPlaceHolder1$LoginUserControl$LoginNameTextBox", username));
    postData.add(
            new BasicNameValuePair("ctl00$ContentPlaceHolder1$LoginUserControl$PasswordTextBox", password));
    return new LoginPackage(urlopen, postData, response, "http://bioklubben.sf.se/Start.aspx");
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * if there are elements inside our top node that have a negative gravity score, let's
 * give em the boot//  w ww. j  a  v a2  s .  c om
 */
private void removeNodesWithNegativeScores() {
    Elements gravityItems = this.topNode.select("*[gravityScore]");
    for (Element item : gravityItems) {
        int score = Integer.parseInt(item.attr("gravityScore"));
        if (score < 1) {
            item.remove();
        }
    }
}

From source file:com.bluedragon.search.index.crawl.handler.FileHandlerHTMLImpl.java

/**
 * Runs around all the internal links and pulls out all the URLs
 * @param doc/*  w w w  .  j  a v a 2s  .c o  m*/
 * @param baseUri
 */
private void setAnchors(Document doc, String baseUri) {
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        if (baseUri != null)
            link.setBaseUri(baseUri);

        String newLink = link.attr("abs:href");
        if (newLink.indexOf("#") != -1)
            newLink = newLink.substring(0, newLink.indexOf("#"));

        anchors.add(newLink);
    }
}

From source file:cn.cuizuoli.appranking.service.GooglePlayService.java

/**
 * getAppRankingList//from w w  w. ja v  a 2 s. c  o m
 * @param feedType
 * @return
 */
public List<AppRanking> getAppRankingList(FeedType feedType, Category category) {
    List<AppRanking> appRankingList = new ArrayList<AppRanking>();
    try {
        if (feedType.getMediaType() == MediaType.GOOGLE) {
            String url = StringUtils.EMPTY;
            if (category == Category.ALL) {
                url = getHotUrl(feedType);
            } else {
                url = getUrl(feedType, category);
            }
            log.info("Google Play -> " + url);
            if (StringUtils.isNotBlank(url)) {

                Document doc = appRankingRestTemplate.getForObject(url, Document.class);
                Elements elements = doc.select(".card-list>.card");
                Iterator<Element> iterator = elements.iterator();
                int i = 1;
                while (iterator.hasNext()) {
                    Element element = iterator.next();
                    String appId = element.attr("data-docid");
                    String name = element.select(".details .title").attr("title");
                    String uri = element.select(".details .title").attr("href");
                    String artist = element.select(".details .subtitle").attr("title");
                    String price = element.select(".details button.price.buy>span").text();
                    String image170 = element.select(".cover .cover-image").attr("data-cover-small");
                    String image340 = element.select(".cover .cover-image").attr("data-cover-large");
                    AppRanking appRanking = new AppRanking();
                    appRanking.setAppId(appId);
                    appRanking.setDeviceType(DeviceType.ANDROID);
                    appRanking.setCountry(Country.JAPAN);
                    appRanking.setMediaType(MediaType.GOOGLE);
                    appRanking.setFeedType(feedType);
                    appRanking.setRanking(i);
                    appRanking.setTitle(name + " - " + artist);
                    appRanking.setCategory(category.getCode());
                    appRanking.setUri(GOOGLE_PLAY_DOMAIN + uri);
                    appRanking.setName(name);
                    appRanking.setArtist(artist);
                    appRanking.setPrice(price);
                    appRanking.setImage53(image170);
                    appRanking.setImage75(image170);
                    appRanking.setImage100(image340);
                    appRankingList.add(appRanking);
                    i++;
                }

            }
        }
    } catch (HttpStatusCodeException e) {
        log.error(ExceptionUtils.getFullStackTrace(e));
    } catch (Exception e) {
        log.error(ExceptionUtils.getFullStackTrace(e));
    }
    return appRankingList;
}

From source file:blackman.matt.board.Post.java

/**
 * Formats the HTML on the post text to accurately display it on the post.
 *
 * @param post The unformatted text of the post.
 * @return A formatted version of the post.
 *///w  w w . j  a  v a2s .co  m
private String formatPostBody(String post) {
    Document formattedText = Jsoup.parse(post);
    Pattern p = Pattern.compile("^/.*/index\\.html");

    // Red Text
    Elements redTexts = formattedText.getElementsByClass("heading");
    for (Element text : redTexts) {
        text.wrap("<font color=\"#AF0A0F\"><strong></strong></font>");
    }

    // Green text
    Elements greenTexts = formattedText.getElementsByClass("quote");
    for (Element text : greenTexts) {
        text.wrap("<font color=\"#789922\"></font>");
    }

    // Board Links
    Elements boardLinks = formattedText.select("a");
    for (Element link : boardLinks) {
        String url = link.attr("href");
        Matcher m = p.matcher(url);
        if (m.matches()) {
            link.attr("href", "http://8chan.co" + url);
        }
    }

    // Reply links
    Elements replyLinks = formattedText.select("a[onclick^=highlightReply");
    for (Element reply : replyLinks) {
        repliedTo.add(reply.attr("href").split("#")[1]);
        boardLinks.attr("href", "http://8chan.co" + reply.attr("href"));
    }

    // Post too long text removal
    Elements tooLongs = formattedText.getElementsByClass("toolong");
    for (Element text : tooLongs) {
        text.text("");
    }

    return formattedText.toString();
}

From source file:jobhunter.dice.Client.java

public Job execute() throws IOException, URISyntaxException {
    l.debug("Connecting to {}", url);

    update("Connecting", 1L);
    final Document doc = Jsoup.connect(url).get();

    update("Parsing HTML", 2L);
    final Job job = Job.of();
    job.setPortal(DicePlugin.portal);/*from   w  w w .  java 2  s  .co  m*/
    job.setLink(url);

    StringBuilder description = new StringBuilder();

    for (Element meta : doc.getElementsByTag("meta")) {
        l.debug("Checking {}", meta.toString());
        if (meta.attr("name").equals("twitter:text:job_title"))
            job.setPosition(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:company"))
            job.getCompany().setName(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:city"))
            job.setAddress(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:salary"))
            job.setSalary(meta.attr("content"));

        if (meta.attr("name").equals("twitter:text:job_description_web")) {
            description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content")));
        }

        if (meta.attr("name").equals("twitter:text:skills")) {
            description.append(StringEscapeUtils.unescapeHtml4(meta.attr("content")));
        }
    }

    job.setDescription(description.toString());

    update("Done", 3L);
    return job;
}