Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:net.meiolania.apps.habrahabr.fragments.qa.loader.QaLoader.java

@Override
public ArrayList<QaData> loadInBackground() {
    ArrayList<QaData> data = new ArrayList<QaData>();

    try {//www.  ja  v a 2s  .  co m
        String readyUrl = url.replace("%page%", String.valueOf(page));

        Log.i(TAG, "Loading a page: " + readyUrl);

        Document document = Jsoup.connect(readyUrl).get();

        Elements qaList = document.select("div.post");

        for (Element qa : qaList) {
            QaData qaData = new QaData();

            Element title = qa.select("a.post_title").first();
            Element hubs = qa.select("div.hubs").first();
            Element answers = qa.select("div.informative").first();
            Element date = qa.select("div.published").first();
            Element author = qa.select("div.author > a").first();
            Element score = qa.select("span.score").first();

            qaData.setTitle(title.text());
            qaData.setUrl(title.attr("abs:href"));
            qaData.setHubs(hubs.text());
            qaData.setAnswers(answers.text());
            qaData.setDate(date.text());
            qaData.setAuthor(author.text());
            qaData.setScore(score.text());

            data.add(qaData);
        }
    } catch (IOException e) {
    }

    return data;
}

From source file:net.meiolania.apps.habrahabr.fragments.users.loader.UsersLoader.java

@Override
public ArrayList<UsersData> loadInBackground() {
    ArrayList<UsersData> data = new ArrayList<UsersData>();

    try {//from w ww . j  a v a 2  s .  c  om
        Log.i(TAG, "Loading a page: " + url);

        Document document = Jsoup.connect(url).get();
        Elements users = document.select("div.user");

        for (Element user : users) {
            UsersData usersData = new UsersData();

            Element rating = user.select("div.rating").first();
            Element karma = user.select("div.karma").first();
            Element avatar = user.select("div.avatar > a > img").first();
            Element name = user.select("div.userlogin > div.username > a").first();
            Element lifetime = user.select("div.info > div.lifetime").first();

            usersData.setName(name.text());
            usersData.setUrl(name.attr("abs:href"));
            usersData.setRating(rating.text());
            usersData.setKarma(karma.text());
            usersData.setAvatar(avatar.attr("src"));
            usersData.setLifetime(lifetime.text());

            data.add(usersData);
        }
    } catch (IOException e) {
    }

    return data;
}

From source file:net.meiolania.apps.habrahabr.fragments.users.loader.UsersShowLoader.java

@Override
public UsersFullData loadInBackground() {
    UsersFullData data = new UsersFullData();

    try {//from   ww  w . ja  v a 2 s . c  o  m
        Log.i(TAG, "Loading a page: " + url);

        Document document = Jsoup.connect(url).get();

        Element avatar = document.select("a.avatar > img").first();
        Element karma = document.select("div.karma > div.score > div.num").first();
        Element rating = document.select("div.rating > div.num").first();
        Element birthday = document.select("dd.bday").first();
        Element fullname = document.select("div.fullname").first();
        Element summary = document.select("dd.summary").first();
        Element interests = document.select("dl.interests > dd").first();

        data.setAvatar(avatar.attr("src"));
        data.setKarma(karma.text());
        data.setRating(rating.text());
        data.setBirthday(birthday != null ? birthday.text() : "");
        data.setFullname(fullname != null ? fullname.text() : "");
        data.setSummary(summary != null ? summary.text() : "");
        data.setInterests(interests != null ? interests.text() : "");
    } catch (IOException e) {
    }

    return data;
}

From source file:net.trustie.model.OpenHubProject_Model.java

private void handleQuickRef(Element quickRef) {
    Elements itemNames = quickRef.select("dt");
    Elements itemValues = quickRef.select("dd");
    Element e = null;//  www .  j a va2 s  .  co  m
    Element eValue = null;
    for (int i = 0; i < itemNames.size(); i++) {
        e = itemNames.get(i);
        eValue = itemValues.get(i);
        String refName = e.text();
        switch (refName) {
        case "Organization:": {
            this.organization = eValue.text();
            break;
        }
        case "Project Links:": {
            Elements links = eValue.select("a");
            String[] tmp = new String[links.size()];
            Element ele = null;
            for (int j = 0; j < links.size(); j++) {
                ele = links.get(j);
                tmp[j] = ele.text() + Seperator.SOURCE_SEPERATOR + ele.attr("href");
            }
            this.projectLinks = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR);
            break;
        }
        case "Code Locations:": {
            Elements locs = eValue.select("a");
            if (locs.size() == 0) {
                this.codeLocation = eValue.text();
            } else {
                Element link = locs.get(0);
                this.codeLocation = link.text() + Seperator.SOURCE_SEPERATOR + link.attr("href");
            }

            break;
        }
        case "Licenses:": {
            Elements links = eValue.select("a");
            List<String> listLicenses = new ArrayList<String>();
            // String[] tmp = new String[links.size()];
            for (int j = 0; j < links.size(); j++) {
                listLicenses.add(links.get(j).text());
                // tmp[j] = links.get(j).text();
            }
            this.licenses = StringHandler.combineTags(listLicenses);
            break;
        }
        case "Similar Projects:": {
            // System.out.println(eValue);
            Elements projects = eValue.select("td[width=49%]");
            // System.out.println(projects.size());
            String[] tmp = new String[projects.size()];
            Element ele = null;
            for (int j = 0; j < projects.size(); j++) {
                ele = projects.get(j);
                Element project = ele.select("a").get(0);
                tmp[j] = project.text() + Seperator.SOURCE_SEPERATOR + project.attr("href");
            }
            this.similarProjects = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR);
            break;
        }
        case "Managers:": {
            if ("Become the first manager for BugSystem".equals(eValue.text())) {
                break;
            }
            Elements users = eValue.select("a");
            String[] tmp = new String[users.size()];
            Element ele = null;
            for (int j = 0; j < users.size(); j++) {
                ele = users.get(j);
                tmp[j] = ele.text() + Seperator.SOURCE_SEPERATOR + ele.attr("href");
            }
            this.managers = StringUtils.join(tmp, Seperator.OSSEAN_SEPERATOR);
            break;
        }
        default: {
            break;
        }
        }
    }
}

From source file:net.trustie.model.SFProject_Model.java

public void afterProcess(Page page) {
    // long start = System.currentTimeMillis();
    this.url = page.getPageUrl();
    // justify it's enterprise or bluesteel user
    // this.html = page.getHtml().toString();
    this.urlMd5 = DigestUtils.md5Hex(page.getPageUrl());
    SimpleDateFormat bartDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    this.collectTime = bartDateFormat.format(new Date());
    this.pageMd5 = DigestUtils.md5Hex(urlMd5 + lastUpdate + feature + downloadCount + stars);
    Document doc = page.getHtml().getDocument();
    Elements bodyEles = doc.select("body");

    if (bodyEles.size() > 0) {
        Element body = bodyEles.get(0);
        String bodyType = body.attr("id");
        if ("pg_project".equals(bodyType)) {
            String type = body.attr("class");

            if (type.equals("bluesteel user")) {

                // bluesteel user
                extractPageBluesteelUser(doc);

            } else if (type.equals("enterprise user")) {
                // enterprise user
                extractPageEnterpriseUser(doc);

            } else {
                // others
            }/* w  ww  . jav  a2 s  .c o  m*/

            if (lastUpdate.contains("ago")) {
                this.lastUpdate = getTime(lastUpdate);
            }
            if (registeredTime.contains("ago")) {
                this.registeredTime = getTime((registeredTime));
            }

            if (lastUpdate.equals("")) {
                this.lastUpdate = "0000-00-00 00:00:00";
            }
            if (registeredTime.equals("")) {
                this.registeredTime = "0000-00-00 00:00:00";
            }
        } else {
            // name
            Elements nameEles = body.select("div#proj_header div.proj-title h2");
            this.name = nameEles.text();

            // desc
            Elements descEles = body.select("div#top_left div#home_intro div#proj-overview p");
            this.desc = descEles.text();

            // features
            Elements featuresEles = body.select("div#top_left div#home_intro div#proj-overview ul");
            this.feature = featuresEles.text();
        }

        this.lastUpdate = DateHandler.formatAllTypeDate(lastUpdate, page.getTime());
        this.registeredTime = DateHandler.formatAllTypeDate(registeredTime, page.getTime());

    }

    // long end = System.currentTimeMillis();
    // System.out.println(end-start);

    // System.out.println(this.toString());
    // System.out.println(types.get(0).attr("class"));
    // Document doc=page.getHtml().getDocument();

}

From source file:nl.surfsara.warcexamples.datascience.WordCountMapper.java

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only process http response content. Note that the outlinks can also be found in the wat metadata.
    if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) {
        // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us:
        HttpHeader httpHeader = value.getHttpHeader();
        if (httpHeader == null) {
            // No header so we are unsure that the content is text/html: NOP
        } else {//from w w  w.j a v  a2  s  . c  o  m
            if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) {
                // Note that if you really want to do this right; you should look at the character encoding as well.
                // We'll leave that as an exercise for you ;-).
                context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1);
                // Get the html payload
                Payload payload = value.getPayload();
                if (payload == null) {
                    // NOP
                } else {
                    String warcContent = IOUtils.toString(payload.getInputStreamComplete());
                    if (warcContent == null && "".equals(warcContent)) {
                        // NOP
                    } else {
                        try {
                            //Remove all HTML from warcContent, right now it seems to empty the pages completely. Please do test for yourself
                            //warcContent = Jsoup.parse(warcContent).text();
                        } catch (Exception e) {
                        }

                        String targetURI = value.header.warcTargetUriStr;
                        //Write Word Count Mapping
                        context.write(new Text(targetURI), new Text(parseToString(countWords(warcContent))));

                        //Write Links
                        Document doc = Jsoup.parse(warcContent);
                        Elements links = doc.select("a");
                        for (Element link : links) {
                            String absHref = link.attr("abs:href");
                            // Omit nulls and empty strings
                            if (absHref != null && !("".equals(absHref))) {
                                context.write(new Text(targetURI), new Text(absHref));
                            }
                        }
                    }
                }
            }
        }
    }
}

From source file:nl.surfsara.warcexamples.hadoop.warc.HrefExtracter.java

@Override
public void map(LongWritable key, WarcRecord value, Context context) throws IOException, InterruptedException {
    context.setStatus(Counters.CURRENT_RECORD + ": " + key.get());

    // Only process http response content. Note that the outlinks can also be found in the wat metadata.
    if ("application/http; msgtype=response".equals(value.header.contentTypeStr)) {
        // org.jwat.warc.WarcRecord is kind enough to also parse http headers for us:
        HttpHeader httpHeader = value.getHttpHeader();
        if (httpHeader == null) {
            // No header so we are unsure that the content is text/html: NOP
        } else {//  ww  w .  j  a v  a  2 s .  c  om
            if (httpHeader.contentType != null && httpHeader.contentType.contains("text/html")) {
                // Note that if you really want to do this right; you should look at the character encoding as well.
                // We'll leave that as an exercise for you ;-).
                context.getCounter(Counters.NUM_HTTP_RESPONSE_RECORDS).increment(1);
                // Get the html payload
                Payload payload = value.getPayload();
                if (payload == null) {
                    // NOP
                } else {
                    String warcContent = IOUtils.toString(payload.getInputStreamComplete());
                    if (warcContent == null || "".equals(warcContent)) {
                        // NOP
                    } else {
                        String targetURI = value.header.warcTargetUriStr;

                        Document doc = Jsoup.parse(warcContent);

                        Elements links = doc.select("a");
                        for (Element link : links) {
                            String absHref = link.attr("abs:href");
                            // Omit nulls and empty strings
                            if (absHref != null && !("".equals(absHref))) {
                                context.write(new Text(targetURI), new Text(absHref));
                            }
                        }
                    }
                }
            }
        }
    }
}

From source file:no.kantega.publishing.admin.content.htmlfilter.CleanupFormHtmlFilter.java

@Override
public Document runFilter(Document document) {
    Elements inputs = document.getElementsByTag("input");
    for (Element input : inputs) {
        String type = input.attr("type");
        if (isBlank(type)) {
            input.attr("type", "text");
        }/*  w w w. j  a  v  a2s . c o m*/
    }
    return document;
}

From source file:no.kantega.publishing.admin.content.htmlfilter.ContextPathFilter.java

private void fixContextPathForAttribute(Elements elements, String attribute) {
    for (Element element : elements) {
        String attributeValue = element.attr(attribute);
        if (isNotBlank(attributeValue)) {
            if (attributeValue.startsWith("../")) {
                attributeValue = rootUrlToken + "/" + attributeValue
                        .substring(attributeValue.lastIndexOf("../") + 3, attributeValue.length());
                element.attr(attribute, attributeValue);
            }//from  w w  w  .ja  v a  2s  . c o m

            if (contextPath.length() > 0) {
                if (attributeValue.startsWith(contextPath + "/")) {
                    attributeValue = rootUrlToken
                            + attributeValue.substring(contextPath.length(), attributeValue.length());
                    element.attr(attribute, attributeValue);
                }
            }
        }
    }
}

From source file:no.kantega.publishing.admin.content.htmlfilter.ConvertUnderlineToEditorStyleFilter.java

@Override
public Document runFilter(Document document) {
    for (Element span : document.getElementsByTag("span")) {
        String style = span.attr("style");
        if (isNotBlank(style)) {
            String textDecoration = getSubAttributeValue(style, "text-decoration");
            if ("underline".equalsIgnoreCase(textDecoration)) {
                span.removeAttr("style");
                span.tagName("u");
            }/*from   www . java2 s. com*/
        }
    }
    return document;
}