Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:com.screenslicer.core.util.Util.java

public static WebElement toElement(RemoteWebDriver driver, HtmlNode htmlNode, Element body)
        throws ActionFailed {
    if (body == null) {
        body = Util.openElement(driver, null, null, null);
    }//from   w w w  . jav  a 2 s. c  om
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        WebElement element = toElement(driver, body.getElementById(htmlNode.id));
        if (element != null) {
            return element;
        }
    }
    List<Elements> selected = new ArrayList<Elements>();
    if (!CommonUtil.isEmpty(htmlNode.tagName)) {
        selected.add(body.getElementsByTag(htmlNode.tagName));
    } else if (!CommonUtil.isEmpty(htmlNode.href)) {
        selected.add(body.getElementsByTag("a"));
    }
    if (!CommonUtil.isEmpty(htmlNode.name)) {
        selected.add(body.getElementsByAttributeValue("name", htmlNode.name));
    }
    if (!CommonUtil.isEmpty(htmlNode.type)) {
        selected.add(body.getElementsByAttributeValue("type", htmlNode.type));
    }
    if (!CommonUtil.isEmpty(htmlNode.value)) {
        selected.add(body.getElementsByAttributeValue("value", htmlNode.value));
    }
    if (!CommonUtil.isEmpty(htmlNode.title)) {
        selected.add(body.getElementsByAttributeValue("title", htmlNode.title));
    }
    if (htmlNode.classes != null && htmlNode.classes.length > 0) {
        Map<Element, Integer> found = new HashMap<Element, Integer>();
        for (int i = 0; i < htmlNode.classes.length; i++) {
            Elements elements = body.getElementsByClass(htmlNode.classes[i]);
            for (Element element : elements) {
                if (!found.containsKey(element)) {
                    found.put(element, 0);
                }
                found.put(element, found.get(element) + 1);
            }
        }
        Elements elements = new Elements();
        for (int i = htmlNode.classes.length; i > 0; i--) {
            for (Map.Entry<Element, Integer> entry : found.entrySet()) {
                if (entry.getValue() == i) {
                    elements.add(entry.getKey());
                }
            }
            if (!elements.isEmpty()) {
                break;
            }
        }
        selected.add(elements);
    }
    if (!CommonUtil.isEmpty(htmlNode.href)) {
        Elements hrefs = body.getElementsByAttribute("href");
        Elements toAdd = new Elements();
        String currentUrl = driver.getCurrentUrl();
        String hrefGiven = htmlNode.href;
        for (Element href : hrefs) {
            String hrefFound = href.attr("href");
            if (hrefGiven.equalsIgnoreCase(hrefFound)) {
                toAdd.add(href);
            } else {
                String uriGiven = Util.toCanonicalUri(currentUrl, hrefGiven);
                String uriFound = Util.toCanonicalUri(currentUrl, hrefFound);
                if (uriGiven.equalsIgnoreCase(uriFound)) {
                    toAdd.add(href);
                }
            }
        }
        selected.add(toAdd);
    }
    if (!CommonUtil.isEmpty(htmlNode.innerText)) {
        selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText)));
    }
    if (htmlNode.multiple != null) {
        selected.add(body.getElementsByAttribute("multiple"));
    }
    Map<Element, Integer> votes = new HashMap<Element, Integer>();
    for (Elements elements : selected) {
        for (Element element : elements) {
            if (!Util.isHidden(element)) {
                if (!votes.containsKey(element)) {
                    votes.put(element, 0);
                }
                votes.put(element, votes.get(element) + 1);
            }
        }
    }
    int maxVote = 0;
    Element maxElement = null;
    for (Map.Entry<Element, Integer> entry : votes.entrySet()) {
        if (entry.getValue() > maxVote) {
            maxVote = entry.getValue();
            maxElement = entry.getKey();
        }
    }
    return toElement(driver, maxElement);
}

From source file:org.kitesdk.spring.hbase.example.service.WebPageSnapshotService.java

/**
 * Parse the outlinks from a href tags in the document, and return them as a
 * list/* w  w  w  .j  a  va  2s. co  m*/
 *
 * @param doc The document to parse
 * @return The list of outlinks as URL strings.
 */
private List<String> getOutlinksFromDocument(Document doc) {
    List<String> outlinks = new ArrayList<String>();
    Elements linkElements = doc.select("a[href]");
    for (Element linkElement : linkElements) {
        outlinks.add(linkElement.attr("href").trim());
    }
    return outlinks;
}

From source file:com.pemikir.youtubeplus.youtube.YoutubeExtractor.java

private VideoInfoItem extractVideoInfoItem(Element li) {
    VideoInfoItem info = new VideoInfoItem();
    info.webpage_url = li.select("a[class*=\"content-link\"]").first().attr("abs:href");
    try {//from ww  w .  j a  v  a  2  s .  co m
        Pattern p = Pattern.compile("v=([0-9a-zA-Z-]*)");
        Matcher m = p.matcher(info.webpage_url);
        m.find();
        info.id = m.group(1);
    } catch (Exception e) {
        e.printStackTrace();
    }
    info.title = li.select("span[class=\"title\"]").first().text();

    info.uploader = li.select("span[class=\"g-hovercard\"]").first().text();

    info.duration = li.select("span[class=\"video-time\"]").first().text();

    Element img = li.select("img").first();
    info.thumbnail_url = img.attr("abs:src");
    // Sometimes youtube sends links to gif files witch somehow seam to not exist
    // anymore. Items with such gif also offer a secondary image source. So we are going
    // to use that if we caught such an item.
    if (info.thumbnail_url.contains(".gif")) {
        info.thumbnail_url = img.attr("data-thumb");
    }

    return info;
}

From source file:com.bdx.rainbow.service.etl.analyze.SYJHttpAnalyze.java

/**
 * ???//w ww.  ja  v a  2 s  . c o m
 */
@Override
public Collection<HttpSeed> findPageSeed(Collection<HttpSeed> seeds) throws Exception {

    if (CollectionUtils.isEmpty(seeds)) {
        return null;
    }

    Collection<HttpSeed> seedGroups = new ArrayList<HttpSeed>();

    for (HttpSeed seed : seeds) {
        Document doc = parse(seed.getHtml());

        // ?URL
        Elements page_form_elements = doc.select("#pageForm");
        if (page_form_elements.isEmpty()) {
            return null;
        }

        Element page_form_e = page_form_elements.get(0);
        // URL
        String url = DOMAIN + page_form_e.attr("action");
        Elements param_elements = page_form_e.select("input");

        // 
        int totalPageNum = this.getTotalPageNum(doc);

        for (int pageNo = 1; pageNo <= totalPageNum; pageNo++) {

            // ?
            Map<String, String> params = new HashMap<String, String>();
            for (Element param_e : param_elements) {
                params.put(param_e.attr("name"), param_e.attr("value"));
            }
            // 
            params.put("curstart", String.valueOf(pageNo));

            HttpSeed httpSeed = this.initListHttpSeed(url, params);

            seedGroups.add(httpSeed);
        }
    }

    return seedGroups;
}

From source file:org.confab.PhpBB3Parser.java

public void postForumThread(Forum targetForum, Post newPost, User user) {
    Utilities.debug("postForumThread");

    try {//  w ww . ja  v  a 2s.  c  om
        String reply_page = targetForum.rootURL() + "newthread.php?do=newthread&f=" + targetForum.id;
        Utilities.debug("GET: " + reply_page);
        HttpGet httpget = new HttpGet(reply_page);
        HttpResponse response = httpclient.execute(httpget, user.httpContext);
        HttpEntity entity = response.getEntity();
        Document page = Jsoup.parse(EntityUtils.toString(entity));
        EntityUtils.consume(entity);
        assert page != null;

        // TODO: need check to make sure we're on the right page.  HttpEntity's
        // can just contain garbage and jsoup will still consume it

        // Make sure we're logged in before going any further
        Element username_box = page.select("input[name=vb_login_username]").first();
        assert username_box == null;
        Element password_box = page.select("input[name=vb_login_password]").first();
        assert password_box == null;

        // Construct POST 
        HttpPost httppost = new HttpPost(targetForum.rootURL() + "newthread.php");
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();

        // TODO: fix subject
        nvps.add(new BasicNameValuePair("subject", "hello world"));
        nvps.add(new BasicNameValuePair("message", newPost.message));

        // Find the form - we can parse the rest of the needed elements from it
        Element reply_form = page.select("form[action*=newthread.php?do=postthread&f=]").first();
        assert reply_form != null;
        String[] vals_array = { "s", "securitytoken", "f", "do", "posthash", "poststarttime", "loggedinuser" };
        List<String> vals = Arrays.asList(vals_array);
        for (String val : vals) {
            Element el = reply_form.select("input[name=" + val + "]").first();
            assert el != null : val;
            nvps.add(new BasicNameValuePair(val, el.attr("value")));
        }
        httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));

        // Execute the POST 
        Utilities.debug("Executing POST");
        response = httpclient.execute(httppost, user.httpContext);
        Utilities.debug("POST response: " + response.getStatusLine());
        assert response.getStatusLine().getStatusCode() == 302;
    } catch (IOException e) {
        System.out.println(e);
    }
    Utilities.debug("end postForumThread");
}

From source file:tkbautobooking.BookingSystem.java

private void praseBookingPage() throws Exception {

    Document doc = Jsoup.parse(BookingPageHTML);
    Element class_selector = doc.getElementById("class_selector");

    if (class_selector == null)
        throw new Exception("Prase Booking Page fail !");

    classMap = new TreeMap<>();
    for (Element option : class_selector.getElementsByTag("option")) {
        if (option.attr("value").equals(""))
            continue;

        classMap.put(option.attr("value"), option.text().replace("", " "));
    }/*ww w  . j  a v a 2  s . c o m*/
}

From source file:org.confab.PhpBB3Parser.java

public void createPost(Post replyTo, Post newPost, User user) {
    Utilities.debug("createPost");

    try {/*from  www.j  a va2  s .  co  m*/
        String reply_page = replyTo.rootURL() + "newreply.php?do=newreply&noquote=1&p=" + replyTo.id;
        HttpGet httpget = new HttpGet(reply_page);
        HttpResponse response = httpclient.execute(httpget, user.httpContext);
        HttpEntity entity = response.getEntity();
        Document page = Jsoup.parse(EntityUtils.toString(entity));
        EntityUtils.consume(entity);
        assert page != null;

        // TODO: need check to make sure we're on the right page.  HttpEntity's
        // can just contain garbage and jsoup will still consume it

        // Make sure we're logged in before going any further
        Element username_box = page.select("input[name=vb_login_username]").first();
        assert username_box == null;
        Element password_box = page.select("input[name=vb_login_password]").first();
        assert password_box == null;

        // Construct POST 
        HttpPost httppost = new HttpPost(replyTo.rootURL() + "newreply.php");
        List<NameValuePair> nvps = new ArrayList<NameValuePair>();

        // There is a title param but think it's optional..
        //nvps.add(new BasicNameValuePair("title", "");

        nvps.add(new BasicNameValuePair("message", newPost.message));

        // Find the form - we can parse the rest of the needed elements from it
        Element reply_form = page.select("form[action*=newreply.php?do=postreply&t=]").first();
        assert reply_form != null;
        String[] vals_array = { "s", "securitytoken", "do", "t", "p", "specifiedpost", "posthash",
                "poststarttime", "loggedinuser", "multiquoteempty" };
        List<String> vals = Arrays.asList(vals_array);
        for (String val : vals) {
            Element el = reply_form.select("input[name=" + val + "]").first();
            assert el != null : val;
            nvps.add(new BasicNameValuePair(val, el.attr("value")));
        }
        httppost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));

        // Execute the POST 
        Utilities.debug("Executing POST");
        response = httpclient.execute(httppost, user.httpContext);
        Utilities.debug("POST response: " + response.getStatusLine());
        assert response.getStatusLine().getStatusCode() == 302;
    } catch (IOException e) {
        System.out.println(e);
    }
    Utilities.debug("end createPost");
}

From source file:org.apache.karaf.cave.server.storage.CaveRepositoryImpl.java

/**
 * Proxy a HTTP URL locally./*  ww  w.  j  a v  a 2s  .c  o  m*/
 *
 * @param url    the HTTP URL to proxy.
 * @param filter regex filter. Only artifacts URL matching the filter will be considered.
 * @throws Exception in case of proxy failure.
 */
private void proxyHttp(String url, String filter) throws Exception {
    LOGGER.debug("Proxying HTTP URL {}", url);
    HttpClient httpClient = new DefaultHttpClient();

    HttpGet httpGet = new HttpGet(url);
    HttpResponse response = httpClient.execute(httpGet);
    HttpEntity entity = response.getEntity();

    if (entity != null) {
        if (entity.getContentType().getValue().equals("application/java-archive")
                || entity.getContentType().getValue().equals("application/octet-stream")) {
            // I have a jar/binary, potentially a resource
            try {
                if ((filter == null) || (url.matches(filter))) {
                    Resource resource = new DataModelHelperImpl().createResource(new URL(url));
                    if (resource != null) {
                        obrRepository.addResource(resource);
                        obrRepository.setLastModified(System.currentTimeMillis());
                    }
                }
            } catch (IllegalArgumentException e) {
                LOGGER.warn(e.getMessage());
            }
        } else {
            // try to find link to "browse"
            try {
                Document document = Jsoup.connect(url).get();

                Elements links = document.select("a");
                if (links.size() > 1) {
                    for (int i = 1; i < links.size(); i++) {
                        Element link = links.get(i);
                        String absoluteHref = link.attr("abs:href");
                        this.proxyHttp(absoluteHref, filter);
                    }
                }
            } catch (UnsupportedMimeTypeException e) {
                // ignore
            }
        }
    }
}

From source file:com.thesmartweb.swebrank.WebParser.java

/**
 * Method to get the number of links (total, internal)
 * @param link_html the url to parse// www.  ja v  a2 s.  c  om
 * @return the number of links
 */
public int[] getnlinks(String link_html) {
    int[] nlinks = new int[2];
    nlinks[0] = 0;//total number of links
    nlinks[1] = 0;//number of internal links 
    try {
        Document doc = Jsoup.connect(link_html).timeout(10 * 1000).get();
        Elements links = doc.select("a[href]");
        nlinks[0] = links.size();
        //----we check if a link is internal or not (abs is used to get the whole link (abs stands for abs)
        for (Element link : links) {
            if (link.attr("abs:href").contains(link_html)) {
                nlinks[1]++;
            }
        }
        return nlinks;
    } catch (Exception ex) {
        Logger.getLogger(com.thesmartweb.swebrank.WebParser.class.getName()).log(Level.SEVERE, null, ex);
        return nlinks;
    }

}

From source file:crawler.AScraper.java

@Transformer(inputChannel = "channel3", outputChannel = "channel4")
public Artwork convert(Element payload) throws ParseException, MalformedURLException {
    Matcher m = patter.matcher(payload.text());
    if (m.find()) {
        String year = m.group("year");
        String month = m.group("month");
        String day = m.group("day");
        int id = Integer.parseInt(m.group("id"));
        String model = m.group("model").split("[\\s\\[\\]]")[0];
        URL link = new URL(payload.attr("href"));
        DateFormat format = new SimpleDateFormat("yyyy-MM-dd");
        format.setTimeZone(TimeZone.getTimeZone("GMT+8"));
        Date date = format.parse(String.format("%s-%s-%s", year, month, day));
        String thread_title = payload.text();
        return new Artwork(thread_title, id, -1, -1, null, link, null, model, date);
    } else {/*from  ww  w.  j a  v  a2s.  c o  m*/
        LOG.error(payload.text());
        return null;
    }

}