Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * May happen but should not//w w w .  j a  va  2 s.  c o  m
 * @param elem an element that is not a span, p or div
 */
private void parseOtherElement(Element elem) throws JSONException {
    List<Node> children = elem.childNodes();
    int offset = sb.length();
    String name = elem.attr("class");
    if (name == null || name.length() == 0)
        name = elem.nodeName();
    Range r = new Range(name, offset, 0);
    stil.add(r);
    for (Node child : children) {
        if (child instanceof Element)
            parseOtherElement((Element) child);
        else if (child instanceof TextNode)
            sb.append(((TextNode) child).getWholeText());
    }
    this.stil.updateLen(r, sb.length() - offset);
    prevWasMilestone = false;
}

From source file:net.GoTicketing.GoTicketing.java

/**
 * ??//from www .j a va2 s  .c  o m
 * @throws Exception 
 */
private void praseVoiceCaptchaSrc() throws Exception {
    Document doc = Jsoup.parse(TicketingPageHTML);
    Element voc = doc.getElementsByTag("audio").last();
    if (voc == null)
        throw new Exception("Can't get voice captcha source !");

    //out.println(host + voc.attr("src").substring(1));
    VoiceCaptchaSrc = host + voc.attr("src").substring(1);
}

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Parse a div (section)//from   ww w  . j  ava2s.  c o m
 * @param div the div
 * @throws JSONException 
 */
private void parseDiv(Element div) throws JSONException {
    List<Node> children = div.childNodes();
    int offset = sb.length();
    String name = div.attr("class");
    if (name == null || name.length() == 0)
        name = "section";
    Range r = new Range(name, offset, 0);
    stil.add(r);
    for (Node child : children) {
        if (child instanceof Element) {
            String nName = child.nodeName().toLowerCase();
            if (nName.equals("p"))
                parsePara((Element) child, "p");
            else if (nName.matches("(h|H)\\d"))
                parsePara((Element) child, nName);
            else if (child.nodeName().toLowerCase().equals("span"))
                parseSpan((Element) child);
            else if (nName.equals("pre"))
                parsePre((Element) child);
            else
                parseOtherElement((Element) child);
        }
    }
    ensure(3, true);
    this.stil.updateLen(r, sb.length() - offset);
    prevWasMilestone = false;
}

From source file:com.weavers.duqhan.business.impl.ProductServiceImpl.java

@Override
public List<StatusBean> getTempProductLinks(String link) {
    boolean status = true; //success
    String startDate = new Date().toString();
    Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE,
            "(==I==)DATE: " + startDate + "Product link collection start.....\n For the link ( " + link + " )");
    Elements productUrlList = null;
    List<StatusBean> statusBeans = new ArrayList<>();
    //        Elements nexturl = null;
    boolean contd = true;
    String productList = link /*"https://www.aliexpress.com/wholesale?minPrice=&maxPrice=&isBigSale=n&isFreeShip=y&isFavorite=all&isMobileExclusive=n&isLocalReturn=n&shipFromCountry=&shipCompanies=&SearchText=jwelry+for+women&CatId=1509&g=y&initiative_id=SB_20170330225112&needQuery=n&isrefine=y"*/;
    Temtproductlinklist temtproductlinklist;
    Temtproductlinklist savedTemtproductlinklist;
    String nexturl = null;/*from ww w  . j av  a  2  s  .  co  m*/
    String firstPart = null;
    String secondPart = null;
    int[] pageNumber = new int[199];
    Random randomObj1 = new Random();
    for (int i = 0; i < 198; i++) {
        pageNumber[i] = (randomObj1.ints(2, 200).findFirst().getAsInt());
    }
    try {
        Document doc = Jsoup.connect(productList).get();
        productUrlList = doc.select("div.ui-pagination-navi a");
        if (!productUrlList.isEmpty()) {
            nexturl = productUrlList.get(0).attr("abs:href");
            firstPart = nexturl.split(".html")[0];
            firstPart = firstPart.substring(0, firstPart.length() - 1);
            secondPart = nexturl.split(".html")[1];
            secondPart = ".html" + secondPart;
            for (int i = 0; i < 198; i++) {
                nexturl = firstPart + pageNumber[i] + secondPart;
                doc = Jsoup.connect(nexturl).get();
                productUrlList = doc.select(".son-list .list-item .pic a[href]");
                //=================== Random sleep START ===================//
                Random randomObj = new Random();
                TimeUnit.SECONDS.sleep(randomObj.ints(30, 60).findFirst().getAsInt());
                //=================== Random sleep END =====================//

                if (!productUrlList.isEmpty()) {
                    for (Element element : productUrlList) {
                        temtproductlinklist = temtproductlinklistDao
                                .getTemtproductlinklistByLink(element.attr("abs:href"));
                        if (temtproductlinklist == null) {
                            StatusBean statusBean = new StatusBean();
                            temtproductlinklist = new Temtproductlinklist();
                            temtproductlinklist.setLink(element.attr("abs:href"));
                            temtproductlinklist.setStatus(0);
                            //System.out.println("element.toString()" + element.attr("abs:href"));
                            savedTemtproductlinklist = temtproductlinklistDao.save(temtproductlinklist);
                            statusBean.setStatus(String.valueOf(savedTemtproductlinklist.getStatus()));
                            statusBean.setStatusCode(savedTemtproductlinklist.getLink());
                            statusBean.setId(savedTemtproductlinklist.getId());
                            statusBeans.add(statusBean);
                        }
                    }
                }
            }
        }
    } catch (Exception ex) {
        status = false; //failure
        System.out.println("(=============================================)DATE: " + new Date().toString()
                + "Product link collection get exception.....\n Which started on: " + startDate + "\n"
                + ex.getLocalizedMessage());
        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE,
                "(==E==)DATE: " + new Date().toString()
                        + "Product link collection get exception.....\n Which started on: " + startDate + "\n",
                ex);
        String body = "DATE: " + new Date().toString()
                + "Product link collection get exception.....\nNext link not found.\n Which started on: "
                + startDate;
        //            MailSender.sendEmail("krisanu.nandi@pkweb.in", "Error", body, "subhendu.sett@pkweb.in");
    }
    if (status) {
        System.out.println("=============================================DATE: " + new Date().toString()
                + "Product link collection end.....\n Which started on: " + startDate);
        Logger.getLogger(ProductServiceImpl.class.getName()).log(Level.SEVERE, "(==I==)DATE: "
                + new Date().toString() + "Product link collection end.....\n Which started on: " + startDate);
        String body = "DATE: " + new Date().toString() + "Product link collection end.....\n Which started on: "
                + startDate;
        //            MailSender.sendEmail("krisanu.nandi@pkweb.in", "Success", body, "subhendu.sett@pkweb.in");
    }
    return statusBeans;
}

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Parse a span with a class or not/* ww w .  java2  s  .c om*/
 * @param span the span in HTML
 */
private void parseSpan(Element span) throws JSONException {
    if (span.hasText()) {
        int offset = sb.length();
        String name = span.attr("class");
        Range r = new Range(name, offset, 0);
        if (name == null || name.length() == 0)
            name = "span";
        if (isMilestone(name)) {
            pages.add(r);
            sb.append(span.text());
            sb.append("\n");
            pages.updateLen(r, sb.length() - offset);
            prevWasMilestone = true;
        } else if (name.equals("soft-hyphen")) {
            stil.add(r);
            // get previous word
            int i = sb.length() - 1;
            while (i > 0 && !Character.isWhitespace(sb.charAt(i)))
                i--;
            if (i > 0)
                i++;
            String prev = clean(sb.substring(i), true);
            // get next word
            String next = clean(nextWord(span), false);
            if (this.speller.isHardHyphen(prev, next))
                r.name = "hard-hyphen";
            sb.append(span.text());
            stil.updateLen(r, sb.length() - offset);
        } else // span may contain other spans
        {
            stil.add(r);
            List<Node> children = span.childNodes();
            for (Node child : children) {
                if (child instanceof Element) {
                    String nName = child.nodeName().toLowerCase();
                    if (nName.equals("span"))
                        parseSpan((Element) child);
                    else
                        parseOtherElement((Element) child);
                } else if (child instanceof TextNode) {
                    TextNode tn = (TextNode) child;
                    sb.append(tn.text());
                }
            }
            if (isLineFormat(name))
                ensure(1, false);
            stil.updateLen(r, sb.length() - offset);
        }
    }
    // else strangely no text: ignore it
}

From source file:com.zacwolf.commons.email.Email.java

private void prepare(final org.jsoup.nodes.Document doc) {
    removeComments(doc);//Remove any comments from the html of the message to reduce the size
    //Change the title to match the subject of the email
    if (doc.getElementsByTag("title").size() > 0)
        doc.getElementsByTag("title").first().html(getSubject());
    //Replace the contents of any tags with class="date" with the current date
    if (doc.getElementsByClass("date").size() > 0) {
        for (org.jsoup.nodes.Element datelem : doc.getElementsByClass("date")) {
            SimpleDateFormat df = new SimpleDateFormat("MMMMMMMMMM d, yyyy");
            if (datelem.hasAttr("format")) {
                try {
                    df = new SimpleDateFormat(datelem.attr("format"));
                } catch (Exception ee) {
                } //throw it away and just go back to the default format;
                datelem.html(df.format(TimeUtils.getGMTtime()));
            }//from  w  w  w  . jav  a  2 s. co  m
        }
    }
    //tables need the border-spacing: style attribute; added for GMail compatiblity
    for (org.jsoup.nodes.Element tbl : doc.getElementsByTag("table"))
        if (!tbl.attr("style").contains("border-spacing:"))
            tbl.attr("style",
                    tbl.attr("style") + (!tbl.attr("style").endsWith(";") ? ";" : "") + "border-spacing:0;");
}

From source file:com.ferasinfotech.gwreader.ScreenSlidePageFragment.java

/**
 * Alternate Factory method for this fragment class. Constructs a new fragment for the given page number,
 * and HTML story element./*from w  w  w .  j av a  2 s  .  c  o m*/
 */
public static ScreenSlidePageFragment create(int pageNumber, int numPages, org.jsoup.nodes.Element story) {
    int story_id = -1;
    String name = "";
    String summary = "";
    String headline = "";
    String cover_photo_url = "";
    String story_string = "";
    long createdAt;

    ScreenSlidePageFragment fragment = new ScreenSlidePageFragment();
    Bundle args = new Bundle();
    if (pageNumber == 0) {
        story_id = 0;
        name = "Grasswire Help";
        headline = "Usage Instructions";
        cover_photo_url = "android.resource://com.ferasinfotech.gwreader/" + R.drawable.gw_logo;
        summary = "Swipe right and left to read each story.\n\n"
                + "Scroll down to read facts and associated news items (tweets and links) for each story.\n\n"
                + "Tap on a news items within a story and you'll be able to follow web links, view tweets via the Twitter app, or watch videos.\n\n"
                + "A long press on a story's cover photo will launch the device browser to view or edit the story on the Grasswire mobile site.\n\n"
                + "A long press on the image above will launch the Grasswire main page.\n\n" + "App Version: "
                + BuildConfig.VERSION_NAME + "\n\n";
    } else {

        // doing a story page, Element 'story' is the story data

        Elements e_list;
        org.jsoup.nodes.Element tag;

        story_id = Integer.valueOf(story.attr("data-story-id"));
        e_list = story.getElementsByClass("feature__tag");
        tag = e_list.get(0);
        name = tag.text() + " (" + pageNumber + "/" + numPages + ")";
        e_list = story.getElementsByClass("story__summary");
        tag = e_list.get(0);
        summary = tag.html().replace("<br />", "\r");
        e_list = story.getElementsByClass("feature__text");
        tag = e_list.get(0);
        headline = tag.text();
        e_list = story.getElementsByClass("feature__image");
        tag = e_list.get(0);
        cover_photo_url = tag.attr("src");
        story_string = story.toString();

    }

    args.putInt(ARG_PAGE, pageNumber);
    args.putInt(ARG_STORY_ID, story_id);
    args.putString(ARG_TITLE, name);
    args.putString(ARG_SUMMARY, summary);
    args.putString(ARG_HEADLINE, headline);
    args.putString(ARG_COVER_PHOTO, cover_photo_url);
    args.putString(ARG_STORY_STRING, "<html><head></head><body>" + story_string + "</body></html>");
    fragment.setArguments(args);
    return fragment;
}

From source file:com.digitalpebble.storm.crawler.bolt.JSoupParserBolt.java

@Override
public void execute(Tuple tuple) {

    byte[] content = tuple.getBinaryByField("content");
    String url = tuple.getStringByField("url");
    Metadata metadata = (Metadata) tuple.getValueByField("metadata");

    // check that its content type is HTML
    // look at value found in HTTP headers
    boolean CT_OK = false;
    String httpCT = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
    if (StringUtils.isNotBlank(httpCT)) {
        if (httpCT.toLowerCase().contains("html")) {
            CT_OK = true;/*from  www . jav a 2 s .  c  o m*/
        }
    }
    // simply ignore cases where the content type has not been set
    // TODO sniff content with Tika?
    else {
        CT_OK = true;
    }

    if (!CT_OK) {
        String errorMessage = "Exception content-type " + httpCT + " for " + url;
        RuntimeException e = new RuntimeException(errorMessage);
        handleException(url, e, metadata, tuple, "content-type checking", errorMessage);
        return;
    }

    LOG.info("Parsing : starting {}", url);

    long start = System.currentTimeMillis();

    String charset = getContentCharset(content, metadata);

    // get the robots tags from the fetch metadata
    RobotsTags robotsTags = new RobotsTags(metadata);

    Map<String, List<String>> slinks;
    String text;
    DocumentFragment fragment;
    try (ByteArrayInputStream bais = new ByteArrayInputStream(content)) {
        org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(bais, charset, url);

        fragment = JSoupDOMBuilder.jsoup2HTML(jsoupDoc);

        // extracts the robots directives from the meta tags
        robotsTags.extractMetaTags(fragment);

        // store a normalised representation in metadata
        // so that the indexer is aware of it
        robotsTags.normaliseToMetadata(metadata);

        // do not extract the links if no follow has been set
        // and we are in strict mode
        if (robotsTags.isNoFollow() && robots_noFollow_strict) {
            slinks = new HashMap<String, List<String>>(0);
        } else {
            Elements links = jsoupDoc.select("a[href]");
            slinks = new HashMap<String, List<String>>(links.size());
            for (Element link : links) {
                // abs:href tells jsoup to return fully qualified domains
                // for
                // relative urls.
                // e.g.: /foo will resolve to http://shopstyle.com/foo
                String targetURL = link.attr("abs:href");

                // nofollow
                boolean noFollow = "nofollow".equalsIgnoreCase(link.attr("rel"));
                // remove altogether
                if (noFollow && robots_noFollow_strict) {
                    continue;
                }

                // link not specifically marked as no follow
                // but whole page is
                if (!noFollow && robotsTags.isNoFollow()) {
                    noFollow = true;
                }

                String anchor = link.text();
                if (StringUtils.isNotBlank(targetURL)) {
                    // any existing anchors for the same target?
                    List<String> anchors = slinks.get(targetURL);
                    if (anchors == null) {
                        anchors = new LinkedList<String>();
                        slinks.put(targetURL, anchors);
                    }
                    // track the anchors only if no follow is false
                    if (!noFollow && StringUtils.isNotBlank(anchor)) {
                        anchors.add(anchor);
                    }
                }
            }
        }

        text = jsoupDoc.body().text();

    } catch (Throwable e) {
        String errorMessage = "Exception while parsing " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content parsing", errorMessage);
        return;
    }

    // store identified charset in md
    metadata.setValue("parse.Content-Encoding", charset);

    long duration = System.currentTimeMillis() - start;

    LOG.info("Parsed {} in {} msec", url, duration);

    List<Outlink> outlinks = toOutlinks(url, metadata, slinks);

    ParseResult parse = new ParseResult();
    parse.setOutlinks(outlinks);

    // parse data of the parent URL
    ParseData parseData = parse.get(url);
    parseData.setMetadata(metadata);
    parseData.setText(text);
    parseData.setContent(content);

    // apply the parse filters if any
    try {
        parseFilters.filter(url, content, fragment, parse);
    } catch (RuntimeException e) {

        String errorMessage = "Exception while running parse filters on " + url + ": " + e;
        handleException(url, e, metadata, tuple, "content filtering", errorMessage);
        return;
    }

    if (emitOutlinks) {
        for (Outlink outlink : outlinks) {
            collector.emit(StatusStreamName, tuple,
                    new Values(outlink.getTargetURL(), outlink.getMetadata(), Status.DISCOVERED));
        }
    }

    // emit each document/subdocument in the ParseResult object
    // there should be at least one ParseData item for the "parent" URL

    for (Map.Entry<String, ParseData> doc : parse) {
        ParseData parseDoc = doc.getValue();

        collector.emit(tuple,
                new Values(doc.getKey(), parseDoc.getContent(), parseDoc.getMetadata(), parseDoc.getText()));
    }

    collector.ack(tuple);
    eventCounter.scope("tuple_success").incr();
}

From source file:com.fluidops.iwb.provider.HTMLProvider.java

@Override
public void gather(List<Statement> res) throws Exception {

    String url = config.url;/*w  ww  .  j a va 2 s  .  c  o m*/
    Document doc = Jsoup.connect(url).get();
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    // Elements article =
    // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table");
    Elements article = doc.getElementsByTag("tbody").select("tr");
    Elements tableElem;
    URI nameURI = null;
    URI roadsURI = null;
    URI sideURI = null;
    URI totalURI = null;

    File file = new File("HTMLdata.txt");
    PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)));

    out.println("Media");
    print("\nMedia: (%d)", media.size());
    for (Element el : media) {
        if (el.tagName().equals("img")) {
            print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.println();
        } else {
            print(" * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.println();
        }

    }

    out.println("Imports");
    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.println();
    }

    out.println("Links");
    print("\nLinks: (%d)", links.size());
    for (Element link : links) {
        print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
        out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text());
        out.println();
    }

    /*
     * out.println("Custom text"); print("\nCustom: (%d)",customArt.size());
     * for (Element custom:customArt){
     * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text());
     * out.println(); }
     */

    out.println("Article");
    print("\nArticle: (%d)", article.size());

    for (int i = 3; i < article.size() - 2; i++) {
        tableElem = article.get(i).select("td");
        out.println();

        if (i == 3) {
            nameURI = ProviderUtils.objectToUri(tableElem.get(0).text());
            roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text());
            sideURI = ProviderUtils.objectToUri(tableElem.get(2).text());
            totalURI = ProviderUtils.objectToUri(tableElem.get(3).text());

        } else {

            res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE,
                    nameURI));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    RDFS.LABEL, tableElem.get(0).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    roadsURI, tableElem.get(1).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    sideURI, tableElem.get(2).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    totalURI, tableElem.get(3).text()));

            for (Element el : tableElem) {
                out.printf("\n * (%s): (%s)", el.tagName(), el.text());
                out.println();

            }
        }
        out.println();
        out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text());
        out.println();
    }
    out.close();
}

From source file:de.geeksfactory.opacclient.apis.Open.java

private String getCoverUrl(Element img) {
    String[] parts = img.attr("sources").split("\\|");
    // Example: SetSimpleCover|a|https://vlb.de/GetBlob.aspx?strIsbn=9783868511291&amp;
    // size=S|a|http://www.buchhandel.de/default.aspx?strframe=titelsuche&amp;
    // caller=vlbPublic&amp;func=DirectIsbnSearch&amp;isbn=9783868511291&amp;
    // nSiteId=11|c|SetNoCover|a|/DesktopModules/OCLC.OPEN.PL.DNN
    // .BaseLibrary/StyleSheets/Images/Fallbacks/emptyURL.gif?4.2.0.0|a|
    for (int i = 0; i + 2 < parts.length; i++) {
        if (parts[i].equals("SetSimpleCover")) {
            String url = parts[i + 2].replace("&amp;", "&");
            try {
                HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
                conn.setRequestMethod("HEAD");
                int code = conn.getResponseCode();
                if (code == 200) {
                    return url;
                }/*from  ww  w  .  j  a  v a  2 s. co  m*/
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    return null;
}