Example usage for org.jsoup.nodes Element attr

List of usage examples for org.jsoup.nodes Element attr

Introduction

In this page you can find the example usage for org.jsoup.nodes Element attr.

Prototype

public String attr(String attributeKey) 

Source Link

Document

Get an attribute's value by its key.

Usage

From source file:org.deeplearning4j.patent.DownloadPreprocessPatents.java

/**
 * Get a list of all URLs in a page for zip files
 *//*from   www. j  a v  a 2s  .  c o m*/
public static List<String> getZipUrlsFromPage(String url) {
    List<String> out = new ArrayList<>();
    try {
        Document doc = Jsoup.connect(url).get();
        Elements links = doc.select("a[href]");

        for (Element e : links) {
            String s = e.attr("href");
            if (s.endsWith(".zip")) {
                if (s.startsWith("http")) {
                    //Absolute link
                    out.add(s);
                } else {
                    //Relative link
                    out.add(e.baseUri() + s);
                }
            }
        }

    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return out;
}

From source file:org.eclipse.mylyn.docs.examples.GenerateEPUB.java

public static void main(String[] args) {

    // clean up from last run
    try {//from  w  w  w  .  j  a va  2 s  . c om
        Files.delete(Paths.get("loremipsum.html"));
        Files.delete(Paths.get("loremipsum.epub"));
    } catch (IOException e1) {
        /* no worries */ }

    try ( // read MarkDown
            FileReader fr = new FileReader("loremipsum.md");
            // and output HTML
            Writer fw = Files.newBufferedWriter(Paths.get("loremipsum.html"), StandardOpenOption.CREATE)) {

        // generate HTML from markdown
        MarkupParser parser = new MarkupParser();
        parser.setMarkupLanguage(new MarkdownLanguage());
        HtmlDocumentBuilder builder = new HtmlDocumentBuilder(fw);
        parser.setBuilder(builder);
        parser.parse(fr, true);

        // convert any inline equations in the HTML into MathML
        String html = new String(Files.readAllBytes(Paths.get("loremipsum.html")));
        StringBuffer sb = new StringBuffer();
        Matcher m = EQUATION.matcher(html);

        // for each equation
        while (m.find()) {
            // replace the LaTeX code with MathML
            m.appendReplacement(sb, laTeX2MathMl(m.group()));
        }
        m.appendTail(sb);

        // EPUB 2.0 can only handle embedded SVG so we find all referenced
        // SVG files and replace the reference with the actual SVG code
        Document parse = Jsoup.parse(sb.toString(), "UTF-8", Parser.xmlParser());

        Elements select = parse.select("img");
        for (Element element : select) {
            String attr = element.attr("src");
            if (attr.endsWith(".svg")) {
                byte[] svg = Files.readAllBytes(Paths.get(attr));
                element.html(new String(svg));
            }
        }

        // write back the modified HTML-file
        Files.write(Paths.get("loremipsum.html"), sb.toString().getBytes(), StandardOpenOption.WRITE);

        // instantiate a new EPUB version 2 publication
        Publication pub = Publication.getVersion2Instance();

        // include referenced resources (default is false)
        pub.setIncludeReferencedResources(true);

        // title and subject is required
        pub.addTitle("EclipseCon Demo");
        pub.addSubject("EclipseCon Demo");

        // generate table of contents (default is true)
        pub.setGenerateToc(true);
        epub.add(pub);

        // add one chapter
        pub.addItem(Paths.get("loremipsum.html").toFile());

        // create the EPUB
        epub.pack(new File("loremipsum.epub"));

    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:org.ednovo.gooru.application.util.ResourceImageUtil.java

public Map<String, Object> getResourceMetaData(String url, String resourceTitle, boolean fetchThumbnail) {
    Map<String, Object> metaData = new HashMap<String, Object>();
    ResourceMetadataCo resourceFeeds = null;
    if (url != null && url.contains(VIMEO_VIDEO)) {
        resourceFeeds = getMetaDataFromVimeoVideo(url);
    } else if (url != null && url.contains(YOUTUBE_VIDEO)) {
        resourceFeeds = getYoutubeResourceFeeds(url, null);
    }/*from www  . j  a  v  a 2 s  .com*/
    String description = "";
    String title = "";
    String videoDuration = "";
    Set<String> images = new LinkedHashSet<String>();
    if (resourceFeeds == null || resourceFeeds.getUrlStatus() == 404) {
        Document doc = null;
        try {
            if (url != null && (url.contains("http://") || url.contains("https://"))) {
                doc = Jsoup.connect(url).timeout(6000).get();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        if (doc != null) {
            title = doc.title();
            Elements meta = doc.getElementsByTag(META);
            if (meta != null) {
                for (Element element : meta) {
                    if (element.attr(NAME) != null && element.attr(NAME).equalsIgnoreCase(DESCRIPTION)) {
                        description = element.attr(CONTENT);
                        break;
                    }
                }
            }
            metaData.put(DESCRIPTION, description);
            if (fetchThumbnail) {
                Elements media = doc.select("[src]");
                if (media != null) {
                    for (Element src : media) {
                        if (src.tagName().equals(IMG)) {
                            images.add(src.attr("abs:src"));
                        }
                        if (images.size() >= SUGGEST_IMAGE_MAX_SIZE) {
                            break;
                        }
                    }
                }
            }
        }
    } else {
        title = resourceFeeds.getTitle();
        description = resourceFeeds.getDescription();
        videoDuration = resourceFeeds.getDuration().toString();
    }
    if (fetchThumbnail) {
        if (resourceFeeds != null && resourceFeeds.getThumbnail() != null) {
            images.add(resourceFeeds.getThumbnail());
        }
        metaData.put(IMAGES, images);
    }
    metaData.put(TITLE, title);
    metaData.put(DESCRIPTION, description);
    metaData.put(DURATION, videoDuration);
    return metaData;
}

From source file:org.jboss.tools.tycho.sitegenerator.GenerateCompositeSite.java

private void collectChildrenFromRemote(String collectChildrenFromRemoteURL2,
        String collectChildrenFromRemoteRegex2, int collectChildrenFromRemoteLimit2,
        List<String> childSitesList2) throws MojoFailureException {
    Document doc = null;/*from   w  ww  .  j a  v  a 2  s. c  o m*/
    try {
        // getLog().debug("Load children from: " +
        // collectChildrenFromRemoteURL2);
        doc = Jsoup.connect(collectChildrenFromRemoteURL2).get();
        // getLog().debug("Regex to match: " +
        // collectChildrenFromRemoteRegex2);
        Elements links = doc.getElementsByTag("a");

        // sort larges (newest) first
        Collections.sort(links, new Comparator<Element>() {
            @Override
            public int compare(Element e1, Element e2) {
                return e2.attr("href").compareTo(e1.attr("href"));
            }
        });

        int linksAdded = 0;
        for (Element link : links) {
            String linkHref = link.attr("href");
            if (collectChildrenFromRemoteRegex2 == null || (linkHref.matches(collectChildrenFromRemoteRegex2)
                    && (linksAdded < collectChildrenFromRemoteLimit2 || collectChildrenFromRemoteLimit2 < 0))) {
                getLog().debug("Adding: " + linkHref);
                childSitesList2.add(collectChildrenFromRemoteURL2 + linkHref);
                linksAdded++;
            }
        }
    } catch (IOException ex) {
        throw new MojoFailureException(ex.getMessage(), ex);
    }
    doc = null;
}

From source file:org.jorge.lolin1dp.io.net.Internet.java

public static List<ArticleWrapper> getNews(String baseUrl, String url) {
    Elements newsHeadLines, newsSubTitles, descVerification;
    try {//from   w  w  w .  j a v a  2  s .  co  m
        System.out.println("Performing get on " + url);
        Document doc = Jsoup.connect(url).timeout(URL_TIMEOUT_MILLIS).get();
        System.out.println("Get performed on " + url);
        newsHeadLines = doc.select("div.panelizer-view-mode").select("div.node").select("div.node-teaser")
                .select("div.node-article").select("div.field").select("div.field-name-field-article-media")
                .select("div.field-type-file").select("div.field-label-hidden");
        newsSubTitles = doc.select("div.field").select("div.field-name-field-body-medium")
                .select("div.field-type-text-long").select("div.field-label-hidden");
        descVerification = doc.select("div.default-2-3");
    } catch (IOException e) {
        e.printStackTrace(System.err);
        return new ArrayList<ArticleWrapper>();
    }

    final List<ArticleWrapper> ret = new ArrayList<>();
    Boolean addThis = Boolean.TRUE;
    int i = 0;
    for (Element elem : newsHeadLines) {

        Element linkElem = elem.getElementsByTag("a").first(), imageElem = elem.getElementsByTag("img").first();

        if (addThis) {
            final String title = linkElem.attr("title");
            final String link = baseUrl + linkElem.attr("href");
            final String imageLink = baseUrl + imageElem.attr("src");
            final String subtitle;
            if (descVerification.get(i).select("div").size() < 7) {
                subtitle = "";
            } else {
                Element removed = newsSubTitles.remove(0);
                subtitle = removed.text();
            }
            ret.add(new ArticleWrapper(title, link, imageLink, subtitle));
            addThis = Boolean.FALSE;
            i++;
        } else {
            addThis = Boolean.TRUE;
        }
    }

    return ret;
}

From source file:org.jtotus.network.NordnetConnect.java

private boolean connectAndAuth(String user, String password) {
    ArrayList<String> inputList = new ArrayList();

    connector = new NordnetConnector();

    String encryptJS = fetchEncryptionScript("./lib/encrypt.js");
    if (encryptJS == null) {
        encryptJS = connector.getPage(_ECRYPT_JS_);
        if (encryptJS == null) {
            System.err.printf("Failed to get encrypt javascript\n");
            return false;
        }/*www. j a  va  2  s .c o m*/
    }

    String loginPage = connector.getPage(_LOGIN_URL_);
    if (loginPage == null) {
        System.err.printf("Failed to get login page\n");
        return false;
    }

    Document doc = Jsoup.parse(loginPage);
    Elements elements = doc.select("input");

    Iterator<Element> iter = elements.iterator();
    while (iter.hasNext()) {
        Element elem = iter.next();
        inputList.add(elem.attr("name"));
    }

    if (inputList.size() < 2) {
        System.err.printf("Failure: \n %s \n", loginPage);
        return false;
    }

    elements = doc.select("script");
    if (elements.size() < 4) {
        System.err.printf("Incorrect size of script elements\n");
        return false;
    }
    Element elem = elements.get(4);

    String[] data = elem.data().split("'");
    if (data.length < 8) {
        System.err.printf("Incorrect size of splitted elements for pass and login tokens\n");
        return false;
    }
    log.info("Got element: data:" + data[7] + " html:" + data[5]);

    String encryptedPassword = fetchEncryptedPassword(encryptJS, password, data[5].trim() /*pubKey*/,
            data[7].trim() /*sessionId*/);

    loginPage = connector.authenticate(_LOGININPUT_URL_, inputList.get(3), user, inputList.get(5),
            encryptedPassword);

    System.err.printf("login: %s = %s pass: %s = %s\n", inputList.get(3), user, inputList.get(5),
            encryptedPassword);

    if (loginPage == null) {
        System.err.printf("Failed to get authenticate\n");
        return false;
    }

    if (!authenticated()) {
        return false;
    }

    return true;
}

From source file:org.lockss.extractor.JsoupTagExtractor.java

/**
 * extract the <meta...></meta> selectors
 * @param doc the parsed jsoup document//from w w  w .ja v  a 2s.co  m
 * @param articleMeta the ArticleMetadata to store the name/content pairs
 */
void extractMetaTags(Document doc, ArticleMetadata articleMeta) {
    Elements metas = doc.select(DEFAULT_META_TAG);
    String name;
    String content;
    for (Element meta : metas) {
        name = meta.attr("name");
        content = meta.attr("content");
        if (!StringUtil.isNullString(content) && !StringUtil.isNullString(name)) {
            content = processHtml(name, content);
            if (theLog.isDebug3())
                theLog.debug3("Add: " + name + " = " + content);
            articleMeta.putRaw(name, content);
        }
    }
}

From source file:org.mar9000.space2latex.WikiPage.java

public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException {
    String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
    Document document = Jsoup.parseBodyFragment(page.storage);
    document.outputSettings().prettyPrint(false);
    Elements images = document.select("ac|image");
    if (images.size() > 0)
        LOGGER.info("  Download images:");
    for (Element element : images) {
        String downloadURL = null;
        String imageKey = null;//w  ww  .ja va  2 s.  c om
        // Attachment?
        Elements refs = element.select("ri|attachment");
        WikiImage image = new WikiImage();
        image.pageId = page.id;
        image.acImage = element.outerHtml();
        //
        if (refs.size() > 0) { // Attachment.
            Element riAttachment = refs.get(0);
            imageKey = riAttachment.attr("ri:filename");
            Elements riPages = riAttachment.select("ri|page");
            // Thumbnails are not found with "child/attachment" URL schema.
            boolean isThumbnail = "true".equals(element.attr("ac:thumbnail"));
            String queryURL = null;
            if (!isThumbnail) {
                queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            } else {
                // For thumbnail we construct directly the downloadURL without queryURL.
                /* Some pages have thumbnail images for better online reading.
                 * Here we download always the attached file to embed readable imagesinto the pdf.
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api"))
                      + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey);
                */
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/"
                        + page.id + "/" + URLEncoder.encode(imageKey);
            }
            if (riPages.size() > 0) {
                // The attachment is related with another page.
                Element riPage = riPages.get(0);
                String space = riPage.attr("ri:space-key");
                String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20");
                String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle
                        + "&spaceKey=" + space;
                JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL);
                if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0)
                    throw new RuntimeException(
                            "Page \"" + contentTitle + "\" in space " + space + " not found.");
                JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0);
                image.pageId = jsonNewPage.getString(JSON_ID_ATTR);
                // Overwrite queryURL.
                String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            }
            if (!isThumbnail)
                downloadURL = getAttachmentDownloadURL(queryURL);
        } else {
            refs = element.select("ri|url");
            if (refs.size() > 0) { // URL.
                downloadURL = refs.get(0).attr("ri:value");
                URL tempURL = new URL(downloadURL);
                String urlPath = tempURL.getPath();
                imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1);
            } else {
                throw new RuntimeException("Image format unknown: " + element.toString());
            }
        }
        // Download the image data.
        image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX.
        if (downloadURL != null) {
            LOGGER.info("    about to download image {}/{}", new Object[] { image.pageId, image.filename });
            image.data = IOUtils.getImageFromURL(downloadURL);
        } else {
            LOGGER.info("    NULL download URL for page/image: {}/{}",
                    new Object[] { image.pageId, image.filename });
        }
        page.images.put(imageKey, image);
    }
}

From source file:org.mar9000.space2latex.WikiPage.java

public static WikiPage loadForFormat(File file) throws IOException {
    String fileContent = IOUtils.readFileAsString(file);
    Document doc = Jsoup.parseBodyFragment(fileContent);
    // Maintain input string.
    doc.outputSettings().prettyPrint(false);
    Element body = doc.body();/*from w ww .java2  s.  c  om*/
    Element pageElement = body.select("page").first();
    String title = pageElement.attr("title");
    String id = pageElement.attr("id");
    Element pageContent = pageElement.select("content").first();
    WikiPage page = new WikiPage(null, title, id, pageContent.html());
    page.pageContent = pageContent;
    // Images.
    Elements images = body.select("wikiimages").first().select("wikiimage");
    for (Element imageElement : images) {
        WikiImage image = new WikiImage();
        String acKey = imageElement.select("ac|image").first().outerHtml();
        image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename");
        page.images.put(acKey, image);
    }
    return page;
}

From source file:org.metaservice.core.deb.util.GitCache.java

public void runDiscovery() {
    HashSet<String> parsed = new HashSet<>();
    LinkedList<String> toParse = new LinkedList<>();
    HashSet<String> dists = new HashSet<>();
    toParse.add(startString);//from  w  ww. j a v  a  2 s. co m
    while (toParse.size() > 0) {
        String uri = toParse.pop();
        try {
            String s = clientMetaservice.get(uri);
            if (s == null) {
                LOGGER.error("Couldn't load " + uri + " skipping.");
                continue;
            }
            Document document = Jsoup.parse(s, uri);
            parsed.add(uri);
            for (Element e : document.select("a:contains(next change)")) {
                String href = e.attr("abs:href");
                if (!parsed.contains(href) && !toParse.contains(href)) {
                    LOGGER.info("adding (next) ", href);
                    toParse.push(href);
                }
            }

            for (Element e : document.select("a[href$=/]")) {
                String absHref = e.attr("abs:href");
                String href = e.attr("href");
                if (!dists.contains(href) && !href.startsWith("/")
                        && !href.startsWith(".") /* &&!toParse.contains (href) */) {
                    if (uri.endsWith("dists/") /*&& !href.contains("sid") && !href.contains("experimental")*/) {
                        dists.add(href);
                        LOGGER.info(href);
                        for (String license : licenses) {
                            String url = absHref + license + "/";
                            LOGGER.info("adding (lic) {}", url);
                            toParse.add(url);
                        }
                    }
                    for (String license : licenses) {
                        if (uri.endsWith(license + "/")) {
                            if (href.startsWith("binary-")) {
                                for (String arch : archs) {
                                    if (href.contains(arch)) {
                                        LOGGER.info("adding (archdir) {}", absHref);
                                        toParse.add(absHref);
                                    }
                                }
                            }
                            if (href.startsWith("source")) {
                                LOGGER.info("adding (archdir) {}", absHref);
                                toParse.add(absHref);
                            }
                        }
                    }

                }

            }

            for (Element e : document.select("a[abs:href$=Packages.gz] , a[abs:href$=Sources.gz]")) {
                String href = e.attr("abs:href");
                //only if this seems to be a non duplicate
                if (document.select("a:contains(prev change)").size() == 0
                        || document.select("a:contains(prev change)").get(0).attr("abs:href").equals(document
                                .select("a:contains(prev):not(:contains(change))").get(0).attr("abs:href"))) {
                    LOGGER.info("RESULT processing ... {} {} ", i++, href);
                    processFileToParse(href);
                }
            }
        } catch (RuntimeException exception) {
            LOGGER.error("RUNTIME EXCEPTION ", exception);
            throw exception;
        }
    }
}