List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:org.deeplearning4j.patent.DownloadPreprocessPatents.java
/** * Get a list of all URLs in a page for zip files *//*from www. j a v a 2s . c o m*/ public static List<String> getZipUrlsFromPage(String url) { List<String> out = new ArrayList<>(); try { Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); for (Element e : links) { String s = e.attr("href"); if (s.endsWith(".zip")) { if (s.startsWith("http")) { //Absolute link out.add(s); } else { //Relative link out.add(e.baseUri() + s); } } } } catch (IOException e) { throw new RuntimeException(e); } return out; }
From source file:org.eclipse.mylyn.docs.examples.GenerateEPUB.java
public static void main(String[] args) { // clean up from last run try {//from w w w . j a va 2 s . c om Files.delete(Paths.get("loremipsum.html")); Files.delete(Paths.get("loremipsum.epub")); } catch (IOException e1) { /* no worries */ } try ( // read MarkDown FileReader fr = new FileReader("loremipsum.md"); // and output HTML Writer fw = Files.newBufferedWriter(Paths.get("loremipsum.html"), StandardOpenOption.CREATE)) { // generate HTML from markdown MarkupParser parser = new MarkupParser(); parser.setMarkupLanguage(new MarkdownLanguage()); HtmlDocumentBuilder builder = new HtmlDocumentBuilder(fw); parser.setBuilder(builder); parser.parse(fr, true); // convert any inline equations in the HTML into MathML String html = new String(Files.readAllBytes(Paths.get("loremipsum.html"))); StringBuffer sb = new StringBuffer(); Matcher m = EQUATION.matcher(html); // for each equation while (m.find()) { // replace the LaTeX code with MathML m.appendReplacement(sb, laTeX2MathMl(m.group())); } m.appendTail(sb); // EPUB 2.0 can only handle embedded SVG so we find all referenced // SVG files and replace the reference with the actual SVG code Document parse = Jsoup.parse(sb.toString(), "UTF-8", Parser.xmlParser()); Elements select = parse.select("img"); for (Element element : select) { String attr = element.attr("src"); if (attr.endsWith(".svg")) { byte[] svg = Files.readAllBytes(Paths.get(attr)); element.html(new String(svg)); } } // write back the modified HTML-file Files.write(Paths.get("loremipsum.html"), sb.toString().getBytes(), StandardOpenOption.WRITE); // instantiate a new EPUB version 2 publication Publication pub = Publication.getVersion2Instance(); // include referenced resources (default is false) pub.setIncludeReferencedResources(true); // title and subject is required pub.addTitle("EclipseCon Demo"); pub.addSubject("EclipseCon Demo"); // generate table of contents (default is true) pub.setGenerateToc(true); epub.add(pub); // add one chapter pub.addItem(Paths.get("loremipsum.html").toFile()); // create the EPUB epub.pack(new File("loremipsum.epub")); } catch (Exception e) { e.printStackTrace(); } }
From source file:org.ednovo.gooru.application.util.ResourceImageUtil.java
public Map<String, Object> getResourceMetaData(String url, String resourceTitle, boolean fetchThumbnail) { Map<String, Object> metaData = new HashMap<String, Object>(); ResourceMetadataCo resourceFeeds = null; if (url != null && url.contains(VIMEO_VIDEO)) { resourceFeeds = getMetaDataFromVimeoVideo(url); } else if (url != null && url.contains(YOUTUBE_VIDEO)) { resourceFeeds = getYoutubeResourceFeeds(url, null); }/*from www . j a v a 2 s .com*/ String description = ""; String title = ""; String videoDuration = ""; Set<String> images = new LinkedHashSet<String>(); if (resourceFeeds == null || resourceFeeds.getUrlStatus() == 404) { Document doc = null; try { if (url != null && (url.contains("http://") || url.contains("https://"))) { doc = Jsoup.connect(url).timeout(6000).get(); } } catch (Exception e) { e.printStackTrace(); } if (doc != null) { title = doc.title(); Elements meta = doc.getElementsByTag(META); if (meta != null) { for (Element element : meta) { if (element.attr(NAME) != null && element.attr(NAME).equalsIgnoreCase(DESCRIPTION)) { description = element.attr(CONTENT); break; } } } metaData.put(DESCRIPTION, description); if (fetchThumbnail) { Elements media = doc.select("[src]"); if (media != null) { for (Element src : media) { if (src.tagName().equals(IMG)) { images.add(src.attr("abs:src")); } if (images.size() >= SUGGEST_IMAGE_MAX_SIZE) { break; } } } } } } else { title = resourceFeeds.getTitle(); description = resourceFeeds.getDescription(); videoDuration = resourceFeeds.getDuration().toString(); } if (fetchThumbnail) { if (resourceFeeds != null && resourceFeeds.getThumbnail() != null) { images.add(resourceFeeds.getThumbnail()); } metaData.put(IMAGES, images); } metaData.put(TITLE, title); metaData.put(DESCRIPTION, description); metaData.put(DURATION, videoDuration); return metaData; }
From source file:org.jboss.tools.tycho.sitegenerator.GenerateCompositeSite.java
private void collectChildrenFromRemote(String collectChildrenFromRemoteURL2, String collectChildrenFromRemoteRegex2, int collectChildrenFromRemoteLimit2, List<String> childSitesList2) throws MojoFailureException { Document doc = null;/*from w ww . j a v a 2 s. c o m*/ try { // getLog().debug("Load children from: " + // collectChildrenFromRemoteURL2); doc = Jsoup.connect(collectChildrenFromRemoteURL2).get(); // getLog().debug("Regex to match: " + // collectChildrenFromRemoteRegex2); Elements links = doc.getElementsByTag("a"); // sort larges (newest) first Collections.sort(links, new Comparator<Element>() { @Override public int compare(Element e1, Element e2) { return e2.attr("href").compareTo(e1.attr("href")); } }); int linksAdded = 0; for (Element link : links) { String linkHref = link.attr("href"); if (collectChildrenFromRemoteRegex2 == null || (linkHref.matches(collectChildrenFromRemoteRegex2) && (linksAdded < collectChildrenFromRemoteLimit2 || collectChildrenFromRemoteLimit2 < 0))) { getLog().debug("Adding: " + linkHref); childSitesList2.add(collectChildrenFromRemoteURL2 + linkHref); linksAdded++; } } } catch (IOException ex) { throw new MojoFailureException(ex.getMessage(), ex); } doc = null; }
From source file:org.jorge.lolin1dp.io.net.Internet.java
public static List<ArticleWrapper> getNews(String baseUrl, String url) { Elements newsHeadLines, newsSubTitles, descVerification; try {//from w w w . j a v a 2 s . co m System.out.println("Performing get on " + url); Document doc = Jsoup.connect(url).timeout(URL_TIMEOUT_MILLIS).get(); System.out.println("Get performed on " + url); newsHeadLines = doc.select("div.panelizer-view-mode").select("div.node").select("div.node-teaser") .select("div.node-article").select("div.field").select("div.field-name-field-article-media") .select("div.field-type-file").select("div.field-label-hidden"); newsSubTitles = doc.select("div.field").select("div.field-name-field-body-medium") .select("div.field-type-text-long").select("div.field-label-hidden"); descVerification = doc.select("div.default-2-3"); } catch (IOException e) { e.printStackTrace(System.err); return new ArrayList<ArticleWrapper>(); } final List<ArticleWrapper> ret = new ArrayList<>(); Boolean addThis = Boolean.TRUE; int i = 0; for (Element elem : newsHeadLines) { Element linkElem = elem.getElementsByTag("a").first(), imageElem = elem.getElementsByTag("img").first(); if (addThis) { final String title = linkElem.attr("title"); final String link = baseUrl + linkElem.attr("href"); final String imageLink = baseUrl + imageElem.attr("src"); final String subtitle; if (descVerification.get(i).select("div").size() < 7) { subtitle = ""; } else { Element removed = newsSubTitles.remove(0); subtitle = removed.text(); } ret.add(new ArticleWrapper(title, link, imageLink, subtitle)); addThis = Boolean.FALSE; i++; } else { addThis = Boolean.TRUE; } } return ret; }
From source file:org.jtotus.network.NordnetConnect.java
private boolean connectAndAuth(String user, String password) { ArrayList<String> inputList = new ArrayList(); connector = new NordnetConnector(); String encryptJS = fetchEncryptionScript("./lib/encrypt.js"); if (encryptJS == null) { encryptJS = connector.getPage(_ECRYPT_JS_); if (encryptJS == null) { System.err.printf("Failed to get encrypt javascript\n"); return false; }/*www. j a va 2 s .c o m*/ } String loginPage = connector.getPage(_LOGIN_URL_); if (loginPage == null) { System.err.printf("Failed to get login page\n"); return false; } Document doc = Jsoup.parse(loginPage); Elements elements = doc.select("input"); Iterator<Element> iter = elements.iterator(); while (iter.hasNext()) { Element elem = iter.next(); inputList.add(elem.attr("name")); } if (inputList.size() < 2) { System.err.printf("Failure: \n %s \n", loginPage); return false; } elements = doc.select("script"); if (elements.size() < 4) { System.err.printf("Incorrect size of script elements\n"); return false; } Element elem = elements.get(4); String[] data = elem.data().split("'"); if (data.length < 8) { System.err.printf("Incorrect size of splitted elements for pass and login tokens\n"); return false; } log.info("Got element: data:" + data[7] + " html:" + data[5]); String encryptedPassword = fetchEncryptedPassword(encryptJS, password, data[5].trim() /*pubKey*/, data[7].trim() /*sessionId*/); loginPage = connector.authenticate(_LOGININPUT_URL_, inputList.get(3), user, inputList.get(5), encryptedPassword); System.err.printf("login: %s = %s pass: %s = %s\n", inputList.get(3), user, inputList.get(5), encryptedPassword); if (loginPage == null) { System.err.printf("Failed to get authenticate\n"); return false; } if (!authenticated()) { return false; } return true; }
From source file:org.lockss.extractor.JsoupTagExtractor.java
/** * extract the <meta...></meta> selectors * @param doc the parsed jsoup document//from w w w .ja v a 2s.co m * @param articleMeta the ArticleMetadata to store the name/content pairs */ void extractMetaTags(Document doc, ArticleMetadata articleMeta) { Elements metas = doc.select(DEFAULT_META_TAG); String name; String content; for (Element meta : metas) { name = meta.attr("name"); content = meta.attr("content"); if (!StringUtil.isNullString(content) && !StringUtil.isNullString(name)) { content = processHtml(name, content); if (theLog.isDebug3()) theLog.debug3("Add: " + name + " = " + content); articleMeta.putRaw(name, content); } } }
From source file:org.mar9000.space2latex.WikiPage.java
public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException { String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); Document document = Jsoup.parseBodyFragment(page.storage); document.outputSettings().prettyPrint(false); Elements images = document.select("ac|image"); if (images.size() > 0) LOGGER.info(" Download images:"); for (Element element : images) { String downloadURL = null; String imageKey = null;//w ww .ja va 2 s. c om // Attachment? Elements refs = element.select("ri|attachment"); WikiImage image = new WikiImage(); image.pageId = page.id; image.acImage = element.outerHtml(); // if (refs.size() > 0) { // Attachment. Element riAttachment = refs.get(0); imageKey = riAttachment.attr("ri:filename"); Elements riPages = riAttachment.select("ri|page"); // Thumbnails are not found with "child/attachment" URL schema. boolean isThumbnail = "true".equals(element.attr("ac:thumbnail")); String queryURL = null; if (!isThumbnail) { queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } else { // For thumbnail we construct directly the downloadURL without queryURL. /* Some pages have thumbnail images for better online reading. * Here we download always the attached file to embed readable imagesinto the pdf. downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey); */ downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/" + page.id + "/" + URLEncoder.encode(imageKey); } if (riPages.size() > 0) { // The attachment is related with another page. Element riPage = riPages.get(0); String space = riPage.attr("ri:space-key"); String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20"); String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle + "&spaceKey=" + space; JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL); if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0) throw new RuntimeException( "Page \"" + contentTitle + "\" in space " + space + " not found."); JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0); image.pageId = jsonNewPage.getString(JSON_ID_ATTR); // Overwrite queryURL. String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR); queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey); } if (!isThumbnail) downloadURL = getAttachmentDownloadURL(queryURL); } else { refs = element.select("ri|url"); if (refs.size() > 0) { // URL. downloadURL = refs.get(0).attr("ri:value"); URL tempURL = new URL(downloadURL); String urlPath = tempURL.getPath(); imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1); } else { throw new RuntimeException("Image format unknown: " + element.toString()); } } // Download the image data. image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX. if (downloadURL != null) { LOGGER.info(" about to download image {}/{}", new Object[] { image.pageId, image.filename }); image.data = IOUtils.getImageFromURL(downloadURL); } else { LOGGER.info(" NULL download URL for page/image: {}/{}", new Object[] { image.pageId, image.filename }); } page.images.put(imageKey, image); } }
From source file:org.mar9000.space2latex.WikiPage.java
public static WikiPage loadForFormat(File file) throws IOException { String fileContent = IOUtils.readFileAsString(file); Document doc = Jsoup.parseBodyFragment(fileContent); // Maintain input string. doc.outputSettings().prettyPrint(false); Element body = doc.body();/*from w ww .java2 s. c om*/ Element pageElement = body.select("page").first(); String title = pageElement.attr("title"); String id = pageElement.attr("id"); Element pageContent = pageElement.select("content").first(); WikiPage page = new WikiPage(null, title, id, pageContent.html()); page.pageContent = pageContent; // Images. Elements images = body.select("wikiimages").first().select("wikiimage"); for (Element imageElement : images) { WikiImage image = new WikiImage(); String acKey = imageElement.select("ac|image").first().outerHtml(); image.filename = imageElement.attr("pageid") + "/" + imageElement.attr("filename"); page.images.put(acKey, image); } return page; }
From source file:org.metaservice.core.deb.util.GitCache.java
public void runDiscovery() { HashSet<String> parsed = new HashSet<>(); LinkedList<String> toParse = new LinkedList<>(); HashSet<String> dists = new HashSet<>(); toParse.add(startString);//from w ww. j a v a 2 s. co m while (toParse.size() > 0) { String uri = toParse.pop(); try { String s = clientMetaservice.get(uri); if (s == null) { LOGGER.error("Couldn't load " + uri + " skipping."); continue; } Document document = Jsoup.parse(s, uri); parsed.add(uri); for (Element e : document.select("a:contains(next change)")) { String href = e.attr("abs:href"); if (!parsed.contains(href) && !toParse.contains(href)) { LOGGER.info("adding (next) ", href); toParse.push(href); } } for (Element e : document.select("a[href$=/]")) { String absHref = e.attr("abs:href"); String href = e.attr("href"); if (!dists.contains(href) && !href.startsWith("/") && !href.startsWith(".") /* &&!toParse.contains (href) */) { if (uri.endsWith("dists/") /*&& !href.contains("sid") && !href.contains("experimental")*/) { dists.add(href); LOGGER.info(href); for (String license : licenses) { String url = absHref + license + "/"; LOGGER.info("adding (lic) {}", url); toParse.add(url); } } for (String license : licenses) { if (uri.endsWith(license + "/")) { if (href.startsWith("binary-")) { for (String arch : archs) { if (href.contains(arch)) { LOGGER.info("adding (archdir) {}", absHref); toParse.add(absHref); } } } if (href.startsWith("source")) { LOGGER.info("adding (archdir) {}", absHref); toParse.add(absHref); } } } } } for (Element e : document.select("a[abs:href$=Packages.gz] , a[abs:href$=Sources.gz]")) { String href = e.attr("abs:href"); //only if this seems to be a non duplicate if (document.select("a:contains(prev change)").size() == 0 || document.select("a:contains(prev change)").get(0).attr("abs:href").equals(document .select("a:contains(prev):not(:contains(change))").get(0).attr("abs:href"))) { LOGGER.info("RESULT processing ... {} {} ", i++, href); processFileToParse(href); } } } catch (RuntimeException exception) { LOGGER.error("RUNTIME EXCEPTION ", exception); throw exception; } } }