List of usage examples for org.jsoup.select Elements html
public String html()
From source file:io.sightly.tck.html.HTMLExtractor.java
/** * Retrieves the content of an element, without its own markup tags, identified by the {@code selector} from the given {@code markup}. * The {@code url} is used only for caching purposes, to avoid parsing multiple times the markup returned for the same resource. * * @param url the url that identifies the markup * @param markup the markup/* ww w.j av a2 s . com*/ * @param selector the selector used for retrieval * @return the contents of the selected element */ public static String innerHTML(String url, String markup, String selector) { ensureMarkup(url, markup); Document document = documents.get(url); Elements elements = document.select(selector); return elements.html(); }
From source file:com.geecko.QuickLyric.lyrics.Genius.java
public static Lyrics fromURL(String url, String artist, String title) { Document lyricsPage;/*from www. java2s . c om*/ String text; try { lyricsPage = Jsoup.connect(url).get(); Elements lyricsDiv = lyricsPage.select("div.lyrics"); if (lyricsDiv.isEmpty()) throw new StringIndexOutOfBoundsException(); else text = Jsoup.clean(lyricsDiv.html(), Whitelist.none().addTags("br")).trim(); } catch (HttpStatusException e) { return new Lyrics(Lyrics.NO_RESULT); } catch (IOException | StringIndexOutOfBoundsException e) { e.printStackTrace(); return new Lyrics(Lyrics.ERROR); } if (artist == null) { title = lyricsPage.getElementsByClass("text_title").get(0).text(); artist = lyricsPage.getElementsByClass("text_artist").get(0).text(); } Lyrics result = new Lyrics(Lyrics.POSITIVE_RESULT); if ("[Instrumental]".equals(text)) result = new Lyrics(Lyrics.NEGATIVE_RESULT); result.setArtist(artist); result.setTitle(title); result.setText(text); result.setURL(url); result.setSource("Genius"); return result; }
From source file:model.SongMeaningsScraper.java
private static String scrapeLyricsPage(String songURL) { String lyrics = ""; // Try to load page using Jsoup try {/*from w ww. j a va2 s .c o m*/ // Load page into Document Document doc = Jsoup.connect(songURL).get(); // Get lyricBox from page Elements lyricBox = doc.select("#textblock"); // Remove ads lyricBox.get(0).getElementsByTag("div").remove(); // Remove comments ParseUtils.removeComments(lyricBox.get(0)); // We now have almost perfect lyrics. lyrics = lyricBox.html(); /*TextNode t = TextNode.createFromEncoded(lyrics, "songmeanings.net"); lyrics = t.getWholeText(); Remove minimal HTML tags, leaving newlines intact */ lyrics = lyrics.replaceAll("<br />", ""); lyrics = lyrics.replaceAll("<i>", ""); lyrics = lyrics.replaceAll("</i>", ""); lyrics = lyrics.replaceAll("<b>", ""); lyrics = lyrics.replaceAll("</b>", ""); lyrics = lyrics.replaceAll("<p>", ""); lyrics = lyrics.replaceAll("</p>", ""); lyrics = lyrics.replaceAll("<", "<"); lyrics = lyrics.replaceAll(">", ">"); lyrics = lyrics.replaceAll("", "\'"); if (lyrics.contains("Due to copyright restrictions") || lyrics.contains("Due to a publisher block")) { Logger.LogToStatusBar("Copyright restrictions on this track, bailing out!"); return ""; } lyrics = " " + lyrics; //System.out.println(lyrics); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("Lyrics not found!"); } System.out.println("Done"); return lyrics; }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private Double findPrice() { Elements price = doc.select(PathMapping.PRICE); String str = price.html().replace("$", "").replaceAll(",", ""); try {/* w w w . j a v a 2s . com*/ if (StringUtils.isNotBlank(str)) { return Double.valueOf(str); } } catch (Exception ex) { } return null; }
From source file:com.gorsini.searcher.CanalplaySearcher.java
public void check() throws HTMLChangeException, IOException { String url = makeURL("intouchables"); Document doc = Jsoup.connect(url).referrer("http://vod.canalplay.com/").get(); Elements movies = doc.select("div.list_movie"); String html = movies.html(); String previousHTML = null;// ww w . j av a 2 s.c om LOG.finest(html); File file = new File(CHECK_FILENAME); if (file.exists()) { previousHTML = FileUtils.readFileToString(new File(CHECK_FILENAME)); } else { LOG.log(Level.INFO, "sauvegarde check"); FileUtils.writeStringToFile(file, html); } if (previousHTML != null && !html.equals(previousHTML)) { // sauvegarde la nouvelle version pour pouvoir la comparer. FileUtils.writeStringToFile(new File(CHECK_FILENAME + ".new"), html); throw new HTMLChangeException(); } else { LOG.log(Level.INFO, "no change detected into HTML response"); } }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findTitle() { Elements title = doc.select(PathMapping.TITLE); title.select("span").remove(); return title.html().trim(); }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findIsbn10() { Elements isbn10 = doc.select(PathMapping.ISBN_10); isbn10.select("b").remove(); return isbn10.html().trim(); }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findLanguage() { Elements language = doc.select(PathMapping.LANGUAGE); language.select("b").remove(); return language.html().trim(); }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private String findPublisher() { Elements publisher = doc.select(PathMapping.PUBLISHER); publisher.select("b").remove(); String str = publisher.html().substring(0, publisher.html().indexOf("(")).trim(); if (str.lastIndexOf(";") != -1) { str = str.substring(0, str.lastIndexOf(";")); }/*from w w w .ja va 2 s .co m*/ return str; }
From source file:com.manisha.allmybooksarepacked.service.BookParser.java
private Integer findPages() { Elements pages = doc.select(PathMapping.PAGES_HARDCOVER); pages.select("b").remove(); if (StringUtils.isNotBlank(pages.html())) { try {// www . j ava 2 s . c o m return Integer.valueOf(pages.html().split(" ")[0].replaceAll(",", "")); } catch (Exception ex) { } } else { pages = doc.select(PathMapping.PAGES_PAPERBACK); pages.select("b").remove(); try { return Integer.valueOf(pages.html().split(" ")[0].replaceAll(",", "")); } catch (Exception ex) { } } return null; }