List of usage examples for org.jsoup.nodes Document select
public Elements select(String cssQuery)
From source file:com.slidespeech.server.service.TextToSpeechService.java
private static String createXML4Cereproc(String fileName, String speakernotes) throws IOException { List<String> voices = new ArrayList<String>(); try {//from w w w. jav a 2s . c o m Document doc = Jsoup.parse(speakernotes, ""); doc.outputSettings().prettyPrint(false); Elements voiceNodes = doc.select("voice"); for (Element voiceNode : voiceNodes) { String lang = (voiceNode.hasAttr("xml:lang") && !voiceNode.attr("xml:lang").equals("")) ? voiceNode.attr("xml:lang") : "en"; String gender = (voiceNode.hasAttr("gender") && !voiceNode.attr("gender").equals("")) ? voiceNode.attr("gender") : "female"; String voiceName = (voiceNode.hasAttr("name") && !voiceNode.attr("name").equals("")) ? voiceNode.attr("name") : ""; //voice name not set by user -> choose one depending on language and gender if (voiceName.equals("")) { voiceName = "isabella";//default //if(lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("female")) voiceName = "isabella"; if (lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("male")) voiceName = "william"; if (lang.equalsIgnoreCase("de")) voiceName = "alex"; voiceNode.attr("name", voiceName); } if (!voices.contains(voiceName)) { voices.add(voiceName); } } BufferedWriter out = new BufferedWriter(new FileWriter(fileName)); out.write(doc.select("body").first().html()); //out.write(doc.select("body").first().html()); out.close(); for (int i = 0; i < voices.size(); i++) { if (voices.get(i).equals("william")) voices.set(i, "/opt/cereproc/cerevoice_william_3.0.5_22k.voice"); if (voices.get(i).equals("isabella")) voices.set(i, "/opt/cereproc/cerevoice_isabella_3.0.3_22k.voice"); if (voices.get(i).equals("alex")) voices.set(i, "/opt/cereproc/cerevoice_alex_3.0.0_beta_22k.voice"); } } catch (Exception e) { //Fallback if ssml parsing fails Writer out = new OutputStreamWriter(new FileOutputStream(fileName)); try { out.write(speakernotes); } finally { out.close(); } voices.add("ssml parsing failed"); } return StringUtils.join(voices, ","); }
From source file:model.SongMeaningsScraper.java
private static String scrapeLyricsPage(String songURL) { String lyrics = ""; // Try to load page using Jsoup try {//from w w w .j av a2s. co m // Load page into Document Document doc = Jsoup.connect(songURL).get(); // Get lyricBox from page Elements lyricBox = doc.select("#textblock"); // Remove ads lyricBox.get(0).getElementsByTag("div").remove(); // Remove comments ParseUtils.removeComments(lyricBox.get(0)); // We now have almost perfect lyrics. lyrics = lyricBox.html(); /*TextNode t = TextNode.createFromEncoded(lyrics, "songmeanings.net"); lyrics = t.getWholeText(); Remove minimal HTML tags, leaving newlines intact */ lyrics = lyrics.replaceAll("<br />", ""); lyrics = lyrics.replaceAll("<i>", ""); lyrics = lyrics.replaceAll("</i>", ""); lyrics = lyrics.replaceAll("<b>", ""); lyrics = lyrics.replaceAll("</b>", ""); lyrics = lyrics.replaceAll("<p>", ""); lyrics = lyrics.replaceAll("</p>", ""); lyrics = lyrics.replaceAll("<", "<"); lyrics = lyrics.replaceAll(">", ">"); lyrics = lyrics.replaceAll("", "\'"); if (lyrics.contains("Due to copyright restrictions") || lyrics.contains("Due to a publisher block")) { Logger.LogToStatusBar("Copyright restrictions on this track, bailing out!"); return ""; } lyrics = " " + lyrics; //System.out.println(lyrics); } catch (IOException e) { // TODO Auto-generated catch block System.out.println("Lyrics not found!"); } System.out.println("Done"); return lyrics; }
From source file:automation.Launcher.java
public static String br2nl(String html) { if (html == null) { return html; }/*from w w w .j a va2 s . c o m*/ Document document = Jsoup.parse(html); document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing document.select("p").prepend("\\n\\n"); document.select("div").prepend("\\n"); // System.out.println(document.html()); document.select("br").append("\\n"); // System.out.println(document.html()); String s = document.html().replaceAll("\\\\n", "\n"); // System.out.println(s); return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java
public static void extractCSS(final URI url, File targetdir) throws Exception { /*//from w ww . j a va 2s . com * For each test URL: 1) extract <style> tag contents 2) extract from * <link rel="stylesheet"> files 3) extract inline style attributes from * all elements and wrap the result in .style {} */ Document doc = Jsoup.connect(url.toString()).timeout(20000).get(); List<String> tests = new ArrayList<String>(); for (Element e : doc.select("style[type=text/css]")) { tests.add(e.data()); } for (Element e : doc.select("link[rel=stylesheet][href][type=text/css]")) { URI cssUri = new URI(e.attr("href")); if (!cssUri.isAbsolute()) { cssUri = url.resolve(cssUri); } String encoding = doc.outputSettings().charset().name(); tests.add(IOUtils.toString(cssUri, encoding)); } for (Element e : doc.select("*[style]")) { tests.add(String.format(".style { %s }", e.attr("style"))); } for (final String test : tests) { targetdir.mkdirs(); String logfile = String.format("%s.%d.scss", FilenameUtils.getBaseName(url.toString()), tests.indexOf(test)); PrintStream dataLogger = new PrintStream(new File(targetdir, logfile)); dataLogger.println("/* Source: " + url + " */"); dataLogger.println(test); } }
From source file:io.seldon.importer.articles.AttributesImporterUtils.java
public static Set<String> getTags(Document articleDoc, String tagsCssSelector, String title) { Set<String> tagSet = new HashSet<String>(); if (StringUtils.isNotBlank(tagsCssSelector)) { Elements tagsElements = articleDoc.select(tagsCssSelector); Element tagsElement = tagsElements.first(); List<String> tagsParts; if ((tagsElement != null) && (tagsElement.attr("content") != null) && (StringUtils.isNotBlank(tagsElement.attr("content")))) { tagsParts = AttributesImporterUtils.getTagsPartsFromSingleElement(tagsElement); } else {/*from ww w . j a va 2 s . c o m*/ tagsParts = AttributesImporterUtils.getTagsPartsFromMultipleElement(tagsElements); } List<String> extraTagsParts = AttributesImporterUtils.createExtraTagsPartsFromTitle(title, tagsParts); tagSet.addAll(tagsParts); tagSet.addAll(extraTagsParts); } return tagSet; }
From source file:models.NotificationMail.java
private static void handleImages(Document doc) { for (Element img : doc.select("img")) { img.attr("style", "max-width:1024px;" + img.attr("style")); img.wrap(String.format("<a href=\"%s\" target=\"_blank\" style=\"border:0;outline:0;\"></a>", img.attr("src"))); }/*from w w w. ja v a2s . c o m*/ }
From source file:com.uniteddev.Unity.Downloader.java
public static ArrayList<String> getDirectoryListing(String url) throws IOException { Document doc = Jsoup.connect(Unity.url + Unity.folder + "/" + url).get(); Elements extracted_links = doc.select("a[href]"); ArrayList<String> links = new ArrayList<String>(); for (int i = 0; i < extracted_links.size(); i++) { String check = extracted_links.get(i).attr("href"); if (!(check.contains("?C=N;O=D")) && !(check.contains("?C=M;O=A")) && !(check.contains("?C=D;O=A")) && !(check.contains("?C=S;O=A")) && !(check.contains("content/minecraft/files"))) { links.add(extracted_links.get(i).attr("href")); }//from www . j a v a 2 s.c o m } return links; }
From source file:com.nuance.expertassistant.ContentExtractor.java
public static void extract(Document doc) { final Elements links = doc.getElementsByTag("a"); final Elements ps = doc.select("p"); final String title = doc.title(); print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">"); final Elements elements = doc.select("*"); final ArrayList<String> openHeaderList = new ArrayList<String>(); for (final Element element : elements) { if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") { } else if (element.tagName().toString().contains("a")) { } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); }// ww w. j av a 2 s . com if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h1"); } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h2"); } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h3"); } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) { if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">"); openHeaderList.add("h4"); } else { print("<para>"); print(stripNonValidXMLCharacters(element.ownText())); print("</para>"); } /* * if (element.tagName().contains("img")) { print("<img src=\"" + * element.attr("src") + "\"></img>"); } */ } if (openHeaderList.contains("h1")) { openHeaderList.remove("h1"); print("</section>"); } if (openHeaderList.contains("h2")) { openHeaderList.remove("h2"); print("</section>"); } if (openHeaderList.contains("h3")) { openHeaderList.remove("h3"); print("</section>"); } if (openHeaderList.contains("h4")) { openHeaderList.remove("h4"); print("</section>"); } print("</section>"); }
From source file:index.IndexManager.java
public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) { final SolrInputDocument index = new SolrInputDocument(); index.setField("id", document.location()); index.setField("time", String.valueOf(System.currentTimeMillis())); index.setField("title", document.title()); final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href")) .collect(Collectors.toSet()); final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src")) .collect(Collectors.toSet()); links.forEach(link -> index.addField("link", link)); media.forEach(link -> index.addField("media", link)); formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e)); formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e)); formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e)); formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e)); formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e)); formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e)); formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e)); formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e)); int i = 0;/*from www.j ava 2 s. c o m*/ Collection<String> text = chunkToLength(document.text()); for (String chunk : text) index.addField(++i + "_text", chunk); return Triple.of(index, links, media); }
From source file:io.apiman.tools.i18n.TemplateScanner.java
/** * Scan the given html template using jsoup and find all strings that require translation. This is * done by finding all elements with a "apiman-i18n-key" attribute. * @param file/*from ww w .j av a 2 s . com*/ * @param strings * @throws IOException */ private static void scanFile(File file, TreeMap<String, String> strings) throws IOException { Document doc = Jsoup.parse(file, "UTF-8"); // First, scan for elements with the 'apiman-i18n-key' attribute. These require translating. Elements elements = doc.select("*[apiman-i18n-key]"); for (Element element : elements) { String i18nKey = element.attr("apiman-i18n-key"); boolean isNecessary = false; // Process the element text (if the element has no children) if (strings.containsKey(i18nKey)) { if (hasNoChildren(element)) { isNecessary = true; String elementVal = element.text(); if (elementVal.trim().length() > 0 && !elementVal.contains("{{")) { String currentValue = strings.get(i18nKey); if (!currentValue.equals(elementVal)) { throw new IOException("Duplicate i18n key found with different default values. Key=" + i18nKey + " Value1=" + elementVal + " Value2=" + currentValue); } } } } else { if (hasNoChildren(element)) { String elementVal = element.text(); if (elementVal.trim().length() > 0 && !elementVal.contains("{{")) { isNecessary = true; strings.put(i18nKey, elementVal); } } } // Process the translatable attributes for (String tattr : TRANSLATABLE_ATTRIBUTES) { if (element.hasAttr(tattr)) { String attrValue = element.attr(tattr); if (attrValue.contains("{{")) { continue; } String attrI18nKey = i18nKey + '.' + tattr; String currentAttrValue = strings.get(attrI18nKey); if (currentAttrValue == null) { isNecessary = true; strings.put(attrI18nKey, attrValue); } else if (!currentAttrValue.equals(attrValue)) { throw new IOException( "Duplicate i18n key found with different default values (for attribute '" + tattr + "'). Key=" + attrI18nKey + " Value1=" + attrValue + " Value2=" + currentAttrValue); } else { isNecessary = true; } } } if (!isNecessary) { throw new IOException("Detected an unnecessary apiman-i18n-key attribute in file '" + file.getName() + "' on element: " + element); } } // Next, scan all elements to see if the element *should* be marked for translation elements = doc.select("*"); for (Element element : elements) { if (element.hasAttr("apiman-i18n-key") || element.hasAttr("apiman-i18n-skip")) { continue; } if (hasNoChildren(element)) { String value = element.text(); if (value != null && value.trim().length() > 0) { if (!value.contains("{{")) { throw new IOException("Found an element in '" + file.getName() + "' that should be translated: " + element); } } } } // Next scan elements with a translatable attribute and fail if any of those elements // are missing the apiman-i18n-key attribute. for (String tattr : TRANSLATABLE_ATTRIBUTES) { elements = doc.select("*[" + tattr + "]"); for (Element element : elements) { if (element.hasAttr("apiman-i18n-key") || element.hasAttr("apiman-i18n-skip") || element.attr(tattr).contains("{{")) { continue; } else { throw new IOException("In template '" + file.getName() + "', found an element with a '" + tattr + "' attribute but missing 'apiman-i18n-key': " + element); } } } }