Example usage for org.jsoup.nodes Document select

Introduction

In this page you can find the example usage for org.jsoup.nodes Document select.

Prototype

public Elements select(String cssQuery)

Source Link

Document

Find elements that match the Selector CSS query, with this element as the starting context.

Usage

From source file:com.slidespeech.server.service.TextToSpeechService.java

private static String createXML4Cereproc(String fileName, String speakernotes) throws IOException {
    List<String> voices = new ArrayList<String>();

    try {//from   w  w w. jav a  2s  . c  o  m
        Document doc = Jsoup.parse(speakernotes, "");
        doc.outputSettings().prettyPrint(false);
        Elements voiceNodes = doc.select("voice");

        for (Element voiceNode : voiceNodes) {
            String lang = (voiceNode.hasAttr("xml:lang") && !voiceNode.attr("xml:lang").equals(""))
                    ? voiceNode.attr("xml:lang")
                    : "en";
            String gender = (voiceNode.hasAttr("gender") && !voiceNode.attr("gender").equals(""))
                    ? voiceNode.attr("gender")
                    : "female";
            String voiceName = (voiceNode.hasAttr("name") && !voiceNode.attr("name").equals(""))
                    ? voiceNode.attr("name")
                    : "";

            //voice name not set by user -> choose one depending on language and gender
            if (voiceName.equals("")) {
                voiceName = "isabella";//default
                //if(lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("female")) voiceName = "isabella";
                if (lang.equalsIgnoreCase("en") && gender.equalsIgnoreCase("male"))
                    voiceName = "william";
                if (lang.equalsIgnoreCase("de"))
                    voiceName = "alex";

                voiceNode.attr("name", voiceName);

            }
            if (!voices.contains(voiceName)) {
                voices.add(voiceName);

            }
        }

        BufferedWriter out = new BufferedWriter(new FileWriter(fileName));
        out.write(doc.select("body").first().html());
        //out.write(doc.select("body").first().html());
        out.close();

        for (int i = 0; i < voices.size(); i++) {
            if (voices.get(i).equals("william"))
                voices.set(i, "/opt/cereproc/cerevoice_william_3.0.5_22k.voice");
            if (voices.get(i).equals("isabella"))
                voices.set(i, "/opt/cereproc/cerevoice_isabella_3.0.3_22k.voice");
            if (voices.get(i).equals("alex"))
                voices.set(i, "/opt/cereproc/cerevoice_alex_3.0.0_beta_22k.voice");
        }
    } catch (Exception e) {
        //Fallback if ssml parsing fails
        Writer out = new OutputStreamWriter(new FileOutputStream(fileName));
        try {
            out.write(speakernotes);
        } finally {
            out.close();
        }
        voices.add("ssml parsing failed");
    }

    return StringUtils.join(voices, ",");
}

From source file:model.SongMeaningsScraper.java

private static String scrapeLyricsPage(String songURL) {
    String lyrics = "";

    // Try to load page using Jsoup
    try {//from   w  w  w  .j  av a2s.  co  m
        // Load page into Document
        Document doc = Jsoup.connect(songURL).get();
        // Get lyricBox from page
        Elements lyricBox = doc.select("#textblock");
        // Remove ads
        lyricBox.get(0).getElementsByTag("div").remove();
        // Remove comments
        ParseUtils.removeComments(lyricBox.get(0));

        // We now have almost perfect lyrics.
        lyrics = lyricBox.html();
        /*TextNode t = TextNode.createFromEncoded(lyrics, "songmeanings.net");
        lyrics = t.getWholeText();
                
        Remove minimal HTML tags, leaving newlines intact
        */
        lyrics = lyrics.replaceAll("<br />", "");
        lyrics = lyrics.replaceAll("<i>", "");
        lyrics = lyrics.replaceAll("</i>", "");
        lyrics = lyrics.replaceAll("<b>", "");
        lyrics = lyrics.replaceAll("</b>", "");
        lyrics = lyrics.replaceAll("<p>", "");
        lyrics = lyrics.replaceAll("</p>", "");

        lyrics = lyrics.replaceAll("&lt;", "<");
        lyrics = lyrics.replaceAll("&gt;", ">");
        lyrics = lyrics.replaceAll("", "\'");

        if (lyrics.contains("Due to copyright restrictions") || lyrics.contains("Due to a publisher block")) {
            Logger.LogToStatusBar("Copyright restrictions on this track, bailing out!");
            return "";
        }

        lyrics = " " + lyrics;
        //System.out.println(lyrics);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        System.out.println("Lyrics not found!");
    }
    System.out.println("Done");
    return lyrics;
}

From source file:automation.Launcher.java

public static String br2nl(String html) {
    if (html == null) {
        return html;
    }/*from w w w .j a va2 s .  c  o m*/
    Document document = Jsoup.parse(html);
    document.outputSettings(new Document.OutputSettings().prettyPrint(false));//makes html() preserve linebreaks and spacing
    document.select("p").prepend("\\n\\n");
    document.select("div").prepend("\\n");
    //   System.out.println(document.html());
    document.select("br").append("\\n");
    //   System.out.println(document.html());

    String s = document.html().replaceAll("\\\\n", "\n");
    //   System.out.println(s);
    return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false));
}

From source file:com.vaadin.sass.testcases.scss.W3ConformanceTests.java

public static void extractCSS(final URI url, File targetdir) throws Exception {
    /*//from w ww  .  j  a  va  2s  .  com
     * For each test URL: 1) extract <style> tag contents 2) extract from
     * <link rel="stylesheet"> files 3) extract inline style attributes from
     * all elements and wrap the result in .style {}
     */

    Document doc = Jsoup.connect(url.toString()).timeout(20000).get();

    List<String> tests = new ArrayList<String>();

    for (Element e : doc.select("style[type=text/css]")) {
        tests.add(e.data());
    }

    for (Element e : doc.select("link[rel=stylesheet][href][type=text/css]")) {
        URI cssUri = new URI(e.attr("href"));
        if (!cssUri.isAbsolute()) {
            cssUri = url.resolve(cssUri);
        }
        String encoding = doc.outputSettings().charset().name();
        tests.add(IOUtils.toString(cssUri, encoding));
    }

    for (Element e : doc.select("*[style]")) {
        tests.add(String.format(".style { %s }", e.attr("style")));
    }

    for (final String test : tests) {
        targetdir.mkdirs();
        String logfile = String.format("%s.%d.scss", FilenameUtils.getBaseName(url.toString()),
                tests.indexOf(test));
        PrintStream dataLogger = new PrintStream(new File(targetdir, logfile));

        dataLogger.println("/* Source: " + url + " */");
        dataLogger.println(test);

    }
}

From source file:io.seldon.importer.articles.AttributesImporterUtils.java

public static Set<String> getTags(Document articleDoc, String tagsCssSelector, String title) {
    Set<String> tagSet = new HashSet<String>();

    if (StringUtils.isNotBlank(tagsCssSelector)) {
        Elements tagsElements = articleDoc.select(tagsCssSelector);
        Element tagsElement = tagsElements.first();
        List<String> tagsParts;
        if ((tagsElement != null) && (tagsElement.attr("content") != null)
                && (StringUtils.isNotBlank(tagsElement.attr("content")))) {
            tagsParts = AttributesImporterUtils.getTagsPartsFromSingleElement(tagsElement);
        } else {/*from  ww  w .  j  a  va 2  s .  c o m*/
            tagsParts = AttributesImporterUtils.getTagsPartsFromMultipleElement(tagsElements);

        }
        List<String> extraTagsParts = AttributesImporterUtils.createExtraTagsPartsFromTitle(title, tagsParts);
        tagSet.addAll(tagsParts);
        tagSet.addAll(extraTagsParts);
    }

    return tagSet;
}

From source file:models.NotificationMail.java

private static void handleImages(Document doc) {
    for (Element img : doc.select("img")) {
        img.attr("style", "max-width:1024px;" + img.attr("style"));
        img.wrap(String.format("<a href=\"%s\" target=\"_blank\" style=\"border:0;outline:0;\"></a>",
                img.attr("src")));
    }/*from  w  w w. ja  v a2s  . c  o m*/
}

From source file:com.uniteddev.Unity.Downloader.java

public static ArrayList<String> getDirectoryListing(String url) throws IOException {
    Document doc = Jsoup.connect(Unity.url + Unity.folder + "/" + url).get();
    Elements extracted_links = doc.select("a[href]");
    ArrayList<String> links = new ArrayList<String>();
    for (int i = 0; i < extracted_links.size(); i++) {
        String check = extracted_links.get(i).attr("href");
        if (!(check.contains("?C=N;O=D")) && !(check.contains("?C=M;O=A")) && !(check.contains("?C=D;O=A"))
                && !(check.contains("?C=S;O=A")) && !(check.contains("content/minecraft/files"))) {
            links.add(extracted_links.get(i).attr("href"));
        }//from www .  j  a  v  a  2 s.c  o m
    }
    return links;
}

From source file:com.nuance.expertassistant.ContentExtractor.java

public static void extract(Document doc) {

    final Elements links = doc.getElementsByTag("a");
    final Elements ps = doc.select("p");

    final String title = doc.title();

    print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">");

    final Elements elements = doc.select("*");

    final ArrayList<String> openHeaderList = new ArrayList<String>();

    for (final Element element : elements) {
        if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") {

        } else if (element.tagName().toString().contains("a")) {

        } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h1")) {
                openHeaderList.remove("h1");
                print("</section>");
            }// ww  w. j av  a  2  s . com
            if (openHeaderList.contains("h2")) {
                openHeaderList.remove("h2");
                print("</section>");
            }
            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h1");

        } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h2")) {
                openHeaderList.remove("h2");
                print("</section>");
            }
            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h2");

        } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h3");

        } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h4");

        }

        else {
            print("<para>");
            print(stripNonValidXMLCharacters(element.ownText()));
            print("</para>");
        }

        /*
         * if (element.tagName().contains("img")) { print("<img src=\"" +
         * element.attr("src") + "\"></img>"); }
         */
    }

    if (openHeaderList.contains("h1")) {
        openHeaderList.remove("h1");
        print("</section>");
    }
    if (openHeaderList.contains("h2")) {
        openHeaderList.remove("h2");
        print("</section>");
    }
    if (openHeaderList.contains("h3")) {
        openHeaderList.remove("h3");
        print("</section>");
    }
    if (openHeaderList.contains("h4")) {
        openHeaderList.remove("h4");
        print("</section>");
    }

    print("</section>");

}

From source file:index.IndexManager.java

public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) {
    final SolrInputDocument index = new SolrInputDocument();
    index.setField("id", document.location());
    index.setField("time", String.valueOf(System.currentTimeMillis()));
    index.setField("title", document.title());

    final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href"))
            .collect(Collectors.toSet());
    final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src"))
            .collect(Collectors.toSet());

    links.forEach(link -> index.addField("link", link));
    media.forEach(link -> index.addField("media", link));

    formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e));

    formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e));

    formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e));

    formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e));

    formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e));

    formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e));

    formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e));

    formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e));

    int i = 0;/*from   www.j  ava 2  s. c o m*/
    Collection<String> text = chunkToLength(document.text());
    for (String chunk : text)
        index.addField(++i + "_text", chunk);

    return Triple.of(index, links, media);
}

From source file:io.apiman.tools.i18n.TemplateScanner.java

/**
 * Scan the given html template using jsoup and find all strings that require translation.  This is
 * done by finding all elements with a "apiman-i18n-key" attribute.
 * @param file/*from ww  w .j  av  a  2 s . com*/
 * @param strings
 * @throws IOException
 */
private static void scanFile(File file, TreeMap<String, String> strings) throws IOException {
    Document doc = Jsoup.parse(file, "UTF-8");

    // First, scan for elements with the 'apiman-i18n-key' attribute.  These require translating.
    Elements elements = doc.select("*[apiman-i18n-key]");
    for (Element element : elements) {
        String i18nKey = element.attr("apiman-i18n-key");
        boolean isNecessary = false;

        // Process the element text (if the element has no children)
        if (strings.containsKey(i18nKey)) {
            if (hasNoChildren(element)) {
                isNecessary = true;
                String elementVal = element.text();
                if (elementVal.trim().length() > 0 && !elementVal.contains("{{")) {
                    String currentValue = strings.get(i18nKey);
                    if (!currentValue.equals(elementVal)) {
                        throw new IOException("Duplicate i18n key found with different default values.  Key="
                                + i18nKey + "  Value1=" + elementVal + "  Value2=" + currentValue);
                    }
                }
            }
        } else {
            if (hasNoChildren(element)) {
                String elementVal = element.text();
                if (elementVal.trim().length() > 0 && !elementVal.contains("{{")) {
                    isNecessary = true;
                    strings.put(i18nKey, elementVal);
                }
            }
        }

        // Process the translatable attributes
        for (String tattr : TRANSLATABLE_ATTRIBUTES) {
            if (element.hasAttr(tattr)) {
                String attrValue = element.attr(tattr);
                if (attrValue.contains("{{")) {
                    continue;
                }
                String attrI18nKey = i18nKey + '.' + tattr;
                String currentAttrValue = strings.get(attrI18nKey);
                if (currentAttrValue == null) {
                    isNecessary = true;
                    strings.put(attrI18nKey, attrValue);
                } else if (!currentAttrValue.equals(attrValue)) {
                    throw new IOException(
                            "Duplicate i18n key found with different default values (for attribute '" + tattr
                                    + "').  Key=" + attrI18nKey + "  Value1=" + attrValue + "  Value2="
                                    + currentAttrValue);
                } else {
                    isNecessary = true;
                }
            }
        }

        if (!isNecessary) {
            throw new IOException("Detected an unnecessary apiman-i18n-key attribute in file '" + file.getName()
                    + "' on element: " + element);
        }
    }

    // Next, scan all elements to see if the element *should* be marked for translation
    elements = doc.select("*");
    for (Element element : elements) {
        if (element.hasAttr("apiman-i18n-key") || element.hasAttr("apiman-i18n-skip")) {
            continue;
        }
        if (hasNoChildren(element)) {
            String value = element.text();
            if (value != null && value.trim().length() > 0) {
                if (!value.contains("{{")) {
                    throw new IOException("Found an element in '" + file.getName()
                            + "' that should be translated:  " + element);
                }
            }
        }
    }

    // Next scan elements with a translatable attribute and fail if any of those elements
    // are missing the apiman-i18n-key attribute.
    for (String tattr : TRANSLATABLE_ATTRIBUTES) {
        elements = doc.select("*[" + tattr + "]");
        for (Element element : elements) {
            if (element.hasAttr("apiman-i18n-key") || element.hasAttr("apiman-i18n-skip")
                    || element.attr(tattr).contains("{{")) {
                continue;
            } else {
                throw new IOException("In template '" + file.getName() + "', found an element with a '" + tattr
                        + "' attribute but missing 'apiman-i18n-key': " + element);
            }
        }
    }

}