Example usage for org.jsoup.nodes Element getElementsByTag

List of usage examples for org.jsoup.nodes Element getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java

/**
 * Avoid automatic redirection during interaction with forms.
 *///from w  w  w  .j  av a 2  s  . c  om
public void validateRpd13s4() {
    for (Element form : getElements(ELEM_FORM)) {
        boolean hasSubmit = false;
        boolean hasDynamicSelect = false;

        for (Element input : form.getElementsByTag(ELEM_INPUT)) {
            String type = input.attr(ATTR_TYPE);
            if ("submit".equals(type) || "image".equals(type)) {
                hasSubmit = true;
                break;
            }
        }
        assertTrue(Type.ERROR, "rpd13s4.submit", hasSubmit);

        for (Element select : form.getElementsByTag("select")) {
            if (select.hasAttr("onchange")) {
                hasDynamicSelect = true;
                break;
            }
        }

        if (hasDynamicSelect) {
            addError(Type.WARNING, -1, -1, "rpd13s4.select");
        }
    }
}

From source file:ru.redcraft.pinterest4j.core.api.PinAPI.java

public List<Comment> getComments(Pin pin) {
    LOG.debug("Getting comments for pin = " + pin);
    List<Comment> comments = new ArrayList<Comment>();
    Document doc = null;/*w  ww .  j a  v  a  2 s.  c o m*/
    String axajResponse = null;
    try {
        axajResponse = new APIRequestBuilder(pin.getURL()).setErrorMessage(PIN_API_ERROR).build().getResponse()
                .getEntity(String.class);
        doc = Jsoup.parse(new JSONObject(axajResponse).getString("footer"));
    } catch (JSONException e) {
        throw new PinterestRuntimeException(PIN_API_ERROR + axajResponse, e);
    }
    for (Element comment : doc.select("div.comment")) {
        long id = Long.valueOf(comment.getElementsByClass("DeleteComment").first().attr("data"));
        Element contentMeta = comment.getElementsByClass("CommenterMeta").first();
        User user = new LazyUser(contentMeta.getElementsByTag("a").first().attr("href").replace("/", ""),
                getApiManager());
        contentMeta.getElementsByTag("a").remove();
        String text = contentMeta.text();
        comments.add(new CommentImpl(id, text, user, pin));
    }
    LOG.debug("Comments extracted: " + comments);
    return comments;
}

From source file:Search.DataManipulation.DataParser.java

public String getPrice(Document dom) {
    Elements priceClass = dom.select("button.price");
    Element priceClass1 = priceClass.first();
    Elements priceClass2 = priceClass1.getElementsByTag("span");
    String price = priceClass2.last().ownText();
    if (price.equalsIgnoreCase("install")) {
        price = "Free";
    } else {//from www. j a  v a 2s  .co m
        String[] split = StringUtils.split(price);
        price = split[0];
    }
    return price;
}

From source file:Search.DataManipulation.DataParser.java

public String getThumbnails(Document dom) throws IOException {
    Elements thumbnailsClass = dom.getElementsByClass("thumbnails");
    Elements thumbnails = thumbnailsClass.first().children();

    List<String> imageArray = new ArrayList<String>();

    for (Element images : thumbnails) {
        String imageTagUrl = images.getElementsByTag("img").first().attr("src");
        byte[] imageByte = dataHandler.imageDownloader(imageTagUrl);

        if (imageByte.length == 0) {
            continue;
        }//  w  w w . j ava 2  s .c om

        String imageTag = Base64.getEncoder().encodeToString(imageByte);
        imageArray.add(imageTag);
    }

    return JSONValue.toJSONString(imageArray);
}

From source file:solarrecorder.SolarRecorder.java

private void getProdData() throws IOException {
    org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy/production").get();

    Element h1 = doc.getElementsByTag("h1").first();
    Element table = h1.nextElementSibling();
    Elements alltr = table.getElementsByTag("tbody").first().getElementsByTag("tr");
    for (Element tr : alltr) {
        Elements alltd = tr.getElementsByTag("td");

        if (alltd.size() == 2) {
            String name = alltd.first().text();
            String value = alltd.last().text();
            switch (name) {
            case "Currently":
            case "Today":
                envoyData.add(new EnvoyData(name, value));
                break;
            }/*  ww w  .j  a  v  a  2 s  .  c  o m*/
        }
    }
}

From source file:solarrecorder.SolarRecorder.java

private void getSysData() throws IOException {
    org.jsoup.nodes.Document doc = Jsoup.connect("http://envoy").get();

    Elements allh2 = doc.getElementsByTag("h2");
    for (Element h2 : allh2) {
        if (h2.text().equals("System Statistics")) {
            Elements tables = h2.parent().getElementsByTag("table");
            Elements alltr = tables.first().getElementsByTag("tbody").first().getElementsByTag("tr");
            for (Element tr : alltr) {
                Elements alltd = tr.getElementsByTag("td");
                String name = alltd.first().text();
                String value = alltd.last().text();
                if (name.equals("Number of Microinverters Online")) {
                    envoyData.add(new EnvoyData(name, value));
                }//from   w  ww. ja  v a 2s .c  o m
            }
        }
    }
}

From source file:us.colloquy.sandbox.FileProcessor.java

@Test
public void listAllUzipedFiles() {
    ///Documents/Tolstoy/diaries
    //System.getProperty("user.home") + "/Documents/Tolstoy/unzipLetters"

    Path pathToLetters = FileSystems.getDefault()
            .getPath(System.getProperty("user.home") + "/Documents/Tolstoy/openDiaries");

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth, (path, attr) -> {
        return String.valueOf(path).endsWith(".ncx");
    })) {/*from  w w  w  .j a  v a2  s . c  om*/

        stream.forEach(results::add);

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    Set<String> uriList = new TreeSet<>();

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            System.out.println("---------------------------------------------");
            System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {

                    System.out.println("Title: " + child.text());
                }
            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------");
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?]+.*") && StringUtils.isNotEmpty(url)) {

                            uriList.add(parent.toString() + File.separator + url.replaceAll("#.*", ""));
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    System.out.println("Size: " + uriList.size());

    for (String uri : uriList) {
        //parse and
        System.out.println(uri);
    }

}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void useJsoup() {

    String homeDir = System.getProperty("user.home");

    System.out.println(homeDir);// ww  w .j a va  2s.  com

    //JSOUP API allows to extract all  elements of letters in files

    // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml");

    File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html");

    try {
        Document doc = Jsoup.parse(input, "UTF-8");

        List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields

        String previousYear = "";

        for (Element element : doc.getElementsByClass("section")) {
            Letter letter = new Letter();

            StringBuilder content = new StringBuilder();

            for (Element child : element.children()) {

                for (Attribute att : child.attributes()) {
                    System.out.println(att.getKey() + " " + att.getValue());
                }

                if ("center".equalsIgnoreCase(child.className())) {
                    String toWhom = child.getElementsByTag("strong").text();

                    if (StringUtils.isEmpty(toWhom)) {
                        toWhom = child.text();
                        // System.out.println(toWhom);
                    }

                    String[] toWhomArray = toWhom.split("(\\s\\s)|(,)");

                    for (String to : toWhomArray) {
                        RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content
                    }

                    //check if there is anything else here and find date and place - it will be replaced if exists below

                    String entireText = child.text();

                    String tail = entireText.replace(toWhom, "");

                    if (StringUtils.isNotEmpty(tail)) {
                        RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present
                    }

                    // System.out.println("two whom\t " +  child.getElementsByTag("strong").text() );

                } else if ("Data".equalsIgnoreCase(child.className())) {

                    if (child.getElementsByTag("em") != null
                            && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) {
                        RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(),
                                previousYear); //most often date and place are enclosed in em tag

                        if (letter.getDate() != null) {
                            LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault())
                                    .toLocalDate();
                            int year = localDate.getYear();
                            previousYear = year + "";
                        }
                    }

                    // System.out.println("when and where\t " + child.getElementsByTag("em").text());

                } else if ("petit".equalsIgnoreCase(child.className())
                        || "Textpetit_otstup".equalsIgnoreCase(child.className())) {
                    letter.getNotes().add(child.text());

                } else {
                    //System.out.println(child.text() );

                    Elements elements = child.getElementsByTag("sup");

                    for (Element e : elements) {
                        String value = e.text();

                        e.replaceWith(new TextNode("[" + value + "]", null));
                    }

                    for (Element el : child.getAllElements()) {
                        // System.out.println(el.tagName());
                        if ("sup".equalsIgnoreCase(el.tagName())) {
                            content.append(" [" + el.text() + "] ");
                        } else {
                            content.append(el.text());
                        }

                    }

                    content.append("\n");

                }

                //                  System.out.println(child.tag() + "\n" );
                //                  System.out.println(child.outerHtml() + "\n" + child.text());
            }

            letter.setContent(content.toString());
            letters.add(letter);
        }

        ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter();

        for (Letter letter : letters) {
            //                if (letter.getDate() == null)
            //                {

            //                        if (StringUtils.isNotEmpty(person.getLastName()))
            //                        {
            String json = ow.writeValueAsString(letter);

            System.out.println(json);
            //                        }

            //}

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:us.colloquy.util.DiaryParser.java

private static void replaceSupTag(Element child) {
        Elements elements = child.getElementsByTag("sup");

        for (Element e : elements) {
            String value = e.text();

            e.replaceWith(new TextNode("[" + value + "]", null));
        }/*ww  w  .  j  av a 2s.  c o m*/

    }

From source file:us.colloquy.util.EpubExtractor.java

public static void getURIForAllLetters(Set<DocumentPointer> uriList, String letterDirectory,
        boolean useOnlyNumber) {

    Path pathToLetters = FileSystems.getDefault().getPath(letterDirectory);

    List<Path> results = new ArrayList<>();

    int maxDepth = 6;

    try (Stream<Path> stream = Files.find(pathToLetters, maxDepth,
            (path, attr) -> String.valueOf(path).endsWith(".ncx"))) {
        stream.forEach(results::add);//from  w ww. j  av  a  2  s.c o m

        //            String joined = stream
        //                    .sorted()
        //                    .map(String::valueOf)
        //                    .collect(Collectors.joining("; "));
        //
        //            System.out.println("\nFound: " + joined);

    } catch (IOException e) {
        e.printStackTrace();
    }

    System.out.println("files: " + results.size());

    try {

        for (Path res : results) {
            Path parent = res.getParent();

            //                System.out.println("---------------------------------------------");
            //                System.out.println(parent.toString());
            //use jsoup to list all files that contain something useful
            Document doc = Jsoup.parse(res.toFile(), "UTF-8");

            String title = "";

            for (Element element : doc.getElementsByTag("docTitle")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    title = child.text();
                    // System.out.println("Title: " + title);
                }
            }

            for (Element element : doc.getElementsByTag("avantitul")) {

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches(
                                "  ? ? .*")) {
                            System.out.println("------------------   " + label);
                        }
                    }
                }

            }

            for (Element element : doc.getElementsByTag("navPoint")) {
                //Letter letter = new Letter();

                // StringBuilder content = new StringBuilder();

                for (Element child : element.children()) {
                    String label = child.text();

                    if (StringUtils.isNotEmpty(label)) {
                        if (label.matches("?")) {
                            System.out.println("------------------ " + "?" + " -------------------");

                        } else if (label.contains(" ?")) {
                            break;
                        }

                        String url = child.getElementsByTag("content").attr("src");

                        if (label.matches(".*\\d{1,3}.*[?--?A-Za-z]+.*") && StringUtils.isNotEmpty(url)) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else if (label.matches(".*\\d{1,3}.*") && StringUtils.isNotEmpty(url)
                                && useOnlyNumber) {
                            DocumentPointer documentPointer = new DocumentPointer(
                                    parent.toString() + File.separator + url.replaceAll("#.*", ""), title);

                            uriList.add(documentPointer);
                            //                                System.out.println("nav point: " + label + " src " + parent.toString()
                            //                                        + System.lineSeparator() + url.replaceAll("#.*",""));

                        } else {
                            // System.out.println("nav point: " + label + " src " + child.getElementsByTag("content").attr("src"));
                        }

                    }
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //        System.out.println("Size: " + uriList.size());

    //        for (DocumentPointer pointer : uriList)
    //        {
    //            //parse and
    //            System.out.println(pointer.getSourse() + "\t" + pointer.getUri());
    //        }
}