Example usage for org.jsoup.nodes Element tagName

List of usage examples for org.jsoup.nodes Element tagName

Introduction

In this page you can find the example usage for org.jsoup.nodes Element tagName.

Prototype

public String tagName() 

Source Link

Document

Get the name of the tag for this element.

Usage

From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCsvDownload.java

private static void download(Element e) throws MalformedURLException, IOException {
    for (Element c : e.children()) {
        String tagName = c.tag().getName();
        if (tagName.equals("small")) {
            for (Element c1 : c.children()) {
                if (c1.tag().getName().equals("a") && c1.text().equalsIgnoreCase("csv")) {
                    String href = c1.attr("href");
                    System.out.println("Downloading " + href);
                    try {
                        URL remoteFile = new URL(href);
                        ReadableByteChannel rbc = Channels.newChannel(remoteFile.openStream());
                        String[] s = href.split("\\/");
                        FileOutputStream fos = new FileOutputStream(
                                DBpediaOntology.DBPEDIA_CSV_FOLDER + s[s.length - 1]);
                        fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
                    } catch (Exception ex) {
                        ex.printStackTrace();
                    }// w  w w .  j a  v a 2s  . co  m
                }
            }
        } else if (tagName.equals("ul")) {
            for (Element c1 : c.children()) {
                if (c1.tagName().equals("li")) {
                    download(c1);
                }
            }
        }
    }
}

From source file:io.github.carlomicieli.footballdb.starter.parsers.DraftParser.java

private DraftedPlayer mapToDraftedPlayer(Element e) {
    if (!e.tagName().equals("tr")) {
        throw new IllegalArgumentException("Invalid tag");
    }//from ww w .  j av a 2s .co  m

    int round = TryConvert.toIntegerOrGet(e.child(0).text(), -1);
    int pick = TryConvert.toIntegerOrGet(e.child(1).text(), -1);
    String team = e.child(2).text();
    String name = e.child(3).text();
    String pos = e.child(4).text();
    String college = e.child(27).text();
    return Draft.newPick().college(college).name(name).round(round).number(pick).position(pos).team(team)
            .build();
}

From source file:com.webcrawler.manager.impl.ImageManagerImpl.java

@Override
public List<ImageDTO> getImageData(final String url)
        throws IOException, IllegalArgumentException, InterruptedException, ExecutionException {

    if (url == null || url.equals("")) {
        throw new IllegalArgumentException("Set URL first");
    }//from www.  ja va  2s .c  om

    Callable<List<ImageDTO>> callable = new Callable<List<ImageDTO>>() {

        @Override
        public List<ImageDTO> call() throws Exception {
            System.out.println("Retrieving image data from url " + url);

            Document document = null;
            Elements media = null;
            List<ImageDTO> images = new ArrayList<ImageDTO>();
            try {
                document = Jsoup.connect(url).get();
                media = document.select("[src]");
            } catch (Exception e) {
                e.printStackTrace();
                return images;
            }

            System.out.println("# of images: " + media.size());

            for (Element src : media) {
                if (src.tagName().equals("img")) {
                    ImageDTO dto = new ImageDTO();
                    dto.setUrlAddress(src.attr("abs:src"));
                    dto.setFileName(getFileName(src.attr("abs:src")));
                    images.add(dto);
                }
            }

            return images;
        }
    };

    Future<List<ImageDTO>> result = executorService.submit(callable);

    return result.get();

}

From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Principal Author) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("principal author")) {
                skip = true;//from  w  ww  .j  av  a  2 s.com
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("principal author")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        String[] splitted = editor.html().split(",");

        for (String split : splitted) {
            if (!split.isEmpty()) {
                if (split.toLowerCase().startsWith("(in alphabetic")
                        || split.toLowerCase().startsWith("see acknowl")
                        || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac")
                        || split.toLowerCase().startsWith("see participants")
                        || split.toLowerCase().contains("note:")) {
                    Log.log("warning", "Spec " + url + " may refer to a different section!");
                    continue;
                }
                if (split.equals("WHATWG:") || split.equals("W3C:"))
                    continue;
                Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                Person result = NameParser.parse(newdoc.text());
                if (result == null)
                    continue;

                for (int i = 0; i < newdoc.select("a").size(); i++) {
                    if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                        if (newdoc.select("a").get(i).attr("href").contains("@")) {
                            result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                        } else {
                            result.addWebsite(newdoc.select("a").get(i).attr("href"));
                        }
                    }
                }

                editorList.add(result);
            }
        }
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * Depricated use {@link #getFormattedText(Element)}
 * takes an element and turns the P tags into \n\n
 * // todo move this to an output formatter object instead of inline here
 *
 * @return/*w w  w. jav a2s.c  o m*/
 */
@Deprecated
public String getFormattedText() {

    StringBuilder sb = new StringBuilder();

    Elements nodes = topNode.getAllElements();
    for (Element e : nodes) {
        if (e.tagName().equals("p")) {
            String text = StringEscapeUtils.unescapeHtml(e.text()).trim();
            sb.append(text);
            sb.append("\n\n");
        }
    }

    return sb.toString();
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if (!prev.text().trim().toLowerCase().startsWith("authors/editors")
                    && !prev.text().trim().toLowerCase().startsWith("author/editor")) {
                skip = true;//  w  w  w  .j a v  a 2s. c  om
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("authors/editors")
                        || next.text().trim().toLowerCase().startsWith("author/editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", url + ": This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:web.analyzer.utils.Utils.java

public List<Heading> docHeadingsProcess(Document doc) {
    List<Heading> headingList = new ArrayList<Heading>();
    int level = 0;
    Elements eles = doc.select("*");
    for (Element ele : eles) {
        level++;/*ww  w  .  ja  v  a  2  s. c o  m*/
        if (HEADING_TAG.contains(ele.tagName())) {
            headingList.add(new Heading(ele.tagName(), ele.html(), level));
        }

        if (ele.children().size() == 0) {
            level = 0;
            continue;
        } else {
            eles = ele.children();
        }
    }

    return headingList;
}

From source file:com.obnsoft.ptcm3.MyApplication.java

private void parseCommandHtml() {
    mCommands = new ArrayList<Command>();
    mCategories = new ArrayList<String>();
    int categoryId = -1;
    try {//  w  ww  .j a v  a2s  .c  o m
        InputStream in = openFileInput(FNAME_CMD_HTML);
        Document document = Jsoup.parse(in, "UTF-8", URL_CMD_HTML);
        in.close();
        Element divContentArea = document.getElementById(ID_CONTENTAREA);
        for (Element e : divContentArea.children()) {
            if (e.tagName().equals(TAG_TABLE)) {
                if (e.className().equals("")) {
                    mCommands.add(new Command(e, categoryId));
                }
            } else if (e.tagName().equals(TAG_H3)) {
                mCategories.add(e.text());
                categoryId++;
            }
        }
    } catch (IOException e) {
        e.printStackTrace();
        mCommands = null;
        mCategories = null;
    }
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("dt")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().contains("version")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;/* w  w w .j av  a2 s  .com*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("dt"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}

From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java

@Override
public ArrayList<Person> run(String url, Document doc) {
    ArrayList<Person> editorList = new ArrayList<>();

    Elements editors = doc.select("h4:contains(Editor) ~ blockquote");
    if (editors.size() == 0)
        return null;

    boolean skip = false;
    for (Element editor : editors) {
        Element prev = editor.previousElementSibling();
        if (prev.tagName().equals("h4")) {
            if ((!prev.text().trim().toLowerCase().startsWith("editor")
                    && !prev.text().trim().toLowerCase().startsWith("edition editor"))
                    || prev.text().trim().toLowerCase().endsWith("version:")
                    || prev.text().trim().toLowerCase().endsWith("draft:")) {
                skip = true;/*from w  w  w . j a  v a  2  s  .  co m*/
            }
        }

        if (skip) {
            Element next = editor.nextElementSibling();
            if (next != null) {
                if (next.text().trim().toLowerCase().startsWith("editor")
                        || next.text().trim().toLowerCase().contains("edition editor")) {
                    skip = false;
                    continue;
                }
            }
            continue;
        }

        if (StringUtils.countMatches(editor.text(), " - ") > 2) {
            Log.log("warning", "This editor may be a list of editors separated by  - ");
            EditorsRule5 ed5 = new EditorsRule5();

            return ed5.run(url, doc);
        }

        String[] splitted = editor.html().split("<br />|<br clear=\"none\" />");

        if (splitted.length < 2) {
            if (editor.text().toLowerCase().startsWith("(in alphabetic")
                    || editor.text().toLowerCase().startsWith("see acknowl")
                    || editor.text().toLowerCase().startsWith("the w3")
                    || editor.text().toLowerCase().startsWith("(see ac")
                    || editor.text().toLowerCase().startsWith("see participants")
                    || editor.text().toLowerCase().contains("note:")) {
                Log.log("warning", "Spec " + url + " may refer to a different section!");
                continue;
            }
            if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:"))
                continue;
            Person result = NameParser.parse(editor.text());
            if (result == null)
                continue;

            for (int i = 0; i < editor.select("a").size(); i++) {
                if (!editor.select("a").get(i).attr("href").isEmpty()) {
                    if (editor.select("a").get(i).attr("href").contains("@")) {
                        result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", ""));
                    } else {
                        result.addWebsite(editor.select("a").get(i).attr("href"));
                    }
                }
            }

            editorList.add(result);
        } else {
            for (String split : splitted) {
                if (!split.isEmpty()) {
                    if (split.toLowerCase().startsWith("(in alphabetic")
                            || split.toLowerCase().startsWith("see acknowl")
                            || split.toLowerCase().startsWith("the w3")
                            || split.toLowerCase().startsWith("(see ac")
                            || split.toLowerCase().startsWith("see participants")
                            || split.toLowerCase().contains("note:")) {
                        Log.log("warning", "Spec " + url + " may refer to a different section!");
                        continue;
                    }
                    if (split.equals("WHATWG:") || split.equals("W3C:"))
                        continue;
                    Document newdoc = Jsoup.parse(split.replaceAll("\n", ""));
                    Person result = NameParser.parse(newdoc.text());
                    if (result == null)
                        continue;

                    for (int i = 0; i < newdoc.select("a").size(); i++) {
                        if (!newdoc.select("a").get(i).attr("href").isEmpty()) {
                            if (newdoc.select("a").get(i).attr("href").contains("@")) {
                                result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", ""));
                            } else {
                                result.addWebsite(newdoc.select("a").get(i).attr("href"));
                            }
                        }
                    }

                    editorList.add(result);
                }
            }
        }

        Element next = editor.nextElementSibling();
        if (next != null)
            if (next.tag().getName().equals("h4"))
                break;
    }

    if (editorList.size() == 0)
        return null;

    return editorList;
}