List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCsvDownload.java
private static void download(Element e) throws MalformedURLException, IOException { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("small")) { for (Element c1 : c.children()) { if (c1.tag().getName().equals("a") && c1.text().equalsIgnoreCase("csv")) { String href = c1.attr("href"); System.out.println("Downloading " + href); try { URL remoteFile = new URL(href); ReadableByteChannel rbc = Channels.newChannel(remoteFile.openStream()); String[] s = href.split("\\/"); FileOutputStream fos = new FileOutputStream( DBpediaOntology.DBPEDIA_CSV_FOLDER + s[s.length - 1]); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); } catch (Exception ex) { ex.printStackTrace(); }// w w w . j a v a 2s . co m } } } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { download(c1); } } } } }
From source file:io.github.carlomicieli.footballdb.starter.parsers.DraftParser.java
private DraftedPlayer mapToDraftedPlayer(Element e) { if (!e.tagName().equals("tr")) { throw new IllegalArgumentException("Invalid tag"); }//from ww w . j av a 2s .co m int round = TryConvert.toIntegerOrGet(e.child(0).text(), -1); int pick = TryConvert.toIntegerOrGet(e.child(1).text(), -1); String team = e.child(2).text(); String name = e.child(3).text(); String pos = e.child(4).text(); String college = e.child(27).text(); return Draft.newPick().college(college).name(name).round(round).number(pick).position(pos).team(team) .build(); }
From source file:com.webcrawler.manager.impl.ImageManagerImpl.java
@Override public List<ImageDTO> getImageData(final String url) throws IOException, IllegalArgumentException, InterruptedException, ExecutionException { if (url == null || url.equals("")) { throw new IllegalArgumentException("Set URL first"); }//from www. ja va 2s .c om Callable<List<ImageDTO>> callable = new Callable<List<ImageDTO>>() { @Override public List<ImageDTO> call() throws Exception { System.out.println("Retrieving image data from url " + url); Document document = null; Elements media = null; List<ImageDTO> images = new ArrayList<ImageDTO>(); try { document = Jsoup.connect(url).get(); media = document.select("[src]"); } catch (Exception e) { e.printStackTrace(); return images; } System.out.println("# of images: " + media.size()); for (Element src : media) { if (src.tagName().equals("img")) { ImageDTO dto = new ImageDTO(); dto.setUrlAddress(src.attr("abs:src")); dto.setFileName(getFileName(src.attr("abs:src"))); images.add(dto); } } return images; } }; Future<List<ImageDTO>> result = executorService.submit(callable); return result.get(); }
From source file:net.pixomania.crawler.W3C.parser.rules.principalAuthors.PrincipalAuthorsRule1.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Principal Author) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("principal author")) { skip = true;//from w ww .j av a 2 s.com } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("principal author")) { skip = false; continue; } } continue; } String[] splitted = editor.html().split(","); for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } if (editorList.size() == 0) return null; return editorList; }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * Depricated use {@link #getFormattedText(Element)} * takes an element and turns the P tags into \n\n * // todo move this to an output formatter object instead of inline here * * @return/*w w w. jav a2s.c o m*/ */ @Deprecated public String getFormattedText() { StringBuilder sb = new StringBuilder(); Elements nodes = topNode.getAllElements(); for (Element e : nodes) { if (e.tagName().equals("p")) { String text = StringEscapeUtils.unescapeHtml(e.text()).trim(); sb.append(text); sb.append("\n\n"); } } return sb.toString(); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule7.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Authors/Editors) ~ dd, dt:contains(Author/Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if (!prev.text().trim().toLowerCase().startsWith("authors/editors") && !prev.text().trim().toLowerCase().startsWith("author/editor")) { skip = true;// w w w .j a v a 2s. c om } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("authors/editors") || next.text().trim().toLowerCase().startsWith("author/editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", url + ": This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:web.analyzer.utils.Utils.java
public List<Heading> docHeadingsProcess(Document doc) { List<Heading> headingList = new ArrayList<Heading>(); int level = 0; Elements eles = doc.select("*"); for (Element ele : eles) { level++;/*ww w . ja v a 2 s. c o m*/ if (HEADING_TAG.contains(ele.tagName())) { headingList.add(new Heading(ele.tagName(), ele.html(), level)); } if (ele.children().size() == 0) { level = 0; continue; } else { eles = ele.children(); } } return headingList; }
From source file:com.obnsoft.ptcm3.MyApplication.java
private void parseCommandHtml() { mCommands = new ArrayList<Command>(); mCategories = new ArrayList<String>(); int categoryId = -1; try {// w ww .j a v a2s .c o m InputStream in = openFileInput(FNAME_CMD_HTML); Document document = Jsoup.parse(in, "UTF-8", URL_CMD_HTML); in.close(); Element divContentArea = document.getElementById(ID_CONTENTAREA); for (Element e : divContentArea.children()) { if (e.tagName().equals(TAG_TABLE)) { if (e.className().equals("")) { mCommands.add(new Command(e, categoryId)); } } else if (e.tagName().equals(TAG_H3)) { mCategories.add(e.text()); categoryId++; } } } catch (IOException e) { e.printStackTrace(); mCommands = null; mCategories = null; } }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule2.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("dt:contains(Editor) ~ dd, dt:contains(Edition Editor) ~ dd"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("dt")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().contains("version") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;/* w w w .j av a2 s .com*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("dt")) break; } if (editorList.size() == 0) return null; return editorList; }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule8.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Elements editors = doc.select("h4:contains(Editor) ~ blockquote"); if (editors.size() == 0) return null; boolean skip = false; for (Element editor : editors) { Element prev = editor.previousElementSibling(); if (prev.tagName().equals("h4")) { if ((!prev.text().trim().toLowerCase().startsWith("editor") && !prev.text().trim().toLowerCase().startsWith("edition editor")) || prev.text().trim().toLowerCase().endsWith("version:") || prev.text().trim().toLowerCase().endsWith("draft:")) { skip = true;/*from w w w . j a v a 2 s . co m*/ } } if (skip) { Element next = editor.nextElementSibling(); if (next != null) { if (next.text().trim().toLowerCase().startsWith("editor") || next.text().trim().toLowerCase().contains("edition editor")) { skip = false; continue; } } continue; } if (StringUtils.countMatches(editor.text(), " - ") > 2) { Log.log("warning", "This editor may be a list of editors separated by - "); EditorsRule5 ed5 = new EditorsRule5(); return ed5.run(url, doc); } String[] splitted = editor.html().split("<br />|<br clear=\"none\" />"); if (splitted.length < 2) { if (editor.text().toLowerCase().startsWith("(in alphabetic") || editor.text().toLowerCase().startsWith("see acknowl") || editor.text().toLowerCase().startsWith("the w3") || editor.text().toLowerCase().startsWith("(see ac") || editor.text().toLowerCase().startsWith("see participants") || editor.text().toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (editor.text().equals("WHATWG:") || editor.text().equals("W3C:")) continue; Person result = NameParser.parse(editor.text()); if (result == null) continue; for (int i = 0; i < editor.select("a").size(); i++) { if (!editor.select("a").get(i).attr("href").isEmpty()) { if (editor.select("a").get(i).attr("href").contains("@")) { result.setEmail(editor.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(editor.select("a").get(i).attr("href")); } } } editorList.add(result); } else { for (String split : splitted) { if (!split.isEmpty()) { if (split.toLowerCase().startsWith("(in alphabetic") || split.toLowerCase().startsWith("see acknowl") || split.toLowerCase().startsWith("the w3") || split.toLowerCase().startsWith("(see ac") || split.toLowerCase().startsWith("see participants") || split.toLowerCase().contains("note:")) { Log.log("warning", "Spec " + url + " may refer to a different section!"); continue; } if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } } Element next = editor.nextElementSibling(); if (next != null) if (next.tag().getName().equals("h4")) break; } if (editorList.size() == 0) return null; return editorList; }