List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:com.kingfong.webcrawler.util.DOMContentUtils.java
/** * This method finds all anchors below the supplied DOM * <code>node</code>, and creates appropriate {@link Outlink} * records for each (relative to the supplied <code>base</code> * URL), and adds them to the <code>outlinks</code> {@link * ArrayList}./*from w w w. j a v a 2 s . co m*/ * * <p> * * Links without inner structure (tags, text, etc) are discarded, as * are links which contain only single nested links and empty text * nodes (this is a common DOM-fixup artifact, at least with * nekohtml). */ public void getOutlinks(String html, URL url, HashSet<String> outlinks) { Document document = Jsoup.parse(html); Elements elements = document.getAllElements(); for (Element currentNode : elements) { String nodeName = currentNode.tagName(); // short nodeType = currentNode.; Elements children = currentNode.children(); nodeName = nodeName.toLowerCase(); LinkParams params = linkParams.get(nodeName); if (params != null) { // if (!shouldThrowAwayLink(currentNode, children, childLen, // params)) { // StringBuilder linkText = new StringBuilder(); // getText(linkText, currentNode, true); Attributes attrs = currentNode.attributes(); String target = null; boolean noFollow = false; boolean post = false; Iterator<Attribute> iterator = attrs.iterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); String attrName = attr.getKey(); if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getValue(); } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getValue())) { noFollow = true; } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getValue())) { post = true; } } if (StringUtils.startsWith(target, "/")) { target = url.getProtocol() + "://" + url.getHost() + target; } if (target != null && URLFilter.filt(target)) { outlinks.add(target); } // } // this should not have any children, skip them if (params.childLen == 0) continue; } } }
From source file:by.heap.remark.convert.TextCleaner.java
private void fixLineBreaks(Element el) { for (final Element e : el.children()) { if (e.tagName().equals("br")) { e.before("\n"); e.remove();/*from w w w.j av a 2 s . c o m*/ } else { fixLineBreaks(e); } } }
From source file:by.heap.remark.convert.TextCleaner.java
private boolean isBlock(Node n) { boolean block = false; if (n != null && n instanceof Element) { Element el = (Element) n; block = el.isBlock() || el.tagName().equals("br"); }//from w w w . j a va 2 s. co m return block; }
From source file:com.aquest.emailmarketing.web.controllers.BroadcastTemplateController.java
/** * Adds the tracking.// w w w . j a v a2 s. c o m * * @param model the model * @param urls the urls * @param principal the principal * @param id the id * @param trackingFlg the tracking flg * @param openGAflg the open g aflg * @param openPixelFlg the open pixel flg * @param trackingType the tracking type * @return the string */ @RequestMapping(value = "/bcastTempGenerateUrls", method = RequestMethod.POST) public String addTracking(Model model, Urls urls, Principal principal, @RequestParam(value = "id") int id, @RequestParam(value = "trackingFlg", required = false) boolean trackingFlg, @RequestParam(value = "openGAflg", required = false) boolean openGAflg, @RequestParam(value = "openPixelFlg", required = false) boolean openPixelFlg, @RequestParam(value = "trackingType", required = false) String trackingType) { TrackingConfig trackingConfig = new TrackingConfig(); BroadcastTemplate broadcastTemplate = broadcastTemplateService.getBroadcastTemplateById(id); String workingHtml = broadcastTemplate.getHtmlbody(); if (trackingFlg == true) { if (openGAflg == true) { workingHtml = emailTracking.addGaOpenEmailTracking(workingHtml, urls); System.out.println("GA Open: " + workingHtml); } if (openPixelFlg == true) { workingHtml = emailTracking.addPixelOpenEmailTracking(workingHtml); System.out.println("Pixel Open: " + workingHtml); } if (trackingType.equals("ga")) { workingHtml = emailTracking.addGaTrackingToUrl(workingHtml, urls); System.out.println("GA Click added: " + workingHtml); } else if (trackingType.equals("intTrack")) { workingHtml = emailTracking.addIntTrackingToUrl(workingHtml, urls); System.out.println("Internal Tracking: " + workingHtml); } else { workingHtml = emailTracking.addBothTrackingToUrl(workingHtml, urls); } } broadcastTemplate.setHtmlbody_tracking(workingHtml); System.out.println(broadcastTemplate.getHtmlbody_tracking()); String confirm = broadcastTemplateService.SaveOrUpdate(broadcastTemplate); System.out.println(confirm); System.out.println(trackingFlg); System.out.println(openGAflg); System.out.println(openPixelFlg); System.out.println(trackingType); if (confirm == broadcastTemplate.getB_template_name()) { trackingConfig.setBcast_template_id(broadcastTemplate.getId()); // taking care of tracking flg int tracking_flg = 0; if (trackingFlg == true) { tracking_flg = 1; } trackingConfig.setTracking_flg(tracking_flg); // taking care of openGAflg int open_ga_flg = 0; if (openGAflg == true) { open_ga_flg = 1; } trackingConfig.setOpen_ga_flg(open_ga_flg); // taking care of openPixelFlg int open_pixel_flg = 0; if (openPixelFlg == true) { open_pixel_flg = 1; } trackingConfig.setOpen_pixel_flg(open_pixel_flg); // set tracking type trackingConfig.setTracking_type(trackingType); // seting utm's trackingConfig.setUtm_campaign(urls.getUtmCampaign()); trackingConfig.setUtm_content(urls.getUtmContent()); trackingConfig.setUtm_medium(urls.getUtmMedium()); trackingConfig.setUtm_source(urls.getUtmSource()); trackingConfigService.SaveOrUpdate(trackingConfig); } // find images in html to be able to embed images in email as in-line attachments EmbeddedImage embeddedImage = new EmbeddedImage(); List<String> imgList = new ArrayList<String>(); String html = broadcastTemplate.getHtmlbody(); Document doc = Jsoup.parse(html); Elements media = doc.select("[src]"); for (Element src : media) { if (src.tagName().equals("img")) { imgList.add(src.attr("abs:src")); } } model.addAttribute("imgList", imgList); model.addAttribute("embeddedImage", embeddedImage); model.addAttribute("broadcastTemplate", broadcastTemplate); return "bcasttempembeddedimage"; }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java
private void traverseHierarchy(Element e, DBpediaCategory category, HashMap<String, DBpediaCategory> map) { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("a")) { String href = c.attr("href"); if (href != null && href.length() > 0) { category.setLabel(c.text()); category.setUri(CLASSES_BASE_URI + c.text()); map.put(category.getLabel(), category); System.out.println(c.text() + "\t" + CLASSES_BASE_URI + c.text()); }//from w w w . jav a 2 s . co m } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { DBpediaCategory cc = new DBpediaCategory(); traverseHierarchy(c1, cc, map); cc.parents = new HashSet<>(); cc.parents.add(category); category.getSubClasses().add(cc); } } } } }
From source file:mergedoc.core.APIDocument.java
/** * ? Javadoc ????/*from www .j a v a 2 s . c o m*/ * @param className ?? * @param docHtml API */ private void parseMethodComment(String className, Document doc) { Elements elements = doc.select("body > div.contentContainer > div.details > ul > li > ul > li > ul > li"); for (Element element : elements) { Element sigElm = element.select("pre").first(); if (sigElm == null) { continue; } String sigStr = sigElm.html(); Signature sig = createSignature(className, sigStr); Comment comment = new Comment(sig); // deprecated String depre = ""; Elements divs = element.select("div"); if (divs.size() == 2) { depre = divs.get(0).html(); } if (divs.size() > 0) { String body = divs.last().html(); body = formatLinkTag(className, body); comment.setDocumentBody(body); } Elements dtTags = element.select("dl dt"); for (Element dtTag : dtTags) { String dtText = dtTag.text(); if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); if (dtText.contains(":")) { name = "<" + name + ">"; } String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); comment.addParam(name, desc); } } continue; } if (dtText.contains(":")) { Element dd = dtTag.nextElementSibling(); String str = dd.html(); str = formatLinkTag(className, str); comment.addReturn(str); continue; } if (dtText.contains(":")) { Element dd = dtTag; while (true) { dd = dd.nextElementSibling(); if (dd == null || dd.tagName().equalsIgnoreCase("dd") == false) { break; } String name = dd.select("code").first().text(); String items = dd.html(); Pattern p = PatternCache .getPattern("(?si)<CODE>(.+?)</CODE>\\s*-\\s*(.*?)(<DD>|</DD>|</DL>|<DT>|$)"); Matcher m = p.matcher(items); if (m.find()) { String desc = formatLinkTag(className, m.group(2)); String param = name + " " + desc; comment.addThrows(param); } } continue; } } // deprecated parseDeprecatedTag(className, depre, comment); // parseCommonTag(className, element, comment); contextTable.put(sig, comment); } }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
private Element getContentElement() throws Exception { clean();/*from w ww . ja v a 2s. c om*/ computeInfo(doc.body()); double maxScore = 0; Element content = null; for (Map.Entry<Element, CountInfo> entry : infoMap.entrySet()) { Element tag = entry.getKey(); if (tag.tagName().equals("a") || tag.tagName().equals("p") || tag == doc.body()) { continue; } double score = computeScore(tag); if (score > maxScore) { maxScore = score; content = tag; } } if (content == null) { throw new Exception("extraction failed"); } return content; }
From source file:me.vertretungsplan.parser.IndiwareParser.java
SubstitutionScheduleDay parseIndiwareDay(Element doc, boolean html) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); DataSource ds;/*from w ww . ja va2s.co m*/ if (html) { ds = new HTMLDataSource(doc); } else { ds = new XMLDataSource(doc); } Matcher matcher = datePattern.matcher(ds.titel().text()); if (!matcher.find()) throw new IOException("malformed date: " + ds.titel().text()); String date = matcher.group(); day.setDate( DateTimeFormat.forPattern("EEEE, dd. MMMM yyyy").withLocale(Locale.GERMAN).parseLocalDate(date)); String lastChange = ds.datum().text(); day.setLastChange(DateTimeFormat.forPattern("dd.MM.yyyy, HH:mm").withLocale(Locale.GERMAN) .parseLocalDateTime(lastChange)); if (ds.kopfinfos().size() > 0) { for (Element kopfinfo : ds.kopfinfos()) { String title = html ? kopfinfo.select("th").text() : kopfinfoTitle(kopfinfo.tagName()) + ":"; StringBuilder message = new StringBuilder(); if (title != null && !title.isEmpty()) { message.append("<b>").append(title).append("</b>").append(" "); } message.append(html ? kopfinfo.select("td").text() : kopfinfo.text()); day.addMessage(message.toString()); } } if (ds.fuss() != null) { StringBuilder message = new StringBuilder(); boolean first = true; for (Element fusszeile : ds.fusszeilen()) { if (first) { first = false; } else { message.append("\n"); } message.append(fusszeile.text()); } day.addMessage(message.toString()); } List<String> columnTypes = null; if (html) { columnTypes = new ArrayList<>(); for (Element th : ((HTMLDataSource) ds).headers()) { columnTypes.add(th.className().replace("thplan", "").replace("thlplan", "")); } } for (Element aktion : ds.aktionen()) { Substitution substitution = new Substitution(); String type = "Vertretung"; String course = null; int i = 0; for (Element info : aktion.children()) { String value = info.text().replace("\u00a0", ""); if (value.equals("---")) { i++; continue; } final String columnType = html ? columnTypes.get(i) : info.tagName(); switch (columnType) { case "klasse": Set<String> classes = new HashSet<>(); for (String klasse : value.split(",")) { Matcher courseMatcher = coursePattern.matcher(klasse); if (courseMatcher.matches()) { classes.add(courseMatcher.group(1)); course = courseMatcher.group(2); } else { classes.add(klasse); } } substitution.setClasses(classes); break; case "stunde": substitution.setLesson(value); break; case "fach": String subject = subjectAndCourse(course, value); if (columnTypes != null && columnTypes.contains("vfach")) { substitution.setPreviousSubject(subject); } else { substitution.setSubject(subject); } break; case "vfach": substitution.setSubject(subjectAndCourse(course, value)); case "lehrer": Matcher bracesMatcher = bracesPattern.matcher(value); if (bracesMatcher.matches()) value = bracesMatcher.group(1); substitution.setTeacher(value); break; case "raum": if (columnTypes != null && columnTypes.contains("vraum")) { substitution.setPreviousRoom(value); } else { substitution.setRoom(value); } break; case "vraum": substitution.setRoom(value); case "info": Matcher substitutionMatcher = substitutionPattern.matcher(value); Matcher cancelMatcher = cancelPattern.matcher(value); Matcher delayMatcher = delayPattern.matcher(value); Matcher selfMatcher = selfPattern.matcher(value); if (substitutionMatcher.matches()) { substitution.setPreviousSubject(substitutionMatcher.group(1)); substitution.setPreviousTeacher(substitutionMatcher.group(2)); if (!substitutionMatcher.group(3).isEmpty()) { substitution.setDesc(substitutionMatcher.group(3)); } } else if (cancelMatcher.matches()) { type = "Entfall"; substitution.setPreviousSubject(cancelMatcher.group(1)); substitution.setPreviousTeacher(cancelMatcher.group(2)); } else if (delayMatcher.matches()) { type = "Verlegung"; substitution.setPreviousSubject(delayMatcher.group(1)); substitution.setPreviousTeacher(delayMatcher.group(2)); substitution.setDesc(delayMatcher.group(3)); } else if (selfMatcher.matches()) { type = "selbst."; if (!selfMatcher.group(1).isEmpty()) substitution.setDesc(selfMatcher.group(1)); } else if (value.equals("fllt aus") || value.equals("Klausur") || value.equals("Aufg.")) { type = value; } else { substitution.setDesc(value); } break; } i++; } substitution.setType(type); substitution.setColor(colorProvider.getColor(substitution.getType())); if (course != null && substitution.getSubject() == null) { substitution.setSubject(course); } day.addSubstitution(substitution); } return day; }
From source file:com.fluidops.iwb.provider.HTMLProvider.java
@Override public void gather(List<Statement> res) throws Exception { String url = config.url;/* www .ja va 2s. c o m*/ Document doc = Jsoup.connect(url).get(); Elements links = doc.select("a[href]"); Elements media = doc.select("[src]"); Elements imports = doc.select("link[href]"); // Elements article = // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table"); Elements article = doc.getElementsByTag("tbody").select("tr"); Elements tableElem; URI nameURI = null; URI roadsURI = null; URI sideURI = null; URI totalURI = null; File file = new File("HTMLdata.txt"); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file))); out.println("Media"); print("\nMedia: (%d)", media.size()); for (Element el : media) { if (el.tagName().equals("img")) { print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"), el.attr("height"), trim(el.attr("alt"), 20)); out.println(); } else { print(" * %s: <%s>", el.tagName(), el.attr("abs:src")); out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src")); out.println(); } } out.println("Imports"); print("\nImports: (%d)", imports.size()); for (Element link : imports) { print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel")); out.println(); } out.println("Links"); print("\nLinks: (%d)", links.size()); for (Element link : links) { print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35)); out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text()); out.println(); } /* * out.println("Custom text"); print("\nCustom: (%d)",customArt.size()); * for (Element custom:customArt){ * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text()); * out.println(); } */ out.println("Article"); print("\nArticle: (%d)", article.size()); for (int i = 3; i < article.size() - 2; i++) { tableElem = article.get(i).select("td"); out.println(); if (i == 3) { nameURI = ProviderUtils.objectToUri(tableElem.get(0).text()); roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text()); sideURI = ProviderUtils.objectToUri(tableElem.get(2).text()); totalURI = ProviderUtils.objectToUri(tableElem.get(3).text()); } else { res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE, nameURI)); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDFS.LABEL, tableElem.get(0).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), roadsURI, tableElem.get(1).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), sideURI, tableElem.get(2).text())); res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), totalURI, tableElem.get(3).text())); for (Element el : tableElem) { out.printf("\n * (%s): (%s)", el.tagName(), el.text()); out.println(); } } out.println(); out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text()); out.println(); } out.close(); }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * metaTitle?metaTitle,metaTitle??????title * * @param contentElement// w w w. j ava 2 s. co m * @return * @throws Exception */ protected String getTitle(final Element contentElement) throws Exception { final ArrayList<Element> titleList = new ArrayList<Element>(); final ArrayList<Double> titleSim = new ArrayList<Double>(); final String metaTitle = getText(doc.title().trim()); if (!metaTitle.isEmpty()) { doc.body().traverse(new NodeVisitor() { @Override public void head(Node node, int i) { if (node instanceof Element) { Element tag = (Element) node; String tagName = tag.tagName(); if (Pattern.matches("h[1-6]", tagName)) { String title = tag.text().trim(); double sim = strSim(title, metaTitle); titleSim.add(sim); titleList.add(tag); } } } @Override public void tail(Node node, int i) { } }); int index = titleSim.size(); if (index >= 0) { double maxScore = 0; int maxIndex = -1; for (int i = 0; i < index; i++) { double score = (i + 1) * titleSim.get(i); if (score > maxScore) { maxScore = score; maxIndex = i; } } if (maxIndex == -1 || titleSim.get(maxIndex) < 0.3) { String title = getText(metaTitle); if (!title.endsWith("") && title.length() > 7) { return title; } Collections.sort(titleList, new Comparator<Element>() { @Override public int compare(Element o1, Element o2) { int len1 = 1; int len2 = 1; if (o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o1.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len1 = 0; } if (o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() > 26 || o2.text().replaceAll("[^\\u4e00-\\u9fa5]", "").length() < 7) { len2 = 0; } if (len1 == len2) { return o1.tagName().charAt(1) - o2.tagName().charAt(1); } return len2 - len1; } }); return getText(titleList.get(0).text()); } return titleList.get(maxIndex).text(); } } /** * ? */ Elements titles = doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]"); if (titles.size() > 0) { String title = titles.first().text(); if (title.length() > 5 && title.length() < 40) { return titles.first().text(); } } try { return getTitleByEditDistance(contentElement); } catch (Exception ex) { throw new Exception("title not found"); } }