List of usage examples for org.jsoup.nodes Element tagName
public String tagName()
From source file:com.aquest.emailmarketing.web.controllers.BroadcastController.java
/** * Adds the tracking./*from w w w .j a va 2s . c o m*/ * * @param model the model * @param urls the urls * @param principal the principal * @param id the id * @param trackingFlg the tracking flg * @param openGAflg the open g aflg * @param openPixelFlg the open pixel flg * @param trackingType the tracking type * @return the string */ @RequestMapping(value = "/generateUrls", method = RequestMethod.POST) public String addTracking(Model model, Urls urls, Principal principal, @RequestParam(value = "id") int id, @RequestParam(value = "trackingFlg", required = false) boolean trackingFlg, @RequestParam(value = "openGAflg", required = false) boolean openGAflg, @RequestParam(value = "openPixelFlg", required = false) boolean openPixelFlg, @RequestParam(value = "trackingType", required = false) String trackingType) { TrackingConfig trackingConfig = new TrackingConfig(); Broadcast broadcast = broadcastService.getBroadcastById(id); String workingHtml = broadcast.getHtmlbody(); if (trackingFlg == true) { if (openGAflg == true) { workingHtml = emailTracking.addGaOpenEmailTracking(workingHtml, urls); System.out.println("GA Open: " + workingHtml); } if (openPixelFlg == true) { workingHtml = emailTracking.addPixelOpenEmailTracking(workingHtml); System.out.println("Pixel Open: " + workingHtml); } if (trackingType.equals("ga")) { workingHtml = emailTracking.addGaTrackingToUrl(workingHtml, urls); System.out.println("GA Click added: " + workingHtml); } else if (trackingType.equals("intTrack")) { workingHtml = emailTracking.addIntTrackingToUrl(workingHtml, urls); System.out.println("Internal Tracking: " + workingHtml); } else { workingHtml = emailTracking.addBothTrackingToUrl(workingHtml, urls); } } broadcast.setHtmlbody_tracking(workingHtml); System.out.println(broadcast.getHtmlbody_tracking()); String confirm = broadcastService.SaveOrUpdate(broadcast); System.out.println(confirm); System.out.println(trackingFlg); System.out.println(openGAflg); System.out.println(openPixelFlg); System.out.println(trackingType); if (confirm == broadcast.getBroadcast_id()) { trackingConfig.setBroadcast_id(broadcast.getBroadcast_id()); // taking care of tracking flg int tracking_flg = 0; if (trackingFlg == true) { tracking_flg = 1; } trackingConfig.setTracking_flg(tracking_flg); // taking care of openGAflg int open_ga_flg = 0; if (openGAflg == true) { open_ga_flg = 1; } trackingConfig.setOpen_ga_flg(open_ga_flg); // taking care of openPixelFlg int open_pixel_flg = 0; if (openPixelFlg == true) { open_pixel_flg = 1; } trackingConfig.setOpen_pixel_flg(open_pixel_flg); // set tracking type trackingConfig.setTracking_type(trackingType); // seting utm's trackingConfig.setUtm_campaign(urls.getUtmCampaign()); trackingConfig.setUtm_content(urls.getUtmContent()); trackingConfig.setUtm_medium(urls.getUtmMedium()); trackingConfig.setUtm_source(urls.getUtmSource()); trackingConfigService.SaveOrUpdate(trackingConfig); } // find images in html to be able to embed images in email as in-line attachments EmbeddedImage embeddedImage = new EmbeddedImage(); //HashSet to avoid duplicates Set<String> imgList = new HashSet<String>(); String html = broadcast.getHtmlbody(); Document doc = Jsoup.parse(html); Elements media = doc.select("[src]"); for (Element src : media) { if (src.tagName().equals("img")) { imgList.add(src.attr("abs:src")); } } model.addAttribute("imgList", imgList); model.addAttribute("embeddedImage", embeddedImage); model.addAttribute("broadcast", broadcast); return "embeddedimage"; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
void parseMultipleMonitorDays(SubstitutionSchedule v, Document doc, JSONObject data) throws JSONException, CredentialInvalidException { if (doc.select(".mon_head").size() > 1) { for (int j = 0; j < doc.select(".mon_head").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_head").get(j).clone()); Element next = doc.select(".mon_head").get(j).nextElementSibling(); if (next != null && next.tagName().equals("center")) { doc2.body().appendChild(next.select(".mon_title").first().clone()); if (next.select("table:has(tr.list)").size() > 0) { doc2.body().appendChild(next.select("table:has(tr.list)").first()); }// w w w. ja v a 2 s . co m if (next.select("table.info").size() > 0) { doc2.body().appendChild(next.select("table.info").first()); } } else if (doc.select(".mon_title").size() - 1 >= j) { doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); doc2.body().appendChild(doc.select("table:has(tr.list)").get(j).clone()); } else { continue; } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else if (doc.select(".mon_title").size() > 1) { for (int j = 0; j < doc.select(".mon_title").size(); j++) { Document doc2 = Document.createShell(doc.baseUri()); doc2.body().appendChild(doc.select(".mon_title").get(j).clone()); Element next = doc.select(".mon_title").get(j).nextElementSibling(); while (next != null && !next.tagName().equals("center")) { doc2.body().appendChild(next); next = doc.select(".mon_title").get(j).nextElementSibling(); } SubstitutionScheduleDay day = parseMonitorDay(doc2, data); v.addDay(day); } } else { SubstitutionScheduleDay day = parseMonitorDay(doc, data); v.addDay(day); } }
From source file:de.geeksfactory.opacclient.apis.Bibliotheca.java
private SearchField createSearchField(String name, String hint, Element input) { if (input.tagName().equals("input") && input.attr("type").equals("text")) { TextSearchField field = new TextSearchField(); field.setDisplayName(name);/* www.j a v a 2 s . co m*/ field.setHint(hint); field.setId(input.attr("name")); return field; } else if (input.tagName().equals("select")) { DropdownSearchField field = new DropdownSearchField(); field.setDisplayName(name); field.setId(input.attr("name")); for (Element option : input.select("option")) { field.addDropdownValue(option.attr("value"), option.text()); } return field; } else { return null; } }
From source file:com.jimplush.goose.ContentExtractor.java
/** * remove any divs that looks like non-content, clusters of links, or paras with no gusto * * @param node/*from www. j a v a 2s . c om*/ * @return */ private Element cleanupNode(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting cleanup Node"); } node = addSiblings(node); Elements nodes = node.children(); for (Element e : nodes) { if (e.tagName().equals("p")) { continue; } if (logger.isDebugEnabled()) { logger.debug("CLEANUP NODE: " + e.id() + " class: " + e.attr("class")); } boolean highLinkDensity = isHighLinkDensity(e); if (highLinkDensity) { if (logger.isDebugEnabled()) { logger.debug("REMOVING NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class")); } e.remove(); continue; } // now check for word density // grab all the paragraphs in the children and remove ones that are too small to matter Elements subParagraphs = e.getElementsByTag("p"); for (Element p : subParagraphs) { if (p.text().length() < 25) { p.remove(); } } // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as // their next siblings to avoid getting img bylines // first let's remove any element that now doesn't have any p tags at all Elements subParagraphs2 = e.getElementsByTag("p"); if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node because it doesn't have any paragraphs"); } e.remove(); continue; } //if this node has a decent enough gravityScore we should keep it as well, might be content int topNodeScore = getScore(node); int currentNodeScore = getScore(e); float thresholdScore = (float) (topNodeScore * .08); if (logger.isDebugEnabled()) { logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore + " threshold: " + thresholdScore); } if (currentNodeScore < thresholdScore) { if (!e.tagName().equals("td")) { if (logger.isDebugEnabled()) { logger.debug("Removing node due to low threshold score"); } e.remove(); } else { if (logger.isDebugEnabled()) { logger.debug("Not removing TD node"); } } continue; } } return node; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs * so we'll want to make sure that the next sibling is a paragraph and has at least some substatial weight to it * * * @param node/*from w w w . j av a 2 s .c o m*/ * @return */ private boolean isOkToBoost(Element node) { int stepsAway = 0; Element sibling = node.nextElementSibling(); while (sibling != null) { if (sibling.tagName().equals("p")) { if (stepsAway >= 3) { if (logger.isDebugEnabled()) { logger.debug("Next paragraph is too far away, not boosting"); } return false; } String paraText = sibling.text(); WordStats wordStats = StopWords.getStopWordCount(paraText); if (wordStats.getStopWordCount() > 5) { if (logger.isDebugEnabled()) { logger.debug("We're gonna boost this node, seems contenty"); } return true; } } // increase how far away the next paragraph is from this node stepsAway++; sibling = sibling.nextElementSibling(); } return false; }
From source file:de.geeksfactory.opacclient.apis.Open.java
/** * Better version of JSoup's implementation of this function ({@link * org.jsoup.nodes.FormElement#formData()}). * * @param form The form to submit//from w ww. j ava 2s . com * @param submitName The name attribute of the button which is clicked to submit the form, or * null * @return A MultipartEntityBuilder containing the data of the form */ protected MultipartEntityBuilder formData(FormElement form, String submitName) { MultipartEntityBuilder data = MultipartEntityBuilder.create(); data.setLaxMode(); // iterate the form control elements and accumulate their values for (Element el : form.elements()) { if (!el.tag().isFormSubmittable()) { continue; // contents are form listable, superset of submitable } String name = el.attr("name"); if (name.length() == 0) continue; String type = el.attr("type"); if ("select".equals(el.tagName())) { Elements options = el.select("option[selected]"); boolean set = false; for (Element option : options) { data.addTextBody(name, option.val()); set = true; } if (!set) { Element option = el.select("option").first(); if (option != null) { data.addTextBody(name, option.val()); } } } else if ("checkbox".equalsIgnoreCase(type) || "radio".equalsIgnoreCase(type)) { // only add checkbox or radio if they have the checked attribute if (el.hasAttr("checked")) { data.addTextBody(name, el.val().length() > 0 ? el.val() : "on"); } } else if ("submit".equalsIgnoreCase(type) || "image".equalsIgnoreCase(type)) { if (submitName != null && el.attr("name").contains(submitName)) { data.addTextBody(name, el.val()); } } else { data.addTextBody(name, el.val()); } } return data; }
From source file:com.iorga.iraj.servlet.AgglomeratorServlet.java
private long searchAndAppendAfter(final ServletConfig config, final Element agglomerateElement, final String scriptSrc, final String pathPrefix, final String pathSuffix, final String urlAttribute, long lastModified) throws MalformedURLException, IOException, URISyntaxException { if (mode == Mode.DEVELOPMENT) { // add a watch for that directory final Path path = Paths.get(config.getServletContext().getRealPath(scriptSrc)); path.register(watchService, StandardWatchEventKinds.ENTRY_CREATE, StandardWatchEventKinds.ENTRY_DELETE); }//from w ww .j a v a2 s. c o m final Set<String> childrenPaths = config.getServletContext().getResourcePaths(scriptSrc); for (final String path : childrenPaths) { if (path.endsWith(pathSuffix)) { // add that JS final StringBuilder targetScript = new StringBuilder("<"); targetScript.append(agglomerateElement.tagName()); // copy all the origin attributes for (final Attribute attribute : agglomerateElement.attributes()) { final String key = attribute.getKey(); if (!ATTRIBUTE_NAME.equalsIgnoreCase(key) && !urlAttribute.equalsIgnoreCase(key) && !URL_ATTRIBUTE_ATTRIBUTE_NAME.equalsIgnoreCase(key)) { targetScript.append(" ").append(attribute.html()); } } // specify the src path final String childUrl = StringUtils.removeStart(path, pathPrefix); targetScript.append(" ").append(new Attribute(urlAttribute, childUrl).html()).append(" />"); agglomerateElement.after(targetScript.toString()); lastModified = Math.max( config.getServletContext().getResource(childUrl).openConnection().getLastModified(), lastModified); } else if (path.endsWith("/")) { // it's a directory, recurse search & append lastModified = Math.max(searchAndAppendAfter(config, agglomerateElement, path, pathPrefix, pathSuffix, urlAttribute, lastModified), lastModified); } } return lastModified; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * adds any siblings that may have a decent score to this node * * @param node/*from www . j a va2 s.c o m*/ * @return */ private Element addSiblings(Element node) { if (logger.isDebugEnabled()) { logger.debug("Starting to add siblings"); } int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node); Element currentSibling = node.previousElementSibling(); while (currentSibling != null) { if (logger.isDebugEnabled()) { logger.debug("SIBLINGCHECK: " + debugNode(currentSibling)); } if (currentSibling.tagName().equals("p")) { node.child(0).before(currentSibling.outerHtml()); currentSibling = currentSibling.previousElementSibling(); continue; } // check for a paraph embedded in a containing element int insertedSiblings = 0; Elements potentialParagraphs = currentSibling.getElementsByTag("p"); if (potentialParagraphs.first() == null) { currentSibling = currentSibling.previousElementSibling(); continue; } for (Element firstParagraph : potentialParagraphs) { WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text()); int paragraphScore = wordStats.getStopWordCount(); if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) { if (logger.isDebugEnabled()) { logger.debug("This node looks like a good sibling, adding it"); } node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>"); insertedSiblings++; } } currentSibling = currentSibling.previousElementSibling(); } return node; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); HttpGet httpget;/* w w w . j a v a2 s . c o m*/ if (opacDir.contains("opax")) { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S"); } else { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S"); } HttpResponse response = http_client.execute(httpget); if (response.getStatusLine().getStatusCode() == 500) { throw new NotReachableException(response.getStatusLine().getReasonPhrase()); } String html = convertStreamToString(response.getEntity().getContent()); HttpUtils.consume(response.getEntity()); Document doc = Jsoup.parse(html); // get text fields Elements text_opts = doc.select("form select[name=REG1] option"); for (Element opt : text_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setDisplayName(opt.text()); field.setHint(""); fields.add(field); } // get media types Elements mt_opts = doc.select("form input[name~=(MT|MS)]"); if (mt_opts.size() > 0) { DropdownSearchField mtDropdown = new DropdownSearchField(); mtDropdown.setId(mt_opts.get(0).attr("name")); mtDropdown.setDisplayName("Medientyp"); for (Element opt : mt_opts) { if (!opt.val().equals("")) { String text = opt.text(); if (text.length() == 0) { // text is empty, check layouts: // Essen: <input name="MT"><img title="mediatype"> // Schaffenb: <input name="MT"><img alt="mediatype"> Element img = opt.nextElementSibling(); if (img != null && img.tagName().equals("img")) { text = img.attr("title"); if (text.equals("")) { text = img.attr("alt"); } } } if (text.length() == 0) { // text is still empty, check table layout, Example // Friedrichshafen // <td><input name="MT"></td> <td><img // title="mediatype"></td> Element td1 = opt.parent(); Element td2 = td1.nextElementSibling(); if (td2 != null) { Elements td2Children = td2.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty, check images in label layout, Example // Wiedenst // <input type="radio" name="MT" id="MTYP1" value="MTYP1"> // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books // .png" alt="Bcher" title="Bcher"></label> Element label = opt.nextElementSibling(); if (label != null) { Elements td2Children = label.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty: missing end tag like Offenburg text = parse_option_regex(opt); } mtDropdown.addDropdownValue(opt.val(), text); } } fields.add(mtDropdown); } // get branches Elements br_opts = doc.select("form select[name=ZW] option"); if (br_opts.size() > 0) { DropdownSearchField brDropdown = new DropdownSearchField(); brDropdown.setId(br_opts.get(0).parent().attr("name")); brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text() .replace("\u00a0", "").replace("?", "").trim()); for (Element opt : br_opts) { brDropdown.addDropdownValue(opt.val(), opt.text()); } fields.add(brDropdown); } return fields; }