List of usage examples for org.jsoup.nodes Element parent
@Override public final Element parent()
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * ??:/*from w ww .ja v a 2 s. com*/ * 1. ??? * 2. ??????? * 3. ?? * 4. ?? ?? * 5. ? * * @return * @throws XpathSyntaxErrorException */ private String getAuthor() throws XpathSyntaxErrorException { String author = ""; if (StringUtils.isBlank(srcTime)) { author = getAuthor(doc.body().html()); return author; } Element cur = doc.body().select("*:containsOwn(" + srcTime + ")").first(); if (cur == null) { LOG.warn("?srcTime=" + srcTime); author = getAuthor(doc.body().html()); return author; } if (!noText(cur)) { String arr[] = cur.html().split(srcTime); for (String text : arr) { author = getShortText(text); if (!StringUtils.isBlank(author)) return author; } } Element parent = cur.parent(); while (parent != null && noText(parent)) { cur = parent; parent = parent.parent(); } author = getAuthor(parent.html()); if (!StringUtils.isBlank(author)) return author; Element pre = cur.previousElementSibling(); while (pre != null && noText(pre)) { pre = pre.previousElementSibling(); } if (pre != null) { author = getShortText(pre.text()); } if (!StringUtils.isBlank(author)) return author; Element next = cur.nextElementSibling(); while (next != null && noText(next)) { next = next.nextElementSibling(); } if (next != null) { author = getShortText(next.text()); } if (!StringUtils.isBlank(author)) return author; author = getShortText(parent.html().replace(srcTime, " ")); if (!StringUtils.isBlank(author)) return author; author = getAuthor(doc.body().html()); if (StringUtils.isBlank(author)) { return author_bak; } return author; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * pulls out videos we like//w ww.ja v a 2s .c o m * * @return */ private ArrayList<Element> extractVideos(Element node) { ArrayList<Element> candidates = new ArrayList<Element>(); ArrayList<Element> goodMovies = new ArrayList<Element>(); try { Elements embeds = node.parent().getElementsByTag("embed"); for (Element el : embeds) { candidates.add(el); } Elements objects = node.parent().getElementsByTag("object"); for (Element el : objects) { candidates.add(el); } if (logger.isDebugEnabled()) { logger.debug("extractVideos: Starting to extract videos. Found: " + candidates.size()); } for (Element el : candidates) { Attributes attrs = el.attributes(); for (Attribute a : attrs) { try { if (logger.isDebugEnabled()) { logger.debug(a.getKey() + " : " + a.getValue()); } if ((a.getValue().contains("youtube") || a.getValue().contains("vimeo")) && a.getKey().equals("src")) { if (logger.isDebugEnabled()) { logger.debug("Found video... setting"); logger.debug("This page has a video!: " + a.getValue()); } goodMovies.add(el); } } catch (Exception e) { logger.error(e.toString()); e.printStackTrace(); } } } } catch (NullPointerException e) { logger.error(e.toString(), e); } catch (Exception e) { logger.error(e.toString(), e); } if (logger.isDebugEnabled()) { logger.debug("extractVideos: done looking videos"); } return goodMovies; }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
private String parse_option_regex(Element inputTag) { String optStr = inputTag.val(); String html = inputTag.parent().html(); String result = optStr;//from www.ja v a2s. c o m String regex1 = "value=\"" + optStr + "\".*?>([^<]+)"; String[] regexList = new String[] { regex1 }; for (String regex : regexList) { Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(html); if (matcher.find()) { result = matcher.group(1); result = result.replaceAll(" ", " ").trim(); break; } } return result; }
From source file:me.vertretungsplan.parser.UntisCommonParser.java
/** * Parses an Untis substitution schedule table * * @param table the <code>table</code> Element from the HTML document * @param data {@link SubstitutionScheduleData#getData()} * @param day the {@link SubstitutionScheduleDay} where the substitutions will be stored * @param defaultClass the class that should be set if there is no class column in the table *///from w w w .j a v a 2s . c om private void parseSubstitutionScheduleTable(Element table, JSONObject data, SubstitutionScheduleDay day, String defaultClass) throws JSONException, CredentialInvalidException { if (data.optBoolean(PARAM_CLASS_IN_EXTRA_LINE) || data.optBoolean("class_in_extra_line")) { // backwards compatibility for (Element element : table.select("td.inline_header")) { String className = getClassName(element.text(), data); if (isValidClass(className)) { Element zeile = null; try { zeile = element.parent().nextElementSibling(); if (zeile.select("td") == null) { zeile = zeile.nextElementSibling(); } int skipLines = 0; while (zeile != null && !zeile.select("td").attr("class").equals("list inline_header")) { if (skipLines > 0) { skipLines--; zeile = zeile.nextElementSibling(); continue; } Substitution v = new Substitution(); int i = 0; for (Element spalte : zeile.select("td")) { String text = spalte.text(); if (isEmpty(text)) { i++; continue; } int skipLinesForThisColumn = 0; Element nextLine = zeile.nextElementSibling(); boolean continueSkippingLines = true; while (continueSkippingLines) { if (nextLine != null && nextLine.children().size() == zeile.children().size()) { Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex()); if (columnInNextLine.text().replaceAll("\u00A0", "").trim() .equals(nextLine.text().replaceAll("\u00A0", "").trim())) { // Continued in the next line text += " " + columnInNextLine.text(); skipLinesForThisColumn++; nextLine = nextLine.nextElementSibling(); } else { continueSkippingLines = false; } } else { continueSkippingLines = false; } } if (skipLinesForThisColumn > skipLines) skipLines = skipLinesForThisColumn; String type = data.getJSONArray(PARAM_COLUMNS).getString(i); switch (type) { case "lesson": v.setLesson(text); break; case "subject": handleSubject(v, spalte); break; case "previousSubject": v.setPreviousSubject(text); break; case "type": v.setType(text); v.setColor(colorProvider.getColor(text)); break; case "type-entfall": if (text.equals("x")) { v.setType("Entfall"); v.setColor(colorProvider.getColor("Entfall")); } else { v.setType("Vertretung"); v.setColor(colorProvider.getColor("Vertretung")); } break; case "room": handleRoom(v, spalte); break; case "teacher": handleTeacher(v, spalte, data); break; case "previousTeacher": v.setPreviousTeachers(splitTeachers(text, data)); break; case "desc": v.setDesc(text); break; case "desc-type": v.setDesc(text); String recognizedType = recognizeType(text); v.setType(recognizedType); v.setColor(colorProvider.getColor(recognizedType)); break; case "previousRoom": v.setPreviousRoom(text); break; case "substitutionFrom": v.setSubstitutionFrom(text); break; case "teacherTo": v.setTeacherTo(text); break; case "ignore": break; case "date": // used by UntisSubstitutionParser break; default: throw new IllegalArgumentException("Unknown column type: " + type); } i++; } autoDetectType(data, zeile, v); v.getClasses().add(className); if (v.getLesson() != null && !v.getLesson().equals("")) { day.addSubstitution(v); } zeile = zeile.nextElementSibling(); } } catch (Throwable e) { e.printStackTrace(); } } } } else { boolean hasType = false; for (int i = 0; i < data.getJSONArray(PARAM_COLUMNS).length(); i++) { if (data.getJSONArray(PARAM_COLUMNS).getString(i).equals("type")) { hasType = true; } } int skipLines = 0; for (Element zeile : table.select("tr.list.odd:not(:has(td.inline_header)), " + "tr.list.even:not(:has(td.inline_header)), " + "tr:has(td[align=center]):gt(0)")) { if (skipLines > 0) { skipLines--; continue; } Substitution v = new Substitution(); String klassen = defaultClass != null ? defaultClass : ""; int i = 0; for (Element spalte : zeile.select("td")) { String text = spalte.text(); String type = data.getJSONArray(PARAM_COLUMNS).getString(i); if (isEmpty(text) && !type.equals("type-entfall")) { i++; continue; } int skipLinesForThisColumn = 0; Element nextLine = zeile.nextElementSibling(); boolean continueSkippingLines = true; while (continueSkippingLines) { if (nextLine != null && nextLine.children().size() == zeile.children().size()) { Element columnInNextLine = nextLine.child(spalte.elementSiblingIndex()); if (columnInNextLine.text().replaceAll("\u00A0", "").trim() .equals(nextLine.text().replaceAll("\u00A0", "").trim())) { // Continued in the next line text += " " + columnInNextLine.text(); skipLinesForThisColumn++; nextLine = nextLine.nextElementSibling(); } else { continueSkippingLines = false; } } else { continueSkippingLines = false; } } if (skipLinesForThisColumn > skipLines) skipLines = skipLinesForThisColumn; switch (type) { case "lesson": v.setLesson(text); break; case "subject": handleSubject(v, spalte); break; case "previousSubject": v.setPreviousSubject(text); break; case "type": v.setType(text); v.setColor(colorProvider.getColor(text)); break; case "type-entfall": if (text.equals("x")) { v.setType("Entfall"); v.setColor(colorProvider.getColor("Entfall")); } else if (!hasType) { v.setType("Vertretung"); v.setColor(colorProvider.getColor("Vertretung")); } break; case "room": handleRoom(v, spalte); break; case "previousRoom": v.setPreviousRoom(text); break; case "desc": v.setDesc(text); break; case "desc-type": v.setDesc(text); String recognizedType = recognizeType(text); v.setType(recognizedType); v.setColor(colorProvider.getColor(recognizedType)); break; case "teacher": handleTeacher(v, spalte, data); break; case "previousTeacher": v.setPreviousTeachers(splitTeachers(text, data)); break; case "substitutionFrom": v.setSubstitutionFrom(text); break; case "teacherTo": v.setTeacherTo(text); break; case "class": klassen = getClassName(text, data); break; case "ignore": break; case "date": // used by UntisSubstitutionParser break; default: throw new IllegalArgumentException("Unknown column type: " + type); } i++; } if (v.getLesson() == null || v.getLesson().equals("")) { continue; } autoDetectType(data, zeile, v); List<String> affectedClasses; // Detect things like "7" Pattern singlePattern = Pattern.compile("(\\d+)"); Matcher singleMatcher = singlePattern.matcher(klassen); // Detect things like "5-12" Pattern rangePattern = Pattern.compile("(\\d+) ?- ?(\\d+)"); Matcher rangeMatcher = rangePattern.matcher(klassen); Pattern pattern2 = Pattern.compile("^(\\d+).*"); if (rangeMatcher.matches()) { affectedClasses = new ArrayList<>(); int min = Integer.parseInt(rangeMatcher.group(1)); int max = Integer.parseInt(rangeMatcher.group(2)); try { for (String klasse : getAllClasses()) { Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.matches()) { int num = Integer.parseInt(matcher2.group(1)); if (min <= num && num <= max) affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else if (singleMatcher.matches()) { affectedClasses = new ArrayList<>(); int grade = Integer.parseInt(singleMatcher.group(1)); try { for (String klasse : getAllClasses()) { Matcher matcher2 = pattern2.matcher(klasse); if (matcher2.matches() && grade == Integer.parseInt(matcher2.group(1))) { affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } else { if (data.optBoolean(PARAM_CLASSES_SEPARATED, true) && data.optBoolean("classes_separated", true)) { // backwards compatibility affectedClasses = Arrays.asList(klassen.split(", ")); } else { affectedClasses = new ArrayList<>(); try { for (String klasse : getAllClasses()) { // TODO: is there a better way? StringBuilder regex = new StringBuilder(); for (char character : klasse.toCharArray()) { if (character == '?') { regex.append("\\?"); } else { regex.append(character); } regex.append(".*"); } if (klassen.matches(regex.toString())) { affectedClasses.add(klasse); } } } catch (IOException e) { e.printStackTrace(); } } } for (String klasse : affectedClasses) { if (isValidClass(klasse)) { v.getClasses().add(klasse); } } if (data.optBoolean(PARAM_MERGE_WITH_DIFFERENT_TYPE, false)) { boolean found = false; for (Substitution subst : day.getSubstitutions()) { if (subst.equalsExcludingType(v)) { found = true; if (v.getType().equals("Vertretung")) { subst.setType("Vertretung"); subst.setColor(colorProvider.getColor("Vertretung")); } break; } } if (!found) { day.addSubstitution(v); } } else { day.addSubstitution(v); } } } }
From source file:org.confab.PhpBB3Parser.java
public List<Forum> parseForums(Document root, BulletinBoard parent) { Utilities.debug("parseForums"); List<Forum> ret = new ArrayList<Forum>(); // get table/*from w w w. j a va 2 s .c om*/ Elements forum_tables = root.select("ul[class=topiclist forums]"); assert !forum_tables.isEmpty() : root.html(); for (Element forum_table : forum_tables) { Elements els_li = forum_table.select("li.row"); assert !els_li.isEmpty(); for (Element el_li : els_li) { Forum new_forum = new Forum(parent); // Get the forum url Elements els_a = el_li.select("a.forumtitle"); Element el_a = els_a.first(); assert el_a != null; new_forum.url = el_a.attr("href"); assert new_forum.url != null; Utilities.debug("new_forum.url : " + new_forum.url); // Get the title text new_forum.title = el_a.text(); assert new_forum.title != null; Utilities.debug("new_forum.title : " + new_forum.title); // Check for any subforums in remaining a elements els_a.remove(els_a.first()); for (Element _el_a : els_a) { Forum sub_forum = new Forum(parent); sub_forum.url = el_a.attr("href"); assert sub_forum.url != null; sub_forum.title = el_a.text(); assert sub_forum.title != null; new_forum.subForums.add(sub_forum); Utilities.debug("added subForum: " + sub_forum.title); } // Get the description/message of this topic String el_description = el_a.parent().text(); if (el_description != null) { new_forum.description = el_description; } else { new_forum.description = ""; } Utilities.debug("new_forum.description : " + new_forum.description); Utilities.debug("new_forum.parent.url : " + new_forum.parent.url); ret.add(new_forum); Utilities.debug("-----"); } } Utilities.debug("end parseForums"); return ret; }
From source file:de.geeksfactory.opacclient.apis.Zones.java
@Override public List<SearchField> getSearchFields() throws IOException { if (!initialised) start();//from w w w. ja v a2 s . c om List<SearchField> fields = new ArrayList<>(); String html = httpGet(opac_url + "/APS_ZONES?fn=AdvancedSearch&Style=Portal3&SubStyle=&Lang=GER" + "&ResponseEncoding=utf-8", getDefaultEncoding()); Document doc = Jsoup.parse(html); // find text fields Elements txt_opts = doc.select("#formSelectTerm_1 option"); for (Element opt : txt_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setHint(""); field.setDisplayName(opt.text()); fields.add(field); } // find filters String filtersQuery = version18 ? ".inSearchLimits .floatingBox" : ".TabRechAv .limitBlock"; Elements filters = doc.select(filtersQuery); int i = 0; for (Element filter : filters) { DropdownSearchField dropdown = new DropdownSearchField(); dropdown.addDropdownValue("", "Alle"); // All dropdowns use "q.limits.limit" as URL param, but they must not have the same ID dropdown.setId("dropdown_" + i); if (version18) { dropdown.setDisplayName(filter.select("tr").get(0).text().trim()); Elements opts = filter.select("tr").get(1).select("table td:has(input)"); for (Element opt : opts) { dropdown.addDropdownValue(opt.select("input").attr("value"), opt.text().trim()); } } else { dropdown.setDisplayName(filter.parent().previousElementSibling().text().trim()); Elements opts = filter.select(".limitChoice label"); for (Element opt : opts) { dropdown.addDropdownValue(opt.attr("for"), opt.text().trim()); } } fields.add(dropdown); i++; } return fields; }
From source file:de.geeksfactory.opacclient.apis.TouchPoint.java
private void parseDropdown(Element dropdownElement, List<SearchField> fields) { Elements options = dropdownElement.select("option"); DropdownSearchField dropdown = new DropdownSearchField(); dropdown.setId(dropdownElement.attr("name")); // Some fields make no sense or are not supported in the app if (dropdown.getId().equals("numberOfHits") || dropdown.getId().equals("timeOut") || dropdown.getId().equals("rememberList")) { return;//from www . jav a 2 s . c o m } for (Element option : options) { dropdown.addDropdownValue(option.attr("value"), option.text()); } dropdown.setDisplayName(dropdownElement.parent().select("label").text()); fields.add(dropdown); }
From source file:de.geeksfactory.opacclient.apis.SISIS.java
private void parseDropdown(Element dropdownElement, List<SearchField> fields) throws JSONException { Elements options = dropdownElement.select("option"); DropdownSearchField dropdown = new DropdownSearchField(); if (dropdownElement.parent().select("input[type=hidden]").size() > 0) { dropdown.setId(dropdownElement.parent().select("input[type=hidden]").attr("value")); dropdown.setData(new JSONObject("{\"restriction\": true}")); } else {//from ww w .j ava 2s . com dropdown.setId(dropdownElement.attr("name")); dropdown.setData(new JSONObject("{\"restriction\": false}")); } for (Element option : options) { dropdown.addDropdownValue(option.attr("value"), option.text()); } dropdown.setDisplayName(dropdownElement.parent().select("label").text()); fields.add(dropdown); }
From source file:de.geeksfactory.opacclient.apis.BiBer1992.java
@Override public List<SearchField> getSearchFields() throws IOException { List<SearchField> fields = new ArrayList<>(); HttpGet httpget;/*from ww w.ja v a 2 s . co m*/ if (opacDir.contains("opax")) { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel.html.S"); } else { httpget = new HttpGet(opacUrl + "/" + opacDir + "/de/qsel_main.S"); } HttpResponse response = http_client.execute(httpget); if (response.getStatusLine().getStatusCode() == 500) { throw new NotReachableException(response.getStatusLine().getReasonPhrase()); } String html = convertStreamToString(response.getEntity().getContent()); HttpUtils.consume(response.getEntity()); Document doc = Jsoup.parse(html); // get text fields Elements text_opts = doc.select("form select[name=REG1] option"); for (Element opt : text_opts) { TextSearchField field = new TextSearchField(); field.setId(opt.attr("value")); field.setDisplayName(opt.text()); field.setHint(""); fields.add(field); } // get media types Elements mt_opts = doc.select("form input[name~=(MT|MS)]"); if (mt_opts.size() > 0) { DropdownSearchField mtDropdown = new DropdownSearchField(); mtDropdown.setId(mt_opts.get(0).attr("name")); mtDropdown.setDisplayName("Medientyp"); for (Element opt : mt_opts) { if (!opt.val().equals("")) { String text = opt.text(); if (text.length() == 0) { // text is empty, check layouts: // Essen: <input name="MT"><img title="mediatype"> // Schaffenb: <input name="MT"><img alt="mediatype"> Element img = opt.nextElementSibling(); if (img != null && img.tagName().equals("img")) { text = img.attr("title"); if (text.equals("")) { text = img.attr("alt"); } } } if (text.length() == 0) { // text is still empty, check table layout, Example // Friedrichshafen // <td><input name="MT"></td> <td><img // title="mediatype"></td> Element td1 = opt.parent(); Element td2 = td1.nextElementSibling(); if (td2 != null) { Elements td2Children = td2.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty, check images in label layout, Example // Wiedenst // <input type="radio" name="MT" id="MTYP1" value="MTYP1"> // <label for="MTYP1"><img src="http://www.wiedenest.de/bib/image/books // .png" alt="Bcher" title="Bcher"></label> Element label = opt.nextElementSibling(); if (label != null) { Elements td2Children = label.select("img[title]"); if (td2Children.size() > 0) { text = td2Children.get(0).attr("title"); } } } if (text.length() == 0) { // text is still empty: missing end tag like Offenburg text = parse_option_regex(opt); } mtDropdown.addDropdownValue(opt.val(), text); } } fields.add(mtDropdown); } // get branches Elements br_opts = doc.select("form select[name=ZW] option"); if (br_opts.size() > 0) { DropdownSearchField brDropdown = new DropdownSearchField(); brDropdown.setId(br_opts.get(0).parent().attr("name")); brDropdown.setDisplayName(br_opts.get(0).parent().parent().previousElementSibling().text() .replace("\u00a0", "").replace("?", "").trim()); for (Element opt : br_opts) { brDropdown.addDropdownValue(opt.val(), opt.text()); } fields.add(brDropdown); } return fields; }
From source file:com.jimplush.goose.ContentExtractor.java
/** * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score * * @return/*from ww w .j a v a 2s . co m*/ */ private Element calculateBestNodeBasedOnClustering(Document doc) { Element topNode = null; // grab all the paragraph elements on the page to start to inspect the likely hood of them being good peeps ArrayList<Element> nodesToCheck = getNodesToCheck(doc); double startingBoost = 1.0; int cnt = 0; int i = 0; // holds all the parents of the nodes we're checking Set<Element> parentNodes = new HashSet<Element>(); ArrayList<Element> nodesWithText = new ArrayList<Element>(); for (Element node : nodesToCheck) { String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); boolean highLinkDensity = isHighLinkDensity(node); if (wordStats.getStopWordCount() > 2 && !highLinkDensity) { nodesWithText.add(node); } } int numberOfNodes = nodesWithText.size(); int negativeScoring = 0; // we shouldn't give more negatives than positives // we want to give the last 20% of nodes negative scores in case they're comments double bottomNodesForNegativeScore = (float) numberOfNodes * 0.25; if (logger.isDebugEnabled()) { logger.debug("About to inspect num of nodes with text: " + numberOfNodes); } for (Element node : nodesWithText) { // add parents and grandparents to scoring // only add boost to the middle paragraphs, top and bottom is usually jankz city // so basically what we're doing is giving boost scores to paragraphs that appear higher up in the dom // and giving lower, even negative scores to those who appear lower which could be commenty stuff float boostScore = 0; if (isOkToBoost(node)) { if (cnt >= 0) { boostScore = (float) ((1.0 / startingBoost) * 50); startingBoost++; } } // check for negative node values if (numberOfNodes > 15) { if ((numberOfNodes - i) <= bottomNodesForNegativeScore) { float booster = (float) bottomNodesForNegativeScore - (float) (numberOfNodes - i); boostScore = -(float) Math.pow(booster, (float) 2); // we don't want to score too highly on the negative side. float negscore = Math.abs(boostScore) + negativeScoring; if (negscore > 40) { boostScore = 5; } } } if (logger.isDebugEnabled()) { logger.debug("Location Boost Score: " + boostScore + " on interation: " + i + "' id='" + node.parent().id() + "' class='" + node.parent().attr("class")); } String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); int upscore = (int) (wordStats.getStopWordCount() + boostScore); updateScore(node.parent(), upscore); updateScore(node.parent().parent(), upscore / 2); updateNodeCount(node.parent(), 1); updateNodeCount(node.parent().parent(), 1); if (!parentNodes.contains(node.parent())) { parentNodes.add(node.parent()); } if (!parentNodes.contains(node.parent().parent())) { parentNodes.add(node.parent().parent()); } cnt++; i++; } // now let's find the parent node who scored the highest int topNodeScore = 0; for (Element e : parentNodes) { if (logger.isDebugEnabled()) { logger.debug("ParentNode: score='" + e.attr("gravityScore") + "' nodeCount='" + e.attr("gravityNodes") + "' id='" + e.id() + "' class='" + e.attr("class") + "' "); } //int score = Integer.parseInt(e.attr("gravityScore")) * Integer.parseInt(e.attr("gravityNodes")); int score = getScore(e); if (score > topNodeScore) { topNode = e; topNodeScore = score; } if (topNode == null) { topNode = e; } } if (logger.isDebugEnabled()) { if (topNode == null) { logger.debug("ARTICLE NOT ABLE TO BE EXTRACTED!, WE HAZ FAILED YOU LORD VADAR"); } else { String logText; String targetText = ""; Element topPara = topNode.getElementsByTag("p").first(); if (topPara == null) { topNode.text(); } else { topPara.text(); } if (targetText.length() >= 51) { logText = targetText.substring(0, 50); } else { logText = targetText; } logger.debug("TOPNODE TEXT: " + logText.trim()); logger.debug("Our TOPNODE: score='" + topNode.attr("gravityScore") + "' nodeCount='" + topNode.attr("gravityNodes") + "' id='" + topNode.id() + "' class='" + topNode.attr("class") + "' "); } } return topNode; }