List of usage examples for org.jsoup.nodes Element id
public String id()
From source file:com.jimplush.goose.ContentExtractor.java
private String debugNode(Element e) { StringBuilder sb = new StringBuilder(); sb.append("GravityScore: '"); sb.append(e.attr("gravityScore")); sb.append("' paraNodeCount: '"); sb.append(e.attr("gravityNodes")); sb.append("' nodeId: '"); sb.append(e.id()); sb.append("' className: '"); sb.append(e.attr("class")); return sb.toString(); }
From source file:com.jimplush.goose.ContentExtractor.java
/** * we're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score * * @return//from w w w . j ava 2 s .co m */ private Element calculateBestNodeBasedOnClustering(Document doc) { Element topNode = null; // grab all the paragraph elements on the page to start to inspect the likely hood of them being good peeps ArrayList<Element> nodesToCheck = getNodesToCheck(doc); double startingBoost = 1.0; int cnt = 0; int i = 0; // holds all the parents of the nodes we're checking Set<Element> parentNodes = new HashSet<Element>(); ArrayList<Element> nodesWithText = new ArrayList<Element>(); for (Element node : nodesToCheck) { String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); boolean highLinkDensity = isHighLinkDensity(node); if (wordStats.getStopWordCount() > 2 && !highLinkDensity) { nodesWithText.add(node); } } int numberOfNodes = nodesWithText.size(); int negativeScoring = 0; // we shouldn't give more negatives than positives // we want to give the last 20% of nodes negative scores in case they're comments double bottomNodesForNegativeScore = (float) numberOfNodes * 0.25; if (logger.isDebugEnabled()) { logger.debug("About to inspect num of nodes with text: " + numberOfNodes); } for (Element node : nodesWithText) { // add parents and grandparents to scoring // only add boost to the middle paragraphs, top and bottom is usually jankz city // so basically what we're doing is giving boost scores to paragraphs that appear higher up in the dom // and giving lower, even negative scores to those who appear lower which could be commenty stuff float boostScore = 0; if (isOkToBoost(node)) { if (cnt >= 0) { boostScore = (float) ((1.0 / startingBoost) * 50); startingBoost++; } } // check for negative node values if (numberOfNodes > 15) { if ((numberOfNodes - i) <= bottomNodesForNegativeScore) { float booster = (float) bottomNodesForNegativeScore - (float) (numberOfNodes - i); boostScore = -(float) Math.pow(booster, (float) 2); // we don't want to score too highly on the negative side. float negscore = Math.abs(boostScore) + negativeScoring; if (negscore > 40) { boostScore = 5; } } } if (logger.isDebugEnabled()) { logger.debug("Location Boost Score: " + boostScore + " on interation: " + i + "' id='" + node.parent().id() + "' class='" + node.parent().attr("class")); } String nodeText = node.text(); WordStats wordStats = StopWords.getStopWordCount(nodeText); int upscore = (int) (wordStats.getStopWordCount() + boostScore); updateScore(node.parent(), upscore); updateScore(node.parent().parent(), upscore / 2); updateNodeCount(node.parent(), 1); updateNodeCount(node.parent().parent(), 1); if (!parentNodes.contains(node.parent())) { parentNodes.add(node.parent()); } if (!parentNodes.contains(node.parent().parent())) { parentNodes.add(node.parent().parent()); } cnt++; i++; } // now let's find the parent node who scored the highest int topNodeScore = 0; for (Element e : parentNodes) { if (logger.isDebugEnabled()) { logger.debug("ParentNode: score='" + e.attr("gravityScore") + "' nodeCount='" + e.attr("gravityNodes") + "' id='" + e.id() + "' class='" + e.attr("class") + "' "); } //int score = Integer.parseInt(e.attr("gravityScore")) * Integer.parseInt(e.attr("gravityNodes")); int score = getScore(e); if (score > topNodeScore) { topNode = e; topNodeScore = score; } if (topNode == null) { topNode = e; } } if (logger.isDebugEnabled()) { if (topNode == null) { logger.debug("ARTICLE NOT ABLE TO BE EXTRACTED!, WE HAZ FAILED YOU LORD VADAR"); } else { String logText; String targetText = ""; Element topPara = topNode.getElementsByTag("p").first(); if (topPara == null) { topNode.text(); } else { topPara.text(); } if (targetText.length() >= 51) { logText = targetText.substring(0, 50); } else { logText = targetText; } logger.debug("TOPNODE TEXT: " + logText.trim()); logger.debug("Our TOPNODE: score='" + topNode.attr("gravityScore") + "' nodeCount='" + topNode.attr("gravityNodes") + "' id='" + topNode.id() + "' class='" + topNode.attr("class") + "' "); } } return topNode; }
From source file:cc.metapro.openct.custom.CustomPresenter.java
@Override public void setWebView(final InteractiveWebView webView, final FragmentManager manager) { webView.setUserClickCallback(new InteractiveWebView.ClickCallback() { @Override/* ww w. j av a 2 s.c o m*/ public void onClick(@NonNull final Element element) { if (HTMLUtils.isPasswordInput(element)) { if (!webView.setById(element.id(), "value", mPassword)) { webView.setByName(element.attr("name"), "value", mPassword); } } else if (HTMLUtils.isTextInput(element)) { ClickDialog.newInstance(new ClickDialog.TypeCallback() { @Override public void onResult(String type) { switch (type) { case InteractiveWebView.COMMON_INPUT_FLAG: if (!webView.focusById(element.id())) { webView.focusByName(element.attr("name")); } break; case InteractiveWebView.USERNAME_INPUT_FLAG: if (!webView.setById(element.id(), "value", mUsername)) { webView.setByName(element.attr("name"), "value", mUsername); } break; } } }).show(manager, "click_dialog"); } } }); }
From source file:org.asqatasun.ruleimplementation.AbstractMarkerPageRuleImplementation.java
/** * To sort marker elements, we extract for each of them the value of the * "id" attribute the value of the "class" attribute and the value of the * "role" attribute. If one of these three values belongs to the marker * value list set by the user, we consider that the element is characterised * and we add it to the "elementMarkerList". * * @param nodeList//from ww w . j av a2 s . c o m */ private void sortMarkerElements() { if ((CollectionUtils.isEmpty(markerList) && CollectionUtils.isEmpty(inverseMarkerList)) || selectionWithoutMarkerHandler.isEmpty()) { return; } Iterator<Element> iter = selectionWithoutMarkerHandler.get().iterator(); Element el; while (iter.hasNext()) { el = iter.next(); String id = el.id(); Collection<String> classNames = el.classNames(); String role = el.attr(ROLE_ATTR); // if the element does contain an "id" OR a "class" attribute OR // a "role" attribute AND one the values belongs to the marker list, // it is removed from the global selection and added to the // marker element selection. if (StringUtils.isNotBlank(id) || CollectionUtils.isNotEmpty(classNames) || StringUtils.isNotBlank(role)) { if (checkAttributeBelongsToMarkerList(id, classNames, role, markerList)) { selectionWithMarkerHandler.add(el); iter.remove(); } // if the element belongs to the inverse marker list, it is // removed from the global collection if (checkAttributeBelongsToMarkerList(id, classNames, role, inverseMarkerList)) { iter.remove(); } } } }
From source file:org.asqatasun.rules.accessiweb22.Aw22Rule11013.java
/** * This methods checks whether the value of the for attribute of a label * node corresponds to the value of the id attribute of any child form field. * //from w w w . j a va 2s .co m * @param childNodes * @param forAttributeValue * @return */ private boolean isForAttributeOfLabelEqualsToIdAttributeOfFormField(Element element, String forAttributeValue) { if (StringUtils.isBlank(forAttributeValue)) { return false; } for (Element el : element.children().select(FORM_ELEMENT_CSS_LIKE_QUERY)) { if (StringUtils.equalsIgnoreCase(forAttributeValue, el.id())) { return true; } } return false; }
From source file:org.asqatasun.rules.doc.utils.exportdomtocsv.ExportDomToCsv.java
/** * Before using it please set the FOLDER variable with the path where you * want to create your csv file.//from w w w . ja v a 2 s. c o m * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { File ref = FileUtils.getFile(FOLDER); JsoupFunc jsf = new JsoupFunc(); Document doc = jsf.getDocument(); Elements thematiques = doc.select("div.thematique"); StringBuilder sb = new StringBuilder(); String testCode; String testLabel = ""; String critere; for (int i = 2; i < thematiques.size(); i++) { String themeIndex = String.valueOf(i - 1) + ""; String theme = (thematiques.get(i).child(0).text() + ""); Elements criteres = thematiques.get(i).select("h3"); for (int j = 1; j < criteres.size(); j++) { Element critereLevel = criteres.get(j); String critereH3String = critereLevel.toString(); String level = critereH3String.substring(critereH3String.indexOf("[") + 1, critereH3String.indexOf("]")) + ""; Elements tests = criteres.get(j).nextElementSibling().select("[id^=test-]"); try { critere = criteres.get(j).id().substring(5, 10) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { critere = criteres.get(j).id().substring(5, 9) + ""; } catch (StringIndexOutOfBoundsException sioobe2) { critere = criteres.get(j).id().substring(5, 8) + ""; } } String[] critereArray = criteres.get(j).text().split("] "); String critereLabel = critereArray[1].toString() + ""; for (Element el : tests) { Pattern digitPattern = Pattern.compile("\\d+\\.\\d+\\.\\d+\\s?\\:?\\s?"); Matcher matcher = digitPattern.matcher(el.text()); if (matcher.find()) { String testLabelReplace = el.html() .replace("index.php", "http://www.accessiweb.org/index.php").replace("\n", ""); testLabel = testLabelReplace.substring(matcher.end(), testLabelReplace.length()) + ""; } try { testCode = el.id().substring(5, 12) + ""; } catch (StringIndexOutOfBoundsException sioobe) { try { testCode = (el.id().substring(5, 11) + ""); } catch (StringIndexOutOfBoundsException sioobe3) { testCode = (el.id().substring(5, 10) + ""); } } sb.append(themeIndex + theme + critere + critereLabel + testCode + testLabel + level + "\n"); } } } FileUtils.writeStringToFile(ref, sb.toString()); }
From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java
/** * @param args the command line arguments * @throws java.io.IOException//from w w w . j a v a2s .co m */ public static void main(String[] args) throws IOException { // Get content from websites Document rgaa3Doc = Jsoup.parse(new URL(RGAA3_REF_URL), 10000); Document aw22Doc = Jsoup.parse(new URL(AW22_URL), 10000); extractLevelFromCriterionAndWrite(rgaa3Doc); extractRuleInfo(rgaa3Doc); // Extract rules from accessiweb 2.2 for (Element el : aw22Doc.select(TEST_SELECTOR)) { if (StringUtils.isNotBlank(el.text())) { AW22.put(extractTestFromId(el.id()), el.text()); } } // generateMysql(); compareReferentials(); // // getRuleI18nKeys(); // checkClassesExist(); // updateTestcasesWithRuleTitle(); // generateMkdoc(); // createTestcaseFiles(); }
From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java
private static void extractLevelFromCriterionAndWrite(Document doc) throws IOException { StringBuilder crit = new StringBuilder(); for (Element el : doc.select(CRITERION_SELECTOR)) { if (StringUtils.isNotBlank(el.id())) { crit.append(el.id().replace("crit", "Rgaa30")); crit.append("="); String content = el.html(); content = content.substring(content.indexOf("] ") + 1); content = extractRuleContent(content); crit.append(content);//from ww w .ja v a 2s . c om crit.append("\n"); String level = el.text().substring(el.text().indexOf("[") + 1, el.text().indexOf("]")); levelFromCrit.put(el.id().replaceAll("crit-", ""), level); } } if (writeCritInFile) { FileUtils.write(new File(CRITERION_I18N_FILE_PATH), crit.toString()); } }
From source file:org.asqatasun.rules.doc.utils.rga33.extractor.Rgaa3Extractor.java
private static void extractRuleInfo(Document doc) { boolean isFirst112 = false; for (Element el : doc.select(TEST_SELECTOR)) { if (StringUtils.isNotBlank(el.id())) { Rule rule = new Rule(RGAA3_REF_URL, REF_NAME); rule.ruleId = el.id();//from w ww . ja v a2 s . com rule.ruleDash = extractTestFromId(rule.ruleId); if (rule.ruleDash.equals("1-1-2")) { if (!isFirst112) { isFirst112 = true; } else { rule.ruleDash = "1-1-4"; rule.ruleId = "test-1-1-4"; } } else if (rule.ruleDash.equals("11-1-4-5")) { rule.ruleDash = "11-14-5"; } rule.setRuleRawHtml(el.html().replaceAll("href=\"", "href=\"" + RGAA3_MAIN_URL)); rule.ruleHtmlWithoutLink = extractRuleContent(rule.ruleRawHtml); rule.ruleText = el.text(); rule.level = levelFromCrit.get(rule.getCriterion()); RGAA3.put(rule.ruleDash, rule); } } }