List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:org.apdplat.extractor.html.HtmlExtractor.java
/** * ????/* w ww .ja v a 2s.com*/ * @param url html? * @param htmlTemplate html?? * @param doc jsoup * @return ? */ private ExtractResult extractHtmlTemplate(String url, HtmlTemplate htmlTemplate, Document doc) { //??? ExtractResult extractResult = new ExtractResult(); extractResult.setUrl(url); extractResult.setTableName(htmlTemplate.getTableName()); List<CssPath> cssPaths = htmlTemplate.getCssPaths(); //??CSS??????? //??CSS??? for (CssPath cssPath : cssPaths) { // ??CSS PATH Elements elements = doc.select(cssPath.getCssPath()); // CSS?? for (Element element : elements) { String text = null; if (StringUtils.isBlank(cssPath.getAttr())) { //??? text = element.text(); } else { //??? text = element.attr(cssPath.getAttr()); } if (StringUtils.isNotBlank(text)) { // ???? if (cssPath.hasExtractFunction()) { //CSS??? for (ExtractFunction pf : cssPath.getExtractFunctions()) { text = ExtractFunctionExecutor.execute(text, doc, cssPath, pf.getExtractExpression()); if (text != null) { ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(pf.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } else { ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(pf.getExtractExpression()); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(pf.getFieldName()); extractFailLog.setFieldDescription(pf.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } else { //CSS? ExtractResultItem extractResultItem = new ExtractResultItem(); extractResultItem.setField(cssPath.getFieldName()); extractResultItem.setValue(text); extractResult.addExtractResultItem(extractResultItem); } } else { //?????? ExtractFailLog extractFailLog = new ExtractFailLog(); extractFailLog.setUrl(url); extractFailLog.setUrlPattern(htmlTemplate.getUrlPattern().getUrlPattern()); extractFailLog.setTemplateName(htmlTemplate.getTemplateName()); extractFailLog.setCssPath(cssPath.getCssPath()); extractFailLog.setExtractExpression(""); extractFailLog.setTableName(htmlTemplate.getTableName()); extractFailLog.setFieldName(cssPath.getFieldName()); extractFailLog.setFieldDescription(cssPath.getFieldDescription()); extractResult.addExtractFailLog(extractFailLog); //?????? //? //??? return extractResult; } } } return extractResult; }
From source file:org.apdplat.extractor.html.impl.DefaultHtmlExtractor.java
/** * ??/*from w ww . j a v a 2 s . co m*/ * @param url URL * @param html HTML * @return ? */ @Override public List<ExtractResult> extract(String url, String html) { List<ExtractResult> extractResults = new ArrayList<>(); //?URL??? List<HtmlTemplate> htmlTemplates = extractRegular.getHtmlTemplate(url); if (htmlTemplates.isEmpty()) { LOGGER.debug("URL?" + url); return extractResults; } try { Document doc = Jsoup.parse(html); Elements metas = doc.select("meta"); String keywords = ""; String description = ""; for (Element meta : metas) { String name = meta.attr("name"); if ("keywords".equals(name)) { keywords = meta.attr("content"); } if ("description".equals(name)) { description = meta.attr("content"); } } Set<String> tableNames = new HashSet<>(); for (HtmlTemplate htmlTemplate : htmlTemplates) { if (tableNames.contains(htmlTemplate.getTableName())) { LOGGER.debug( "?tableName????UrlPattern?" + htmlTemplate.getUrlPattern().getUrlPattern()); LOGGER.debug(htmlTemplates.toString()); } tableNames.add(htmlTemplate.getTableName()); try { //??? ExtractResult extractResult = extractHtmlTemplate(url, htmlTemplate, doc); //?URL??????? if (!extractResult.getExtractFailLogs().isEmpty() || !extractResult.getExtractResultItems().isEmpty()) { extractResult.setContent(html.getBytes("utf-8")); extractResult.setEncoding("utf-8"); extractResult.setKeywords(keywords); extractResult.setDescription(description); extractResults.add(extractResult); } else { LOGGER.debug(url + " ? " + htmlTemplate.getTemplateName() + " ?"); } } catch (Exception e) { LOGGER.error("???" + htmlTemplate.getTemplateName(), e); } } } catch (Exception e) { LOGGER.error("?: " + url, e); } return extractResults; }
From source file:org.arb.extractor.DomTreeWalker.java
private void collectIdsOnElement(Element element, AbstractCodeUnit codeUnit) { if (element.hasAttr("id")) { elementIdSet.add(element.attr("id")); }/*from w w w . j a v a2 s.c o m*/ if (element.hasAttr("arb:id")) { arbIdSet.add(element.attr("arb:id")); } for (int i = 0; i < element.children().size(); i++) { collectIdsOnElement(element.child(i), codeUnit); } }
From source file:org.arb.extractor.DomTreeWalker.java
/** * Get the existing resource id from the element. Resource id can be specified with either * arb:id or id, with arb:id taking priority. * //from ww w . java2 s. c om * @param elem the element to be checked. * @return resource id or null. */ private String getElementResourceId(Element elem) { if (elem.hasAttr("arb:id")) { return elem.attr("arb:id"); } if (elem.hasAttr("id")) { return elem.attr("id"); } return null; }
From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java
/** * Retrieve css content and adapt it for each inline resource *///w w w .jav a 2 s. c o m private void adaptInlineCSS() { Set<Long> relatedCssIdSet = new HashSet<>(); for (Element el : inlineCssElements) { String attributeValue = el.attr("style"); if (StringUtils.isNotBlank(attributeValue)) { Resource cssResource = new CSSResourceImpl(el.nodeName() + "{" + attributeValue + "}", 0, new InlineRsrc()); StylesheetContent cssContent = getStylesheetFromInlineResource(cssResource.getResource()); adaptContent(cssContent, cssResource, getCurrentResourcePath(el.baseUri()), null); relatedCssIdSet.add(getContentDataService().saveOrUpdate(cssContent).getId()); } } getContentDataService().saveContentRelationShip(getSSP(), relatedCssIdSet); }
From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java
/** * Adapt the external css. /*from www .j a v a 2 s .c o m*/ */ private void adaptExternalCss() { for (Element el : externalCssElements) { List<CSSMediaQuery> mediaList = getListOfMediaFromAttributeValue(el); String resourcePath = el.attr("abs:href"); getExternalResourceAndAdapt(resourcePath, mediaList); } Set<Long> relatedCssIdSet = new HashSet<>(); // At the end of the document we link each external css that are // already fetched and that have been encountered in the SSP to the SSP. LOGGER.debug("Found " + relatedExternalCssSet.size() + " external css in " + getSSP().getURI()); for (StylesheetContent cssContent : relatedExternalCssSet) { if (cssContent.getAdaptedContent() == null) { cssContent.setAdaptedContent(CSS_ON_ERROR); } LOGGER.debug("Create relation between " + getSSP().getURI() + " and " + cssContent.getURI()); // to avoid fatal error when persist weird sourceCode try { // the content is saved only when the id is null which means // that the content hasn't been persisted yet. Otherwise, the // save is uneeded and the id is used to create the relation // with the current SSP if (cssContent.getId() == null) { cssContent = (StylesheetContent) getContentDataService().saveOrUpdate(cssContent); } relatedCssIdSet.add(cssContent.getId()); } catch (PersistenceException | DataException pe) { adaptedContentOnError(cssContent, relatedCssIdSet); } } getContentDataService().saveContentRelationShip(getSSP(), relatedCssIdSet); }
From source file:org.asqatasun.contentadapter.css.CSSJsoupPhlocContentAdapterImpl.java
/** * Get the list of media from the media attribute content * @param mediaAttribute/*from w w w . jav a 2 s. co m*/ * @return */ private List<CSSMediaQuery> getListOfMediaFromAttributeValue(Element element) { String mediaAttribute = element.attr("media"); List<CSSMediaQuery> mediaTypeList = new ArrayList<>(); if (mediaAttribute == null || StringUtils.isBlank(mediaAttribute)) { return mediaTypeList; } else { mediaTypeList.addAll(MediaQueryTools.parseToMediaQuery(mediaAttribute, CCharset.CHARSET_UTF_8_OBJ, ECSSVersion.CSS30)); } return mediaTypeList; }
From source file:org.asqatasun.processing.ProcessRemarkServiceImpl.java
@Override public SourceCodeRemark createSourceCodeRemark(TestSolution processResult, Element element, String messageCode) {/*from www. j a v a 2 s . c om*/ SourceCodeRemark remark = processRemarkDataService.getSourceCodeRemark(element.nodeName(), processResult, messageCode, searchElementLineNumber(element)); remark.setSnippet(getSnippetFromElement(element)); for (String attr : evidenceElementList) { EvidenceElement evidenceElementSup; if (StringUtils.equalsIgnoreCase(attr, "text")) { evidenceElementSup = getEvidenceElement(attr, element.text()); } else { evidenceElementSup = getEvidenceElement(attr, element.attr(attr)); } remark.addElement(evidenceElementSup); } return remark; }
From source file:org.asqatasun.ruleimplementation.AbstractMarkerPageRuleImplementation.java
/** * To sort marker elements, we extract for each of them the value of the * "id" attribute the value of the "class" attribute and the value of the * "role" attribute. If one of these three values belongs to the marker * value list set by the user, we consider that the element is characterised * and we add it to the "elementMarkerList". * * @param nodeList// w w w . j av a 2 s. com */ private void sortMarkerElements() { if ((CollectionUtils.isEmpty(markerList) && CollectionUtils.isEmpty(inverseMarkerList)) || selectionWithoutMarkerHandler.isEmpty()) { return; } Iterator<Element> iter = selectionWithoutMarkerHandler.get().iterator(); Element el; while (iter.hasNext()) { el = iter.next(); String id = el.id(); Collection<String> classNames = el.classNames(); String role = el.attr(ROLE_ATTR); // if the element does contain an "id" OR a "class" attribute OR // a "role" attribute AND one the values belongs to the marker list, // it is removed from the global selection and added to the // marker element selection. if (StringUtils.isNotBlank(id) || CollectionUtils.isNotEmpty(classNames) || StringUtils.isNotBlank(role)) { if (checkAttributeBelongsToMarkerList(id, classNames, role, markerList)) { selectionWithMarkerHandler.add(el); iter.remove(); } // if the element belongs to the inverse marker list, it is // removed from the global collection if (checkAttributeBelongsToMarkerList(id, classNames, role, inverseMarkerList)) { iter.remove(); } } } }
From source file:org.asqatasun.rules.accessiweb22.Aw22Rule06031.java
/** * /* w w w. j a v a2s . c om*/ * @param sspHandler * @param el * @param linkText * @return */ private TestSolution testTitleAttributeLink(SSPHandler sspHandler, Element el, String linkText) { // if the current has no title or has an empty title or has a title // content identical to the link text, returns not applicable. if (!el.hasAttr(TITLE_ATTR)) { return TestSolution.NOT_APPLICABLE; } String attrValue = el.attr(TITLE_ATTR); if (StringUtils.isBlank(attrValue)) { return TestSolution.NOT_APPLICABLE; } if (StringUtils.equalsIgnoreCase(attrValue, linkText)) { return TestSolution.NOT_APPLICABLE; } ElementHandler<Element> elHandler = new ElementHandlerImpl(el); TestSolutionHandler tsHandler = new TestSolutionHandlerImpl(); titlePertinenceElementChecker.check(sspHandler, elHandler, tsHandler); return tsHandler.getTestSolution(); }