List of usage examples for org.jsoup.nodes Element id
public String id()
From source file:org.opens.tanaguru.rules.rgaa22.Rgaa22Rule03111.java
@Override protected void select(SSPHandler sspHandler, ElementHandler<Element> elementHandler) { ELEMENT_SELECTOR.selectElements(sspHandler, elementHandler); if (elementHandler.isEmpty()) { return;/*w w w.j av a2 s. c o m*/ } Elements elementsWithUniqueId = new Elements(); // From the selected form elements, only keep the one with a unique id // on the page for (Element el : elementHandler.get()) { if (StringUtils.isNotEmpty(el.id().trim()) && CssLikeSelectorBuilder.getNumberOfElements(sspHandler, CssLikeSelectorBuilder.buildSelectorFromId(el.id())) == 1) { elementsWithUniqueId.add(el); } } // add the subset to the global selection elementHandler.clean().addAll(elementsWithUniqueId); if (elementsWithUniqueId.isEmpty()) { return; } for (Element el : elementsWithUniqueId) { String labelSelector = CssLikeSelectorBuilder.buildSelectorFromElementsAndAttributeValue(LABEL_ELEMENT, FOR_ATTR, el.id()); if (CssLikeSelectorBuilder.getNumberOfElements(sspHandler, labelSelector) == 0) { this.elementsWithoutLabel.add(el); } } }
From source file:org.opens.tanaguru.rules.rgaa30.Rgaa30Rule110102.java
@Override protected void check(SSPHandler sspHandler, TestSolutionHandler testSolutionHandler) { /* If the page has no input form element, the test is not applicable */ if (inputFormMap.entrySet().isEmpty()) { testSolutionHandler.addTestSolution(TestSolution.NOT_APPLICABLE); return;// w w w. j av a 2 s . c om } for (Map.Entry<Element, ElementHandler<Element>> entry : inputFormMap.entrySet()) { /* The attribute Presence Checker */ ElementChecker attributePresenceChecker = new AttributePresenceChecker(ID_ATTR, TestSolution.PASSED, TestSolution.FAILED, null, ID_MISSING_MSG); attributePresenceChecker.check(sspHandler, entry.getValue(), testSolutionHandler); /* The attribute Emptiness Checker. Keep default value i.e failed when attribute is empty */ ElementChecker attributeEmptinessChecker = new TextEmptinessChecker( new TextAttributeOfElementBuilder(ID_ATTR), ID_MISSING_MSG, null); attributeEmptinessChecker.check(sspHandler, entry.getValue(), testSolutionHandler); /* The id unicityChecker */ ElementChecker idUnicityChecker = new IdUnicityChecker(ID_NOT_UNIQUE_MSG); idUnicityChecker.check(sspHandler, entry.getValue(), testSolutionHandler); } for (Map.Entry<Element, ElementHandler<Element>> entry : labelFormMap.entrySet()) { /* The attribute Presence Checker */ ElementChecker attributePresenceChecker = new AttributePresenceChecker(FOR_ATTR, TestSolution.PASSED, TestSolution.FAILED, null, FOR_MISSING_MSG); attributePresenceChecker.check(sspHandler, entry.getValue(), testSolutionHandler); /* The attribute Emptiness Checker. Keep default value i.e failed when attribute is empty */ ElementChecker attributeEmptinessChecker = new TextEmptinessChecker( new TextAttributeOfElementBuilder(FOR_ATTR), FOR_MISSING_MSG, null); attributeEmptinessChecker.check(sspHandler, entry.getValue(), testSolutionHandler); } for (Map.Entry<Element, ElementHandler<Element>> entry : inputFormMap.entrySet()) { ElementHandler<Element> inputOnError = new ElementHandlerImpl(); /* Check if each input id attribute is linked to a for attribute*/ for (Element el : entry.getValue().get()) { String id = el.id(); if (StringUtils.isNotBlank(id)) { ElementHandler<Element> linkedLabelToInputHandler = new ElementHandlerImpl(); if (entry.getKey() .select(LABEL_ELEMENT + " " + CssLikeSelectorBuilder .buildSelectorFromElementsAndAttributeValue(INPUT_ELEMENT, ID_ATTR, id)) .isEmpty()) { linkedLabelToInputHandler.addAll(entry.getKey().select(CssLikeSelectorBuilder .buildSelectorFromElementsAndAttributeValue(LABEL_ELEMENT, FOR_ATTR, id))); if (linkedLabelToInputHandler.isEmpty()) { inputOnError.add(el); } } } } ElementChecker elementPresenceChecker = new ElementPresenceChecker(TestSolution.FAILED, TestSolution.PASSED, INVALID_INPUT_MSG, null); elementPresenceChecker.check(sspHandler, inputOnError, testSolutionHandler); } for (Map.Entry<Element, ElementHandler<Element>> entry : labelFormMap.entrySet()) { ElementHandler<Element> labelOnError = new ElementHandlerImpl(); /* Check if each label for attribute is associated to an input id attribute*/ for (Element el : entry.getValue().get()) { String id = el.attr(FOR_ATTR); if (StringUtils.isNotBlank(id)) { ElementHandler<Element> linkedLabelToInputHandler = new ElementHandlerImpl(); linkedLabelToInputHandler .addAll(entry.getKey().select(CssLikeSelectorBuilder.buildSelectorFromId(id))); if (linkedLabelToInputHandler.isEmpty()) { labelOnError.add(el); } } } ElementChecker elementPresenceChecker = new ElementPresenceChecker(TestSolution.FAILED, TestSolution.PASSED, INVALID_LABEL_MSG, null); elementPresenceChecker.check(sspHandler, labelOnError, testSolutionHandler); } }
From source file:org.structr.web.importer.Importer.java
private DOMNode createChildNodes(final Node startNode, final DOMNode parent, final Page page, final boolean removeHashAttribute, final int depth) throws FrameworkException { DOMNode rootElement = null;/*from www. j a va2s. c o m*/ Linkable linkable = null; String instructions = null; final List<Node> children = startNode.childNodes(); for (Node node : children) { String tag = node.nodeName(); // clean tag, remove non-word characters except : and # if (tag != null) { tag = tag.replaceAll("[^a-zA-Z0-9#:.\\-_]+", ""); } final StringBuilder classString = new StringBuilder(); final String type = CaseHelper.toUpperCamelCase(tag); String comment = null; String content = null; String id = null; boolean isNewTemplateOrComponent = false; if (ignoreElementNames.contains(type)) { continue; } if (node instanceof Element) { final Element el = ((Element) node); final Set<String> classes = el.classNames(); for (String cls : classes) { classString.append(cls).append(" "); } id = el.id(); // do not download files when called from DeployCommand! if (!isDeployment) { String downloadAddressAttr = srcElements.contains(tag) ? "src" : hrefElements.contains(tag) ? "href" : null; if (originalUrl != null && downloadAddressAttr != null && StringUtils.isNotBlank(node.attr(downloadAddressAttr))) { String downloadAddress = node.attr(downloadAddressAttr); linkable = downloadFile(downloadAddress, originalUrl); } else { linkable = null; } } if (removeHashAttribute) { // Remove data-structr-hash attribute node.removeAttr("data-structr-hash"); } } // Data and comment nodes: Trim the text and put it into the "content" field without changes if (type.equals("#comment")) { comment = ((Comment) node).getData(); tag = ""; // Don't add content node for whitespace if (StringUtils.isBlank(comment)) { continue; } // store for later use commentSource.append(comment).append("\n"); // check if comment contains instructions if (commentHandler != null && commentHandler.containsInstructions(comment)) { if (instructions != null) { // unhandled instructions from previous iteration => empty content element createEmptyContentNode(page, parent, commentHandler, instructions); } instructions = comment; continue; } } else if (type.equals("#data")) { tag = ""; content = ((DataNode) node).getWholeData(); // Don't add content node for whitespace if (StringUtils.isBlank(content)) { continue; } } else // Text-only nodes: Trim the text and put it into the "content" field { if (type.equals("#text")) { tag = ""; if (isDeployment) { content = trimTrailingNewline(((TextNode) node).getWholeText()); if (content == null || content.length() == 0) { continue; } } else { content = trimTrailingNewline(((TextNode) node).text()); if (StringUtils.isBlank(content)) { continue; } } } } org.structr.web.entity.dom.DOMNode newNode = null; // create node if (StringUtils.isBlank(tag)) { if (page != null) { // create comment or content node if (!StringUtils.isBlank(comment)) { final PropertyKey<String> contentTypeKey = StructrApp.key(Content.class, "contentType"); newNode = (DOMNode) page.createComment(comment); newNode.setProperty(contentTypeKey, "text/html"); } else { newNode = (Content) page.createTextNode(content); } } } else if ("structr:template".equals(tag)) { final String src = node.attr("src"); if (src != null) { DOMNode template = null; if (DeployCommand.isUuid(src)) { template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class) .and(GraphObject.id, src).getFirst(); if (template == null) { System.out.println("##################################### template with UUID " + src + " not found, this is a known bug"); } } else if (DeployCommand.endsWithUuid(src)) { final String uuid = src.substring(src.length() - 32); template = (DOMNode) StructrApp.getInstance().nodeQuery(NodeInterface.class) .and(GraphObject.id, uuid).getFirst(); if (template == null) { System.out.println("##################################### template with UUID " + uuid + " not found, this is a known bug"); } } else { template = Importer.findSharedComponentByName(src); if (template == null) { template = Importer.findTemplateByName(src); if (template == null) { template = createNewTemplateNode(parent, node.childNodes()); isNewTemplateOrComponent = true; } } } if (template != null) { newNode = template; if (template.isSharedComponent()) { newNode = (DOMNode) template.cloneNode(false); newNode.setSharedComponent(template); newNode.setOwnerDocument(page); } else if (page != null) { newNode.setOwnerDocument(page); } } else { logger.warn("Unable to find template or shared component {}, template ignored!", src); } } else { logger.warn("Invalid template definition, missing src attribute!"); } } else if ("structr:component".equals(tag)) { final String src = node.attr("src"); if (src != null) { DOMNode component = null; if (DeployCommand.isUuid(src)) { component = app.nodeQuery(DOMNode.class).and(GraphObject.id, src).getFirst(); } else if (DeployCommand.endsWithUuid(src)) { final String uuid = src.substring(src.length() - 32); component = app.nodeQuery(DOMNode.class).and(GraphObject.id, uuid).getFirst(); } else { component = Importer.findSharedComponentByName(src); } if (component == null) { component = createSharedComponent(node); } isNewTemplateOrComponent = true; if (component != null) { newNode = (DOMNode) component.cloneNode(false); final String _html_src = newNode.getProperty(new StringProperty("_html_src")); if (!StringUtils.isEmpty(_html_src)) { node.attr("src", _html_src); } else { node.removeAttr("src"); } newNode.setSharedComponent(component); newNode.setOwnerDocument(page); } else { logger.warn("Unable to find shared component {} - ignored!", src); } } else { logger.warn("Invalid component definition, missing src attribute!"); } } else { if (page != null) { newNode = (org.structr.web.entity.dom.DOMElement) page.createElement(tag, true); } } if (newNode != null) { // save root element for later use if (rootElement == null && !(newNode instanceof org.structr.web.entity.dom.Comment)) { rootElement = newNode; } // set linkable if (linkable != null && newNode instanceof LinkSource) { ((LinkSource) newNode).setLinkable(linkable); } // container for bulk setProperties() final PropertyMap newNodeProperties = new PropertyMap(); final Class newNodeType = newNode.getClass(); newNodeProperties.put(AbstractNode.visibleToPublicUsers, publicVisible); newNodeProperties.put(AbstractNode.visibleToAuthenticatedUsers, authVisible); // "id" attribute: Put it into the "_html_id" field if (StringUtils.isNotBlank(id)) { newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_id"), id); } if (StringUtils.isNotBlank(classString.toString())) { newNodeProperties.put(StructrApp.key(DOMElement.class, "_html_class"), StringUtils.trim(classString.toString())); } for (Attribute nodeAttr : node.attributes()) { final String key = nodeAttr.getKey(); if (!key.equals("text")) { // Don't add text attribute as _html_text because the text is already contained in the 'content' attribute final String value = nodeAttr.getValue(); if (key.startsWith("data-")) { if (key.startsWith(DATA_META_PREFIX)) { // convert data-structr-meta-* attributes to local camel case properties on the node, int l = DATA_META_PREFIX.length(); String upperCaseKey = WordUtils.capitalize(key.substring(l), new char[] { '-' }) .replaceAll("-", ""); String camelCaseKey = key.substring(l, l + 1).concat(upperCaseKey.substring(1)); if (value != null) { // store value using actual input converter final PropertyKey actualKey = StructrApp.getConfiguration() .getPropertyKeyForJSONName(newNodeType, camelCaseKey, false); if (actualKey != null) { final PropertyConverter converter = actualKey .inputConverter(securityContext); if (converter != null) { final Object convertedValue = converter.convert(value); newNodeProperties.put(actualKey, convertedValue); } else { newNodeProperties.put(actualKey, value); } } else { logger.warn("Unknown meta property key {}, ignoring.", camelCaseKey); } } } else if (key.startsWith(DATA_STRUCTR_PREFIX)) { // don't convert data-structr-* attributes as they are internal final PropertyKey propertyKey = StructrApp.getConfiguration() .getPropertyKeyForJSONName(newNodeType, key); if (propertyKey != null) { final PropertyConverter inputConverter = propertyKey .inputConverter(securityContext); if (value != null && inputConverter != null) { newNodeProperties.put(propertyKey, propertyKey.inputConverter(securityContext).convert(value)); } else { newNodeProperties.put(propertyKey, value); } } } else { // store data-* attributes in node final PropertyKey propertyKey = new StringProperty(key); if (value != null) { newNodeProperties.put(propertyKey, value); } } } else { boolean notBlank = StringUtils.isNotBlank(value); boolean isAnchor = notBlank && value.startsWith("#"); boolean isLocal = notBlank && !value.startsWith("http"); boolean isActive = notBlank && value.contains("${"); boolean isStructrLib = notBlank && value.startsWith("/structr/js/"); if (linkable != null && "link".equals(tag) && "href".equals(key) && isLocal && !isActive && !isDeployment) { newNodeProperties.put(new StringProperty(PropertyView.Html + key), "${link.path}?${link.version}"); } else if (linkable != null && ("href".equals(key) || "src".equals(key)) && isLocal && !isActive && !isAnchor && !isStructrLib && !isDeployment) { newNodeProperties.put(new StringProperty(PropertyView.Html + key), "${link.path}"); } else { if (key.startsWith("aria-")) { // use custom key newNodeProperties.put( new StringProperty( CustomHtmlAttributeProperty.CUSTOM_HTML_ATTRIBUTE_PREFIX + key), value); } else { newNodeProperties.put(new StringProperty(PropertyView.Html + key), value); } } } } } // bulk set properties on new node newNode.setProperties(securityContext, newNodeProperties); if ("script".equals(tag)) { final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type"); final String contentType = newNode.getProperty(typeKey); if (contentType == null) { // Set default type of script tag to "text/javascript" to ensure inline JS gets imported properly newNode.setProperty(typeKey, "text/javascript"); } else if (contentType.equals("application/schema+json")) { for (final Node scriptContentNode : node.childNodes()) { final String source = scriptContentNode.toString(); // Import schema JSON SchemaJsonImporter.importSchemaJson(source); } } else if (contentType.equals("application/x-structr-script")) { for (final Node scriptContentNode : node.childNodes()) { final String source = scriptContentNode.toString(); try { Actions.execute(securityContext, null, source, null); } catch (UnlicensedScriptException ex) { ex.log(logger); } } continue; } else if (contentType.equals("application/x-structr-javascript")) { for (final Node scriptContentNode : node.childNodes()) { final String source = scriptContentNode.toString(); try { Actions.execute(securityContext, null, source, null); } catch (UnlicensedScriptException ex) { ex.log(logger); } } continue; } } else if ("style".equals(tag)) { final PropertyKey<String> typeKey = StructrApp.key(Input.class, "_html_type"); final String contentType = newNode.getProperty(typeKey); if ("text/css".equals(contentType)) { // parse content of style elements and add referenced files to list of resources to be downloaded for (final Node styleContentNode : node.childNodes()) { final String source = styleContentNode.toString(); try { // Import referenced resources processCss(source, originalUrl); } catch (IOException ex) { logger.warn("Couldn't process CSS source", ex); } } } } if (instructions != null) { if (instructions.contains("@structr:content") && !(newNode instanceof Content)) { // unhandled instructions from previous iteration => empty content element createEmptyContentNode(page, parent, commentHandler, instructions); } else { // apply instructions to new DOM element if (commentHandler != null) { commentHandler.handleComment(page, newNode, instructions, true); } } instructions = null; } // allow parent to be null to prevent direct child relationship if (parent != null) { // special handling for <head> elements if (newNode instanceof Head && parent instanceof Body) { final org.w3c.dom.Node html = parent.getParentNode(); html.insertBefore(newNode, parent); } else { parent.appendChild(newNode); } } // Link new node to its parent node // linkNodes(parent, newNode, page, localIndex); // Step down and process child nodes except for newly created templates if (!isNewTemplateOrComponent) { createChildNodes(node, newNode, page, removeHashAttribute, depth + 1); } } } // reset instructions when leaving a level if (instructions != null) { createEmptyContentNode(page, parent, commentHandler, instructions); instructions = null; } return rootElement; }
From source file:org.structr.web.Importer.java
private void createChildNodes(final Node startNode, final DOMNode parent, final Page page, final boolean removeHashAttribute) throws FrameworkException { Linkable res = null;/* www . jav a2s. c o m*/ final List<Node> children = startNode.childNodes(); for (Node node : children) { String tag = node.nodeName(); // clean tag, remove non-word characters if (tag != null) { tag = tag.replaceAll("[^a-zA-Z0-9#]+", ""); } String type = CaseHelper.toUpperCamelCase(tag); String comment = null; String content = null; String id = null; StringBuilder classString = new StringBuilder(); if (ArrayUtils.contains(ignoreElementNames, type)) { continue; } if (node instanceof Element) { Element el = ((Element) node); Set<String> classes = el.classNames(); for (String cls : classes) { classString.append(cls).append(" "); } id = el.id(); String downloadAddressAttr = (ArrayUtils.contains(srcElements, tag) ? "src" : ArrayUtils.contains(hrefElements, tag) ? "href" : null); if (downloadAddressAttr != null && StringUtils.isNotBlank(node.attr(downloadAddressAttr))) { String downloadAddress = node.attr(downloadAddressAttr); res = downloadFile(downloadAddress, originalUrl); } if (removeHashAttribute) { // Remove data-structr-hash attribute node.removeAttr(DOMNode.dataHashProperty.jsonName()); } } // Data and comment nodes: Trim the text and put it into the "content" field without changes if (/*type.equals("#data") || */type.equals("#comment")) { tag = ""; comment = ((Comment) node).getData(); // Don't add content node for whitespace if (StringUtils.isBlank(comment)) { continue; } // store for later use commentSource.append(comment).append("\n"); } else if (type.equals("#data")) { tag = ""; content = ((DataNode) node).getWholeData(); // Don't add content node for whitespace if (StringUtils.isBlank(content)) { continue; } } else // Text-only nodes: Trim the text and put it into the "content" field { if (type.equals("#text")) { // type = "Content"; tag = ""; //content = ((TextNode) node).getWholeText(); content = ((TextNode) node).text(); // Add content node for whitespace within <p> elements only if (!("p".equals(startNode.nodeName().toLowerCase())) && StringUtils.isWhitespace(content)) { continue; } } } org.structr.web.entity.dom.DOMNode newNode; // create node if (StringUtils.isBlank(tag)) { // create comment or content node if (!StringUtils.isBlank(comment)) { newNode = (DOMNode) page.createComment(comment); newNode.setProperty(org.structr.web.entity.dom.Comment.contentType, "text/html"); } else { newNode = (Content) page.createTextNode(content); } } else { newNode = (org.structr.web.entity.dom.DOMElement) page.createElement(tag); } if (newNode != null) { newNode.setProperty(AbstractNode.visibleToPublicUsers, publicVisible); newNode.setProperty(AbstractNode.visibleToAuthenticatedUsers, authVisible); if (res != null) { newNode.setProperty(LinkSource.linkable, res); } // "id" attribute: Put it into the "_html_id" field if (StringUtils.isNotBlank(id)) { newNode.setProperty(DOMElement._id, id); } if (StringUtils.isNotBlank(classString.toString())) { newNode.setProperty(DOMElement._class, StringUtils.trim(classString.toString())); } for (Attribute nodeAttr : node.attributes()) { final String key = nodeAttr.getKey(); if (!key.equals("text")) { // Don't add text attribute as _html_text because the text is already contained in the 'content' attribute final String value = nodeAttr.getValue(); if (key.startsWith("data-")) { if (key.startsWith(DATA_META_PREFIX)) { // convert data-structr-meta-* attributes to local camel case properties on the node, int l = DATA_META_PREFIX.length(); String upperCaseKey = WordUtils.capitalize(key.substring(l), new char[] { '-' }) .replaceAll("-", ""); String camelCaseKey = key.substring(l, l + 1).concat(upperCaseKey.substring(1)); if (value != null) { if (value.equalsIgnoreCase("true")) { newNode.setProperty(new BooleanProperty(camelCaseKey), true); } else if (value.equalsIgnoreCase("false")) { newNode.setProperty(new BooleanProperty(camelCaseKey), false); } else { newNode.setProperty(new StringProperty(camelCaseKey), nodeAttr.getValue()); } } } else if (key.startsWith(DATA_STRUCTR_PREFIX)) { // don't convert data-structr-* attributes as they are internal PropertyKey propertyKey = config.getPropertyKeyForJSONName(newNode.getClass(), key); if (propertyKey != null) { final PropertyConverter inputConverter = propertyKey .inputConverter(securityContext); if (value != null && inputConverter != null) { newNode.setProperty(propertyKey, propertyKey.inputConverter(securityContext).convert(value)); } else { newNode.setProperty(propertyKey, value); } } } } else { boolean notBlank = StringUtils.isNotBlank(value); boolean isAnchor = notBlank && value.startsWith("#"); boolean isLocal = notBlank && !value.startsWith("http"); boolean isActive = notBlank && value.contains("${"); boolean isStructrLib = notBlank && value.startsWith("/structr/js/"); if ("link".equals(tag) && "href".equals(key) && isLocal && !isActive) { newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), "${link.path}?${link.version}"); } else if (("href".equals(key) || "src".equals(key)) && isLocal && !isActive && !isAnchor && !isStructrLib) { newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), "${link.path}"); } else { newNode.setProperty(new StringProperty(PropertyView.Html.concat(key)), value); } } } } final StringProperty typeKey = new StringProperty(PropertyView.Html.concat("type")); if ("script".equals(tag) && newNode.getProperty(typeKey) == null) { // Set default type of script tag to "text/javascript" to ensure inline JS gets imported properly newNode.setProperty(typeKey, "text/javascript"); } parent.appendChild(newNode); // Link new node to its parent node // linkNodes(parent, newNode, page, localIndex); // Step down and process child nodes createChildNodes(node, newNode, page, removeHashAttribute); } } }
From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java
@Override public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception { LOGGER.debug("getMetadata() " + options.toString()); // check if there is a md in the result if (options.getResult() != null && options.getResult().getMetadata() != null) { LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult()); return options.getResult().getMetadata(); }/* ww w. ja v a2s. c o m*/ MediaMetadata md = new MediaMetadata(providerInfo.getId()); String imdbId = ""; // imdbId from searchResult if (options.getResult() != null) { imdbId = options.getResult().getIMDBId(); } // imdbid from scraper option if (!MetadataUtil.isValidImdbId(imdbId)) { imdbId = options.getImdbId(); } if (!MetadataUtil.isValidImdbId(imdbId)) { return md; } LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId); md.setId(MediaMetadata.IMDBID, imdbId); ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor); ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>( executor); // worker for imdb request (/combined) (everytime from akas.imdb.com) // StringBuilder sb = new StringBuilder(imdbSite.getSite()); StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/combined"); Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); Future<Document> futureCombined = compSvcImdb.submit(worker); // worker for imdb request (/plotsummary) (from chosen site) Future<Document> futurePlotsummary = null; sb = new StringBuilder(imdbSite.getSite()); sb.append("title/"); sb.append(imdbId); sb.append("/plotsummary"); worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2()); futurePlotsummary = compSvcImdb.submit(worker); // worker for tmdb request Future<MediaMetadata> futureTmdb = null; if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry()); futureTmdb = compSvcTmdb.submit(worker2); } Document doc; doc = futureCombined.get(); /* * title and year have the following structure * * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div> */ // parse title and year Element title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title Elements elements = title.getElementsByTag("h1"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } // year elements = title.getElementsByTag("span"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); // search year Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)"); Matcher matcher = yearPattern.matcher(content); while (matcher.find()) { if (matcher.group(1) != null) { String movieYear = matcher.group(1); md.storeMetadata(MediaMetadata.YEAR, movieYear); break; } } } // original title elements = title.getElementsByAttributeValue("class", "title-extra"); if (elements.size() > 0) { element = elements.first(); String content = element.text(); content = content.replaceAll("\\(original title\\)", "").trim(); md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content); } } // poster Element poster = doc.getElementById("primary-poster"); if (poster != null) { String posterUrl = poster.attr("src"); posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_"); processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl); } /* * <div class="starbar-meta"> <b>7.4/10</b> <a href="ratings" class="tn15more">52,871 votes</a> » </div> */ // rating and rating count Element ratingElement = doc.getElementById("tn15rating"); if (ratingElement != null) { Elements elements = ratingElement.getElementsByClass("starbar-meta"); if (elements.size() > 0) { Element div = elements.get(0); // rating comes in <b> tag Elements b = div.getElementsByTag("b"); if (b.size() == 1) { String ratingAsString = b.text(); Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10"); Matcher matcher = ratingPattern.matcher(ratingAsString); while (matcher.find()) { if (matcher.group(1) != null) { float rating = 0; try { rating = Float.valueOf(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.RATING, rating); break; } } } // count Elements a = div.getElementsByAttributeValue("href", "ratings"); if (a.size() == 1) { String countAsString = a.text().replaceAll("[.,]|votes", "").trim(); int voteCount = 0; try { voteCount = Integer.parseInt(countAsString); } catch (Exception e) { } md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount); } } // top250 elements = ratingElement.getElementsByClass("starbar-special"); if (elements.size() > 0) { Elements a = elements.get(0).getElementsByTag("a"); if (a.size() > 0) { Element anchor = a.get(0); Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})"); Matcher matcher = topPattern.matcher(anchor.ownText()); while (matcher.find()) { if (matcher.group(1) != null) { int top250 = 0; try { top250 = Integer.parseInt(matcher.group(1)); } catch (Exception e) { } md.storeMetadata(MediaMetadata.TOP_250, top250); } } } } } // parse all items coming by <div class="info"> Elements elements = doc.getElementsByClass("info"); for (Element element : elements) { // only parse divs if (!"div".equals(element.tag().getName())) { continue; } // elements with h5 are the titles of the values Elements h5 = element.getElementsByTag("h5"); if (h5.size() > 0) { Element firstH5 = h5.first(); String h5Title = firstH5.text(); // release date /* * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline" * href="/title/tt0114746/releaseinfo" * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a> </div></div> */ if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element releaseDateElement = div.first(); String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", "")); Pattern pattern = Pattern.compile("(.*)\\(.*\\)"); Matcher matcher = pattern.matcher(releaseDate); if (matcher.find()) { try { SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy"); Date parsedDate = sdf.parse(matcher.group(1)); sdf = new SimpleDateFormat("dd-MM-yyyy"); md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate)); } catch (Exception e) { } } } } /* * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline" * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See * more</a> » </div></div> */ // tagline if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*") && !options.isScrapeImdbForeignLanguage()) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String tagline = cleanString(taglineElement.ownText().replaceAll("", "")); md.storeMetadata(MediaMetadata.TAGLINE, tagline); } } /* * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick= * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a> » </div> */ // genres are only scraped from akas.imdb.com if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Elements a = div.first().getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/Sections/Genres/.*")) { md.addGenre(getTmmGenre(anchor.ownText())); } } } } // } /* * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div> */ // runtime // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) { Elements div = element.getElementsByClass("info-content"); if (div.size() > 0) { Element taglineElement = div.first(); String first = taglineElement.ownText().split("\\|")[0]; String runtimeAsString = cleanString(first.replaceAll("min", "")); int runtime = 0; try { runtime = Integer.parseInt(runtimeAsString); } catch (Exception e) { // try to filter out the first number we find Pattern runtimePattern = Pattern.compile("([0-9]{2,3})"); Matcher matcher = runtimePattern.matcher(runtimeAsString); if (matcher.find()) { runtime = Integer.parseInt(matcher.group(0)); } } md.storeMetadata(MediaMetadata.RUNTIME, runtime); } } /* * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div> */ // country if (h5Title.matches("(?i)Country.*")) { Elements a = element.getElementsByTag("a"); String countries = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/country/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String country = matcher.group(1); if (StringUtils.isNotEmpty(countries)) { countries += ", "; } countries += country.toUpperCase(); } } md.storeMetadata(MediaMetadata.COUNTRY, countries); } /* * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div> */ // Spoken languages if (h5Title.matches("(?i)Language.*")) { Elements a = element.getElementsByTag("a"); String spokenLanguages = ""; for (Element anchor : a) { Pattern pattern = Pattern.compile("/language/(.*)"); Matcher matcher = pattern.matcher(anchor.attr("href")); if (matcher.matches()) { String langu = matcher.group(1); if (StringUtils.isNotEmpty(spokenLanguages)) { spokenLanguages += ", "; } spokenLanguages += langu; } } md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages); } /* * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div> */ // certification // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) { if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { // certification for the right country if (anchor.attr("href").matches( "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) { Pattern certificationPattern = Pattern.compile(".*:(.*)"); Matcher matcher = certificationPattern.matcher(anchor.ownText()); Certification certification = null; while (matcher.find()) { if (matcher.group(1) != null) { certification = Certification.getCertification(options.getCountry(), matcher.group(1)); } } if (certification != null) { md.addCertification(certification); break; } } } } } /* * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick= * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div> */ // director if ("director-info".equals(element.id())) { Elements a = element.getElementsByTag("a"); for (Element anchor : a) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } } /* * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick= * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src= * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick= * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick= * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src= * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32" * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick= * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table> */ // cast elements = doc.getElementsByClass("cast"); if (elements.size() > 0) { Elements tr = elements.get(0).getElementsByTag("tr"); for (Element row : tr) { Elements td = row.getElementsByTag("td"); MediaCastMember cm = new MediaCastMember(); for (Element column : td) { // actor thumb if (column.hasClass("hs")) { Elements img = column.getElementsByTag("img"); if (img.size() > 0) { String thumbUrl = img.get(0).attr("src"); if (thumbUrl.contains("no_photo.png")) { cm.setImageUrl(""); } else { thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_"); thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", ""); cm.setImageUrl(thumbUrl); } } } // actor name if (column.hasClass("nm")) { cm.setName(cleanString(column.text())); } // character if (column.hasClass("char")) { cm.setCharacter(cleanString(column.text())); } } if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) { cm.setType(CastType.ACTOR); md.addCastMember(cm); } } } Element content = doc.getElementById("tn15content"); if (content != null) { elements = content.getElementsByTag("table"); for (Element table : elements) { // writers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) { Elements anchors = table.getElementsByTag("a"); for (Element anchor : anchors) { if (anchor.attr("href").matches("/name/nm.*")) { MediaCastMember cm = new MediaCastMember(CastType.WRITER); cm.setName(anchor.ownText()); md.addCastMember(cm); } } } // producers if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) { continue; } Elements columns = row.children(); if (columns.size() == 0) { continue; } MediaCastMember cm = new MediaCastMember(CastType.PRODUCER); String name = cleanString(columns.get(0).text()); if (StringUtils.isBlank(name)) { continue; } cm.setName(name); if (columns.size() >= 3) { cm.setPart(cleanString(columns.get(2).text())); } md.addCastMember(cm); } } } } // Production companies elements = doc.getElementsByClass("blackcatheader"); for (Element blackcatheader : elements) { if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) { Elements a = blackcatheader.nextElementSibling().getElementsByTag("a"); StringBuilder productionCompanies = new StringBuilder(); for (Element anchor : a) { if (StringUtils.isNotEmpty(productionCompanies)) { productionCompanies.append(", "); } productionCompanies.append(anchor.ownText()); } md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString()); break; } } /* * plot from /plotsummary */ // build the url doc = null; doc = futurePlotsummary.get(); // imdb.com has another site structure if (imdbSite == ImdbSiteDefinition.IMDB_COM) { Elements zebraList = doc.getElementsByClass("zebraList"); if (zebraList != null && !zebraList.isEmpty()) { Elements odd = zebraList.get(0).getElementsByClass("odd"); if (odd.isEmpty()) { odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even } if (odd.size() > 0) { Elements p = odd.get(0).getElementsByTag("p"); if (p.size() > 0) { String plot = cleanString(p.get(0).ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.storeMetadata(MediaMetadata.PLOT, plot); } } // title also from chosen site if we are not scraping akas.imdb.com if (imdbSite != ImdbSiteDefinition.IMDB_COM) { title = doc.getElementById("tn15title"); if (title != null) { Element element = null; // title elements = title.getElementsByClass("main"); if (elements.size() > 0) { element = elements.first(); String movieTitle = cleanString(element.ownText()); md.storeMetadata(MediaMetadata.TITLE, movieTitle); } } } // } // get data from tmdb? if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) { MediaMetadata tmdbMd = futureTmdb.get(); if (options.isScrapeImdbForeignLanguage() && tmdbMd != null && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) { // tmdbid md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID)); // title md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE)); // original title md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE)); // tagline md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE)); // plot md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT)); // collection info md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); } if (options.isScrapeCollectionInfo() && tmdbMd != null) { md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET)); md.storeMetadata(MediaMetadata.COLLECTION_NAME, tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME)); } } // if we have still no original title, take the title if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) { md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE)); } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbMovieParser.java
private MediaMetadata parseReleaseinfoPageAKAs(Document doc, MediaScrapeOptions options, MediaMetadata md) { // <table id="akas" class="subpage_data spEven2Col"> // <tr class="even"> // <td>(original title)</td> // <td>Intouchables</td> // </tr> // need to search all tables for correct ID, since the UNIQUE id is used multiple times - thanks for nothing :p for (Element table : doc.getElementsByTag("table")) { if (table.id().equalsIgnoreCase("akas")) { Elements rows = table.getElementsByTag("tr"); for (Element row : rows) { Element c1 = row.getElementsByTag("td").get(0); Element c2 = row.getElementsByTag("td").get(1); if (c1 != null && c1.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(c2.text()); break; }//w w w. j a v a 2 s . c o m } } } // alternative; new way with table classes // <tr class="ipl-zebra-list__item aka-item"> // <td class="aka-item__name">Germany</td> // <td class="aka-item__title">Avatar - Aufbruch nach Pandora</td> // </tr> if (md.getOriginalTitle().isEmpty()) { Elements rows = doc.getElementsByClass("aka-item"); for (Element row : rows) { Element country = row.getElementsByClass("aka-item__name").first(); Element title = row.getElementsByClass("aka-item__title").first(); if (country != null && country.text().toLowerCase(Locale.ROOT).contains("original title")) { md.setOriginalTitle(title.text()); break; } } } return md; }
From source file:org.tinymediamanager.scraper.imdb.ImdbParser.java
protected MediaMetadata parsePlotsummaryPage(Document doc, MediaScrapeOptions options, MediaMetadata md) { // imdb.com has another site structure if (getImdbSite() == ImdbSiteDefinition.IMDB_COM) { // first check synopsis content // Element zebraList = doc.getElementById("plot-synopsis-content"); // if (zebraList != null) { // Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); // if (!p.isEmpty()) { // Element em = p.get(0); // if (!"no-synopsis-content".equals(em.id())) { // String plot = cleanString(em.text()); // md.setPlot(plot); // }/* ww w . j a va 2s . c o m*/ // } // } // NOPE: synopsis contains spoilers // just take first summary // <li class="ipl-zebra-list__item" id="summary-ps21700000"> // <p>text text text text </p> // <div class="author-container"> // <em>—<a href="/search/title?plot_author=author">Author Name</a></em> // </div> // </li> Element zebraList = doc.getElementById("plot-summaries-content"); if (zebraList != null) { Elements p = zebraList.getElementsByClass("ipl-zebra-list__item"); if (!p.isEmpty()) { Element em = p.get(0); // remove author Elements authors = em.getElementsByClass("author-container"); if (!authors.isEmpty()) { authors.get(0).remove(); } if (!"no-summary-content".equals(em.id())) { String plot = cleanString(em.text()); md.setPlot(plot); } } } } else { Element wiki = doc.getElementById("swiki.2.1"); if (wiki != null) { String plot = cleanString(wiki.ownText()); md.setPlot(plot); } } return md; }