List of usage examples for org.jsoup.nodes Element getElementsByTag
public Elements getElementsByTag(String tagName)
From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java
@Override public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception { LOGGER.debug("search() " + options.toString()); List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>(); String searchUrl = ""; String searchTerm = ""; String imdb = ""; // only title search if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search for everything: " + searchTerm); } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) { searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE)); searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8"); LOGGER.debug("search with title: " + searchTerm); } else {/*from w ww . j ava2 s . c om*/ LOGGER.debug("empty searchString"); return resultList; } searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm); Document doc = null; try { Url url = new CachedUrl(searchUrl); InputStream in = url.getInputStream(); doc = Jsoup.parse(in, PAGE_ENCODING, ""); in.close(); } catch (Exception e) { LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage()); // clear cache CachedUrl.removeCachedFileForUrl(searchUrl); } if (doc == null) { return resultList; } // only look for movie links Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php"); LOGGER.debug("found " + filme.size() + " search results"); if (filme.isEmpty()) { if (!doc.getElementsByTag("title").text().contains("Suche nach")) { // redirected to detail page MediaSearchResult msr = new MediaSearchResult(providerInfo.getId()); Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id="); if (el.size() > 0) { msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)")); } msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim()); el = doc.getElementsByAttributeValueContaining("href", "az.php3?j="); if (el.size() == 1) { msr.setYear(el.get(0).text()); } resultList.add(msr); } return resultList; } // <a // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1" // class="normLight">Avatar - Aufbruch nach Pandora</B> // <nobr>(2009)</nobr><br /><span class="smallLight" // style="color:#ccc;">Avatar</span></a> // map to merge 2 results :/ Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>(); for (Element a : filme) { try { String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-"); MediaSearchResult sr = new MediaSearchResult(providerInfo.getId()); if (res.containsKey(id)) { LOGGER.debug("dupe found; merging with previous searchresult"); sr = res.get(id); } if (StringUtils.isNotEmpty(imdb)) { sr.setIMDBId(imdb); } if (StringUtils.isEmpty(sr.getId())) { sr.setId(id); } if (StringUtils.isEmpty(sr.getTitle())) { if (a.html().contains("nobr")) { sr.setTitle(a.ownText()); } else { sr.setTitle(a.text()); } } LOGGER.debug("found movie " + sr.getTitle()); if (StringUtils.isEmpty(sr.getOriginalTitle())) { sr.setOriginalTitle(a.getElementsByTag("span").text()); } if (StringUtils.isEmpty(sr.getYear())) { sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any // 4 // digit } sr.setMediaType(MediaType.MOVIE); sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id); // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), // "images(.*?)\\"")); if (imdb.equals(sr.getIMDBId())) { // perfect match sr.setScore(1); } else { // compare score based on names sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle())); } // populate extra args MetadataUtil.copySearchQueryToSearchResult(options, sr); res.put(id, sr); } catch (Exception e) { LOGGER.warn("error parsing movie result: " + e.getMessage()); } } for (String r : res.keySet()) { resultList.add(res.get(r)); } Collections.sort(resultList); Collections.reverse(resultList); return resultList; }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
/** * Process the content contents of a single entry node. The entry node is the root-node for a single dictionary * entry./*from w w w . j av a2 s . c o m*/ * * @param entryNode * @param resultBuilder * @param fallbackEntryType */ private void processEntryNode(@NotNull Element entryNode, @NotNull EngineQueryResultBuilder resultBuilder, @NotNull EntryType fallbackEntryType) { // Try to determine the entry type again EntryType entryType = fallbackEntryType; Element category = entryNode.getElementsByTag("category").first(); if (category != null) { entryType = resolveSectionType(category.attr("type")); if (entryType == EntryType.UNKNOWN) entryType = fallbackEntryType; } // Process each side separately Elements sideNodes = entryNode.getElementsByTag("side"); Element leftSide = sideNodes.get(0); Element rightSide = sideNodes.get(1); DictionaryObject leftObject = processSideNode(leftSide, entryType); DictionaryObject rightObject = processSideNode(rightSide, entryType); // Build the final DictionaryEntry resultBuilder.addEntry(new DictionaryEntryBuilder().setEntryType(entryType).setInputObject(leftObject) .setOutputObject(rightObject).build()); }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
private void processSection(Element sectionNode, EngineQueryResultBuilder resultBuilder) { String sectionType = sectionNode.attr(SECTION_NAME_ATTRIBUTE); EntryType fallbackEntryType = resolveSectionType(sectionType); for (Element entryNode : sectionNode.getElementsByTag("entry")) { processEntryNode(entryNode, resultBuilder, fallbackEntryType); }/*ww w . ja v a 2s. c o m*/ }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
private DictionaryObject processSideNode(Element side, EntryType entryType) { DictionaryObjectBuilder dictionaryObjectBuilder = new DictionaryObjectBuilder(); // Extract general form: String generalForm = side.getElementsByTag("word").first().text(); // Extract language: String languageIdentifier = side.attr("lang"); if ("ch".equals(languageIdentifier)) languageIdentifier = "cn"; Language language = Language.getExistingLanguageById(languageIdentifier); // Extract representation value: String representation = side.getElementsByTag("repr").text(); if (!StringUtils.equals(generalForm, representation)) dictionaryObjectBuilder.setDescription(representation); // Test for domain specific content: String domain = extractDomainString(representation); if (StringUtils.isNotEmpty(domain)) dictionaryObjectBuilder.setDomain(domain); // Test for abbreviation String abbreviation = extractAbbreviationString(representation); if (StringUtils.isNotEmpty(abbreviation)) dictionaryObjectBuilder.setAbbreviation(abbreviation); // Process additional forms (e.g. verb tenses): processAdditionalForms(entryType, dictionaryObjectBuilder, language, representation); return dictionaryObjectBuilder.setGeneralForm(generalForm).setLanguage(language).build(); }
From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java
private void processSimilarities(@Nullable Element similarityNode, @NotNull EngineQueryResultBuilder engineQueryResultBuilder) { if (similarityNode == null) { LOGGER.warn("Couldn't find similarity node"); return;//from w w w .ja v a 2s .c o m } Elements sides = similarityNode.getElementsByTag("side"); for (Element side : sides) { Language sideLanguage = Language.getExistingLanguageById(side.attr("lang")); for (Element word : side.getElementsByTag("word")) { String wordText = word.text(); engineQueryResultBuilder.addSimilarRecommendation( new DictionaryObjectBuilder().setLanguage(sideLanguage).setGeneralForm(wordText).build()); } } }
From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java
private void processTable(@NotNull Element table, @NotNull Language language, @NotNull MonolingualQueryResultBuilder resultBuilder) { Elements tableRows = table.getElementsByTag("tr"); if (tableRows.size() <= 1) { LOGGER.warn("Word table has unexpected size {}", tableRows.size()); return;/*from w w w . j a v a 2 s . c o m*/ } for (int i = 1; i < tableRows.size(); i++) { Element tableRow = tableRows.get(i); Optional<MonolingualEntry> entry = processTableRow(tableRow, language); if (entry.isPresent()) resultBuilder.addMonolingualEntry(entry.get()); } }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
private void processTranslationTable(@NotNull String queryString, @NotNull Document document, @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage, @NotNull Language targetLanguage) { // Find main table (german to X) String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-" + targetLanguage.getIdentifier().toLowerCase(); Element translationTable = document.getElementById("dictionary-" + languageIdentifier); // Process the main table with its entries if (translationTable != null) { // Find all relevant entries, filter them by class and process them translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0) .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage)); // Extract synonyms Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table"); if (synonymTableCandidates.size() > 0) { extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage); }//from www .j av a 2 s. com } else { LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier, targetLanguage.getIdentifier(), queryString); } }
From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java
@NotNull private DictionaryObject processSingleNode(@NotNull Element element, @NotNull Language language, String queryString) {/*from w ww . j a va 2 s .c o m*/ DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder(); objectBuilder.setLanguage(language); // Extract entry text: String context = StringUtils.substringBefore(element.text(), element.getElementsByTag("a").first().text()); String generalForm = context + element.getElementsByTag("a").first().text(); objectBuilder.setGeneralForm(StringUtils.strip(generalForm)); // Extract description: extractDescription(element, queryString, objectBuilder); // Extract gender: extractGender(element, objectBuilder); return objectBuilder.build(); }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
private void validateRpd1s3AboutForms() { // Form validation Elements formElements = getElements("form"); for (Element formElement : formElements) { // Look for either a submit input or an image input with the 'alt' attribute specified. // See http://www.w3.org/TR/WCAG10-HTML-TECHS/#forms-graphical-buttons boolean hasSubmit = false; boolean hasButtonSubmit = false; for (Element input : formElement.getElementsByTag(ELEM_INPUT)) { String type = input.attr(ATTR_TYPE); if (SUBMIT.equals(type) || (IMAGE.equals(type) && !StringUtils.isEmpty(input.attr(ATTR_ALT)))) { hasSubmit = true;//ww w .j a v a 2 s . c om break; } } for (Element button : formElement.getElementsByTag("button")) { if (!button.hasAttr(ATTR_TYPE) || SUBMIT.equals(button.attr(ATTR_TYPE))) { hasButtonSubmit = true; break; } } assertTrue(Type.ERROR, "rpd1s3.formSubmit", hasSubmit || hasButtonSubmit); } }
From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java
/** * @param table Table to analyze//from w w w .jav a2 s. c om * @return true if the table contains th with ids and td */ private boolean hasTableHeadersAndIds(Element table) { for (Element td : table.getElementsByTag("td")) { if (!td.hasAttr("headers")) { return false; } } for (Element td : table.getElementsByTag("th")) { if (!td.hasAttr("id")) { return false; } } return true; }