Example usage for org.jsoup.nodes Element getElementsByTag

List of usage examples for org.jsoup.nodes Element getElementsByTag

Introduction

In this page you can find the example usage for org.jsoup.nodes Element getElementsByTag.

Prototype

public Elements getElementsByTag(String tagName) 

Source Link

Document

Finds elements, including and recursively under this element, with the specified tag name.

Usage

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    String searchUrl = "";
    String searchTerm = "";
    String imdb = "";

    // only title search
    if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.QUERY));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search for everything: " + searchTerm);
    } else if (StringUtils.isNotEmpty(options.get(MediaSearchOptions.SearchParam.TITLE))) {
        searchTerm = cleanSearch(options.get(MediaSearchOptions.SearchParam.TITLE));
        searchUrl = BASE_URL + "/suche/index.php3?qstring=" + URLEncoder.encode(searchTerm, "UTF-8");
        LOGGER.debug("search with title: " + searchTerm);
    } else {/*from   w  ww  .  j  ava2  s .  c  om*/
        LOGGER.debug("empty searchString");
        return resultList;
    }

    searchTerm = MetadataUtil.removeNonSearchCharacters(searchTerm);

    Document doc = null;
    try {
        Url url = new CachedUrl(searchUrl);
        InputStream in = url.getInputStream();
        doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();
    } catch (Exception e) {
        LOGGER.error("failed to search for " + searchTerm + ": " + e.getMessage());

        // clear cache
        CachedUrl.removeCachedFileForUrl(searchUrl);
    }

    if (doc == null) {
        return resultList;
    }

    // only look for movie links
    Elements filme = doc.getElementsByAttributeValueStarting("href", "hit.php");
    LOGGER.debug("found " + filme.size() + " search results");
    if (filme.isEmpty()) {
        if (!doc.getElementsByTag("title").text().contains("Suche nach")) {
            // redirected to detail page
            MediaSearchResult msr = new MediaSearchResult(providerInfo.getId());
            Elements el = doc.getElementsByAttributeValueStarting("href", "index.php3?id=");
            if (el.size() > 0) {
                msr.setId(StrgUtils.substr(el.get(0).attr("href"), "id=(\\d+)"));
            }
            msr.setTitle(StrgUtils.substr(doc.getElementsByTag("title").text(), "(.*?)\\|").trim());
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                msr.setYear(el.get(0).text());
            }
            resultList.add(msr);
        }
        return resultList;
    }

    // <a
    // href="hit.php3?hit=d6900d7d9baf66ba77d8e59cc425da9e-movie-7614-17114331-1"
    // class="normLight">Avatar - Aufbruch nach Pandora</B>
    // <nobr>(2009)</nobr><br /><span class="smallLight"
    // style="color:#ccc;">Avatar</span></a>

    // map to merge 2 results :/
    Map<String, MediaSearchResult> res = new HashMap<String, MediaSearchResult>();

    for (Element a : filme) {
        try {
            String id = StrgUtils.substr(a.attr("href"), "-movie-(.*?)-");
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            if (res.containsKey(id)) {
                LOGGER.debug("dupe found; merging with previous searchresult");
                sr = res.get(id);
            }

            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            if (StringUtils.isEmpty(sr.getId())) {
                sr.setId(id);
            }
            if (StringUtils.isEmpty(sr.getTitle())) {
                if (a.html().contains("nobr")) {
                    sr.setTitle(a.ownText());
                } else {
                    sr.setTitle(a.text());
                }
            }
            LOGGER.debug("found movie " + sr.getTitle());
            if (StringUtils.isEmpty(sr.getOriginalTitle())) {
                sr.setOriginalTitle(a.getElementsByTag("span").text());
            }
            if (StringUtils.isEmpty(sr.getYear())) {
                sr.setYear(StrgUtils.substr(a.getElementsByTag("nobr").text(), ".*(\\d{4}).*")); // any
                                                                                                 // 4
                                                                                                 // digit
            }
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(BASE_URL + "/filme/index.php3?id=" + id);
            // sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(),
            // "images(.*?)\\&quot"));

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                sr.setScore(MetadataUtil.calculateScore(searchTerm, sr.getTitle()));
            }

            // populate extra args
            MetadataUtil.copySearchQueryToSearchResult(options, sr);
            res.put(id, sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    for (String r : res.keySet()) {
        resultList.add(res.get(r));
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);
    return resultList;
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

/**
 * Process the content contents of a single entry node. The entry node is the root-node for a single dictionary
 * entry./*from w w  w . j av a2  s .  c o m*/
 *
 * @param entryNode
 * @param resultBuilder
 * @param fallbackEntryType
 */
private void processEntryNode(@NotNull Element entryNode, @NotNull EngineQueryResultBuilder resultBuilder,
        @NotNull EntryType fallbackEntryType) {
    // Try to determine the entry type again
    EntryType entryType = fallbackEntryType;
    Element category = entryNode.getElementsByTag("category").first();
    if (category != null) {
        entryType = resolveSectionType(category.attr("type"));
        if (entryType == EntryType.UNKNOWN)
            entryType = fallbackEntryType;
    }

    // Process each side separately
    Elements sideNodes = entryNode.getElementsByTag("side");

    Element leftSide = sideNodes.get(0);
    Element rightSide = sideNodes.get(1);

    DictionaryObject leftObject = processSideNode(leftSide, entryType);
    DictionaryObject rightObject = processSideNode(rightSide, entryType);

    // Build the final DictionaryEntry
    resultBuilder.addEntry(new DictionaryEntryBuilder().setEntryType(entryType).setInputObject(leftObject)
            .setOutputObject(rightObject).build());
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

private void processSection(Element sectionNode, EngineQueryResultBuilder resultBuilder) {
    String sectionType = sectionNode.attr(SECTION_NAME_ATTRIBUTE);
    EntryType fallbackEntryType = resolveSectionType(sectionType);

    for (Element entryNode : sectionNode.getElementsByTag("entry")) {
        processEntryNode(entryNode, resultBuilder, fallbackEntryType);
    }/*ww  w . ja v a  2s. c o m*/
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

private DictionaryObject processSideNode(Element side, EntryType entryType) {
    DictionaryObjectBuilder dictionaryObjectBuilder = new DictionaryObjectBuilder();

    // Extract general form:
    String generalForm = side.getElementsByTag("word").first().text();

    // Extract language:
    String languageIdentifier = side.attr("lang");
    if ("ch".equals(languageIdentifier))
        languageIdentifier = "cn";
    Language language = Language.getExistingLanguageById(languageIdentifier);

    // Extract representation value:
    String representation = side.getElementsByTag("repr").text();
    if (!StringUtils.equals(generalForm, representation))
        dictionaryObjectBuilder.setDescription(representation);

    // Test for domain specific content:
    String domain = extractDomainString(representation);
    if (StringUtils.isNotEmpty(domain))
        dictionaryObjectBuilder.setDomain(domain);

    // Test for abbreviation
    String abbreviation = extractAbbreviationString(representation);
    if (StringUtils.isNotEmpty(abbreviation))
        dictionaryObjectBuilder.setAbbreviation(abbreviation);

    // Process additional forms (e.g. verb tenses):
    processAdditionalForms(entryType, dictionaryObjectBuilder, language, representation);

    return dictionaryObjectBuilder.setGeneralForm(generalForm).setLanguage(language).build();
}

From source file:org.xlrnet.metadict.engines.leo.LeoEngine.java

private void processSimilarities(@Nullable Element similarityNode,
        @NotNull EngineQueryResultBuilder engineQueryResultBuilder) {
    if (similarityNode == null) {
        LOGGER.warn("Couldn't find similarity node");
        return;//from   w  w  w .ja  v  a  2s  .c  o  m
    }

    Elements sides = similarityNode.getElementsByTag("side");

    for (Element side : sides) {
        Language sideLanguage = Language.getExistingLanguageById(side.attr("lang"));

        for (Element word : side.getElementsByTag("word")) {
            String wordText = word.text();
            engineQueryResultBuilder.addSimilarRecommendation(
                    new DictionaryObjectBuilder().setLanguage(sideLanguage).setGeneralForm(wordText).build());
        }

    }
}

From source file:org.xlrnet.metadict.engines.nobordbok.OrdbokEngine.java

private void processTable(@NotNull Element table, @NotNull Language language,
        @NotNull MonolingualQueryResultBuilder resultBuilder) {
    Elements tableRows = table.getElementsByTag("tr");

    if (tableRows.size() <= 1) {
        LOGGER.warn("Word table has unexpected size {}", tableRows.size());
        return;/*from w w  w  .  j a  v a 2 s  .  c  o m*/
    }

    for (int i = 1; i < tableRows.size(); i++) {
        Element tableRow = tableRows.get(i);
        Optional<MonolingualEntry> entry = processTableRow(tableRow, language);
        if (entry.isPresent())
            resultBuilder.addMonolingualEntry(entry.get());
    }
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

private void processTranslationTable(@NotNull String queryString, @NotNull Document document,
        @NotNull BilingualQueryResultBuilder resultBuilder, @NotNull Language sourceLanguage,
        @NotNull Language targetLanguage) {
    // Find main table (german to X)
    String languageIdentifier = sourceLanguage.getIdentifier().toLowerCase() + "-"
            + targetLanguage.getIdentifier().toLowerCase();

    Element translationTable = document.getElementById("dictionary-" + languageIdentifier);

    // Process the main table with its entries
    if (translationTable != null) {
        // Find all relevant entries, filter them by class and process them
        translationTable.getElementsByTag("tr").stream().filter(e -> e.getElementsByTag("th").size() == 0)
                .forEach(e -> processEntry(queryString, e, resultBuilder, sourceLanguage, targetLanguage));
        // Extract synonyms
        Elements synonymTableCandidates = document.getElementsByClass("dictionary-synonyms-table");
        if (synonymTableCandidates.size() > 0) {
            extractBilingualSynonyms(queryString, synonymTableCandidates.get(0), resultBuilder, sourceLanguage);
        }//from   www  .j av a  2  s.  com

    } else {
        LOGGER.debug("Translation table for {} -> {} with query \"{}\" is null", languageIdentifier,
                targetLanguage.getIdentifier(), queryString);
    }
}

From source file:org.xlrnet.metadict.engines.woxikon.WoxikonEngine.java

@NotNull
private DictionaryObject processSingleNode(@NotNull Element element, @NotNull Language language,
        String queryString) {/*from   w ww  .  j  a va 2  s  .c o m*/
    DictionaryObjectBuilder objectBuilder = ImmutableDictionaryObject.builder();
    objectBuilder.setLanguage(language);

    // Extract entry text:
    String context = StringUtils.substringBefore(element.text(), element.getElementsByTag("a").first().text());
    String generalForm = context + element.getElementsByTag("a").first().text();
    objectBuilder.setGeneralForm(StringUtils.strip(generalForm));

    // Extract description:
    extractDescription(element, queryString, objectBuilder);

    // Extract gender:
    extractGender(element, objectBuilder);

    return objectBuilder.build();
}

From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java

private void validateRpd1s3AboutForms() {
    // Form validation
    Elements formElements = getElements("form");

    for (Element formElement : formElements) {
        // Look for either a submit input or an image input with the 'alt' attribute specified.
        // See http://www.w3.org/TR/WCAG10-HTML-TECHS/#forms-graphical-buttons
        boolean hasSubmit = false;
        boolean hasButtonSubmit = false;
        for (Element input : formElement.getElementsByTag(ELEM_INPUT)) {
            String type = input.attr(ATTR_TYPE);
            if (SUBMIT.equals(type) || (IMAGE.equals(type) && !StringUtils.isEmpty(input.attr(ATTR_ALT)))) {
                hasSubmit = true;//ww  w  .j a  v a 2  s  .  c  om
                break;
            }
        }
        for (Element button : formElement.getElementsByTag("button")) {
            if (!button.hasAttr(ATTR_TYPE) || SUBMIT.equals(button.attr(ATTR_TYPE))) {
                hasButtonSubmit = true;
                break;
            }
        }
        assertTrue(Type.ERROR, "rpd1s3.formSubmit", hasSubmit || hasButtonSubmit);
    }
}

From source file:org.xwiki.validator.HTML5DutchWebGuidelinesValidator.java

/**
 * @param table Table to analyze//from   w  w w .jav  a2 s.  c om
 * @return true if the table contains th with ids and td
 */
private boolean hasTableHeadersAndIds(Element table) {
    for (Element td : table.getElementsByTag("td")) {
        if (!td.hasAttr("headers")) {
            return false;
        }
    }

    for (Element td : table.getElementsByTag("th")) {
        if (!td.hasAttr("id")) {
            return false;
        }
    }

    return true;
}