Example usage for org.jsoup.nodes Element ownText

List of usage examples for org.jsoup.nodes Element ownText

Introduction

In this page you can find the example usage for org.jsoup.nodes Element ownText.

Prototype

public String ownText() 

Source Link

Document

Gets the text owned by this element only; does not get the combined text of all children.

Usage

From source file:org.arb.extractor.DomTreeWalker.java

/**
 * Extract resource from an element and all its children.
 *  /*w  ww  .  j a va2s  .c  om*/
 * @param element the target element.
 * @param codeUnit used to record all found replacement.
 */
private void extractResourceOnElement(Element element, AbstractCodeUnit codeUnit) {
    String ownText = element.ownText();
    if (!ownText.isEmpty() && !hasResource(element)) {
        DomCodeReplacement replacement = new DomCodeReplacement();
        replacement.setElement(element);
        replacement.setResourceText(ownText);
        replacement.setResourceId(getElementResourceId(element));
        if (replacement.getResourceId() == null) {
            replacement.setNewId(true);
            replacement.setUseArbId(true);
        }
        codeUnit.addReplacement(replacement);
    }
    for (int i = 0; i < element.children().size(); i++) {
        extractResourceOnElement(element.child(i), codeUnit);
    }
}

From source file:org.asqatasun.rules.elementselector.CaptchaElementSelector.java

/**
 *
 * @param element//  ww w  .ja  v a  2s  .c om
 * @return wheter either one attribute of the current element, either its
 * text, either one attribute of one of its parent or the text of one of
 * its parents contains the "captcha" keyword
 */
private boolean parseAttributeToExtractCaptcha(Element element) {
    if (element.nodeName().equalsIgnoreCase(HTML_ELEMENT)
            || element.nodeName().equalsIgnoreCase(BODY_ELEMENT)) {
        return false;
    }
    if (StringUtils.containsIgnoreCase(element.ownText(), CAPTCHA_KEY)) {
        return true;
    } else {
        for (Attribute attr : element.attributes()) {
            if (StringUtils.containsIgnoreCase(attr.getValue(), CAPTCHA_KEY)) {
                return true;
            }
        }
    }
    return false;
}

From source file:org.asqatasun.rules.elementselector.CompositeLinkElementSelector.java

/**
 * //from w  w  w  .  ja v  a 2 s.  c om
 * @param linkElement
 * @return whether the current link element is an image link
 */
protected boolean isImageLink(Element linkElement) {
    if (linkElement.children().isEmpty() || linkElement.children().size() > 1
            || StringUtils.isNotBlank(linkElement.ownText())) {
        return false;
    }
    return !linkElement.children().select(IMAGE_LINK_CHILDREN_CSS_LIKE_QUERY).isEmpty();
}

From source file:org.asqatasun.rules.elementselector.CompositeLinkElementSelector.java

/**
 * /*from  www.j a  v  a  2 s . c om*/
 * @param linkElement
 * @return whether the current link element is a svg link
 */
protected boolean isSvgLink(Element linkElement) {
    if (linkElement.children().isEmpty() || linkElement.children().size() > 1
            || StringUtils.isNotBlank(linkElement.ownText())) {
        return false;
    }
    return !linkElement.children().select(HtmlElementStore.SVG_ELEMENT).isEmpty();
}

From source file:org.asqatasun.rules.textbuilder.CompleteTextElementBuilder.java

/**
 * The textual content of an element can be composed with :
 * <ul>/*from  w w  w .ja v  a2  s  .c  o m*/
 *     <li> The text of the element</li>
 *     <li> The text of the alt attribute of the element</li>
 *     <li> The text of the title attribute of the element</li>
 *     <li> The text of the summary attribute of the element</li>
 *     <li> The text of the value attribute of the element</li>
 *     <li> The text of the content attribute of the element when 
 *          the value of the name attribute is "description"
 *      </li>
 * </ul>
 * 
 * @param element
 * @return the textual content of an element
 */
@Override
public String buildTextFromElement(Element element) {
    StringBuilder strb = new StringBuilder();
    if (StringUtils.isNotBlank(element.ownText())) {
        strb.append(SPACER);
        strb.append(element.ownText().trim());
    }

    strb.append(getTextualContentOfAttribute(element, AttributeStore.ALT_ATTR));
    strb.append(getTextualContentOfAttribute(element, AttributeStore.TITLE_ATTR));
    strb.append(getTextualContentOfAttribute(element, AttributeStore.SUMMARY_ATTR));
    strb.append(getTextualContentOfAttribute(element, AttributeStore.VALUE_ATTR));

    if (element.hasAttr(AttributeStore.CONTENT_ATTR) && element.hasAttr(AttributeStore.NAME_ATTR)
            && StringUtils.equalsIgnoreCase(element.attr(AttributeStore.NAME_ATTR), "description")
            && StringUtils.isNotBlank(element.attr(AttributeStore.CONTENT_ATTR))) {
        strb.append(SPACER);
        strb.append(getTextualContentOfAttribute(element, AttributeStore.CONTENT_ATTR));
    }
    return StringUtils.trim(strb.toString());
}

From source file:org.asqatasun.rules.textbuilder.OwnTextElementBuilder.java

@Override
public String buildTextFromElement(Element element) {
    return StringUtils.trim(element.ownText());
}

From source file:org.norvelle.addressdiscoverer.parse.unstructured.ForwardsFlattenedDocumentIterator.java

private String extractText(Element currElement) {
    StringBuilder sb = new StringBuilder();
    Attributes attrs = currElement.attributes();
    for (Attribute attr : attrs.asList()) {
        String attrValue = attr.getValue();
        sb.append(attr.getKey()).append(": ").append(attrValue).append("\n");
    }//  w  ww.  j  a  va 2 s.c  om
    sb.append(currElement.ownText());
    return sb.toString();
}

From source file:org.opens.tanaguru.rules.elementselector.CompositeLinkElementSelector.java

/**
 * //from  w  ww  .j  a v  a2s . com
 * @param linkElement
 * @return whether the current link element is an image link
 */
protected boolean isImageLink(Element linkElement) {
    if (linkElement.children().isEmpty() || linkElement.children().size() > 1
            || StringUtils.isNotBlank(linkElement.ownText())) {
        return false;
    }
    if (!linkElement.children().select(IMAGE_LINK_CHILDREN_CSS_LIKE_QUERY).isEmpty()) {
        return true;
    }
    return false;
}

From source file:org.structr.web.common.microformat.MicroformatParser.java

private Object extractChildContent(final Element element) {

    final List<String> parts = new LinkedList<>();

    element.traverse(new NodeVisitor() {

        @Override//w  w  w .j a  va 2  s .  com
        public void head(Node node, int depth) {

            if (node instanceof Element) {

                final Element element = (Element) node;
                final Set<String> classes = element.classNames();

                removeEmpty(classes);

                if (classes.isEmpty()) {

                    parts.add(element.ownText());
                }
            }
        }

        @Override
        public void tail(Node node, int depth) {
        }
    });

    if (parts.isEmpty()) {

        final String ownText = element.ownText();
        if (StringUtils.isNotBlank(ownText)) {

            parts.add(element.ownText());
        }
    }

    if (parts.isEmpty()) {
        return null;
    }

    if (parts.size() == 1) {
        return parts.get(0);
    }

    return parts;
}

From source file:org.tinymediamanager.scraper.imdb.ImdbMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());
    // check if there is a md in the result
    if (options.getResult() != null && options.getResult().getMetadata() != null) {
        LOGGER.debug("IMDB: getMetadata from cache: " + options.getResult());
        return options.getResult().getMetadata();
    }/*from  w ww .j  a  v  a  2 s.  co  m*/

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    String imdbId = "";

    // imdbId from searchResult
    if (options.getResult() != null) {
        imdbId = options.getResult().getIMDBId();
    }

    // imdbid from scraper option
    if (!MetadataUtil.isValidImdbId(imdbId)) {
        imdbId = options.getImdbId();
    }

    if (!MetadataUtil.isValidImdbId(imdbId)) {
        return md;
    }

    LOGGER.debug("IMDB: getMetadata(imdbId): " + imdbId);
    md.setId(MediaMetadata.IMDBID, imdbId);

    ExecutorCompletionService<Document> compSvcImdb = new ExecutorCompletionService<Document>(executor);
    ExecutorCompletionService<MediaMetadata> compSvcTmdb = new ExecutorCompletionService<MediaMetadata>(
            executor);

    // worker for imdb request (/combined) (everytime from akas.imdb.com)
    // StringBuilder sb = new StringBuilder(imdbSite.getSite());
    StringBuilder sb = new StringBuilder(ImdbSiteDefinition.IMDB_COM.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/combined");
    Callable<Document> worker = new ImdbWorker(sb.toString(), options.getLanguage().name(),
            options.getCountry().getAlpha2());
    Future<Document> futureCombined = compSvcImdb.submit(worker);

    // worker for imdb request (/plotsummary) (from chosen site)
    Future<Document> futurePlotsummary = null;
    sb = new StringBuilder(imdbSite.getSite());
    sb.append("title/");
    sb.append(imdbId);
    sb.append("/plotsummary");

    worker = new ImdbWorker(sb.toString(), options.getLanguage().name(), options.getCountry().getAlpha2());
    futurePlotsummary = compSvcImdb.submit(worker);

    // worker for tmdb request
    Future<MediaMetadata> futureTmdb = null;
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        Callable<MediaMetadata> worker2 = new TmdbWorker(imdbId, options.getLanguage(), options.getCountry());
        futureTmdb = compSvcTmdb.submit(worker2);
    }

    Document doc;
    doc = futureCombined.get();

    /*
     * title and year have the following structure
     * 
     * <div id="tn15title"><h1>Merida - Legende der Highlands <span>(<a href="/year/2012/">2012</a>) <span class="pro-link">...</span> <span
     * class="title-extra">Brave <i>(original title)</i></span> </span></h1> </div>
     */

    // parse title and year
    Element title = doc.getElementById("tn15title");
    if (title != null) {
        Element element = null;
        // title
        Elements elements = title.getElementsByTag("h1");
        if (elements.size() > 0) {
            element = elements.first();
            String movieTitle = cleanString(element.ownText());
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // year
        elements = title.getElementsByTag("span");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();

            // search year
            Pattern yearPattern = Pattern.compile("\\(([0-9]{4})|/\\)");
            Matcher matcher = yearPattern.matcher(content);
            while (matcher.find()) {
                if (matcher.group(1) != null) {
                    String movieYear = matcher.group(1);
                    md.storeMetadata(MediaMetadata.YEAR, movieYear);
                    break;
                }
            }
        }

        // original title
        elements = title.getElementsByAttributeValue("class", "title-extra");
        if (elements.size() > 0) {
            element = elements.first();
            String content = element.text();
            content = content.replaceAll("\\(original title\\)", "").trim();
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, content);
        }
    }

    // poster
    Element poster = doc.getElementById("primary-poster");
    if (poster != null) {
        String posterUrl = poster.attr("src");
        posterUrl = posterUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
        posterUrl = posterUrl.replaceAll("SY[0-9]{2,4}_", "SY400_");
        processMediaArt(md, MediaArtworkType.POSTER, "Poster", posterUrl);
    }

    /*
     * <div class="starbar-meta"> <b>7.4/10</b> &nbsp;&nbsp;<a href="ratings" class="tn15more">52,871 votes</a>&nbsp;&raquo; </div>
     */

    // rating and rating count
    Element ratingElement = doc.getElementById("tn15rating");
    if (ratingElement != null) {
        Elements elements = ratingElement.getElementsByClass("starbar-meta");
        if (elements.size() > 0) {
            Element div = elements.get(0);

            // rating comes in <b> tag
            Elements b = div.getElementsByTag("b");
            if (b.size() == 1) {
                String ratingAsString = b.text();
                Pattern ratingPattern = Pattern.compile("([0-9]\\.[0-9])/10");
                Matcher matcher = ratingPattern.matcher(ratingAsString);
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        float rating = 0;
                        try {
                            rating = Float.valueOf(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.RATING, rating);
                        break;
                    }
                }
            }

            // count
            Elements a = div.getElementsByAttributeValue("href", "ratings");
            if (a.size() == 1) {
                String countAsString = a.text().replaceAll("[.,]|votes", "").trim();
                int voteCount = 0;
                try {
                    voteCount = Integer.parseInt(countAsString);
                } catch (Exception e) {
                }
                md.storeMetadata(MediaMetadata.VOTE_COUNT, voteCount);
            }
        }

        // top250
        elements = ratingElement.getElementsByClass("starbar-special");
        if (elements.size() > 0) {
            Elements a = elements.get(0).getElementsByTag("a");
            if (a.size() > 0) {
                Element anchor = a.get(0);
                Pattern topPattern = Pattern.compile("Top 250: #([0-9]{1,3})");
                Matcher matcher = topPattern.matcher(anchor.ownText());
                while (matcher.find()) {
                    if (matcher.group(1) != null) {
                        int top250 = 0;
                        try {
                            top250 = Integer.parseInt(matcher.group(1));
                        } catch (Exception e) {
                        }
                        md.storeMetadata(MediaMetadata.TOP_250, top250);
                    }
                }
            }
        }
    }

    // parse all items coming by <div class="info">
    Elements elements = doc.getElementsByClass("info");
    for (Element element : elements) {
        // only parse divs
        if (!"div".equals(element.tag().getName())) {
            continue;
        }

        // elements with h5 are the titles of the values
        Elements h5 = element.getElementsByTag("h5");
        if (h5.size() > 0) {
            Element firstH5 = h5.first();
            String h5Title = firstH5.text();

            // release date
            /*
             * <div class="info"><h5>Release Date:</h5><div class="info-content">5 January 1996 (USA)<a class="tn15more inline"
             * href="/title/tt0114746/releaseinfo"
             * onclick="(new Image()).src='/rg/title-tease/releasedates/images/b.gif?link=/title/tt0114746/releaseinfo';"> See more</a>&nbsp;</div></div>
             */
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getReleaseDate() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element releaseDateElement = div.first();
                    String releaseDate = cleanString(releaseDateElement.ownText().replaceAll("", ""));
                    Pattern pattern = Pattern.compile("(.*)\\(.*\\)");
                    Matcher matcher = pattern.matcher(releaseDate);
                    if (matcher.find()) {
                        try {
                            SimpleDateFormat sdf = new SimpleDateFormat("d MMM yyyy");
                            Date parsedDate = sdf.parse(matcher.group(1));
                            sdf = new SimpleDateFormat("dd-MM-yyyy");
                            md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(parsedDate));
                        } catch (Exception e) {
                        }
                    }
                }
            }

            /*
             * <div class="info"><h5>Tagline:</h5><div class="info-content"> (7) To Defend Us... <a class="tn15more inline"
             * href="/title/tt0472033/taglines" onClick= "(new Image()).src='/rg/title-tease/taglines/images/b.gif?link=/title/tt0472033/taglines';" >See
             * more</a>&nbsp;&raquo; </div></div>
             */
            // tagline
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getTagline() + ".*")
                    && !options.isScrapeImdbForeignLanguage()) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String tagline = cleanString(taglineElement.ownText().replaceAll("", ""));
                    md.storeMetadata(MediaMetadata.TAGLINE, tagline);
                }
            }

            /*
             * <div class="info-content"><a href="/Sections/Genres/Animation/">Animation</a> | <a href="/Sections/Genres/Action/">Action</a> | <a
             * href="/Sections/Genres/Adventure/">Adventure</a> | <a href="/Sections/Genres/Fantasy/">Fantasy</a> | <a
             * href="/Sections/Genres/Mystery/">Mystery</a> | <a href="/Sections/Genres/Sci-Fi/">Sci-Fi</a> | <a
             * href="/Sections/Genres/Thriller/">Thriller</a> <a class="tn15more inline" href="/title/tt0472033/keywords" onClick=
             * "(new Image()).src='/rg/title-tease/keywords/images/b.gif?link=/title/tt0472033/keywords';" > See more</a>&nbsp;&raquo; </div>
             */
            // genres are only scraped from akas.imdb.com
            if (h5Title.matches("(?i)" + imdbSite.getGenre() + "(.*)")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Elements a = div.first().getElementsByTag("a");
                    for (Element anchor : a) {
                        if (anchor.attr("href").matches("/Sections/Genres/.*")) {
                            md.addGenre(getTmmGenre(anchor.ownText()));
                        }
                    }
                }
            }
            // }

            /*
             * <div class="info"><h5>Runtime:</h5><div class="info-content">162 min | 171 min (special edition) | 178 min (extended cut)</div></div>
             */
            // runtime
            // if (h5Title.matches("(?i)" + imdbSite.getRuntime() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getRuntime() + ".*")) {
                Elements div = element.getElementsByClass("info-content");
                if (div.size() > 0) {
                    Element taglineElement = div.first();
                    String first = taglineElement.ownText().split("\\|")[0];
                    String runtimeAsString = cleanString(first.replaceAll("min", ""));
                    int runtime = 0;
                    try {
                        runtime = Integer.parseInt(runtimeAsString);
                    } catch (Exception e) {
                        // try to filter out the first number we find
                        Pattern runtimePattern = Pattern.compile("([0-9]{2,3})");
                        Matcher matcher = runtimePattern.matcher(runtimeAsString);
                        if (matcher.find()) {
                            runtime = Integer.parseInt(matcher.group(0));
                        }
                    }
                    md.storeMetadata(MediaMetadata.RUNTIME, runtime);
                }
            }

            /*
             * <div class="info"><h5>Country:</h5><div class="info-content"><a href="/country/fr">France</a> | <a href="/country/es">Spain</a> | <a
             * href="/country/it">Italy</a> | <a href="/country/hu">Hungary</a></div></div>
             */
            // country
            if (h5Title.matches("(?i)Country.*")) {
                Elements a = element.getElementsByTag("a");
                String countries = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/country/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String country = matcher.group(1);
                        if (StringUtils.isNotEmpty(countries)) {
                            countries += ", ";
                        }
                        countries += country.toUpperCase();
                    }
                }
                md.storeMetadata(MediaMetadata.COUNTRY, countries);
            }

            /*
             * <div class="info"><h5>Language:</h5><div class="info-content"><a href="/language/en">English</a> | <a href="/language/de">German</a> | <a
             * href="/language/fr">French</a> | <a href="/language/it">Italian</a></div>
             */
            // Spoken languages
            if (h5Title.matches("(?i)Language.*")) {
                Elements a = element.getElementsByTag("a");
                String spokenLanguages = "";
                for (Element anchor : a) {
                    Pattern pattern = Pattern.compile("/language/(.*)");
                    Matcher matcher = pattern.matcher(anchor.attr("href"));
                    if (matcher.matches()) {
                        String langu = matcher.group(1);
                        if (StringUtils.isNotEmpty(spokenLanguages)) {
                            spokenLanguages += ", ";
                        }
                        spokenLanguages += langu;
                    }
                }
                md.storeMetadata(MediaMetadata.SPOKEN_LANGUAGES, spokenLanguages);
            }

            /*
             * <div class="info"><h5>Certification:</h5><div class="info-content"><a href="/search/title?certificates=us:pg">USA:PG</a> <i>(certificate
             * #47489)</i> | <a href="/search/title?certificates=ca:pg">Canada:PG</a> <i>(Ontario)</i> | <a
             * href="/search/title?certificates=au:pg">Australia:PG</a> | <a href="/search/title?certificates=in:u">India:U</a> | <a
             * href="/search/title?certificates=ie:pg">Ireland:PG</a> ...</div></div>
             */
            // certification
            // if (h5Title.matches("(?i)" + imdbSite.getCertification() + ".*")) {
            if (h5Title.matches("(?i)" + ImdbSiteDefinition.IMDB_COM.getCertification() + ".*")) {
                Elements a = element.getElementsByTag("a");
                for (Element anchor : a) {
                    // certification for the right country
                    if (anchor.attr("href").matches(
                            "(?i)/search/title\\?certificates=" + options.getCountry().getAlpha2() + ".*")) {
                        Pattern certificationPattern = Pattern.compile(".*:(.*)");
                        Matcher matcher = certificationPattern.matcher(anchor.ownText());
                        Certification certification = null;
                        while (matcher.find()) {
                            if (matcher.group(1) != null) {
                                certification = Certification.getCertification(options.getCountry(),
                                        matcher.group(1));
                            }
                        }

                        if (certification != null) {
                            md.addCertification(certification);
                            break;
                        }
                    }
                }
            }
        }

        /*
         * <div id="director-info" class="info"> <h5>Director:</h5> <div class="info-content"><a href="/name/nm0000416/" onclick=
         * "(new Image()).src='/rg/directorlist/position-1/images/b.gif?link=name/nm0000416/';" >Terry Gilliam</a><br/> </div> </div>
         */
        // director
        if ("director-info".equals(element.id())) {
            Elements a = element.getElementsByTag("a");
            for (Element anchor : a) {
                if (anchor.attr("href").matches("/name/nm.*")) {
                    MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                    cm.setName(anchor.ownText());
                    md.addCastMember(cm);
                }
            }
        }
    }

    /*
     * <table class="cast"> <tr class="odd"><td class="hs"><a href="http://pro.imdb.com/widget/resume_redirect/" onClick=
     * "(new Image()).src='/rg/resume/prosystem/images/b.gif?link=http://pro.imdb.com/widget/resume_redirect/';" ><img src=
     * "http://i.media-imdb.com/images/SF9113d6f5b7cb1533c35313ccd181a6b1/tn15/no_photo.png" width="25" height="31" border="0"></td><td class="nm"><a
     * href="/name/nm0577828/" onclick= "(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0577828/';" >Joseph Melito</a></td><td
     * class="ddd"> ... </td><td class="char"><a href="/character/ch0003139/">Young Cole</a></td></tr> <tr class="even"><td class="hs"><a
     * href="/name/nm0000246/" onClick= "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0000246/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BMjA0MjMzMTE5OF5BMl5BanBnXkFtZTcwMzQ2ODE3Mw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0000246/" onclick=
     * "(new Image()).src='/rg/castlist/position-2/images/b.gif?link=/name/nm0000246/';" >Bruce Willis</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003139/">James Cole</a></td></tr> <tr class="odd"><td class="hs"><a href="/name/nm0781218/" onClick=
     * "(new Image()).src='/rg/title-tease/tinyhead/images/b.gif?link=/name/nm0781218/';" ><img src=
     * "http://ia.media-imdb.com/images/M/MV5BODI1MTA2MjkxM15BMl5BanBnXkFtZTcwMTcwMDg2Nw@@._V1._SY30_SX23_.jpg" width="23" height="32"
     * border="0"></a><br></td><td class="nm"><a href="/name/nm0781218/" onclick=
     * "(new Image()).src='/rg/castlist/position-3/images/b.gif?link=/name/nm0781218/';" >Jon Seda</a></td><td class="ddd"> ... </td><td
     * class="char"><a href="/character/ch0003143/">Jose</a></td></tr>...</table>
     */
    // cast
    elements = doc.getElementsByClass("cast");
    if (elements.size() > 0) {
        Elements tr = elements.get(0).getElementsByTag("tr");
        for (Element row : tr) {
            Elements td = row.getElementsByTag("td");
            MediaCastMember cm = new MediaCastMember();
            for (Element column : td) {
                // actor thumb
                if (column.hasClass("hs")) {
                    Elements img = column.getElementsByTag("img");
                    if (img.size() > 0) {
                        String thumbUrl = img.get(0).attr("src");
                        if (thumbUrl.contains("no_photo.png")) {
                            cm.setImageUrl("");
                        } else {
                            thumbUrl = thumbUrl.replaceAll("SX[0-9]{2,4}_", "SX400_");
                            thumbUrl = thumbUrl.replaceAll("SY[0-9]{2,4}_", "");
                            cm.setImageUrl(thumbUrl);
                        }
                    }
                }
                // actor name
                if (column.hasClass("nm")) {
                    cm.setName(cleanString(column.text()));
                }
                // character
                if (column.hasClass("char")) {
                    cm.setCharacter(cleanString(column.text()));
                }
            }
            if (StringUtils.isNotEmpty(cm.getName()) && StringUtils.isNotEmpty(cm.getCharacter())) {
                cm.setType(CastType.ACTOR);
                md.addCastMember(cm);
            }
        }
    }

    Element content = doc.getElementById("tn15content");
    if (content != null) {
        elements = content.getElementsByTag("table");
        for (Element table : elements) {
            // writers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getWriter())) {
                Elements anchors = table.getElementsByTag("a");
                for (Element anchor : anchors) {
                    if (anchor.attr("href").matches("/name/nm.*")) {
                        MediaCastMember cm = new MediaCastMember(CastType.WRITER);
                        cm.setName(anchor.ownText());
                        md.addCastMember(cm);
                    }
                }
            }

            // producers
            if (table.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                Elements rows = table.getElementsByTag("tr");
                for (Element row : rows) {
                    if (row.text().contains(ImdbSiteDefinition.IMDB_COM.getProducers())) {
                        continue;
                    }
                    Elements columns = row.children();
                    if (columns.size() == 0) {
                        continue;
                    }
                    MediaCastMember cm = new MediaCastMember(CastType.PRODUCER);
                    String name = cleanString(columns.get(0).text());
                    if (StringUtils.isBlank(name)) {
                        continue;
                    }
                    cm.setName(name);
                    if (columns.size() >= 3) {
                        cm.setPart(cleanString(columns.get(2).text()));
                    }
                    md.addCastMember(cm);
                }
            }
        }
    }

    // Production companies
    elements = doc.getElementsByClass("blackcatheader");
    for (Element blackcatheader : elements) {
        if (blackcatheader.ownText().equals(ImdbSiteDefinition.IMDB_COM.getProductionCompanies())) {
            Elements a = blackcatheader.nextElementSibling().getElementsByTag("a");
            StringBuilder productionCompanies = new StringBuilder();
            for (Element anchor : a) {
                if (StringUtils.isNotEmpty(productionCompanies)) {
                    productionCompanies.append(", ");
                }
                productionCompanies.append(anchor.ownText());
            }
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, productionCompanies.toString());
            break;
        }
    }

    /*
     * plot from /plotsummary
     */
    // build the url
    doc = null;
    doc = futurePlotsummary.get();

    // imdb.com has another site structure
    if (imdbSite == ImdbSiteDefinition.IMDB_COM) {
        Elements zebraList = doc.getElementsByClass("zebraList");
        if (zebraList != null && !zebraList.isEmpty()) {
            Elements odd = zebraList.get(0).getElementsByClass("odd");
            if (odd.isEmpty()) {
                odd = zebraList.get(0).getElementsByClass("even"); // sometimes imdb has even
            }
            if (odd.size() > 0) {
                Elements p = odd.get(0).getElementsByTag("p");
                if (p.size() > 0) {
                    String plot = cleanString(p.get(0).ownText());
                    md.storeMetadata(MediaMetadata.PLOT, plot);
                }
            }
        }
    } else {
        Element wiki = doc.getElementById("swiki.2.1");
        if (wiki != null) {
            String plot = cleanString(wiki.ownText());
            md.storeMetadata(MediaMetadata.PLOT, plot);
        }
    }

    // title also from chosen site if we are not scraping akas.imdb.com
    if (imdbSite != ImdbSiteDefinition.IMDB_COM) {
        title = doc.getElementById("tn15title");
        if (title != null) {
            Element element = null;
            // title
            elements = title.getElementsByClass("main");
            if (elements.size() > 0) {
                element = elements.first();
                String movieTitle = cleanString(element.ownText());
                md.storeMetadata(MediaMetadata.TITLE, movieTitle);
            }
        }
    }
    // }

    // get data from tmdb?
    if (options.isScrapeImdbForeignLanguage() || options.isScrapeCollectionInfo()) {
        MediaMetadata tmdbMd = futureTmdb.get();
        if (options.isScrapeImdbForeignLanguage() && tmdbMd != null
                && StringUtils.isNotBlank(tmdbMd.getStringValue(MediaMetadata.PLOT))) {
            // tmdbid
            md.setId(MediaMetadata.TMDBID, tmdbMd.getId(MediaMetadata.TMDBID));
            // title
            md.storeMetadata(MediaMetadata.TITLE, tmdbMd.getStringValue(MediaMetadata.TITLE));
            // original title
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, tmdbMd.getStringValue(MediaMetadata.ORIGINAL_TITLE));
            // tagline
            md.storeMetadata(MediaMetadata.TAGLINE, tmdbMd.getStringValue(MediaMetadata.TAGLINE));
            // plot
            md.storeMetadata(MediaMetadata.PLOT, tmdbMd.getStringValue(MediaMetadata.PLOT));
            // collection info
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
        }
        if (options.isScrapeCollectionInfo() && tmdbMd != null) {
            md.storeMetadata(MediaMetadata.TMDBID_SET, tmdbMd.getIntegerValue(MediaMetadata.TMDBID_SET));
            md.storeMetadata(MediaMetadata.COLLECTION_NAME,
                    tmdbMd.getStringValue(MediaMetadata.COLLECTION_NAME));
        }
    }

    // if we have still no original title, take the title
    if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
    }

    return md;
}