Example usage for org.jsoup.nodes Element toString

Introduction

In this page you can find the example usage for org.jsoup.nodes Element toString.

Prototype

public String toString()

Source Link

Document

Gets this node's outer HTML.

Usage

From source file:org.codeexample.anchorlinks.CVAnchorContentIndexingFilter.java

public Map<String, String> parseAnchors(Document rootDoc) throws IOException {
    Map<String, String> anchorContents = new LinkedHashMap<String, String>();
    Element rootElement = rootDoc;
    if (regexBodyRoot != null) {
        rootElement = rootDoc.select(regexBodyRoot).first();
    }//from   www .j  a v a 2  s .c om
    if (rootElement == null)
        return anchorContents;
    Set<String> anchors = getAnchors(rootElement);
    if (anchors.isEmpty())
        return anchorContents;
    StringBuilder remaining = new StringBuilder(rootElement.toString());

    Iterator<String> it = anchors.iterator();
    String current = it.next();
    while (it.hasNext() && remaining.length() > 0) {
        String next = it.next();
        anchorContents.put(current, getContentBetweenAnchorInWiki(remaining, current, next));
        current = next;
    }
    // last one
    String lastTxt = Jsoup.parse(remaining.toString()).text();
    if (StringUtils.isNotBlank(lastTxt)) {
        anchorContents.put(current, lastTxt);
    }
    return anchorContents;
}

From source file:org.keionline.keionline.ArticleView.java

private String getContent(String url) throws IOException {
    Document doc = Jsoup.connect(url).userAgent("Mozilla").get();
    Element data = doc.getElementsByClass("node").first();// get the third content div,
    Elements select = data.select("img");
    // Change the links to absolute!! so that images work
    for (Element e : select) {
        e.attr("src", e.absUrl("src"));
    }//from w  w  w  .jav a2 s.  co  m
    select = data.select("a");
    for (Element e : select) {
        e.attr("href", e.absUrl("href"));
    }
    Element info = data.getElementsByClass("submitted").first();
    info.after("<hr>");
    String cont = data.toString();
    cont = CSS + cont + "</body>";
    content = cont;
    return cont;
}

From source file:org.mar9000.space2latex.WikiPage.java

public static void downloadWikiPageImages(WikiPage page) throws MalformedURLException {
    String pageUrl = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
    Document document = Jsoup.parseBodyFragment(page.storage);
    document.outputSettings().prettyPrint(false);
    Elements images = document.select("ac|image");
    if (images.size() > 0)
        LOGGER.info("  Download images:");
    for (Element element : images) {
        String downloadURL = null;
        String imageKey = null;/*from w ww.  ja v a  2 s  .  c  o  m*/
        // Attachment?
        Elements refs = element.select("ri|attachment");
        WikiImage image = new WikiImage();
        image.pageId = page.id;
        image.acImage = element.outerHtml();
        //
        if (refs.size() > 0) { // Attachment.
            Element riAttachment = refs.get(0);
            imageKey = riAttachment.attr("ri:filename");
            Elements riPages = riAttachment.select("ri|page");
            // Thumbnails are not found with "child/attachment" URL schema.
            boolean isThumbnail = "true".equals(element.attr("ac:thumbnail"));
            String queryURL = null;
            if (!isThumbnail) {
                queryURL = pageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            } else {
                // For thumbnail we construct directly the downloadURL without queryURL.
                /* Some pages have thumbnail images for better online reading.
                 * Here we download always the attached file to embed readable imagesinto the pdf.
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api"))
                      + "/download/thumbnails/" + page.id + "/" + URLEncoder.encode(imageKey);
                */
                downloadURL = pageUrl.substring(0, pageUrl.indexOf("/rest/api")) + "/download/attachments/"
                        + page.id + "/" + URLEncoder.encode(imageKey);
            }
            if (riPages.size() > 0) {
                // The attachment is related with another page.
                Element riPage = riPages.get(0);
                String space = riPage.attr("ri:space-key");
                String contentTitle = riPage.attr("ri:content-title").replaceAll(" ", "%20");
                String self = page.json.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                String newQueryURL = self.substring(0, self.lastIndexOf('/')) + "?title=" + contentTitle
                        + "&spaceKey=" + space;
                JSONObject jsonNewQuery = ConfluenceRESTUtils.getURLResponse(newQueryURL);
                if (jsonNewQuery.getInt(JSON_SIZE_ATTR) == 0)
                    throw new RuntimeException(
                            "Page \"" + contentTitle + "\" in space " + space + " not found.");
                JSONObject jsonNewPage = (JSONObject) jsonNewQuery.getJSONArray(JSON_RESULTS_ATTR).get(0);
                image.pageId = jsonNewPage.getString(JSON_ID_ATTR);
                // Overwrite queryURL.
                String newPageUrl = jsonNewPage.getJSONObject(JSON_LINKS_ATTR).getString(JSON_SELF_ATTR);
                queryURL = newPageUrl + "/child/attachment?filename=" + URLEncoder.encode(imageKey);
            }
            if (!isThumbnail)
                downloadURL = getAttachmentDownloadURL(queryURL);
        } else {
            refs = element.select("ri|url");
            if (refs.size() > 0) { // URL.
                downloadURL = refs.get(0).attr("ri:value");
                URL tempURL = new URL(downloadURL);
                String urlPath = tempURL.getPath();
                imageKey = urlPath.substring(urlPath.lastIndexOf('/') + 1);
            } else {
                throw new RuntimeException("Image format unknown: " + element.toString());
            }
        }
        // Download the image data.
        image.filename = imageKey.replace(' ', '_'); // Space are not handled by LaTeX.
        if (downloadURL != null) {
            LOGGER.info("    about to download image {}/{}", new Object[] { image.pageId, image.filename });
            image.data = IOUtils.getImageFromURL(downloadURL);
        } else {
            LOGGER.info("    NULL download URL for page/image: {}/{}",
                    new Object[] { image.pageId, image.filename });
        }
        page.images.put(imageKey, image);
    }
}

From source file:org.opens.rules.doc.utils.exportdomtocsv.ExportDomToCsv.java

/**
 * Before using it please set the FOLDER variable with the path where you
 * want to create your csv file.//from ww w. ja v a 2s.c  o m
 *
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    File ref = FileUtils.getFile(FOLDER);
    JsoupFunc jsf = new JsoupFunc();
    Document doc = jsf.getDocument();
    Elements thematiques = doc.select("div.thematique");
    StringBuilder sb = new StringBuilder();
    String testCode = "";
    String testLabel = "";
    String critere = "";
    for (int i = 2; i < thematiques.size(); i++) {
        String themeIndex = String.valueOf(i - 1) + "";
        String theme = (thematiques.get(i).child(0).text() + "");
        Elements criteres = thematiques.get(i).select("h3");
        for (int j = 1; j < criteres.size(); j++) {
            Element critereLevel = criteres.get(j);
            String critereH3String = critereLevel.toString();
            String level = critereH3String.substring(critereH3String.indexOf("[") + 1,
                    critereH3String.indexOf("]")) + "";
            Elements tests = criteres.get(j).nextElementSibling().select("[id^=test-]");
            try {
                critere = criteres.get(j).id().substring(5, 10) + "";
            } catch (StringIndexOutOfBoundsException sioobe) {
                try {
                    critere = criteres.get(j).id().substring(5, 9) + "";
                } catch (StringIndexOutOfBoundsException sioobe2) {
                    critere = criteres.get(j).id().substring(5, 8) + "";
                }
            }
            String[] critereArray = criteres.get(j).text().split("] ");
            String critereLabel = critereArray[1].toString() + "";
            for (Element el : tests) {
                Pattern digitPattern = Pattern.compile("\\d+\\.\\d+\\.\\d+\\s?\\:?\\s?");
                Matcher matcher = digitPattern.matcher(el.text());
                if (matcher.find()) {
                    String testLabelReplace = el.html()
                            .replace("index.php", "http://www.accessiweb.org/index.php").replace("\n", "");
                    testLabel = testLabelReplace.substring(matcher.end(), testLabelReplace.length()) + "";
                }
                try {
                    testCode = el.id().substring(5, 12) + "";
                } catch (StringIndexOutOfBoundsException sioobe) {
                    try {
                        testCode = (el.id().substring(5, 11) + "");
                    } catch (StringIndexOutOfBoundsException sioobe3) {
                        testCode = (el.id().substring(5, 10) + "");
                    }
                }
                sb.append(themeIndex + theme + critere + critereLabel + testCode + testLabel + level + "\n");
            }
        }
    }
    FileUtils.writeStringToFile(ref, sb.toString());
}

From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java

/**
 * Search for movies at aebn.net./*from   www  . ja  v  a2  s .co  m*/
 *
 */
@Override
public List<MediaSearchResult> search(MediaSearchOptions query) throws Exception {
    LOGGER.debug("AEBN: search() {}", query);
    List<MediaSearchResult> resultList = new ArrayList<MediaSearchResult>();
    Elements movies = null;
    String searchString = "";

    // Search for query
    if (StringUtils.isNotEmpty(query.get(MediaSearchOptions.SearchParam.QUERY))) {
        searchString = query.get(MediaSearchOptions.SearchParam.QUERY);
    }

    // Search
    String searchUrl = BASE_DATAURL + "/dispatcher/fts?userQuery="
            + URLEncoder.encode(cleanSearchQuery(searchString), "UTF-8")
            + "&targetSearchMode=basic&isAdvancedSearch=true&isFlushAdvancedSearchCriteria=false" + "&count="
            + SEARCH_COUNT.toString() + "&imageType=Large&sortType=Relevance";
    try {
        LOGGER.info("========= BEGIN AEBN Scraper Search for: {}", searchString);
        Url url = new Url(searchUrl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // only look for movie links like
        // <a id="FTSMovieSearch_link_title_detail_30" ... </a>
        movies = doc.getElementsByAttributeValueMatching("id", "FTSMovieSearch_link_title_detail_\\d+");
        LOGGER.debug("AEBN: found {} search results", movies.size());
    } catch (Exception e) {
        LOGGER.error("AEBN: failed to search for {}: ", searchString, e);
    }

    if (movies == null || movies.isEmpty()) {
        LOGGER.debug("AEBN: no movie found");
        return resultList;
    }

    // there are search results, so fill media data structure
    HashSet<String> foundResultUrls = new HashSet<String>();
    for (Element anchor : movies) {
        try {
            String movieUrl = BASE_DATAURL + StrgUtils.substr(anchor.toString(), "href=\\\"(.*?)\\\"");
            String movieId = StrgUtils.substr(anchor.toString(), "movieId=(\\d+)");
            String movieName = StringEscapeUtils.unescapeHtml4(anchor.text());
            String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + movieId + "_160w.jpg";
            LOGGER.debug("AEBN: found movie {} (id{})", movieName, movieId);

            // check if it is a valid AEBN id
            if (!isValidAebnId(Integer.parseInt(movieId))) {
                LOGGER.error("AEBN: id({}) is not a valid aebn id", movieId);
            }

            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId());
            sr.setId(movieId);
            sr.setIMDBId("");
            sr.setTitle(movieName);
            sr.setOriginalTitle(movieName);
            // sr.setYear not possible, no data at this point
            sr.setYear(null);
            sr.setMediaType(MediaType.MOVIE);
            sr.setUrl(movieUrl);
            sr.setPosterUrl(posterUrl);

            // compare score based on names
            float score = MetadataUtil.calculateScore(searchString, movieName);
            if (posterUrl.isEmpty() || posterUrl.contains("nopicture")) {
                LOGGER.debug("AEBN: no poster - downgrading score by 0.01");
                score = score - 0.01f;
            }
            sr.setScore(score);

            // check if result has at least a title and id
            if (StringUtils.isBlank(sr.getTitle()) || StringUtils.isBlank(sr.getId())) {
                LOGGER.warn("AEBN: no title nor id, skipping");
                continue;
            }

            // check if the movie has been already added to the search results
            if (foundResultUrls.contains(sr.getUrl())) {
                continue;
            }
            foundResultUrls.add(sr.getUrl());

            // populate extra arguments (deprecated)
            // MetadataUtil.copySearchQueryToSearchResult(query, sr);

            resultList.add(sr);
        } catch (Exception e) {
            LOGGER.warn("AEBN: error parsing search result: {}", e);
        }
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);

    return resultList;
}

From source file:org.tinymediamanager.scraper.aebn.AebnMetadataProvider.java

/**
 * Get movie meta data from aebn.net.//from   w ww  .ja  v a2s.  co m
 *
 */
@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("AEBN: getMetadata() {}", options);

    // check if there is already meta data present in the result
    if ((options.getResult() != null) && (options.getResult().getMediaMetadata() != null)) {
        LOGGER.debug("AEBN: return metadata from cache");
        return options.getResult().getMediaMetadata();
    }

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    Elements elements = null;
    Element element = null;
    Integer aebnId = 0;

    // get AebnId from previous search result
    if ((options.getResult() != null) && (options.getResult().getId() != null)) {
        aebnId = Integer.parseInt(options.getResult().getId());
        LOGGER.debug("AEBN: aebnId() from previous search result = {}", aebnId);
        // preset some values from search result (if there is one)
        // Use core.Utils.RemoveSortableName() if you want e.g. "Bourne Legacy, The" -> "The Bourne Legacy".
        md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getOriginalTitle()));
        md.storeMetadata(MediaMetadata.TITLE,
                StrgUtils.removeCommonSortableName(options.getResult().getTitle()));
    }

    // or get AebnId from options
    if (!isValidAebnId(aebnId) && (options.getId(AEBNID) != null)) {
        LOGGER.debug("AEBN: aebnId() from options = {}", options.getId(AEBNID));
        aebnId = Integer.parseInt(options.getId(AEBNID));
    }

    if (!isValidAebnId(aebnId)) {
        LOGGER.warn("AEBN: no or incorrect aebnId, aborting");
        return md;
    }

    // ID
    md.setId(providerInfo.getId(), aebnId);
    LOGGER.debug("AEBN: aebnId({})", aebnId);

    // Base download url for data scraping
    String downloadUrl = BASE_DATAURL + "/dispatcher/movieDetail?movieId=" + aebnId;
    String locale = options.getLanguage().name();
    if (!StringUtils.isBlank(locale)) {
        downloadUrl = downloadUrl + "&locale=" + locale;
        LOGGER.debug("AEBN: used locale({})", locale);
    }

    // begin download and scrape
    try {
        LOGGER.debug("AEBN: download movie detail page");
        Url url = new Url(downloadUrl);
        InputStream in = url.getInputStream();
        Document document = Jsoup.parse(in, "UTF-8", "");
        in.close();

        // Title
        // <h1 itemprop="name" class="md-movieTitle" >Titelname</h1>
        LOGGER.debug("AEBN: parse title");
        elements = document.getElementsByAttributeValue("class", "md-movieTitle");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieTitle = cleanString(element.text());
            LOGGER.debug("AEBN: title({})", movieTitle);
            md.storeMetadata(MediaMetadata.TITLE, movieTitle);
        }

        // Poster
        // front cover:
        // http://pic.aebn.net/Stream/Movie/Boxcovers/a66568_xlf.jpg
        String posterUrl = BASE_IMGURL + "/Stream/Movie/Boxcovers/a" + aebnId.toString() + "_xlf.jpg";
        md.storeMetadata(MediaMetadata.POSTER_URL, posterUrl);

        // Fanart/Background
        // http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534541.jpg
        // <img class="sceneThumbnail" alt="Scene Thumbnail" title="Scene Thumbnail" onError="..."
        // src="http://pic.aebn.net/Stream/Movie/Scenes/a113324_s534544.jpg" onclick="..." />
        LOGGER.debug("AEBN: parse fanart / scene thumbs");
        elements = document.getElementsByAttributeValue("class", "SceneThumbnail");
        LOGGER.debug("AEBN: {} elements found", elements.size());
        int i = 1;
        for (Element anchor : elements) {
            String backgroundUrl = anchor.attr("src");
            LOGGER.debug("AEBN: backgroundUrl{}({})", i, backgroundUrl);
            md.storeMetadata("backgroundUrl" + Integer.valueOf(i).toString(), backgroundUrl);
            i++;
        }

        // Runtime
        LOGGER.debug("AEBN: parse runtime");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=duration]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieRuntime = cleanString(element.attr("content"));
            movieRuntime = StrgUtils.substr(movieRuntime, "PT(\\d+)M");
            LOGGER.debug("AEBN: runtime({})", movieRuntime);
            md.storeMetadata(MediaMetadata.RUNTIME, movieRuntime);
        }

        // Year
        LOGGER.debug("AEBN: parse year");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=datePublished]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: " + elements.size() + " elements found (should be one!)");
            element = elements.first();
            String movieYear = cleanString(element.attr("content"));
            movieYear = StrgUtils.substr(movieYear, "(\\d+)-");
            LOGGER.debug("AEBN: year({})", movieYear);
            md.storeMetadata(MediaMetadata.YEAR, movieYear);
        }

        // Series (Collection)
        LOGGER.debug("AEBN: parse collection");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[class=series]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            element = elements.first();
            String movieCollection = cleanString(element.text());

            // Fake a TMDB_SET based on the hash value of the collection name
            int movieCollectionHash = movieCollection.hashCode();

            md.storeMetadata(MediaMetadata.COLLECTION_NAME, movieCollection);
            md.storeMetadata(MediaMetadata.TMDB_SET, movieCollectionHash);
            LOGGER.debug("AEBN: collection({}), hashcode({})", movieCollection, movieCollectionHash);
        }

        // Studio
        LOGGER.debug("AEBN: parse studio");
        elements = document.getElementsByAttributeValue("id", "md-details")
                .select("[itemprop=productionCompany]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String movieStudio = cleanString(elements.first().text());
            LOGGER.debug("AEBN: studio({})", movieStudio);
            md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, movieStudio);
        }

        // Genre
        LOGGER.debug("AEBN: parse genre");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=genre]");
        for (Element g : elements) {
            md.addGenre(getTmmGenre(g.text()));
        }
        // add basic genre, since all genres at AEBN could be summarised
        // into this one
        md.addGenre(MediaGenres.EROTIC);

        // Certification
        // no data scrapeable---but obviously it's adult only, so simply
        // generate it
        String movieCertification = null;
        Certification certification = null;
        String country = options.getCountry().getAlpha2();
        LOGGER.debug("AEBN: generate certification for {}", country);
        // @formatter:off
        if (country.equals("DE")) {
            movieCertification = "FSK 18";
        }
        if (country.equals("US")) {
            movieCertification = "NC-17";
        }
        if (country.equals("GB")) {
            movieCertification = "R18";
        }
        if (country.equals("FR")) {
            movieCertification = "18";
        }
        if (country.equals("ES")) {
            movieCertification = "PX";
        }
        if (country.equals("JP")) {
            movieCertification = "R18+";
        }
        if (country.equals("IT")) {
            movieCertification = "V.M.18";
        }
        if (country.equals("NL")) {
            movieCertification = "16";
        }
        // @formatter:on
        certification = Certification.getCertification(options.getCountry(), movieCertification);
        if (certification != null) {
            LOGGER.debug("AEBN: certification({})", certification);
            md.addCertification(certification);
        }

        // Plot and Tagline
        LOGGER.debug("AEBN: parse plot");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=about]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String moviePlot = cleanString(elements.first().text());
            md.storeMetadata(MediaMetadata.PLOT, moviePlot);
            // no separate tagline available, so extract the first sentence
            // from the movie plot
            String movieTagline = StrgUtils.substr(moviePlot, "^(.*?[.!?:])");
            LOGGER.debug("AEBN: tagline(" + movieTagline + ")");
            md.storeMetadata(MediaMetadata.TAGLINE, movieTagline);
        }

        // Actors
        LOGGER.debug("AEBN: parse actors");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=actor]");
        LOGGER.debug("AEBN: {} actors found", elements.size());
        for (Element anchor : elements) {
            String actorid = StrgUtils.substr(anchor.toString(), "starId=(\\d+)");
            String actorname = cleanString(anchor.select("[itemprop=name]").first().text());
            String actordetailsurl = BASE_DATAURL + anchor.attr("href");
            if (!actorname.isEmpty()) {
                LOGGER.debug("AEBN: add actor id({}), name({}), details({})", actorid, actorname,
                        actordetailsurl);
                MediaCastMember cm = new MediaCastMember();
                cm.setType(MediaCastMember.CastType.ACTOR);
                cm.setName(actorname);
                if (!actorid.isEmpty()) {
                    cm.setId(actorid);
                }

                // Actor detail page
                try {
                    Url starurl = new Url(actordetailsurl);
                    InputStream starurlstream = starurl.getInputStream();
                    Document stardocument = Jsoup.parse(starurlstream, "UTF-8", "");
                    starurlstream.close();
                    Elements elements2 = stardocument.getElementsByAttributeValue("class", "StarInfo");
                    if (elements2.size() == 0) {
                        LOGGER.debug("AEBN: no additional actor details found");
                    } else {
                        // Actor image
                        String actorimage = elements2.select("[itemprop=image]").first().attr("src");
                        LOGGER.debug("AEBN: actor image({})", actorimage);
                        if (!actorimage.isEmpty()) {
                            cm.setImageUrl(actorimage);
                        }
                        // Actor 'fanart' images
                        // unsure if this is ever shown in tmm
                        elements2 = stardocument.getElementsByAttributeValue("class", "StarDetailGallery")
                                .select("a");
                        LOGGER.debug("AEBN: {} gallery images found", elements2.size());
                        for (Element thumbnail : elements2) {
                            LOGGER.debug("AEBN: add fanart image({})", thumbnail.attr("href"));
                            cm.addFanart(thumbnail.attr("href"));
                        }
                    }
                } catch (Exception e) {
                    LOGGER.error("AEBN: Error downloading {}: {}", actordetailsurl, e);
                }

                md.addCastMember(cm);
            }
        }

        // Director
        LOGGER.debug("AEBN: parse director");
        elements = document.getElementsByAttributeValue("id", "md-details").select("[itemprop=director]");
        if (elements.size() > 0) {
            LOGGER.debug("AEBN: {} elements found (should be one!)", elements.size());
            String directorid = StrgUtils.substr(elements.toString(), "directorID=(\\d+)");
            String directorname = cleanString(elements.select("[itemprop=name]").first().text());
            if (!directorname.isEmpty()) {
                MediaCastMember cm = new MediaCastMember(CastType.DIRECTOR);
                cm.setName(directorname);
                if (!directorid.isEmpty()) {
                    cm.setId(directorid);
                }
                cm.setImageUrl("");
                md.addCastMember(cm);
                LOGGER.debug("AEBN: add director id({}), name({})", directorid, directorname);
            }
        }

        // Original Title
        // if we have no original title, just copy the title
        if (StringUtils.isBlank(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }
    } catch (Exception e) {
        LOGGER.error("AEBN: Error parsing {}: {}", options.getResult().getUrl(), e);
    }

    return md;
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

private void parseCast(Elements el, MediaCastMember.CastType type, MediaMetadata md) {
    if (el != null && !el.isEmpty()) {
        Element castEl = null;/* w ww.  j a va2 s.c  o  m*/
        for (Element element : el) {
            if (!element.tagName().equals("option")) { // we get more, just do not take the optionbox
                castEl = element;
            }
        }
        if (castEl == null) {
            LOGGER.debug("meh, no " + type.name() + " found");
            return;
        }
        // walk up to table TR...
        while (!((castEl == null) || (castEl.tagName().equalsIgnoreCase("tr")))) {
            castEl = castEl.parent();
        }
        // ... and take the next table row ^^
        Element tr = castEl.nextElementSibling();

        if (tr != null) {
            for (Element a : tr.getElementsByAttributeValue("valign", "middle")) {
                String act = a.toString();
                String aname = StrgUtils.substr(act, "alt=\"(.*?)\"");
                if (!aname.isEmpty()) {
                    MediaCastMember cm = new MediaCastMember();
                    cm.setName(aname);
                    String id = StrgUtils.substr(act, "id=(.*?)[^\"]\">");
                    if (!id.isEmpty()) {
                        cm.setId(id);
                        // thumb
                        // http://www.ofdb.de/thumbnail.php?cover=images%2Fperson%2F7%2F7689.jpg&size=6
                        // fullsize ;) http://www.ofdb.de/images/person/7/7689.jpg
                        try {
                            String imgurl = URLDecoder
                                    .decode(StrgUtils.substr(act, "images%2Fperson%2F(.*?)&amp;size"), "UTF-8");
                            if (!imgurl.isEmpty()) {
                                imgurl = BASE_URL + "/images/person/" + imgurl;
                            }
                            cm.setImageUrl(imgurl);
                        } catch (Exception e) {
                        }
                    }
                    String arole = StrgUtils.substr(act, "\\.\\.\\. (.*?)</font>").replaceAll("<[^>]*>", "");
                    cm.setCharacter(arole);
                    cm.setType(type);
                    md.addCastMember(cm);
                }
            }
        }
    }
}

From source file:org.tinymediamanager.scraper.ofdb.OfdbMetadataProvider.java

@Override
public List<MediaSearchResult> search(MediaSearchOptions options) throws Exception {
    LOGGER.debug("search() " + options.toString());

    if (options.getMediaType() != MediaType.MOVIE) {
        throw new UnsupportedMediaTypeException(options.getMediaType());
    }/*  w  w  w  .j  a v a  2  s . c o m*/

    List<MediaSearchResult> resultList = new ArrayList<>();
    String searchString = "";
    String searchQuery = "";
    String imdb = "";
    Elements filme = null;
    int myear = options.getYear();

    /*
     * Kat = All | Titel | Person | DTitel | OTitel | Regie | Darsteller | Song | Rolle | EAN| IMDb | Google
     * http://www.ofdb.de//view.php?page=suchergebnis &Kat=xxxxxxxxx&SText=yyyyyyyyyyy
     */
    // 1. search with imdbId
    if (StringUtils.isNotEmpty(options.getImdbId()) && (filme == null || filme.isEmpty())) {
        try {
            imdb = options.getImdbId();
            searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=IMDb&SText=" + imdb;
            LOGGER.debug("search with imdbId: " + imdb);

            Url url = new Url(searchString);
            InputStream in = url.getInputStream();
            Document doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
            // only look for movie links
            filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,");
            LOGGER.debug("found " + filme.size() + " search results");
        } catch (Exception e) {
            LOGGER.error("failed to search for imdb Id " + imdb + ": " + e.getMessage());
        }
    }

    // 2. search for search string
    if (StringUtils.isNotEmpty(options.getQuery()) && (filme == null || filme.isEmpty())) {
        try {
            String query = options.getQuery();
            searchQuery = query;
            query = MetadataUtil.removeNonSearchCharacters(query);
            searchString = BASE_URL + "/view.php?page=suchergebnis&Kat=All&SText="
                    + URLEncoder.encode(cleanSearch(query), "UTF-8");
            LOGGER.debug("search for everything: " + query);

            Url url = new Url(searchString);
            InputStream in = url.getInputStream();
            Document doc = Jsoup.parse(in, "UTF-8", "");
            in.close();
            // only look for movie links
            filme = doc.getElementsByAttributeValueMatching("href", "film\\/\\d+,");
            LOGGER.debug("found " + filme.size() + " search results");
        } catch (Exception e) {
            LOGGER.error("failed to search for " + searchQuery + ": " + e.getMessage());
        }
    }

    if (filme == null || filme.isEmpty()) {
        LOGGER.debug("nothing found :(");
        return resultList;
    }

    // <a href="film/22523,Die-Bourne-Identitt"
    // onmouseover="Tip('<img src=&quot;images/film/22/22523.jpg&quot;
    // width=&quot;120&quot; height=&quot;170&quot;>',SHADOW,true)">Bourne
    // Identitt, Die<font size="1"> / Bourne Identity, The</font> (2002)</a>
    HashSet<String> foundResultUrls = new HashSet<>();
    for (Element a : filme) {
        try {
            MediaSearchResult sr = new MediaSearchResult(providerInfo.getId(), MediaType.MOVIE);
            if (StringUtils.isNotEmpty(imdb)) {
                sr.setIMDBId(imdb);
            }
            sr.setId(StrgUtils.substr(a.toString(), "film\\/(\\d+),")); // OFDB ID
            sr.setTitle(StringEscapeUtils.unescapeHtml4(StrgUtils
                    .removeCommonSortableName(StrgUtils.substr(a.toString(), ".*>(.*?)(\\[.*?\\])?<font"))));
            LOGGER.debug("found movie " + sr.getTitle());
            sr.setOriginalTitle(StringEscapeUtils.unescapeHtml4(
                    StrgUtils.removeCommonSortableName(StrgUtils.substr(a.toString(), ".*> / (.*?)</font"))));
            try {
                sr.setYear(Integer.parseInt(StrgUtils.substr(a.toString(), "font> \\((.*?)\\)<\\/a")));
            } catch (Exception ignored) {
            }

            sr.setUrl(BASE_URL + "/" + StrgUtils.substr(a.toString(), "href=\\\"(.*?)\\\""));
            sr.setPosterUrl(BASE_URL + "/images" + StrgUtils.substr(a.toString(), "images(.*?)\\&quot"));

            // check if it has at least a title and url
            if (StringUtils.isBlank(sr.getTitle()) || StringUtils.isBlank(sr.getUrl())) {
                continue;
            }

            // OFDB could provide linke twice - check if that has been already added
            if (foundResultUrls.contains(sr.getUrl())) {
                continue;
            }
            foundResultUrls.add(sr.getUrl());

            if (imdb.equals(sr.getIMDBId())) {
                // perfect match
                sr.setScore(1);
            } else {
                // compare score based on names
                float score = MetadataUtil.calculateScore(searchQuery, sr.getTitle());

                if (yearDiffers(myear, sr.getYear())) {
                    float diff = (float) Math.abs(myear - sr.getYear()) / 100;
                    LOGGER.debug(
                            "parsed year does not match search result year - downgrading score by " + diff);
                    score -= diff;
                }
                sr.setScore(score);

            }
            resultList.add(sr);
        } catch (Exception e) {
            LOGGER.warn("error parsing movie result: " + e.getMessage());
        }
    }
    Collections.sort(resultList);
    Collections.reverse(resultList);

    return resultList;
}

From source file:org.tinymediamanager.scraper.zelluloid.ZelluloidMetadataProvider.java

@Override
public MediaMetadata getMetadata(MediaScrapeOptions options) throws Exception {
    LOGGER.debug("getMetadata() " + options.toString());

    // we can only work further if we got a search result on zelluloid.de
    if (options.getResult() == null) {
        throw new Exception("Scrape with Zelluloid.de without prior search is not supported");
    }// ww w.  jav a2  s  .c  om

    MediaMetadata md = new MediaMetadata(providerInfo.getId());
    // generic Elements used all over
    Elements el = null;
    // preset values from searchresult (if we have them)
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
            Utils.removeSortableName(options.getResult().getOriginalTitle()));
    md.storeMetadata(MediaMetadata.TITLE, Utils.removeSortableName(options.getResult().getTitle()));
    md.storeMetadata(MediaMetadata.YEAR, options.getResult().getYear());
    md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, options.getResult().getOriginalTitle());

    String id = "";
    if (StringUtils.isEmpty(options.getResult().getId())) {
        id = StrgUtils.substr(options.getResult().getUrl(), "id=(.*?)");
    } else {
        id = options.getResult().getId();
    }

    String detailurl = options.getResult().getUrl();
    if (StringUtils.isEmpty(detailurl)) {
        detailurl = BASE_URL + "/filme/index.php3?id=" + id;
    }

    Url url;
    try {
        LOGGER.debug("get details page");
        url = new CachedUrl(detailurl);
        InputStream in = url.getInputStream();
        Document doc = Jsoup.parse(in, PAGE_ENCODING, "");
        in.close();

        // parse plot
        String plot = doc.getElementsByAttributeValue("class", "bigtext").text();
        md.storeMetadata(MediaMetadata.PLOT, plot);
        md.storeMetadata(MediaMetadata.TAGLINE, plot.length() > 150 ? plot.substring(0, 150) : plot);

        // parse poster
        el = doc.getElementsByAttributeValueStarting("src", "/images/poster");
        if (el.size() == 1) {
            md.storeMetadata(MediaMetadata.POSTER_URL, BASE_URL + el.get(0).attr("src"));
        }

        // parse year
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.YEAR))) {
            el = doc.getElementsByAttributeValueContaining("href", "az.php3?j=");
            if (el.size() == 1) {
                md.storeMetadata(MediaMetadata.YEAR, el.get(0).text());
            }
        }

        // parse cinema release
        el = doc.getElementsByAttributeValueContaining("href", "?v=w");
        if (el.size() > 0) {
            try {
                SimpleDateFormat sdf = new SimpleDateFormat("dd.MM.yyyy");
                Date d = sdf.parse(el.get(0).text());
                sdf = new SimpleDateFormat("yyyy-MM-dd");
                md.storeMetadata(MediaMetadata.RELEASE_DATE, sdf.format(d));
            } catch (Exception e) {
                LOGGER.warn("cannot parse cinema release date: " + el.get(0).text());
            }
        }

        // parse original title
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE,
                    StrgUtils.substr(doc.toString(), "Originaltitel: (.*?)\\<"));
        }
        if (StringUtils.isEmpty(md.getStringValue(MediaMetadata.ORIGINAL_TITLE))) {
            md.storeMetadata(MediaMetadata.ORIGINAL_TITLE, md.getStringValue(MediaMetadata.TITLE));
        }

        // parse runtime
        String rt = (StrgUtils.substr(doc.toString(), "ca.&nbsp;(.*?)&nbsp;min"));
        if (!rt.isEmpty()) {
            try {
                md.storeMetadata(MediaMetadata.RUNTIME, Integer.valueOf(rt));
            } catch (Exception e2) {
                LOGGER.warn("cannot convert runtime: " + rt);
            }
        }

        // parse genres
        el = doc.getElementsByAttributeValueContaining("href", "az.php3?g=");
        for (Element g : el) {
            String gid = g.attr("href").substring(g.attr("href").lastIndexOf('=') + 1);
            md.addGenre(getTmmGenre(gid));
        }

        // parse cert
        // FSK: ab 12, $230 Mio. Budget
        String fsk = StrgUtils.substr(doc.toString(), "FSK: (.*?)[,<]");
        if (!fsk.isEmpty()) {
            md.addCertification(Certification.findCertification(fsk));
        }

        // parse rating
        Elements ratings = doc.getElementsByAttributeValue("class", "ratingBarTable");
        if (ratings.size() == 2) { // get user rating
            Element e = ratings.get(1);
            // <div>87%</div>
            String r = e.getElementsByTag("div").text().replace("%", "");
            try {
                md.storeMetadata(MediaMetadata.RATING, Double.valueOf(r) / 10); // only 0-10
            } catch (Exception e2) {
                LOGGER.warn("cannot convert rating: " + r);
            }
        }

        // details page
        doc = null;
        String detailsUrl = BASE_URL + "/filme/details.php3?id=" + id;
        try {
            url = new CachedUrl(detailsUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get details: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(detailsUrl);
        }

        if (doc != null) {
            Element tab = doc.getElementById("ccdetails");
            int header = 0;
            String lastRole = "";
            for (Element tr : tab.getElementsByTag("tr")) {
                if (tr.toString().contains("dyngfx")) { // header gfx
                    if (tr.toString().contains("Besetzung")) {
                        header = 1;
                    } else if (tr.toString().contains("Crew")) {
                        header = 2;
                    } else if (tr.toString().contains("Produktion")) {
                        header = 3;
                    } else if (tr.toString().contains("Verleih")) {
                        header = 4;
                    } else if (tr.toString().contains("Alternativtitel")) {
                        header = 5;
                    }
                    continue;
                } else {
                    // no header gfx, so data
                    MediaCastMember mcm = new MediaCastMember();
                    el = tr.getElementsByTag("td");
                    if (header == 1) {
                        // actors
                        if (el.size() == 2) {
                            mcm.setCharacter(el.get(0).text());
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            mcm.setType(MediaCastMember.CastType.ACTOR);
                            // System.out.println("Cast: " + mcm.getCharacter() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                            // TODO: parse actor detail pages :/
                        }
                    } else if (header == 2) {
                        // crew
                        if (el.size() == 2) {
                            String crewrole = el.get(0).html().trim();
                            mcm.setName(el.get(1).getElementsByTag("a").text());
                            if (crewrole.equals("&nbsp;")) {
                                mcm.setPart(lastRole);
                            } else {
                                mcm.setPart(crewrole);
                                lastRole = crewrole;
                            }
                            if (crewrole.equals("Regie")) {
                                mcm.setType(MediaCastMember.CastType.DIRECTOR);
                            } else if (crewrole.equals("Drehbuch")) {
                                mcm.setType(MediaCastMember.CastType.WRITER);
                            } else {
                                mcm.setType(MediaCastMember.CastType.OTHER);
                            }
                            mcm.setId(StrgUtils.substr(el.get(1).getElementsByTag("a").attr("href"),
                                    "id=(\\d+)"));
                            // System.out.println("Crew: " + mcm.getPart() + " - " +
                            // mcm.getName());
                            md.addCastMember(mcm);
                        }
                    } else if (header == 3) {
                        // production
                        md.storeMetadata(MediaMetadata.PRODUCTION_COMPANY, el.get(0).text());
                    }
                }
            }
        }

        // get links page
        doc = null;
        String linksUrl = BASE_URL + "/filme/links.php3?id=" + id;
        try {
            url = new CachedUrl(linksUrl);
            in = url.getInputStream();
            doc = Jsoup.parse(in, PAGE_ENCODING, "");
            in.close();
        } catch (Exception e) {
            LOGGER.error("failed to get links page: " + e.getMessage());

            // clear cache
            CachedUrl.removeCachedFileForUrl(linksUrl);
        }

        if (doc != null) {
            el = doc.getElementsByAttributeValueContaining("href", "german.imdb.com");
            if (el != null && el.size() > 0) {
                String imdb = StrgUtils.substr(el.get(0).attr("href"), "(tt\\d{7})");
                if (imdb.isEmpty()) {
                    imdb = "tt" + StrgUtils.substr(el.get(0).attr("href"), "\\?(\\d+)");
                }
                md.setId(MediaMetadata.IMDBID, imdb);
            }
        }
    } catch (Exception e) {
        LOGGER.error("Error parsing " + options.getResult().getUrl());

        // clear cache
        CachedUrl.removeCachedFileForUrl(detailurl);

        throw e;
    }

    return md;
}

From source file:SeedGenerator.MainForm.java

private void calculateEndpointTagsTF() {
    try {/*from  ww  w.j  a  v a  2 s . c  om*/
        try {
            PreparedStatement createtablepstmt = con.prepareStatement(
                    "CREATE TABLE `recommender_endpoints_tf` (\n" + "  `id` int(11) NOT NULL AUTO_INCREMENT,\n"
                            + "  `word` varchar(45) DEFAULT NULL,\n" + "  `endpointid` int(11) DEFAULT NULL,\n"
                            + "  `queryid` int(11) DEFAULT NULL,\n" + "  `count` int(11) DEFAULT NULL,\n"
                            + "  `idf` int(11) DEFAULT NULL,\n" + "  `idfid` int(11) DEFAULT NULL,\n"
                            + "  `totalNumberofWords` int(11) DEFAULT NULL,\n" + "  PRIMARY KEY (`id`),\n"
                            + "  KEY `word` (`word`),\n" + "  KEY `idf` (`idfid`),\n"
                            + "  KEY `endpointid` (`endpointid`)\n"
                            + ") ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;");
            createtablepstmt.execute();
        } catch (Exception ex) {

        }

        PreparedStatement pstmtendpoints = con.prepareStatement(
                "SELECT * from crawler.endpoints where sourceCodeHTML is not null and source != 'spendold' and source != 'spendnew' group by domain ORDER BY id asc;");
        ResultSet rs1 = pstmtendpoints.executeQuery();

        String htmlsource = "";
        HashMap<String, Integer> wordCount = new HashMap();

        while (rs1.next()) {
            htmlsource = rs1.getString("sourceCodeHTML");
            HashMap<String, Integer> localwordCount = new HashMap();
            org.jsoup.nodes.Document doc = Jsoup.parse(htmlsource);//.connect("http://en.wikipedia.org/").get();
            //Elements newsHeadlines = doc.select("#mp-itn b a");

            Elements links = doc.getElementsByTag("a");
            Elements labels = doc.getElementsByTag("Label");
            Elements spans = doc.getElementsByTag("span");
            Elements titles = doc.getElementsByTag("title");
            Elements meta = doc.getElementsByTag("meta");
            Elements h2 = doc.getElementsByTag("h2");
            Elements h1 = doc.getElementsByTag("h1");
            Elements h3 = doc.getElementsByTag("h3");
            Elements li = doc.getElementsByTag("li");
            Elements dt = doc.getElementsByTag("dt");
            Elements p = doc.getElementsByTag("p");
            Elements option = doc.getElementsByTag("option");

            links.addAll(labels);
            links.addAll(spans);
            links.addAll(titles);
            links.addAll(meta);
            links.addAll(h2);
            links.addAll(h1);
            links.addAll(h3);
            links.addAll(li);
            links.addAll(dt);
            links.addAll(p);
            links.addAll(option);

            for (Element link : links) {
                String word = link.toString();
                if (wordCount.containsKey(word) && !localwordCount.containsKey(word)) {
                    wordCount.replace(word, wordCount.get(word) + 1);
                } else if (!wordCount.containsKey(word)) {
                    wordCount.put(word, 1);
                }

                if (localwordCount.containsKey(word)) {
                    //                            wordCount.replace(word, wordCount.get(word) + 1);
                } else {
                    localwordCount.put(word, 1);
                }

                String linkHref = link.attr("href");
                //                    String linkText = link.text();
            }
            //
            //                String words[] = htmlsource.split("\n");//\\s+");
            //                for (String word : words) {
            //                    String cleanword;
            //
            //                    cleanword = word.replaceAll("\r", "");//"[^\\p{L}\\p{Nd}]+", "");
            //                    if (!cleanword.equals("")) {
            //                        if (!word.equals(cleanword)) {
            //                            word = cleanword;//System.out.println(word+"--"+cleanword);
            //                        }
            //                        word = word.toLowerCase().replace("", "i");
            //
            //                        if (wordCount.containsKey(word) && !localwordCount.containsKey(word)) {
            //                            wordCount.replace(word, wordCount.get(word) + 1);
            //                        } else if (!wordCount.containsKey(word)) {
            //                            wordCount.put(word, 1);
            //                        }
            //
            //                        if (localwordCount.containsKey(word)) {
            ////                            wordCount.replace(word, wordCount.get(word) + 1);
            //                        } else {
            //                            localwordCount.put(word, 1);
            //                        }
            //
            //                    } else {
            //                    }
            //
            //                }

        }

        pstmtendpoints.close();
        rs1.close();
        Iterator it = wordCount.entrySet().iterator();

        while (it.hasNext()) {
            Map.Entry pair = (Map.Entry) it.next();
            if (Integer.parseInt(pair.getValue().toString()) > 1) {
                PreparedStatement insertpstmt = con
                        .prepareStatement("insert into recommender_endpoints_tf (word,count) values(?,?);");
                if (pair.getKey().toString().length() > 44) {
                    insertpstmt.setString(1, pair.getKey().toString());
                } else {
                    insertpstmt.setString(1, pair.getKey().toString());
                }
                insertpstmt.setInt(2, Integer.parseInt(pair.getValue().toString()));

                insertpstmt.executeUpdate();
                insertpstmt.close();
            }
            it.remove(); // avoids a ConcurrentModificationException
        }

    } catch (Exception e) {
        //System.err.println("Got an exception! ");
        System.err.println(e.getMessage());
    }

}