org.schabi.newpipe.services.youtube.YoutubeVideoExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.schabi.newpipe.services.youtube.YoutubeVideoExtractor.java

Source

package org.schabi.newpipe.services.youtube;

import android.util.Log;
import android.util.Xml;

import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.ScriptableObject;
import org.schabi.newpipe.Downloader;
import org.schabi.newpipe.services.VideoExtractor;
import org.schabi.newpipe.MediaFormat;
import org.schabi.newpipe.VideoInfo;
import org.schabi.newpipe.VideoPreviewInfo;
import org.xmlpull.v1.XmlPullParser;

import java.io.StringReader;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by Christian Schabesberger on 06.08.15.
 *
 * Copyright (C) Christian Schabesberger 2015 <chris.schabesberger@mailbox.org>
 * YoutubeVideoExtractor.java is part of NewPipe.
 *
 * NewPipe is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * NewPipe is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NewPipe.  If not, see <http://www.gnu.org/licenses/>.
 */

public class YoutubeVideoExtractor extends VideoExtractor {

    private static final String TAG = YoutubeVideoExtractor.class.toString();
    private final Document doc;
    private JSONObject jsonObj;
    private JSONObject playerArgs;

    // static values
    private static final String DECRYPTION_FUNC_NAME = "decrypt";

    // cached values
    private static volatile String decryptionCode = "";

    public YoutubeVideoExtractor(String pageUrl) {
        super(pageUrl);//most common videoInfo fields are now set in our superclass, for all services
        String pageContents = Downloader.download(cleanUrl(pageUrl));
        doc = Jsoup.parse(pageContents, pageUrl);

        //attempt to load the youtube js player JSON arguments
        try {
            String jsonString = matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", pageContents);
            jsonObj = new JSONObject(jsonString);
            playerArgs = jsonObj.getJSONObject("args");

        } catch (Exception e) {//if this fails, the video is most likely not available.
            // Determining why is done later.
            videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE;
            Log.d(TAG, "Could not load JSON data for Youtube video \"" + pageUrl
                    + "\". This most likely means the video is unavailable");
        }

        //----------------------------------
        // load and parse description code, if it isn't already initialised
        //----------------------------------
        if (decryptionCode.isEmpty()) {
            try {
                // The Youtube service needs to be initialized by downloading the
                // js-Youtube-player. This is done in order to get the algorithm
                // for decrypting cryptic signatures inside certain stream urls.
                JSONObject ytAssets = jsonObj.getJSONObject("assets");
                String playerUrl = ytAssets.getString("js");

                if (playerUrl.startsWith("//")) {
                    playerUrl = "https:" + playerUrl;
                }
                decryptionCode = loadDecryptionCode(playerUrl);
            } catch (Exception e) {
                Log.d(TAG, "Could not load decryption code for the Youtube service.");
                e.printStackTrace();
            }
        }
    }

    @Override
    public String getTitle() {
        try {//json player args method
            return playerArgs.getString("title");
        } catch (JSONException je) {//html <meta> method
            je.printStackTrace();
            Log.w(TAG, "failed to load title from JSON args; trying to extract it from HTML");
        }
        try { // fall through to fall-back
            return doc.select("meta[name=title]").attr("content");
        } catch (Exception e) {
            Log.e(TAG, "failed permanently to load title.");
            e.printStackTrace();
            return "";
        }
    }

    @Override
    public String getDescription() {
        try {
            return doc.select("p[id=\"eow-description\"]").first().html();
        } catch (Exception e) {//todo: add fallback method
            Log.e(TAG, "failed to load description.");
            e.printStackTrace();
            return "";
        }
    }

    @Override
    public String getUploader() {
        try {//json player args method
            return playerArgs.getString("author");
        } catch (JSONException je) {
            je.printStackTrace();
            Log.w(TAG, "failed to load uploader name from JSON args; trying to extract it from HTML");
        }
        try {//fall through to fallback HTML method
            return doc.select("div.yt-user-info").first().text();
        } catch (Exception e) {
            e.printStackTrace();
            Log.e(TAG, "failed permanently to load uploader name.");
            return "";
        }
    }

    @Override
    public int getLength() {
        try {
            return playerArgs.getInt("length_seconds");
        } catch (JSONException je) {//todo: find fallback method
            Log.e(TAG, "failed to load video duration from JSON args");
            je.printStackTrace();
            return -1;
        }
    }

    @Override
    public int getViews() {
        try {
            String viewCountString = doc.select("meta[itemprop=interactionCount]").attr("content");
            return Integer.parseInt(viewCountString);
        } catch (Exception e) {//todo: find fallback method
            Log.e(TAG, "failed to number of views");
            e.printStackTrace();
            return -1;
        }
    }

    @Override
    public String getUploadDate() {
        try {
            return doc.select("meta[itemprop=datePublished]").attr("content");
        } catch (Exception e) {//todo: add fallback method
            Log.e(TAG, "failed to get upload date.");
            e.printStackTrace();
            return "";
        }
    }

    @Override
    public String getThumbnailUrl() {
        //first attempt getting a small image version
        //in the html extracting part we try to get a thumbnail with a higher resolution
        // Try to get high resolution thumbnail if it fails use low res from the player instead
        try {
            return doc.select("link[itemprop=\"thumbnailUrl\"]").first().attr("abs:href");
        } catch (Exception e) {
            Log.w(TAG, "Could not find high res Thumbnail. Using low res instead");
            //fall through to fallback
        }
        try {
            return playerArgs.getString("thumbnail_url");
        } catch (JSONException je) {
            je.printStackTrace();
            Log.w(TAG, "failed to extract thumbnail URL from JSON args; trying to extract it from HTML");
            return "";
        }
    }

    @Override
    public String getUploaderThumbnailUrl() {
        try {
            return doc.select("a[class*=\"yt-user-photo\"]").first().select("img").first().attr("abs:data-thumb");
        } catch (Exception e) {//todo: add fallback method
            Log.e(TAG, "failed to get uploader thumbnail URL.");
            e.printStackTrace();
            return "";
        }
    }

    @Override
    public VideoInfo.AudioStream[] getAudioStreams() {
        try {
            String dashManifest = playerArgs.getString("dashmpd");
            return parseDashManifest(dashManifest, decryptionCode);

        } catch (NullPointerException e) {
            Log.e(TAG, "Could not find \"dashmpd\" upon the player args (maybe no dash manifest available).");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return new VideoInfo.AudioStream[0];
    }

    @Override
    public VideoInfo.VideoStream[] getVideoStreams() {
        try {
            //------------------------------------
            // extract video stream url
            //------------------------------------
            String encoded_url_map = playerArgs.getString("url_encoded_fmt_stream_map");
            Vector<VideoInfo.VideoStream> videoStreams = new Vector<>();
            for (String url_data_str : encoded_url_map.split(",")) {
                Map<String, String> tags = new HashMap<>();
                for (String raw_tag : Parser.unescapeEntities(url_data_str, true).split("&")) {
                    String[] split_tag = raw_tag.split("=");
                    tags.put(split_tag[0], split_tag[1]);
                }

                int itag = Integer.parseInt(tags.get("itag"));
                String streamUrl = URLDecoder.decode(tags.get("url"), "UTF-8");

                // if video has a signature: decrypt it and add it to the url
                if (tags.get("s") != null) {
                    streamUrl = streamUrl + "&signature=" + decryptSignature(tags.get("s"), decryptionCode);
                }

                if (resolveFormat(itag) != -1) {
                    videoStreams.add(new VideoInfo.VideoStream(streamUrl, resolveFormat(itag),
                            resolveResolutionString(itag)));
                }
            }
            return videoStreams.toArray(new VideoInfo.VideoStream[videoStreams.size()]);

        } catch (Exception e) {
            Log.e(TAG, "Failed to get video stream");
            e.printStackTrace();
            return new VideoInfo.VideoStream[0];
        }
    }

    /**These lists only contain itag formats that are supported by the common Android Video player.
    However if you are looking for a list showing all itag formats, look at
    https://github.com/rg3/youtube-dl/issues/1687 */

    @SuppressWarnings("WeakerAccess")
    public static int resolveFormat(int itag) {
        switch (itag) {
        // video
        case 17:
            return MediaFormat.v3GPP.id;
        case 18:
            return MediaFormat.MPEG_4.id;
        case 22:
            return MediaFormat.MPEG_4.id;
        case 36:
            return MediaFormat.v3GPP.id;
        case 37:
            return MediaFormat.MPEG_4.id;
        case 38:
            return MediaFormat.MPEG_4.id;
        case 43:
            return MediaFormat.WEBM.id;
        case 44:
            return MediaFormat.WEBM.id;
        case 45:
            return MediaFormat.WEBM.id;
        case 46:
            return MediaFormat.WEBM.id;
        default:
            //Log.i(TAG, "Itag " + Integer.toString(itag) + " not known or not supported.");
            return -1;
        }
    }

    @SuppressWarnings("WeakerAccess")
    public static String resolveResolutionString(int itag) {
        switch (itag) {
        case 17:
            return "144p";
        case 18:
            return "360p";
        case 22:
            return "720p";
        case 36:
            return "240p";
        case 37:
            return "1080p";
        case 38:
            return "1080p";
        case 43:
            return "360p";
        case 44:
            return "480p";
        case 45:
            return "720p";
        case 46:
            return "1080p";
        default:
            //Log.i(TAG, "Itag " + Integer.toString(itag) + " not known or not supported.");
            return null;
        }
    }

    @SuppressWarnings("WeakerAccess")
    @Override
    public String getVideoId(String url) {
        String id;
        String pat;

        if (url.contains("youtube")) {
            pat = "youtube\\.com/watch\\?v=([\\-a-zA-Z0-9_]{11})";
        } else if (url.contains("youtu.be")) {
            pat = "youtu\\.be/([a-zA-Z0-9_-]{11})";
        } else {
            Log.e(TAG, "Error could not parse url: " + url);
            return "";
        }
        id = matchGroup1(pat, url);
        if (!id.isEmpty()) {
            Log.i(TAG, "string \"" + url + "\" matches!");
            return id;
        }
        Log.i(TAG, "string \"" + url + "\" does not match.");
        return "";
    }

    @SuppressWarnings("WeakerAccess")
    @Override
    public String getVideoUrl(String videoId) {
        return "https://www.youtube.com/watch?v=" + videoId;
    }

    /**Attempts to parse (and return) the offset to start playing the video from.
     * @return the offset (in seconds), or 0 if no timestamp is found.*/
    @Override
    public int getTimeStamp() {
        String timeStamp = matchGroup1("((#|&)t=\\d{0,3}h?\\d{0,3}m?\\d{1,3}s?)", pageUrl);

        //TODO: test this
        if (!timeStamp.isEmpty()) {
            String secondsString = matchGroup1("(\\d{1,3})s", timeStamp);
            String minutesString = matchGroup1("(\\d{1,3})m", timeStamp);
            String hoursString = matchGroup1("(\\d{1,3})h", timeStamp);

            if (secondsString.isEmpty()//if nothing was got,
                    && minutesString.isEmpty()//treat as unlabelled seconds
                    && hoursString.isEmpty())
                secondsString = matchGroup1("t=(\\d{1,3})", timeStamp);

            int seconds = (secondsString.isEmpty() ? 0 : Integer.parseInt(secondsString));
            int minutes = (minutesString.isEmpty() ? 0 : Integer.parseInt(minutesString));
            int hours = (hoursString.isEmpty() ? 0 : Integer.parseInt(hoursString));

            int ret = seconds + (60 * minutes) + (3600 * hours);//don't trust BODMAS!
            Log.d(TAG, "derived timestamp value:" + ret);
            return ret;
            //the ordering varies internationally
        } //else, return default 0
        return 0;
    }

    @Override
    public VideoInfo getVideoInfo() {
        videoInfo = super.getVideoInfo();
        //todo: replace this with a call to getVideoId, if possible
        videoInfo.id = matchGroup1("v=([0-9a-zA-Z_-]{11})", pageUrl);

        videoInfo.age_limit = 0;

        //average rating
        try {
            videoInfo.average_rating = playerArgs.getString("avg_rating");
        } catch (JSONException e) {
            e.printStackTrace();
        }

        //---------------------------------------
        // extracting information from html page
        //---------------------------------------

        // Determine what went wrong when the Video is not available
        if (videoInfo.videoAvailableStatus == VideoInfo.VIDEO_UNAVAILABLE) {
            if (doc.select("h1[id=\"unavailable-message\"]").first().text().contains("GEMA")) {
                videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE_GEMA;
            }
        }

        String likesString = "";
        String dislikesString = "";
        try {
            // likes
            likesString = doc.select("button.like-button-renderer-like-button").first()
                    .select("span.yt-uix-button-content").first().text();
            videoInfo.like_count = Integer.parseInt(likesString.replaceAll("[^\\d]", ""));
            // dislikes
            dislikesString = doc.select("button.like-button-renderer-dislike-button").first()
                    .select("span.yt-uix-button-content").first().text();

            videoInfo.dislike_count = Integer.parseInt(dislikesString.replaceAll("[^\\d]", ""));
        } catch (NumberFormatException nfe) {
            Log.e(TAG, "failed to parse likesString \"" + likesString + "\" and dislikesString \"" + dislikesString
                    + "\" as integers");
        } catch (Exception e) {
            // if it fails we know that the video does not offer dislikes.
            e.printStackTrace();
            videoInfo.like_count = 0;
            videoInfo.dislike_count = 0;
        }

        // next video
        videoInfo.nextVideo = extractVideoPreviewInfo(
                doc.select("div[class=\"watch-sidebar-section\"]").first().select("li").first());

        // related videos
        Vector<VideoPreviewInfo> relatedVideos = new Vector<>();
        for (Element li : doc.select("ul[id=\"watch-related\"]").first().children()) {
            // first check if we have a playlist. If so leave them out
            if (li.select("a[class*=\"content-link\"]").first() != null) {
                relatedVideos.add(extractVideoPreviewInfo(li));
            }
        }
        //todo: replace conversion
        videoInfo.relatedVideos = relatedVideos;
        //videoInfo.relatedVideos = relatedVideos.toArray(new VideoPreviewInfo[relatedVideos.size()]);
        return videoInfo;
    }

    private VideoInfo.AudioStream[] parseDashManifest(String dashManifest, String decryptoinCode) {
        if (!dashManifest.contains("/signature/")) {
            String encryptedSig = matchGroup1("/s/([a-fA-F0-9\\.]+)", dashManifest);
            String decryptedSig;

            decryptedSig = decryptSignature(encryptedSig, decryptoinCode);
            dashManifest = dashManifest.replace("/s/" + encryptedSig, "/signature/" + decryptedSig);
        }
        String dashDoc = Downloader.download(dashManifest);
        Vector<VideoInfo.AudioStream> audioStreams = new Vector<>();
        try {
            XmlPullParser parser = Xml.newPullParser();
            parser.setInput(new StringReader(dashDoc));
            int eventType = parser.getEventType();
            String tagName = "";
            String currentMimeType = "";
            int currentBandwidth = -1;
            int currentSamplingRate = -1;
            boolean currentTagIsBaseUrl = false;
            while (eventType != XmlPullParser.END_DOCUMENT) {
                switch (eventType) {
                case XmlPullParser.START_TAG:
                    tagName = parser.getName();
                    if (tagName.equals("AdaptationSet")) {
                        currentMimeType = parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "mimeType");
                    } else if (tagName.equals("Representation") && currentMimeType.contains("audio")) {
                        currentBandwidth = Integer
                                .parseInt(parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "bandwidth"));
                        currentSamplingRate = Integer.parseInt(
                                parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "audioSamplingRate"));
                    } else if (tagName.equals("BaseURL")) {
                        currentTagIsBaseUrl = true;
                    }

                    break;
                case XmlPullParser.TEXT:
                    if (currentTagIsBaseUrl && (currentMimeType.contains("audio"))) {
                        int format = -1;
                        if (currentMimeType.equals(MediaFormat.WEBMA.mimeType)) {
                            format = MediaFormat.WEBMA.id;
                        } else if (currentMimeType.equals(MediaFormat.M4A.mimeType)) {
                            format = MediaFormat.M4A.id;
                        }
                        audioStreams.add(new VideoInfo.AudioStream(parser.getText(), format, currentBandwidth,
                                currentSamplingRate));
                    }
                case XmlPullParser.END_TAG:
                    if (tagName.equals("AdaptationSet")) {
                        currentMimeType = "";
                    } else if (tagName.equals("BaseURL")) {
                        currentTagIsBaseUrl = false;
                    }
                    break;
                default:
                }
                eventType = parser.next();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return audioStreams.toArray(new VideoInfo.AudioStream[audioStreams.size()]);
    }

    /**Provides information about links to other videos on the video page, such as related videos.
     * This is encapsulated in a VideoPreviewInfo object,
     * which is a subset of the fields in a full VideoInfo.*/
    private VideoPreviewInfo extractVideoPreviewInfo(Element li) {
        VideoPreviewInfo info = new VideoPreviewInfo();
        info.webpage_url = li.select("a.content-link").first().attr("abs:href");
        try {
            info.id = matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url);
        } catch (Exception e) {
            e.printStackTrace();
        }

        //todo: check NullPointerException causing
        info.title = li.select("span.title").first().text();
        //this page causes the NullPointerException, after finding it by searching for "tjvg":
        //https://www.youtube.com/watch?v=Uqg0aEhLFAg
        String views = li.select("span.view-count").first().text();
        Log.i(TAG, "title:" + info.title);
        Log.i(TAG, "view count:" + views);
        try {
            info.view_count = Long.parseLong(li.select("span.view-count").first().text().replaceAll("[^\\d]", ""));
        } catch (NullPointerException e) {//related videos sometimes have no view count
            info.view_count = 0;
        }
        info.uploader = li.select("span.g-hovercard").first().text();

        info.duration = li.select("span.video-time").first().text();

        Element img = li.select("img").first();
        info.thumbnail_url = img.attr("abs:src");
        // Sometimes youtube sends links to gif files which somehow seem to not exist
        // anymore. Items with such gif also offer a secondary image source. So we are going
        // to use that if we caught such an item.
        if (info.thumbnail_url.contains(".gif")) {
            info.thumbnail_url = img.attr("data-thumb");
        }
        if (info.thumbnail_url.startsWith("//")) {
            info.thumbnail_url = "https:" + info.thumbnail_url;
        }
        return info;
    }

    private String loadDecryptionCode(String playerUrl) {
        String playerCode = Downloader.download(playerUrl);
        String decryptionFuncName = "";
        String decryptionFunc = "";
        String helperObjectName;
        String helperObject = "";
        String callerFunc = "function " + DECRYPTION_FUNC_NAME + "(a){return %%(a);}";
        String decryptionCode;

        try {
            decryptionFuncName = matchGroup1("\\.sig\\|\\|([a-zA-Z0-9$]+)\\(", playerCode);

            String functionPattern = "(var " + decryptionFuncName.replace("$", "\\$")
                    + "=function\\([a-zA-Z0-9_]*\\)\\{.+?\\})";
            decryptionFunc = matchGroup1(functionPattern, playerCode);
            decryptionFunc += ";";

            helperObjectName = matchGroup1(";([A-Za-z0-9_\\$]{2})\\...\\(", decryptionFunc);

            String helperPattern = "(var " + helperObjectName.replace("$", "\\$") + "=\\{.+?\\}\\};)";
            helperObject = matchGroup1(helperPattern, playerCode);

        } catch (Exception e) {
            e.printStackTrace();
        }

        callerFunc = callerFunc.replace("%%", decryptionFuncName);
        decryptionCode = helperObject + decryptionFunc + callerFunc;

        return decryptionCode;
    }

    private String decryptSignature(String encryptedSig, String decryptionCode) {
        Context context = Context.enter();
        context.setOptimizationLevel(-1);
        Object result = null;
        try {
            ScriptableObject scope = context.initStandardObjects();
            context.evaluateString(scope, decryptionCode, "decryptionCode", 1, null);
            Function decryptionFunc = (Function) scope.get("decrypt", scope);
            result = decryptionFunc.call(context, scope, scope, new Object[] { encryptedSig });
        } catch (Exception e) {
            e.printStackTrace();
        }
        Context.exit();
        if (result != null)
            return result.toString();
        else
            return "";
    }

    private String cleanUrl(String complexUrl) {
        return getVideoUrl(getVideoId(complexUrl));
    }

    private String matchGroup1(String pattern, String input) {
        Pattern pat = Pattern.compile(pattern);
        Matcher mat = pat.matcher(input);
        boolean foundMatch = mat.find();
        if (foundMatch) {
            return mat.group(1);
        } else {
            Log.w(TAG, "failed to find pattern \"" + pattern + "\" inside of \"" + input + "\"");
            new Exception("failed to find pattern \"" + pattern + "\"").printStackTrace();
            return "";
        }

    }
}