NewPipe/app/src/main/java/org/schabi/newpipe/youtube/YoutubeExtractor.java

511 lines
20 KiB
Java

package org.schabi.newpipe.youtube;
import android.util.Log;
import android.util.Xml;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.ScriptableObject;
import org.schabi.newpipe.Downloader;
import org.schabi.newpipe.Extractor;
import org.schabi.newpipe.MediaFormat;
import org.schabi.newpipe.VideoInfo;
import org.schabi.newpipe.VideoInfoItem;
import org.xmlpull.v1.XmlPullParser;
import java.io.StringReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by Christian Schabesberger on 06.08.15.
*
* Copyright (C) Christian Schabesberger 2015 <chris.schabesberger@mailbox.org>
* YoutubeExtractor.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
public class YoutubeExtractor implements Extractor {
private static final String TAG = YoutubeExtractor.class.toString();
// These lists only contain itag formats that are supported by the common Android Video player.
// How ever if you are heading for a list showing all itag formats look at
// https://github.com/rg3/youtube-dl/issues/1687
public static int resolveFormat(int itag) {
switch(itag) {
// video
case 17: return MediaFormat.v3GPP.id;
case 18: return MediaFormat.MPEG_4.id;
case 22: return MediaFormat.MPEG_4.id;
case 36: return MediaFormat.v3GPP.id;
case 37: return MediaFormat.MPEG_4.id;
case 38: return MediaFormat.MPEG_4.id;
case 43: return MediaFormat.WEBM.id;
case 44: return MediaFormat.WEBM.id;
case 45: return MediaFormat.WEBM.id;
case 46: return MediaFormat.WEBM.id;
default:
//Log.i(TAG, "Itag " + Integer.toString(itag) + " not known or not supported.");
return -1;
}
}
public static String resolveResolutionString(int itag) {
switch(itag) {
case 17: return "144p";
case 18: return "360p";
case 22: return "720p";
case 36: return "240p";
case 37: return "1080p";
case 38: return "1080p";
case 43: return "360p";
case 44: return "480p";
case 45: return "720p";
case 46: return "1080p";
default:
//Log.i(TAG, "Itag " + Integer.toString(itag) + " not known or not supported.");
return null;
}
}
// static values
private static final String DECRYPTION_FUNC_NAME="decrypt";
// cached values
private static volatile String decryptionCode = "";
public void initService(String site) {
// The Youtube service needs to be initialized by downloading the
// js-Youtube-player. This is done in order to get the algorithm
// for decrypting cryptic signatures inside certain stream urls.
// Star Wars Kid is used as a dummy video, in order to download the youtube player.
//String site = Downloader.download("https://www.youtube.com/watch?v=HPPj6viIBmU");
//-------------------------------------
// extracting form player args
//-------------------------------------
try {
String jsonString = matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", site);
JSONObject jsonObj = new JSONObject(jsonString);
//----------------------------------
// load an parse description code
//----------------------------------
if (decryptionCode.isEmpty()) {
JSONObject ytAssets = jsonObj.getJSONObject("assets");
String playerUrl = ytAssets.getString("js");
if (playerUrl.startsWith("//")) {
playerUrl = "https:" + playerUrl;
}
decryptionCode = loadDecryptionCode(playerUrl);
}
} catch (Exception e){
Log.d(TAG, "Could not initialize the extractor of the Youtube service.");
e.printStackTrace();
}
}
@Override
public String getVideoId(String videoUrl) {
try {
URI uri = new URI(videoUrl);
if(uri.getHost().contains("youtube")) {
String query = uri.getFragment();
if(query == null) {
query = uri.getQuery();
} else {
query = query.replace("/watch?", "");
}
String queryElements[] = query.split("&");
Map<String, String> queryArguments = new HashMap<>();
for (String e : queryElements) {
String[] s = e.split("=");
queryArguments.put(s[0], s[1]);
}
return queryArguments.get("v");
} else if(uri.getHost().contains("youtu.be")) {
// uri.getRawPath() does somehow not return the last character.
// so we do a workaround instead.
//return uri.getRawPath();
String url[] = videoUrl.split("/");
return url[url.length-1];
} else {
Log.e(TAG, "Error could not parse url: " + videoUrl);
}
} catch(Exception e) {
Log.e(TAG, "Error could not parse url: " + videoUrl);
e.printStackTrace();
return "";
}
return null;
}
@Override
public String getVideoUrl(String videoId) {
return "https://www.youtube.com/watch?v=" + videoId;
}
@Override
public VideoInfo getVideoInfo(String siteUrl) {
String site = Downloader.download(siteUrl);
VideoInfo videoInfo = new VideoInfo();
Document doc = Jsoup.parse(site, siteUrl);
videoInfo.id = matchGroup1("v=([0-9a-zA-Z]*)", siteUrl);
videoInfo.age_limit = 0;
videoInfo.webpage_url = siteUrl;
initService(site);
//-------------------------------------
// extracting form player args
//-------------------------------------
JSONObject playerArgs = null;
{
try {
String jsonString = matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", site);
JSONObject jsonObj = new JSONObject(jsonString);
playerArgs = jsonObj.getJSONObject("args");
}
catch (Exception e) {
e.printStackTrace();
// If we fail in this part the video is most likely not available.
// Determining why is done later.
videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE;
}
}
//-----------------------
// load and extract audio
//-----------------------
try {
String dashManifest = playerArgs.getString("dashmpd");
videoInfo.audioStreams = parseDashManifest(dashManifest, decryptionCode);
} catch (NullPointerException e) {
Log.e(TAG, "Could not find \"dashmpd\" upon the player args (maybe no dash manifest available).");
} catch (Exception e) {
e.printStackTrace();
}
try {
//--------------------------------------------
// extract general information about the video
//--------------------------------------------
videoInfo.uploader = playerArgs.getString("author");
videoInfo.title = playerArgs.getString("title");
//first attempt gating a small image version
//in the html extracting part we try to get a thumbnail with a higher resolution
videoInfo.thumbnail_url = playerArgs.getString("thumbnail_url");
videoInfo.duration = playerArgs.getInt("length_seconds");
videoInfo.average_rating = playerArgs.getString("avg_rating");
//------------------------------------
// extract video stream url
//------------------------------------
String encoded_url_map = playerArgs.getString("url_encoded_fmt_stream_map");
Vector<VideoInfo.VideoStream> videoStreams = new Vector<>();
for(String url_data_str : encoded_url_map.split(",")) {
Map<String, String> tags = new HashMap<>();
for(String raw_tag : Parser.unescapeEntities(url_data_str, true).split("&")) {
String[] split_tag = raw_tag.split("=");
tags.put(split_tag[0], split_tag[1]);
}
int itag = Integer.parseInt(tags.get("itag"));
String streamUrl = terrible_unescape_workaround_fuck(tags.get("url"));
// if video has a signature: decrypt it and add it to the url
if(tags.get("s") != null) {
streamUrl = streamUrl + "&signature=" + decryptSignature(tags.get("s"), decryptionCode);
}
if(resolveFormat(itag) != -1) {
videoStreams.add(new VideoInfo.VideoStream(
streamUrl,
resolveFormat(itag),
resolveResolutionString(itag)));
}
}
videoInfo.videoStreams =
videoStreams.toArray(new VideoInfo.VideoStream[videoStreams.size()]);
} catch (Exception e) {
e.printStackTrace();
}
//---------------------------------------
// extracting information from html page
//---------------------------------------
// Determine what went wrong when the Video is not available
if(videoInfo.videoAvailableStatus == VideoInfo.VIDEO_UNAVAILABLE) {
if(doc.select("h1[id=\"unavailable-message\"]").first().text().contains("GEMA")) {
videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE_GEMA;
}
}
// Try to get high resolution thumbnail if it fails use low res from the player instead
try {
videoInfo.thumbnail_url = doc.select("link[itemprop=\"thumbnailUrl\"]").first()
.attr("abs:href");
} catch(Exception e) {
Log.i(TAG, "Could not find high res Thumbnail. Use low res instead");
}
// upload date
videoInfo.upload_date = doc.select("strong[class=\"watch-time-text\"").first()
.text();
// Extracting the date itself from header
videoInfo.upload_date = matchGroup1("([A-Za-z]{3}\\s[\\d]{1,2},\\s[\\d]{4}$)", videoInfo.upload_date);
// description
videoInfo.description = doc.select("p[id=\"eow-description\"]").first()
.html();
try {
// likes
videoInfo.like_count = doc.select("span[class=\"like-button-renderer \"]").first()
.getAllElements().select("button")
.select("span").get(0).text();
// dislikes
videoInfo.dislike_count = doc.select("span[class=\"like-button-renderer \"]").first()
.getAllElements().select("button")
.select("span").get(2).text();
} catch(Exception e) {
// if it fails we know that the video does not offer dislikes.
videoInfo.like_count = "0";
videoInfo.dislike_count = "0";
}
// uploader thumbnail
videoInfo.uploader_thumbnail_url = doc.select("a[class*=\"yt-user-photo\"]").first()
.select("img").first()
.attr("abs:data-thumb");
// view count
videoInfo.view_count = doc.select("div[class=\"watch-view-count\"]").first().text();
// Extracting the number of views from header
videoInfo.view_count = matchGroup1("([0-9,]*$)", videoInfo.view_count);
// next video
videoInfo.nextVideo = extractVideoInfoItem(doc.select("div[class=\"watch-sidebar-section\"]").first()
.select("li").first());
int i = 0;
// related videos
Vector<VideoInfoItem> relatedVideos = new Vector<>();
for(Element li : doc.select("ul[id=\"watch-related\"]").first().children()) {
// first check if we have a playlist. If so leave them out
if(li.select("a[class*=\"content-link\"]").first() != null) {
relatedVideos.add(extractVideoInfoItem(li));
i++;
}
}
videoInfo.relatedVideos = relatedVideos.toArray(new VideoInfoItem[relatedVideos.size()]);
return videoInfo;
}
private VideoInfo.AudioStream[] parseDashManifest(String dashManifest, String decryptoinCode) {
if(!dashManifest.contains("/signature/")) {
String encryptedSig = matchGroup1("/s/([a-fA-F0-9\\.]+)", dashManifest);
String decryptedSig;
decryptedSig = decryptSignature(encryptedSig, decryptoinCode);
dashManifest = dashManifest.replace("/s/" + encryptedSig, "/signature/" + decryptedSig);
}
String dashDoc = Downloader.download(dashManifest);
Vector<VideoInfo.AudioStream> audioStreams = new Vector<>();
try {
XmlPullParser parser = Xml.newPullParser();
parser.setInput(new StringReader(dashDoc));
int eventType = parser.getEventType();
String tagName = "";
String currentMimeType = "";
int currentBandwidth = -1;
int currentSamplingRate = -1;
boolean currentTagIsBaseUrl = false;
while(eventType != XmlPullParser.END_DOCUMENT) {
switch(eventType) {
case XmlPullParser.START_TAG:
tagName = parser.getName();
if(tagName.equals("AdaptationSet")) {
currentMimeType = parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "mimeType");
} else if(tagName.equals("Representation") && currentMimeType.contains("audio")) {
currentBandwidth = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "bandwidth"));
currentSamplingRate = Integer.parseInt(
parser.getAttributeValue(XmlPullParser.NO_NAMESPACE, "audioSamplingRate"));
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = true;
}
break;
case XmlPullParser.TEXT:
if(currentTagIsBaseUrl &&
(currentMimeType.contains("audio"))) {
int format = -1;
if(currentMimeType.equals(MediaFormat.WEBMA.mimeType)) {
format = MediaFormat.WEBMA.id;
} else if(currentMimeType.equals(MediaFormat.M4A.mimeType)) {
format = MediaFormat.M4A.id;
}
audioStreams.add(new VideoInfo.AudioStream(parser.getText(),
format, currentBandwidth, currentSamplingRate));
}
case XmlPullParser.END_TAG:
if(tagName.equals("AdaptationSet")) {
currentMimeType = "";
} else if(tagName.equals("BaseURL")) {
currentTagIsBaseUrl = false;
}
break;
default:
}
eventType = parser.next();
}
} catch(Exception e) {
e.printStackTrace();
}
return audioStreams.toArray(new VideoInfo.AudioStream[audioStreams.size()]);
}
private VideoInfoItem extractVideoInfoItem(Element li) {
VideoInfoItem info = new VideoInfoItem();
info.webpage_url = li.select("a[class*=\"content-link\"]").first()
.attr("abs:href");
try {
info.id = matchGroup1("v=([0-9a-zA-Z-]*)", info.webpage_url);
} catch (Exception e) {
e.printStackTrace();
}
info.title = li.select("span[class=\"title\"]").first().text();
info.view_count = li.select("span[class*=\"view-count\"]").first().text();
info.uploader = li.select("span[class=\"g-hovercard\"]").first().text();
info.duration = li.select("span[class=\"video-time\"]").first().text();
Element img = li.select("img").first();
info.thumbnail_url = img.attr("abs:src");
// Sometimes youtube sends links to gif files witch somehow seam to not exist
// anymore. Items with such gif also offer a secondary image source. So we are going
// to use that if we caught such an item.
if(info.thumbnail_url.contains(".gif")) {
info.thumbnail_url = img.attr("data-thumb");
}
if(info.thumbnail_url.startsWith("//")) {
info.thumbnail_url = "https:" + info.thumbnail_url;
}
return info;
}
private String terrible_unescape_workaround_fuck(String shit) {
String[] splitAtEscape = shit.split("%");
String retval = "";
retval += splitAtEscape[0];
for(int i = 1; i < splitAtEscape.length; i++) {
String escNum = splitAtEscape[i].substring(0, 2);
char c = (char) Integer.parseInt(escNum,16);
retval += c;
retval += splitAtEscape[i].substring(2);
}
return retval;
}
private String loadDecryptionCode(String playerUrl) {
String playerCode = Downloader.download(playerUrl);
String decryptionFuncName = "";
String decryptionFunc = "";
String helperObjectName;
String helperObject = "";
String callerFunc = "function " + DECRYPTION_FUNC_NAME + "(a){return %%(a);}";
String decryptionCode;
try {
decryptionFuncName = matchGroup1("\\.sig\\|\\|([a-zA-Z0-9$]+)\\(", playerCode);
String functionPattern = "(function " + decryptionFuncName.replace("$", "\\$") + "\\([a-zA-Z0-9_]*\\)\\{.+?\\})";
decryptionFunc = matchGroup1(functionPattern, playerCode);
helperObjectName = matchGroup1(";([A-Za-z0-9_\\$]{2})\\...\\(", decryptionFunc);
String helperPattern = "(var " + helperObjectName.replace("$", "\\$") + "=\\{.+?\\}\\};)function";
helperObject = matchGroup1(helperPattern, playerCode);
} catch (Exception e) {
e.printStackTrace();
}
callerFunc = callerFunc.replace("%%", decryptionFuncName);
decryptionCode = helperObject + decryptionFunc + callerFunc;
return decryptionCode;
}
private String decryptSignature(String encryptedSig, String decryptoinCode) {
Context context = Context.enter();
context.setOptimizationLevel(-1);
Object result = null;
try {
ScriptableObject scope = context.initStandardObjects();
context.evaluateString(scope, decryptoinCode, "decryptionCode", 1, null);
Function decryptionFunc = (Function) scope.get("decrypt", scope);
result = decryptionFunc.call(context, scope, scope, new Object[]{encryptedSig});
} catch (Exception e) {
e.printStackTrace();
}
Context.exit();
return result.toString();
}
private String matchGroup1(String pattern, String input) {
Pattern pat = Pattern.compile(pattern);
Matcher mat = pat.matcher(input);
boolean foundMatch = mat.find();
if(foundMatch){
return mat.group(1);
}
else {
Log.e(TAG, "failed to find pattern \""+pattern+"\"");
new Exception("failed to find pattern \""+pattern+"\"").printStackTrace();
return "";
}
}
}