Merge pull request #1087 from AudricV/yt_js-extractor-improvements-and-fixes

[YouTube] Improve and fix YoutubeJavaScriptExtractor
This commit is contained in:
Stypox 2023-08-06 12:01:00 +02:00 committed by GitHub
commit 3faaf4301c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 110 additions and 79 deletions

View File

@ -10,40 +10,62 @@ import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.utils.Parser; import org.schabi.newpipe.extractor.utils.Parser;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
import java.util.regex.Pattern;
/** /**
* YouTube restricts streaming their media in multiple ways by requiring clients to apply a cipher * The extractor of YouTube's base JavaScript player file.
* function on parameters of requests. *
* The cipher function is sent alongside as a JavaScript function.
* <p> * <p>
* This class handling fetching the JavaScript file in order to allow other classes to extract the * YouTube restrict streaming their media in multiple ways by requiring their HTML5 clients to use
* needed functions. * a signature timestamp, and on streaming URLs a signature deobfuscation function for some
* contents and a throttling parameter deobfuscation one for all contents.
* </p>
*
* <p>
* This class handles fetching of this base JavaScript player file in order to allow other classes
* to extract the needed data.
* </p>
*
* <p>
* It will try to get the player URL from YouTube's IFrame resource first, and from a YouTube embed
* watch page as a fallback.
* </p>
*/ */
public final class YoutubeJavaScriptExtractor { public final class YoutubeJavaScriptExtractor {
private static final String HTTPS = "https:"; private static final String HTTPS = "https:";
private static final String BASE_JS_PLAYER_URL_FORMAT =
"https://www.youtube.com/s/player/%s/player_ias.vflset/en_GB/base.js";
private static final Pattern IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN = Pattern.compile(
"player\\\\/([a-z0-9]{8})\\\\/");
private static final Pattern EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN = Pattern.compile(
"\"jsUrl\":\"(/s/player/[A-Za-z0-9]+/player_ias\\.vflset/[A-Za-z_-]+/base\\.js)\"");
private static String cachedJavaScriptCode; private static String cachedJavaScriptCode;
private YoutubeJavaScriptExtractor() { private YoutubeJavaScriptExtractor() {
} }
/** /**
* Extracts the JavaScript file. The result is cached, so subsequent calls use the result of * Extracts the JavaScript file.
* previous calls.
* *
* @param videoId Does not influence the result, but a valid video id may help in the chance * <p>
* that YouTube tracks it. * The result is cached, so subsequent calls use the result of previous calls.
* @return The whole JavaScript file as a string. * </p>
* @throws ParsingException If the extraction failed. *
* @param videoId a YouTube video ID, which doesn't influence the result, but it may help in
* the chance that YouTube track it
* @return the whole JavaScript file as a string
* @throws ParsingException if the extraction failed
*/ */
@Nonnull @Nonnull
public static String extractJavaScriptCode(final String videoId) throws ParsingException { public static String extractJavaScriptCode(@Nonnull final String videoId)
throws ParsingException {
if (cachedJavaScriptCode == null) { if (cachedJavaScriptCode == null) {
String url; String url;
try { try {
url = YoutubeJavaScriptExtractor.extractJavaScriptUrl(); url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource();
} catch (final Exception i) { } catch (final Exception e) {
url = YoutubeJavaScriptExtractor.extractJavaScriptUrl(videoId); url = YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage(videoId);
} }
final String playerJsUrl = YoutubeJavaScriptExtractor.cleanJavaScriptUrl(url); final String playerJsUrl = YoutubeJavaScriptExtractor.cleanJavaScriptUrl(url);
cachedJavaScriptCode = YoutubeJavaScriptExtractor.downloadJavaScriptCode(playerJsUrl); cachedJavaScriptCode = YoutubeJavaScriptExtractor.downloadJavaScriptCode(playerJsUrl);
@ -53,75 +75,83 @@ public final class YoutubeJavaScriptExtractor {
} }
/** /**
* Same as {@link YoutubeJavaScriptExtractor#extractJavaScriptCode(String)} but with a constant * Reset the cached JavaScript code.
* value for videoId. *
* Possible because the videoId has no influence on the result.
* <p> * <p>
* In the off chance that YouTube tracks with which video id the request is made, it may make * It will be fetched again the next time {@link #extractJavaScriptCode(String)} is called.
* sense to pass in video ids. * </p>
*/
@Nonnull
public static String extractJavaScriptCode() throws ParsingException {
return extractJavaScriptCode("d4IGg5dqeO8");
}
/**
* Reset the JavaScript code. It will be fetched again the next time
* {@link #extractJavaScriptCode()} or {@link #extractJavaScriptCode(String)} is called.
*/ */
public static void resetJavaScriptCode() { public static void resetJavaScriptCode() {
cachedJavaScriptCode = null; cachedJavaScriptCode = null;
} }
public static String extractJavaScriptUrl() throws ParsingException { @Nonnull
static String extractJavaScriptUrlWithIframeResource() throws ParsingException {
final String iframeUrl;
final String iframeContent;
try { try {
final String iframeUrl = "https://www.youtube.com/iframe_api"; iframeUrl = "https://www.youtube.com/iframe_api";
final String iframeContent = NewPipe.getDownloader() iframeContent = NewPipe.getDownloader()
.get(iframeUrl, Localization.DEFAULT).responseBody(); .get(iframeUrl, Localization.DEFAULT)
final String hashPattern = "player\\\\\\/([a-z0-9]{8})\\\\\\/"; .responseBody();
final String hash = Parser.matchGroup1(hashPattern, iframeContent); } catch (final Exception e) {
throw new ParsingException("Could not fetch IFrame resource", e);
return String.format(
"https://www.youtube.com/s/player/%s/player_ias.vflset/en_US/base.js", hash);
} catch (final Exception ignored) {
} }
throw new ParsingException("Iframe API did not provide YouTube player js url"); try {
final String hash = Parser.matchGroup1(
IFRAME_RES_JS_BASE_PLAYER_HASH_PATTERN, iframeContent);
return String.format(BASE_JS_PLAYER_URL_FORMAT, hash);
} catch (final Parser.RegexException e) {
throw new ParsingException(
"IFrame resource didn't provide JavaScript base player's hash", e);
}
} }
public static String extractJavaScriptUrl(final String videoId) throws ParsingException { @Nonnull
static String extractJavaScriptUrlWithEmbedWatchPage(@Nonnull final String videoId)
throws ParsingException {
final String embedUrl;
final String embedPageContent;
try { try {
final String embedUrl = "https://www.youtube.com/embed/" + videoId; embedUrl = "https://www.youtube.com/embed/" + videoId;
final String embedPageContent = NewPipe.getDownloader() embedPageContent = NewPipe.getDownloader()
.get(embedUrl, Localization.DEFAULT).responseBody(); .get(embedUrl, Localization.DEFAULT)
.responseBody();
try { } catch (final Exception e) {
final String assetsPattern = "\"assets\":.+?\"js\":\\s*(\"[^\"]+\")"; throw new ParsingException("Could not fetch embedded watch page", e);
return Parser.matchGroup1(assetsPattern, embedPageContent)
.replace("\\", "").replace("\"", "");
} catch (final Parser.RegexException ex) {
// playerJsUrl is still available in the file, just somewhere else TODO
// it is ok not to find it, see how that's handled in getDeobfuscationCode()
final Document doc = Jsoup.parse(embedPageContent);
final Elements elems = doc.select("script").attr("name", "player_ias/base");
for (final Element elem : elems) {
if (elem.attr("src").contains("base.js")) {
return elem.attr("src");
}
}
}
} catch (final Exception ignored) {
} }
throw new ParsingException("Embedded info did not provide YouTube player js url"); // Parse HTML response with jsoup and look at script elements first
final Document doc = Jsoup.parse(embedPageContent);
final Elements elems = doc.select("script")
.attr("name", "player/base");
for (final Element elem : elems) {
// Script URLs should be relative and not absolute
final String playerUrl = elem.attr("src");
if (playerUrl.contains("base.js")) {
return playerUrl;
}
}
// Use regexes to match the URL in a JavaScript embedded script of the HTML page
try {
return Parser.matchGroup1(
EMBEDDED_WATCH_PAGE_JS_BASE_PLAYER_URL_PATTERN, embedPageContent);
} catch (final Parser.RegexException e) {
throw new ParsingException(
"Embedded watch page didn't provide JavaScript base player's URL", e);
}
} }
@Nonnull @Nonnull
private static String cleanJavaScriptUrl(@Nonnull final String playerJsUrl) { private static String cleanJavaScriptUrl(@Nonnull final String playerJsUrl) {
if (playerJsUrl.startsWith("//")) { if (playerJsUrl.startsWith("//")) {
// https part has to be added manually if the URL is protocol-relative
return HTTPS + playerJsUrl; return HTTPS + playerJsUrl;
} else if (playerJsUrl.startsWith("/")) { } else if (playerJsUrl.startsWith("/")) {
// sometimes https://www.youtube.com part has to be added manually // https://www.youtube.com part has to be added manually if the URL is relative to
// YouTube's domain
return HTTPS + "//www.youtube.com" + playerJsUrl; return HTTPS + "//www.youtube.com" + playerJsUrl;
} else { } else {
return playerJsUrl; return playerJsUrl;
@ -129,12 +159,15 @@ public final class YoutubeJavaScriptExtractor {
} }
@Nonnull @Nonnull
private static String downloadJavaScriptCode(final String playerJsUrl) private static String downloadJavaScriptCode(@Nonnull final String playerJsUrl)
throws ParsingException { throws ParsingException {
try { try {
return NewPipe.getDownloader().get(playerJsUrl, Localization.DEFAULT).responseBody(); return NewPipe.getDownloader()
.get(playerJsUrl, Localization.DEFAULT)
.responseBody();
} catch (final Exception e) { } catch (final Exception e) {
throw new ParsingException("Could not get player js code from url: " + playerJsUrl); throw new ParsingException(
"Could not get JavaScript base player's code from URL: " + playerJsUrl, e);
} }
} }
} }

View File

@ -814,9 +814,9 @@ public class YoutubeStreamExtractor extends StreamExtractor {
@Override @Override
public void onFetchPage(@Nonnull final Downloader downloader) public void onFetchPage(@Nonnull final Downloader downloader)
throws IOException, ExtractionException { throws IOException, ExtractionException {
initStsFromPlayerJsIfNeeded();
final String videoId = getId(); final String videoId = getId();
initStsFromPlayerJsIfNeeded(videoId);
final Localization localization = getExtractorLocalization(); final Localization localization = getExtractorLocalization();
final ContentCountry contentCountry = getExtractorContentCountry(); final ContentCountry contentCountry = getExtractorContentCountry();
html5Cpn = generateContentPlaybackNonce(); html5Cpn = generateContentPlaybackNonce();
@ -1052,8 +1052,6 @@ public class YoutubeStreamExtractor extends StreamExtractor {
@Nonnull final Localization localization, @Nonnull final Localization localization,
@Nonnull final String videoId) @Nonnull final String videoId)
throws IOException, ExtractionException { throws IOException, ExtractionException {
initStsFromPlayerJsIfNeeded();
// Because a cpn is unique to each request, we need to generate it again // Because a cpn is unique to each request, we need to generate it again
html5Cpn = generateContentPlaybackNonce(); html5Cpn = generateContentPlaybackNonce();
@ -1110,9 +1108,9 @@ public class YoutubeStreamExtractor extends StreamExtractor {
.getString("videoId")); .getString("videoId"));
} }
private static void storePlayerJs() throws ParsingException { private static void storePlayerJs(@Nonnull final String videoId) throws ParsingException {
try { try {
playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(); playerCode = YoutubeJavaScriptExtractor.extractJavaScriptCode(videoId);
} catch (final Exception e) { } catch (final Exception e) {
throw new ParsingException("Could not store JavaScript player", e); throw new ParsingException("Could not store JavaScript player", e);
} }
@ -1177,12 +1175,13 @@ public class YoutubeStreamExtractor extends StreamExtractor {
return cachedDeobfuscationCode; return cachedDeobfuscationCode;
} }
private static void initStsFromPlayerJsIfNeeded() throws ParsingException { private static void initStsFromPlayerJsIfNeeded(@Nonnull final String videoId)
throws ParsingException {
if (!isNullOrEmpty(sts)) { if (!isNullOrEmpty(sts)) {
return; return;
} }
if (playerCode == null) { if (playerCode == null) {
storePlayerJs(); storePlayerJs(videoId);
if (playerCode == null) { if (playerCode == null) {
throw new ParsingException("playerCode is null"); throw new ParsingException("playerCode is null");
} }

View File

@ -20,21 +20,20 @@ public class YoutubeJavaScriptExtractorTest {
@Test @Test
public void testExtractJavaScriptUrlIframe() throws ParsingException { public void testExtractJavaScriptUrlIframe() throws ParsingException {
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl().endsWith("base.js")); assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithIframeResource()
.endsWith("base.js"));
} }
@Test @Test
public void testExtractJavaScriptUrlEmbed() throws ParsingException { public void testExtractJavaScriptUrlEmbed() throws ParsingException {
assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrl("d4IGg5dqeO8").endsWith("base.js")); assertTrue(YoutubeJavaScriptExtractor.extractJavaScriptUrlWithEmbedWatchPage("d4IGg5dqeO8")
.endsWith("base.js"));
} }
@Test @Test
public void testExtractJavaScript__success() throws ParsingException { public void testExtractJavaScript__success() throws ParsingException {
String playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode("d4IGg5dqeO8"); String playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode("d4IGg5dqeO8");
assertPlayerJsCode(playerJsCode); assertPlayerJsCode(playerJsCode);
playerJsCode = YoutubeJavaScriptExtractor.extractJavaScriptCode();
assertPlayerJsCode(playerJsCode);
} }
@Test @Test