Merge pull request #1032 from AudricV/yt_fix-comments-hashtags-links-extraction

[YouTube] Fix hashtags links extraction and escape HTML links
This commit is contained in:
Stypox 2023-03-01 10:47:37 +01:00 committed by GitHub
commit 19e4b216c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 330 additions and 777 deletions

View File

@ -822,7 +822,7 @@ public final class YoutubeParsingHelper {
try { try {
final String url = "https://music.youtube.com/sw.js"; final String url = "https://music.youtube.com/sw.js";
final var headers = getOriginReferrerHeaders("https://music.youtube.com"); final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
final String response = getDownloader().get(url, headers).responseBody(); final String response = getDownloader().get(url, headers).responseBody();
musicClientVersion = getStringResultFromRegexArray(response, musicClientVersion = getStringResultFromRegexArray(response,
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1); INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
@ -843,18 +843,11 @@ public final class YoutubeParsingHelper {
} }
@Nullable @Nullable
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint) public static String getUrlFromNavigationEndpoint(
throws ParsingException { @Nonnull final JsonObject navigationEndpoint) {
if (navigationEndpoint.has("webCommandMetadata")) {
// this case needs to be handled before the browseEndpoint,
// e.g. for hashtags in comments
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}
if (navigationEndpoint.has("urlEndpoint")) { if (navigationEndpoint.has("urlEndpoint")) {
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url"); String internUrl = navigationEndpoint.getObject("urlEndpoint")
.getString("url");
if (internUrl.startsWith("https://www.youtube.com/redirect?")) { if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
// remove https://www.youtube.com part to fall in the next if block // remove https://www.youtube.com part to fall in the next if block
internUrl = internUrl.substring(23); internUrl = internUrl.substring(23);
@ -879,7 +872,9 @@ public final class YoutubeParsingHelper {
|| internUrl.startsWith("/watch")) { || internUrl.startsWith("/watch")) {
return "https://www.youtube.com" + internUrl; return "https://www.youtube.com" + internUrl;
} }
} else if (navigationEndpoint.has("browseEndpoint")) { }
if (navigationEndpoint.has("browseEndpoint")) {
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint"); final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl"); final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
final String browseId = browseEndpoint.getString("browseId"); final String browseId = browseEndpoint.getString("browseId");
@ -892,26 +887,39 @@ public final class YoutubeParsingHelper {
if (!isNullOrEmpty(canonicalBaseUrl)) { if (!isNullOrEmpty(canonicalBaseUrl)) {
return "https://www.youtube.com" + canonicalBaseUrl; return "https://www.youtube.com" + canonicalBaseUrl;
} }
}
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\"" if (navigationEndpoint.has("watchEndpoint")) {
+ browseEndpoint + "\")");
} else if (navigationEndpoint.has("watchEndpoint")) {
final StringBuilder url = new StringBuilder(); final StringBuilder url = new StringBuilder();
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint url.append("https://www.youtube.com/watch?v=")
.getObject("watchEndpoint").getString(VIDEO_ID)); .append(navigationEndpoint.getObject("watchEndpoint")
.getString(VIDEO_ID));
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) { if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint") url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
.getString("playlistId")); .getString("playlistId"));
} }
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) { if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint") url.append("&t=")
.append(navigationEndpoint.getObject("watchEndpoint")
.getInt("startTimeSeconds")); .getInt("startTimeSeconds"));
} }
return url.toString(); return url.toString();
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
return "https://www.youtube.com/playlist?list="
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
} }
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
return "https://www.youtube.com/playlist?list="
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
.getString("playlistId");
}
if (navigationEndpoint.has("commandMetadata")) {
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}
return null; return null;
} }
@ -924,8 +932,7 @@ public final class YoutubeParsingHelper {
* @return text in the JSON object or {@code null} * @return text in the JSON object or {@code null}
*/ */
@Nullable @Nullable
public static String getTextFromObject(final JsonObject textObject, final boolean html) public static String getTextFromObject(final JsonObject textObject, final boolean html) {
throws ParsingException {
if (isNullOrEmpty(textObject)) { if (isNullOrEmpty(textObject)) {
return null; return null;
} }
@ -944,12 +951,12 @@ public final class YoutubeParsingHelper {
String text = run.getString("text"); String text = run.getString("text");
if (html) { if (html) {
text = Entities.escape(text);
if (run.has("navigationEndpoint")) { if (run.has("navigationEndpoint")) {
final String url = getUrlFromNavigationEndpoint(run final String url = getUrlFromNavigationEndpoint(
.getObject("navigationEndpoint")); run.getObject("navigationEndpoint"));
if (!isNullOrEmpty(url)) { if (!isNullOrEmpty(url)) {
text = "<a href=\"" + url + "\">" + text + "</a>"; text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
+ "</a>";
} }
} }
@ -1015,11 +1022,12 @@ public final class YoutubeParsingHelper {
} }
final String content = attributedDescription.getString("content"); final String content = attributedDescription.getString("content");
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
if (content == null) { if (content == null) {
return null; return null;
} }
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
final StringBuilder textBuilder = new StringBuilder(); final StringBuilder textBuilder = new StringBuilder();
int textStart = 0; int textStart = 0;
@ -1038,12 +1046,7 @@ public final class YoutubeParsingHelper {
continue; continue;
} }
final String url; final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
try {
url = getUrlFromNavigationEndpoint(navigationEndpoint);
} catch (final ParsingException e) {
continue;
}
if (url == null) { if (url == null) {
continue; continue;
@ -1062,9 +1065,9 @@ public final class YoutubeParsingHelper {
.replaceFirst("^[/•] *", ""); .replaceFirst("^[/•] *", "");
textBuilder.append("<a href=\"") textBuilder.append("<a href=\"")
.append(url) .append(Entities.escape(url))
.append("\">") .append("\">")
.append(linkText) .append(Entities.escape(linkText))
.append("</a>"); .append("</a>");
textStart = startIndex + length; textStart = startIndex + length;
@ -1081,13 +1084,12 @@ public final class YoutubeParsingHelper {
} }
@Nullable @Nullable
public static String getTextFromObject(final JsonObject textObject) throws ParsingException { public static String getTextFromObject(final JsonObject textObject) {
return getTextFromObject(textObject, false); return getTextFromObject(textObject, false);
} }
@Nullable @Nullable
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException { public static String getUrlFromObject(final JsonObject textObject) {
if (isNullOrEmpty(textObject)) { if (isNullOrEmpty(textObject)) {
return null; return null;
} }
@ -1108,8 +1110,7 @@ public final class YoutubeParsingHelper {
} }
@Nullable @Nullable
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
throws ParsingException {
if (jsonObject.isString(theKey)) { if (jsonObject.isString(theKey)) {
return jsonObject.getString(theKey); return jsonObject.getString(theKey);
} else { } else {

View File

@ -45,13 +45,10 @@ public class YoutubeChannelInfoItemExtractor implements ChannelInfoItemExtractor
this.channelInfoItem = channelInfoItem; this.channelInfoItem = channelInfoItem;
boolean wHandle = false; boolean wHandle = false;
try { final String subscriberCountText = getTextFromObject(
final String subscriberCountText = getTextFromObject( channelInfoItem.getObject("subscriberCountText"));
channelInfoItem.getObject("subscriberCountText")); if (subscriberCountText != null) {
if (subscriberCountText != null) { wHandle = subscriberCountText.startsWith("@");
wHandle = subscriberCountText.startsWith("@");
}
} catch (final ParsingException ignored) {
} }
this.withHandle = wHandle; this.withHandle = wHandle;
} }

View File

@ -168,11 +168,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
title = playerResponse.getObject("videoDetails").getString("title"); title = playerResponse.getObject("videoDetails").getString("title");
if (isNullOrEmpty(title)) { if (isNullOrEmpty(title)) {
try { title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
if (isNullOrEmpty(title)) { if (isNullOrEmpty(title)) {
throw new ParsingException("Could not get name"); throw new ParsingException("Could not get name");
@ -285,21 +281,17 @@ public class YoutubeStreamExtractor extends StreamExtractor {
public Description getDescription() throws ParsingException { public Description getDescription() throws ParsingException {
assertPageFetched(); assertPageFetched();
// Description with more info on links // Description with more info on links
try { final String videoSecondaryInfoRendererDescription = getTextFromObject(
final String description = getTextFromObject( getVideoSecondaryInfoRenderer().getObject("description"),
getVideoSecondaryInfoRenderer().getObject("description"), true);
true); if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
if (!isNullOrEmpty(description)) { return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
return new Description(description, Description.HTML); }
}
final String attributedDescription = getAttributedDescription( final String attributedDescription = getAttributedDescription(
getVideoSecondaryInfoRenderer().getObject("attributedDescription")); getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
if (!isNullOrEmpty(attributedDescription)) { if (!isNullOrEmpty(attributedDescription)) {
return new Description(attributedDescription, Description.HTML); return new Description(attributedDescription, Description.HTML);
}
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
} }
String description = playerResponse.getObject("videoDetails") String description = playerResponse.getObject("videoDetails")
@ -400,14 +392,8 @@ public class YoutubeStreamExtractor extends StreamExtractor {
@Override @Override
public long getViewCount() throws ParsingException { public long getViewCount() throws ParsingException {
String views = null; String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));
try {
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
if (isNullOrEmpty(views)) { if (isNullOrEmpty(views)) {
views = playerResponse.getObject("videoDetails").getString("viewCount"); views = playerResponse.getObject("videoDetails").getString("viewCount");
@ -795,7 +781,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
return getTextFromObject(playerResponse.getObject("playabilityStatus") return getTextFromObject(playerResponse.getObject("playabilityStatus")
.getObject("errorScreen").getObject("playerErrorMessageRenderer") .getObject("errorScreen").getObject("playerErrorMessageRenderer")
.getObject("reason")); .getObject("reason"));
} catch (final ParsingException | NullPointerException e) { } catch (final NullPointerException e) {
return null; // No error message return null; // No error message
} }
} }

View File

@ -183,10 +183,10 @@ public class YoutubeStreamExtractorDefaultTest {
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; } @Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; } @Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
@Override public List<String> expectedDescriptionContains() { @Override public List<String> expectedDescriptionContains() {
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34", return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34", "https://www.youtube.com/watch?v=Lqv6G0pDNnw&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34", "https://www.youtube.com/watch?v=XxaRBPyrnBU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34"); "https://www.youtube.com/watch?v=U-9tUEOFKNU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
} }
@Override public long expectedLength() { return 434; } @Override public long expectedLength() { return 434; }
@Override public long expectedViewCountAtLeast() { return 21229200; } @Override public long expectedViewCountAtLeast() { return 21229200; }

View File

@ -3,10 +3,10 @@
"httpMethod": "GET", "httpMethod": "GET",
"url": "https://www.youtube.com/sw.js", "url": "https://www.youtube.com/sw.js",
"headers": { "headers": {
"Origin": [ "Referer": [
"https://www.youtube.com" "https://www.youtube.com"
], ],
"Referer": [ "Origin": [
"https://www.youtube.com" "https://www.youtube.com"
], ],
"Accept-Language": [ "Accept-Language": [
@ -29,7 +29,7 @@
"https://www.youtube.com" "https://www.youtube.com"
], ],
"alt-svc": [ "alt-svc": [
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\"" "h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
], ],
"cache-control": [ "cache-control": [
"private, max-age\u003d0" "private, max-age\u003d0"
@ -41,10 +41,10 @@
"same-origin; report-to\u003d\"youtube_main\"" "same-origin; report-to\u003d\"youtube_main\""
], ],
"date": [ "date": [
"Mon, 28 Nov 2022 20:27:36 GMT" "Sun, 26 Feb 2023 17:48:54 GMT"
], ],
"expires": [ "expires": [
"Mon, 28 Nov 2022 20:27:36 GMT" "Sun, 26 Feb 2023 17:48:54 GMT"
], ],
"p3p": [ "p3p": [
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\"" "CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
@ -59,9 +59,9 @@
"ESF" "ESF"
], ],
"set-cookie": [ "set-cookie": [
"YSC\u003ddaTQ98V-voQ; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone", "YSC\u003dYJXWRWCYVkE; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dTue, 03-Mar-2020 20:27:36 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone", "VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 17:48:54 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+452; expires\u003dWed, 27-Nov-2024 20:27:36 GMT; path\u003d/; domain\u003d.youtube.com; Secure" "CONSENT\u003dPENDING+668; expires\u003dTue, 25-Feb-2025 17:48:54 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
], ],
"strict-transport-security": [ "strict-transport-security": [
"max-age\u003d31536000" "max-age\u003d31536000"

View File

@ -29,7 +29,7 @@
"https://www.youtube.com" "https://www.youtube.com"
], ],
"alt-svc": [ "alt-svc": [
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\"" "h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
], ],
"cache-control": [ "cache-control": [
"private, max-age\u003d0" "private, max-age\u003d0"
@ -41,10 +41,10 @@
"same-origin; report-to\u003d\"youtube_main\"" "same-origin; report-to\u003d\"youtube_main\""
], ],
"date": [ "date": [
"Tue, 22 Nov 2022 10:40:20 GMT" "Sun, 26 Feb 2023 10:57:08 GMT"
], ],
"expires": [ "expires": [
"Tue, 22 Nov 2022 10:40:20 GMT" "Sun, 26 Feb 2023 10:57:08 GMT"
], ],
"p3p": [ "p3p": [
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\"" "CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
@ -59,9 +59,9 @@
"ESF" "ESF"
], ],
"set-cookie": [ "set-cookie": [
"YSC\u003ddIhq5C9znKU; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone", "YSC\u003dL2wyk8wP8TA; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dWed, 26-Feb-2020 10:40:20 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone", "VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 10:57:08 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+600; expires\u003dThu, 21-Nov-2024 10:40:19 GMT; path\u003d/; domain\u003d.youtube.com; Secure" "CONSENT\u003dPENDING+005; expires\u003dTue, 25-Feb-2025 10:57:08 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
], ],
"strict-transport-security": [ "strict-transport-security": [
"max-age\u003d31536000" "max-age\u003d31536000"