Merge pull request #1032 from AudricV/yt_fix-comments-hashtags-links-extraction

[YouTube] Fix hashtags links extraction and escape HTML links
This commit is contained in:
Stypox 2023-03-01 10:47:37 +01:00 committed by GitHub
commit 19e4b216c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 330 additions and 777 deletions

View File

@ -822,7 +822,7 @@ public final class YoutubeParsingHelper {
try {
final String url = "https://music.youtube.com/sw.js";
final var headers = getOriginReferrerHeaders("https://music.youtube.com");
final var headers = getOriginReferrerHeaders(YOUTUBE_MUSIC_URL);
final String response = getDownloader().get(url, headers).responseBody();
musicClientVersion = getStringResultFromRegexArray(response,
INNERTUBE_CONTEXT_CLIENT_VERSION_REGEXES, 1);
@ -843,18 +843,11 @@ public final class YoutubeParsingHelper {
}
@Nullable
public static String getUrlFromNavigationEndpoint(@Nonnull final JsonObject navigationEndpoint)
throws ParsingException {
if (navigationEndpoint.has("webCommandMetadata")) {
// this case needs to be handled before the browseEndpoint,
// e.g. for hashtags in comments
final JsonObject metadata = navigationEndpoint.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}
public static String getUrlFromNavigationEndpoint(
@Nonnull final JsonObject navigationEndpoint) {
if (navigationEndpoint.has("urlEndpoint")) {
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
String internUrl = navigationEndpoint.getObject("urlEndpoint")
.getString("url");
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
// remove https://www.youtube.com part to fall in the next if block
internUrl = internUrl.substring(23);
@ -879,7 +872,9 @@ public final class YoutubeParsingHelper {
|| internUrl.startsWith("/watch")) {
return "https://www.youtube.com" + internUrl;
}
} else if (navigationEndpoint.has("browseEndpoint")) {
}
if (navigationEndpoint.has("browseEndpoint")) {
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
final String browseId = browseEndpoint.getString("browseId");
@ -892,26 +887,39 @@ public final class YoutubeParsingHelper {
if (!isNullOrEmpty(canonicalBaseUrl)) {
return "https://www.youtube.com" + canonicalBaseUrl;
}
}
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\""
+ browseEndpoint + "\")");
} else if (navigationEndpoint.has("watchEndpoint")) {
if (navigationEndpoint.has("watchEndpoint")) {
final StringBuilder url = new StringBuilder();
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint
.getObject("watchEndpoint").getString(VIDEO_ID));
url.append("https://www.youtube.com/watch?v=")
.append(navigationEndpoint.getObject("watchEndpoint")
.getString(VIDEO_ID));
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
.getString("playlistId"));
}
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
url.append("&t=").append(navigationEndpoint.getObject("watchEndpoint")
url.append("&t=")
.append(navigationEndpoint.getObject("watchEndpoint")
.getInt("startTimeSeconds"));
}
return url.toString();
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
return "https://www.youtube.com/playlist?list="
+ navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
}
if (navigationEndpoint.has("watchPlaylistEndpoint")) {
return "https://www.youtube.com/playlist?list="
+ navigationEndpoint.getObject("watchPlaylistEndpoint")
.getString("playlistId");
}
if (navigationEndpoint.has("commandMetadata")) {
final JsonObject metadata = navigationEndpoint.getObject("commandMetadata")
.getObject("webCommandMetadata");
if (metadata.has("url")) {
return "https://www.youtube.com" + metadata.getString("url");
}
}
return null;
}
@ -924,8 +932,7 @@ public final class YoutubeParsingHelper {
* @return text in the JSON object or {@code null}
*/
@Nullable
public static String getTextFromObject(final JsonObject textObject, final boolean html)
throws ParsingException {
public static String getTextFromObject(final JsonObject textObject, final boolean html) {
if (isNullOrEmpty(textObject)) {
return null;
}
@ -944,12 +951,12 @@ public final class YoutubeParsingHelper {
String text = run.getString("text");
if (html) {
text = Entities.escape(text);
if (run.has("navigationEndpoint")) {
final String url = getUrlFromNavigationEndpoint(run
.getObject("navigationEndpoint"));
final String url = getUrlFromNavigationEndpoint(
run.getObject("navigationEndpoint"));
if (!isNullOrEmpty(url)) {
text = "<a href=\"" + url + "\">" + text + "</a>";
text = "<a href=\"" + Entities.escape(url) + "\">" + Entities.escape(text)
+ "</a>";
}
}
@ -1015,11 +1022,12 @@ public final class YoutubeParsingHelper {
}
final String content = attributedDescription.getString("content");
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
if (content == null) {
return null;
}
final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
final StringBuilder textBuilder = new StringBuilder();
int textStart = 0;
@ -1038,12 +1046,7 @@ public final class YoutubeParsingHelper {
continue;
}
final String url;
try {
url = getUrlFromNavigationEndpoint(navigationEndpoint);
} catch (final ParsingException e) {
continue;
}
final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
if (url == null) {
continue;
@ -1062,9 +1065,9 @@ public final class YoutubeParsingHelper {
.replaceFirst("^[/•] *", "");
textBuilder.append("<a href=\"")
.append(url)
.append(Entities.escape(url))
.append("\">")
.append(linkText)
.append(Entities.escape(linkText))
.append("</a>");
textStart = startIndex + length;
@ -1081,13 +1084,12 @@ public final class YoutubeParsingHelper {
}
@Nullable
public static String getTextFromObject(final JsonObject textObject) throws ParsingException {
public static String getTextFromObject(final JsonObject textObject) {
return getTextFromObject(textObject, false);
}
@Nullable
public static String getUrlFromObject(final JsonObject textObject) throws ParsingException {
public static String getUrlFromObject(final JsonObject textObject) {
if (isNullOrEmpty(textObject)) {
return null;
}
@ -1108,8 +1110,7 @@ public final class YoutubeParsingHelper {
}
@Nullable
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey)
throws ParsingException {
public static String getTextAtKey(@Nonnull final JsonObject jsonObject, final String theKey) {
if (jsonObject.isString(theKey)) {
return jsonObject.getString(theKey);
} else {

View File

@ -45,13 +45,10 @@ public class YoutubeChannelInfoItemExtractor implements ChannelInfoItemExtractor
this.channelInfoItem = channelInfoItem;
boolean wHandle = false;
try {
final String subscriberCountText = getTextFromObject(
channelInfoItem.getObject("subscriberCountText"));
if (subscriberCountText != null) {
wHandle = subscriberCountText.startsWith("@");
}
} catch (final ParsingException ignored) {
final String subscriberCountText = getTextFromObject(
channelInfoItem.getObject("subscriberCountText"));
if (subscriberCountText != null) {
wHandle = subscriberCountText.startsWith("@");
}
this.withHandle = wHandle;
}

View File

@ -168,11 +168,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
title = playerResponse.getObject("videoDetails").getString("title");
if (isNullOrEmpty(title)) {
try {
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
title = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("title"));
if (isNullOrEmpty(title)) {
throw new ParsingException("Could not get name");
@ -285,21 +281,17 @@ public class YoutubeStreamExtractor extends StreamExtractor {
public Description getDescription() throws ParsingException {
assertPageFetched();
// Description with more info on links
try {
final String description = getTextFromObject(
getVideoSecondaryInfoRenderer().getObject("description"),
true);
if (!isNullOrEmpty(description)) {
return new Description(description, Description.HTML);
}
final String videoSecondaryInfoRendererDescription = getTextFromObject(
getVideoSecondaryInfoRenderer().getObject("description"),
true);
if (!isNullOrEmpty(videoSecondaryInfoRendererDescription)) {
return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
}
final String attributedDescription = getAttributedDescription(
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
if (!isNullOrEmpty(attributedDescription)) {
return new Description(attributedDescription, Description.HTML);
}
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
final String attributedDescription = getAttributedDescription(
getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
if (!isNullOrEmpty(attributedDescription)) {
return new Description(attributedDescription, Description.HTML);
}
String description = playerResponse.getObject("videoDetails")
@ -400,14 +392,8 @@ public class YoutubeStreamExtractor extends StreamExtractor {
@Override
public long getViewCount() throws ParsingException {
String views = null;
try {
views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));
} catch (final ParsingException ignored) {
// Age-restricted videos cause a ParsingException here
}
String views = getTextFromObject(getVideoPrimaryInfoRenderer().getObject("viewCount")
.getObject("videoViewCountRenderer").getObject("viewCount"));
if (isNullOrEmpty(views)) {
views = playerResponse.getObject("videoDetails").getString("viewCount");
@ -795,7 +781,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
return getTextFromObject(playerResponse.getObject("playabilityStatus")
.getObject("errorScreen").getObject("playerErrorMessageRenderer")
.getObject("reason"));
} catch (final ParsingException | NullPointerException e) {
} catch (final NullPointerException e) {
return null; // No error message
}
}

View File

@ -183,10 +183,10 @@ public class YoutubeStreamExtractorDefaultTest {
@Override public String expectedUploaderUrl() { return "https://www.youtube.com/channel/UCsTcErHg8oDvUnTzoqsYeNw"; }
@Override public long expectedUploaderSubscriberCountAtLeast() { return 18_000_000; }
@Override public List<String> expectedDescriptionContains() {
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=XxaRBPyrnBU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=U-9tUEOFKNU&list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
return Arrays.asList("https://www.youtube.com/watch?v=X7FLCHVXpsA&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=Lqv6G0pDNnw&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=XxaRBPyrnBU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34",
"https://www.youtube.com/watch?v=U-9tUEOFKNU&amp;list=PL7u4lWXQ3wfI_7PgX0C-VTiwLeu0S4v34");
}
@Override public long expectedLength() { return 434; }
@Override public long expectedViewCountAtLeast() { return 21229200; }

View File

@ -3,10 +3,10 @@
"httpMethod": "GET",
"url": "https://www.youtube.com/sw.js",
"headers": {
"Origin": [
"Referer": [
"https://www.youtube.com"
],
"Referer": [
"Origin": [
"https://www.youtube.com"
],
"Accept-Language": [
@ -29,7 +29,7 @@
"https://www.youtube.com"
],
"alt-svc": [
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
],
"cache-control": [
"private, max-age\u003d0"
@ -41,10 +41,10 @@
"same-origin; report-to\u003d\"youtube_main\""
],
"date": [
"Mon, 28 Nov 2022 20:27:36 GMT"
"Sun, 26 Feb 2023 17:48:54 GMT"
],
"expires": [
"Mon, 28 Nov 2022 20:27:36 GMT"
"Sun, 26 Feb 2023 17:48:54 GMT"
],
"p3p": [
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
@ -59,9 +59,9 @@
"ESF"
],
"set-cookie": [
"YSC\u003ddaTQ98V-voQ; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dTue, 03-Mar-2020 20:27:36 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+452; expires\u003dWed, 27-Nov-2024 20:27:36 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
"YSC\u003dYJXWRWCYVkE; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 17:48:54 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+668; expires\u003dTue, 25-Feb-2025 17:48:54 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
],
"strict-transport-security": [
"max-age\u003d31536000"

View File

@ -29,7 +29,7 @@
"https://www.youtube.com"
],
"alt-svc": [
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000,h3-Q050\u003d\":443\"; ma\u003d2592000,h3-Q046\u003d\":443\"; ma\u003d2592000,h3-Q043\u003d\":443\"; ma\u003d2592000,quic\u003d\":443\"; ma\u003d2592000; v\u003d\"46,43\""
"h3\u003d\":443\"; ma\u003d2592000,h3-29\u003d\":443\"; ma\u003d2592000"
],
"cache-control": [
"private, max-age\u003d0"
@ -41,10 +41,10 @@
"same-origin; report-to\u003d\"youtube_main\""
],
"date": [
"Tue, 22 Nov 2022 10:40:20 GMT"
"Sun, 26 Feb 2023 10:57:08 GMT"
],
"expires": [
"Tue, 22 Nov 2022 10:40:20 GMT"
"Sun, 26 Feb 2023 10:57:08 GMT"
],
"p3p": [
"CP\u003d\"This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl\u003den-GB for more info.\""
@ -59,9 +59,9 @@
"ESF"
],
"set-cookie": [
"YSC\u003ddIhq5C9znKU; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dWed, 26-Feb-2020 10:40:20 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+600; expires\u003dThu, 21-Nov-2024 10:40:19 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
"YSC\u003dL2wyk8wP8TA; Domain\u003d.youtube.com; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"VISITOR_INFO1_LIVE\u003d; Domain\u003d.youtube.com; Expires\u003dMon, 01-Jun-2020 10:57:08 GMT; Path\u003d/; Secure; HttpOnly; SameSite\u003dnone",
"CONSENT\u003dPENDING+005; expires\u003dTue, 25-Feb-2025 10:57:08 GMT; path\u003d/; domain\u003d.youtube.com; Secure"
],
"strict-transport-security": [
"max-age\u003d31536000"