NewPipeExtractor/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java

828 lines
35 KiB
Java
Raw Normal View History

package org.schabi.newpipe.extractor.services.youtube;
2017-03-01 18:47:52 +01:00
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import com.grack.nanojson.JsonWriter;
import org.schabi.newpipe.extractor.MetaInfo;
import org.schabi.newpipe.extractor.Page;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ContentNotAvailableException;
2020-02-29 16:42:04 +01:00
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
2017-03-01 18:47:52 +01:00
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
2020-02-29 16:55:07 +01:00
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.stream.Description;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
2017-03-01 18:47:52 +01:00
2020-02-29 16:42:04 +01:00
import java.io.IOException;
2020-02-27 17:39:23 +01:00
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
2020-02-27 17:39:23 +01:00
import java.net.URLDecoder;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeParseException;
2021-01-17 18:48:16 +01:00
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
2020-02-26 15:22:59 +01:00
import static org.schabi.newpipe.extractor.NewPipe.getDownloader;
import static org.schabi.newpipe.extractor.utils.Utils.EMPTY_STRING;
import static org.schabi.newpipe.extractor.utils.Utils.HTTP;
import static org.schabi.newpipe.extractor.utils.Utils.HTTPS;
import static org.schabi.newpipe.extractor.utils.Utils.UTF_8;
import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
import static org.schabi.newpipe.extractor.utils.Utils.join;
/*
2017-03-01 18:47:52 +01:00
* Created by Christian Schabesberger on 02.03.16.
*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
* YoutubeParsingHelper.java is part of NewPipe.
*
* NewPipe is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* NewPipe is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.
*/
public class YoutubeParsingHelper {
private YoutubeParsingHelper() {
}
2020-02-26 15:22:59 +01:00
private static final String HARDCODED_CLIENT_VERSION = "2.20200214.04.00";
private static String clientVersion;
2020-07-26 12:00:56 +02:00
private static String key;
2020-03-20 11:05:19 +01:00
private static final String[] HARDCODED_YOUTUBE_MUSIC_KEYS = {"AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30", "67", "0.1"};
2020-03-17 11:33:39 +01:00
private static String[] youtubeMusicKeys;
private static final String FEED_BASE_CHANNEL_ID = "https://www.youtube.com/feeds/videos.xml?channel_id=";
private static final String FEED_BASE_USER = "https://www.youtube.com/feeds/videos.xml?user=";
private static boolean isGoogleURL(String url) {
url = extractCachedUrlIfNeeded(url);
try {
final URL u = new URL(url);
final String host = u.getHost();
return host.startsWith("google.") || host.startsWith("m.google.");
} catch (MalformedURLException e) {
return false;
}
}
public static boolean isYoutubeURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("youtube.com") || host.equalsIgnoreCase("www.youtube.com")
2019-09-12 04:43:49 +02:00
|| host.equalsIgnoreCase("m.youtube.com") || host.equalsIgnoreCase("music.youtube.com");
}
public static boolean isYoutubeServiceURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("www.youtube-nocookie.com") || host.equalsIgnoreCase("youtu.be");
}
public static boolean isHooktubeURL(final URL url) {
final String host = url.getHost();
return host.equalsIgnoreCase("hooktube.com");
}
public static boolean isInvidioURL(final URL url) {
final String host = url.getHost();
2020-07-02 21:31:05 +02:00
return host.equalsIgnoreCase("invidio.us")
|| host.equalsIgnoreCase("dev.invidio.us")
|| host.equalsIgnoreCase("www.invidio.us")
2021-01-22 19:20:22 +01:00
|| host.equalsIgnoreCase("redirect.invidious.io")
2020-07-02 21:31:05 +02:00
|| host.equalsIgnoreCase("invidious.snopyta.org")
|| host.equalsIgnoreCase("yewtu.be")
|| host.equalsIgnoreCase("tube.connect.cafe")
|| host.equalsIgnoreCase("invidious.zapashcanon.fr")
|| host.equalsIgnoreCase("invidious.kavin.rocks")
|| host.equalsIgnoreCase("invidious.tube")
|| host.equalsIgnoreCase("invidious.site")
|| host.equalsIgnoreCase("invidious.xyz")
|| host.equalsIgnoreCase("vid.mint.lgbt")
|| host.equalsIgnoreCase("invidiou.site")
2021-01-22 19:20:22 +01:00
|| host.equalsIgnoreCase("invidious.fdn.fr")
|| host.equalsIgnoreCase("invidious.048596.xyz")
|| host.equalsIgnoreCase("invidious.zee.li")
|| host.equalsIgnoreCase("vid.puffyan.us")
|| host.equalsIgnoreCase("ytprivate.com");
}
/**
2020-07-02 21:31:05 +02:00
* Parses the duration string of the video expecting ":" or "." as separators
2021-02-07 22:12:22 +01:00
*
* @return the duration in seconds
2020-07-02 21:31:05 +02:00
* @throws ParsingException when more than 3 separators are found
*/
public static int parseDurationString(final String input)
2017-03-01 18:47:52 +01:00
throws ParsingException, NumberFormatException {
// If time separator : is not detected, try . instead
2018-09-09 14:01:39 +02:00
final String[] splitInput = input.contains(":")
? input.split(":")
: input.split("\\.");
2017-03-01 18:47:52 +01:00
String days = "0";
String hours = "0";
String minutes = "0";
2018-09-09 14:01:39 +02:00
final String seconds;
2017-03-01 18:47:52 +01:00
switch (splitInput.length) {
2017-03-01 18:47:52 +01:00
case 4:
days = splitInput[0];
hours = splitInput[1];
minutes = splitInput[2];
seconds = splitInput[3];
break;
case 3:
hours = splitInput[0];
minutes = splitInput[1];
seconds = splitInput[2];
break;
case 2:
minutes = splitInput[0];
seconds = splitInput[1];
break;
case 1:
seconds = splitInput[0];
break;
default:
throw new ParsingException("Error duration string with unknown format: " + input);
}
2020-05-30 17:20:54 +02:00
return ((Integer.parseInt(Utils.removeNonDigitCharacters(days)) * 24
+ Integer.parseInt(Utils.removeNonDigitCharacters(hours))) * 60
+ Integer.parseInt(Utils.removeNonDigitCharacters(minutes))) * 60
+ Integer.parseInt(Utils.removeNonDigitCharacters(seconds));
2017-03-01 18:47:52 +01:00
}
public static String getFeedUrlFrom(final String channelIdOrUser) {
if (channelIdOrUser.startsWith("user/")) {
return FEED_BASE_USER + channelIdOrUser.replace("user/", "");
} else if (channelIdOrUser.startsWith("channel/")) {
return FEED_BASE_CHANNEL_ID + channelIdOrUser.replace("channel/", "");
} else {
return FEED_BASE_CHANNEL_ID + channelIdOrUser;
}
}
public static OffsetDateTime parseDateFrom(final String textualUploadDate) throws ParsingException {
try {
return OffsetDateTime.parse(textualUploadDate);
} catch (DateTimeParseException e) {
try {
return LocalDate.parse(textualUploadDate).atStartOfDay().atOffset(ZoneOffset.UTC);
} catch (DateTimeParseException e1) {
throw new ParsingException("Could not parse date: \"" + textualUploadDate + "\"", e1);
}
}
}
2020-02-02 18:15:47 +01:00
/**
* Checks if the given playlist id is a YouTube Mix (auto-generated playlist)
* Ids from a YouTube Mix start with "RD"
2021-02-07 22:12:22 +01:00
*
2020-02-02 18:15:47 +01:00
* @param playlistId
* @return Whether given id belongs to a YouTube Mix
2020-02-02 18:15:47 +01:00
*/
public static boolean isYoutubeMixId(final String playlistId) {
return playlistId.startsWith("RD") && !isYoutubeMusicMixId(playlistId);
}
/**
* Checks if the given playlist id is a YouTube Music Mix (auto-generated playlist)
* Ids from a YouTube Music Mix start with "RDAMVM" or "RDCLAK"
2021-02-07 22:12:22 +01:00
*
* @param playlistId
* @return Whether given id belongs to a YouTube Music Mix
*/
public static boolean isYoutubeMusicMixId(final String playlistId) {
return playlistId.startsWith("RDAMVM") || playlistId.startsWith("RDCLAK");
}
2021-02-07 22:12:22 +01:00
/**
* Checks if the given playlist id is a YouTube Channel Mix (auto-generated playlist)
* Ids from a YouTube channel Mix start with "RDCM"
2021-02-07 22:12:22 +01:00
*
* @return Whether given id belongs to a YouTube Channel Mix
*/
public static boolean isYoutubeChannelMixId(final String playlistId) {
return playlistId.startsWith("RDCM");
}
/**
* Extracts the video id from the playlist id for Mixes.
2021-02-07 22:12:22 +01:00
*
* @throws ParsingException If the playlistId is a Channel Mix or not a mix.
*/
public static String extractVideoIdFromMixId(final String playlistId) throws ParsingException {
if (playlistId.startsWith("RDMM")) { // My Mix
return playlistId.substring(4);
} else if (isYoutubeMusicMixId(playlistId)) { // starts with "RDAMVM" or "RDCLAK"
return playlistId.substring(6);
} else if (isYoutubeChannelMixId(playlistId)) { // starts with "RMCM"
// Channel mix are build with RMCM{channelId}, so videoId can't be determined
throw new ParsingException("Video id could not be determined from mix id: " + playlistId);
} else if (isYoutubeMixId(playlistId)) { // normal mix, starts with "RD"
return playlistId.substring(2);
} else { // not a mix
throw new ParsingException("Video id could not be determined from mix id: " + playlistId);
}
}
public static JsonObject getInitialData(final String html) throws ParsingException {
try {
2020-10-16 20:27:40 +02:00
try {
final String initialData = Parser.matchGroup1("window\\[\"ytInitialData\"\\]\\s*=\\s*(\\{.*?\\});", html);
return JsonParser.object().from(initialData);
} catch (Parser.RegexException e) {
final String initialData = Parser.matchGroup1("var\\s*ytInitialData\\s*=\\s*(\\{.*?\\});", html);
return JsonParser.object().from(initialData);
}
} catch (JsonParserException | Parser.RegexException e) {
throw new ParsingException("Could not get ytInitialData", e);
}
}
public static boolean isHardcodedClientVersionValid() throws IOException, ExtractionException {
final String url = "https://www.youtube.com/results?search_query=test&pbj=1";
final Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList(HARDCODED_CLIENT_VERSION));
final String response = getDownloader().get(url, headers).responseBody();
2020-02-26 15:22:59 +01:00
return response.length() > 50; // ensure to have a valid response
}
private static void extractClientVersionAndKey() throws IOException, ExtractionException {
final String url = "https://www.youtube.com/results?search_query=test";
final String html = getDownloader().get(url).responseBody();
2020-07-26 12:00:56 +02:00
final JsonObject initialData = getInitialData(html);
final JsonArray serviceTrackingParams = initialData.getObject("responseContext").getArray("serviceTrackingParams");
String shortClientVersion = null;
// try to get version from initial data first
2020-07-26 12:00:56 +02:00
for (final Object service : serviceTrackingParams) {
final JsonObject s = (JsonObject) service;
if (s.getString("service").equals("CSI")) {
2020-07-26 12:00:56 +02:00
final JsonArray params = s.getArray("params");
for (final Object param : params) {
final JsonObject p = (JsonObject) param;
final String key = p.getString("key");
if (key != null && key.equals("cver")) {
2020-07-26 12:00:56 +02:00
clientVersion = p.getString("value");
}
}
} else if (s.getString("service").equals("ECATCHER")) {
// fallback to get a shortened client version which does not contain the last two digits
2020-07-26 12:00:56 +02:00
final JsonArray params = s.getArray("params");
for (final Object param : params) {
final JsonObject p = (JsonObject) param;
final String key = p.getString("key");
if (key != null && key.equals("client.version")) {
shortClientVersion = p.getString("value");
}
}
}
}
String contextClientVersion;
2020-07-26 12:00:56 +02:00
final String[] patterns = {
"INNERTUBE_CONTEXT_CLIENT_VERSION\":\"([0-9\\.]+?)\"",
"innertube_context_client_version\":\"([0-9\\.]+?)\"",
"client.version=([0-9\\.]+)"
};
2020-07-26 12:00:56 +02:00
for (final String pattern : patterns) {
try {
contextClientVersion = Parser.matchGroup1(pattern, html);
2020-04-15 18:49:58 +02:00
if (!isNullOrEmpty(contextClientVersion)) {
2020-07-26 12:00:56 +02:00
clientVersion = contextClientVersion;
break;
2020-02-26 15:22:59 +01:00
}
2021-02-07 22:12:22 +01:00
} catch (Parser.RegexException ignored) {
}
}
if (!isNullOrEmpty(clientVersion) && !isNullOrEmpty(shortClientVersion)) {
2020-07-26 12:00:56 +02:00
clientVersion = shortClientVersion;
}
try {
key = Parser.matchGroup1("INNERTUBE_API_KEY\":\"([0-9a-zA-Z_-]+?)\"", html);
} catch (Parser.RegexException e) {
try {
key = Parser.matchGroup1("innertubeApiKey\":\"([0-9a-zA-Z_-]+?)\"", html);
2021-02-07 22:12:22 +01:00
} catch (Parser.RegexException ignored) {
}
}
2020-07-26 12:00:56 +02:00
}
/**
* Get the client version
*/
public static String getClientVersion() throws IOException, ExtractionException {
if (!isNullOrEmpty(clientVersion)) return clientVersion;
if (isHardcodedClientVersionValid()) return clientVersion = HARDCODED_CLIENT_VERSION;
extractClientVersionAndKey();
2020-07-26 12:00:56 +02:00
if (isNullOrEmpty(key)) throw new ParsingException("Could not extract client version");
return clientVersion;
}
/**
* Get the key
*/
public static String getKey() throws IOException, ExtractionException {
if (!isNullOrEmpty(key)) return key;
extractClientVersionAndKey();
2020-07-26 12:00:56 +02:00
if (isNullOrEmpty(key)) throw new ParsingException("Could not extract key");
return key;
}
2020-02-27 17:39:23 +01:00
/**
* <p>
* <b>Only use in tests.</b>
* </p>
*
* <p>
* Quick-and-dirty solution to reset global state in between test classes.
* </p>
* <p>
* This is needed for the mocks because in order to reach that state a network request has to
* be made. If the global state is not reset and the RecordingDownloader is used,
* then only the first test class has that request recorded. Meaning running the other
* tests with mocks will fail, because the mock is missing.
* </p>
*/
2021-01-17 18:48:16 +01:00
public static void resetClientVersionAndKey() {
clientVersion = null;
key = null;
}
2020-03-20 11:05:19 +01:00
public static boolean areHardcodedYoutubeMusicKeysValid() throws IOException, ReCaptchaException {
final String url = "https://music.youtube.com/youtubei/v1/search?alt=json&key=" + HARDCODED_YOUTUBE_MUSIC_KEYS[0];
// @formatter:off
byte[] json = JsonWriter.string()
.object()
.object("context")
.object("client")
.value("clientName", "WEB_REMIX")
.value("clientVersion", HARDCODED_YOUTUBE_MUSIC_KEYS[2])
.value("hl", "en")
.value("gl", "GB")
.array("experimentIds").end()
.value("experimentsToken", "")
.value("utcOffsetMinutes", 0)
.object("locationInfo").end()
.object("musicAppInfo").end()
.end()
.object("capabilities").end()
.object("request")
.array("internalExperimentFlags").end()
.object("sessionIndex").end()
.end()
.object("activePlayers").end()
.object("user")
.value("enableSafetyMode", false)
.end()
.end()
.value("query", "test")
.value("params", "Eg-KAQwIARAAGAAgACgAMABqChAEEAUQAxAKEAk%3D")
2021-02-07 22:12:22 +01:00
.end().done().getBytes(UTF_8);
2020-03-20 11:05:19 +01:00
// @formatter:on
final Map<String, List<String>> headers = new HashMap<>();
2020-03-20 11:05:19 +01:00
headers.put("X-YouTube-Client-Name", Collections.singletonList(HARDCODED_YOUTUBE_MUSIC_KEYS[1]));
headers.put("X-YouTube-Client-Version", Collections.singletonList(HARDCODED_YOUTUBE_MUSIC_KEYS[2]));
headers.put("Origin", Collections.singletonList("https://music.youtube.com"));
headers.put("Referer", Collections.singletonList("music.youtube.com"));
2020-03-20 11:05:19 +01:00
headers.put("Content-Type", Collections.singletonList("application/json"));
final String response = getDownloader().post(url, headers, json).responseBody();
2020-03-20 11:05:19 +01:00
return response.length() > 50; // ensure to have a valid response
}
2020-03-17 11:33:39 +01:00
public static String[] getYoutubeMusicKeys() throws IOException, ReCaptchaException, Parser.RegexException {
if (youtubeMusicKeys != null && youtubeMusicKeys.length == 3) return youtubeMusicKeys;
2020-03-20 11:05:19 +01:00
if (areHardcodedYoutubeMusicKeysValid()) return youtubeMusicKeys = HARDCODED_YOUTUBE_MUSIC_KEYS;
2020-03-17 11:33:39 +01:00
final String url = "https://music.youtube.com/";
final String html = getDownloader().get(url).responseBody();
2020-03-20 11:05:19 +01:00
String key;
try {
key = Parser.matchGroup1("INNERTUBE_API_KEY\":\"([0-9a-zA-Z_-]+?)\"", html);
} catch (Parser.RegexException e) {
key = Parser.matchGroup1("innertube_api_key\":\"([0-9a-zA-Z_-]+?)\"", html);
}
2020-03-17 11:33:39 +01:00
final String clientName = Parser.matchGroup1("INNERTUBE_CONTEXT_CLIENT_NAME\":([0-9]+?),", html);
2020-03-20 11:05:19 +01:00
String clientVersion;
try {
clientVersion = Parser.matchGroup1("INNERTUBE_CONTEXT_CLIENT_VERSION\":\"([0-9\\.]+?)\"", html);
} catch (Parser.RegexException e) {
try {
clientVersion = Parser.matchGroup1("INNERTUBE_CLIENT_VERSION\":\"([0-9\\.]+?)\"", html);
} catch (Parser.RegexException ee) {
clientVersion = Parser.matchGroup1("innertube_context_client_version\":\"([0-9\\.]+?)\"", html);
}
}
2020-03-17 11:33:39 +01:00
return youtubeMusicKeys = new String[]{key, clientName, clientVersion};
}
2021-02-12 22:22:11 +01:00
@Nullable
public static String getUrlFromNavigationEndpoint(JsonObject navigationEndpoint) throws ParsingException {
2020-04-16 16:08:14 +02:00
if (navigationEndpoint.has("urlEndpoint")) {
2020-02-27 17:39:23 +01:00
String internUrl = navigationEndpoint.getObject("urlEndpoint").getString("url");
2021-02-12 22:22:11 +01:00
if (internUrl.startsWith("https://www.youtube.com/redirect?")) {
// remove https://www.youtube.com part to fall in the next if block
2021-02-12 22:22:11 +01:00
internUrl = internUrl.substring(23);
}
2020-02-27 17:39:23 +01:00
if (internUrl.startsWith("/redirect?")) {
// q parameter can be the first parameter
internUrl = internUrl.substring(10);
String[] params = internUrl.split("&");
for (String param : params) {
if (param.split("=")[0].equals("q")) {
String url;
try {
2021-02-07 22:12:22 +01:00
url = URLDecoder.decode(param.split("=")[1], UTF_8);
2020-02-27 17:39:23 +01:00
} catch (UnsupportedEncodingException e) {
return null;
}
return url;
}
}
} else if (internUrl.startsWith("http")) {
return internUrl;
2021-02-12 22:22:11 +01:00
} else if (internUrl.startsWith("/channel") || internUrl.startsWith("/user") || internUrl.startsWith("/watch")) {
return "https://www.youtube.com" + internUrl;
2020-02-27 17:39:23 +01:00
}
2020-04-16 16:08:14 +02:00
} else if (navigationEndpoint.has("browseEndpoint")) {
final JsonObject browseEndpoint = navigationEndpoint.getObject("browseEndpoint");
final String canonicalBaseUrl = browseEndpoint.getString("canonicalBaseUrl");
final String browseId = browseEndpoint.getString("browseId");
// All channel ids are prefixed with UC
if (browseId != null && browseId.startsWith("UC")) {
return "https://www.youtube.com/channel/" + browseId;
}
2020-04-15 18:49:58 +02:00
if (!isNullOrEmpty(canonicalBaseUrl)) {
return "https://www.youtube.com" + canonicalBaseUrl;
}
throw new ParsingException("canonicalBaseUrl is null and browseId is not a channel (\"" + browseEndpoint + "\")");
2020-04-16 16:08:14 +02:00
} else if (navigationEndpoint.has("watchEndpoint")) {
2020-02-27 17:39:23 +01:00
StringBuilder url = new StringBuilder();
url.append("https://www.youtube.com/watch?v=").append(navigationEndpoint.getObject("watchEndpoint").getString("videoId"));
if (navigationEndpoint.getObject("watchEndpoint").has("playlistId")) {
url.append("&list=").append(navigationEndpoint.getObject("watchEndpoint")
.getString("playlistId"));
}
if (navigationEndpoint.getObject("watchEndpoint").has("startTimeSeconds")) {
url.append("&amp;t=").append(navigationEndpoint.getObject("watchEndpoint")
.getInt("startTimeSeconds"));
}
2020-02-27 17:39:23 +01:00
return url.toString();
2020-04-16 16:08:14 +02:00
} else if (navigationEndpoint.has("watchPlaylistEndpoint")) {
2020-03-17 11:33:39 +01:00
return "https://www.youtube.com/playlist?list=" +
navigationEndpoint.getObject("watchPlaylistEndpoint").getString("playlistId");
2020-02-27 17:39:23 +01:00
}
return null;
}
2020-04-20 14:27:33 +02:00
/**
* Get the text from a JSON object that has either a simpleText or a runs array.
2021-02-07 22:12:22 +01:00
*
2020-04-20 14:27:33 +02:00
* @param textObject JSON object to get the text from
* @param html whether to return HTML, by parsing the navigationEndpoint
* @return text in the JSON object or {@code null}
2020-04-20 14:27:33 +02:00
*/
@Nullable
public static String getTextFromObject(JsonObject textObject, boolean html) throws ParsingException {
if (isNullOrEmpty(textObject)) return null;
2020-02-27 17:39:23 +01:00
if (textObject.has("simpleText")) return textObject.getString("simpleText");
if (textObject.getArray("runs").isEmpty()) return null;
final StringBuilder textBuilder = new StringBuilder();
for (final Object textPart : textObject.getArray("runs")) {
2020-02-27 17:39:23 +01:00
String text = ((JsonObject) textPart).getString("text");
2020-04-16 16:08:14 +02:00
if (html && ((JsonObject) textPart).has("navigationEndpoint")) {
2020-02-27 17:39:23 +01:00
String url = getUrlFromNavigationEndpoint(((JsonObject) textPart).getObject("navigationEndpoint"));
2020-04-15 18:49:58 +02:00
if (!isNullOrEmpty(url)) {
2020-02-27 17:39:23 +01:00
textBuilder.append("<a href=\"").append(url).append("\">").append(text).append("</a>");
continue;
}
}
textBuilder.append(text);
}
String text = textBuilder.toString();
if (html) {
text = text.replaceAll("\\n", "<br>");
text = text.replaceAll(" ", " &nbsp;");
}
return text;
}
@Nullable
public static String getTextFromObject(JsonObject textObject) throws ParsingException {
2020-02-27 17:39:23 +01:00
return getTextFromObject(textObject, false);
}
public static String fixThumbnailUrl(String thumbnailUrl) {
if (thumbnailUrl.startsWith("//")) {
thumbnailUrl = thumbnailUrl.substring(2);
}
if (thumbnailUrl.startsWith(HTTP)) {
thumbnailUrl = Utils.replaceHttpWithHttps(thumbnailUrl);
} else if (!thumbnailUrl.startsWith(HTTPS)) {
thumbnailUrl = "https://" + thumbnailUrl;
}
return thumbnailUrl;
}
2020-02-29 16:42:04 +01:00
public static String getValidJsonResponseBody(final Response response)
throws ParsingException, MalformedURLException {
if (response.responseCode() == 404) {
throw new ContentNotAvailableException("Not found"
+ " (\"" + response.responseCode() + " " + response.responseMessage() + "\")");
}
final String responseBody = response.responseBody();
if (responseBody.length() < 50) { // ensure to have a valid response
2020-02-29 16:42:04 +01:00
throw new ParsingException("JSON response is too short");
}
// Check if the request was redirected to the error page.
final URL latestUrl = new URL(response.latestUrl());
if (latestUrl.getHost().equalsIgnoreCase("www.youtube.com")) {
final String path = latestUrl.getPath();
if (path.equalsIgnoreCase("/oops") || path.equalsIgnoreCase("/error")) {
throw new ContentNotAvailableException("Content unavailable");
}
}
final String responseContentType = response.getHeader("Content-Type");
if (responseContentType != null
&& responseContentType.toLowerCase().contains("text/html")) {
throw new ParsingException("Got HTML document, expected JSON response"
+ " (latest url was: \"" + response.latestUrl() + "\")");
}
return responseBody;
}
public static Response getResponse(final String url, final Localization localization)
throws IOException, ExtractionException {
final Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList(getClientVersion()));
final Response response = getDownloader().get(url, headers, localization);
getValidJsonResponseBody(response);
return response;
}
public static String extractCookieValue(final String cookieName, final Response response) {
final List<String> cookies = response.responseHeaders().get("set-cookie");
int startIndex;
String result = "";
for (final String cookie : cookies) {
startIndex = cookie.indexOf(cookieName);
if (startIndex != -1) {
result = cookie.substring(startIndex + cookieName.length() + "=".length(),
cookie.indexOf(";", startIndex));
}
}
return result;
}
public static JsonArray getJsonResponse(final String url, final Localization localization)
throws IOException, ExtractionException {
Map<String, List<String>> headers = new HashMap<>();
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList(getClientVersion()));
final Response response = getDownloader().get(url, headers, localization);
return JsonUtils.toJsonArray(getValidJsonResponseBody(response));
}
public static JsonArray getJsonResponse(final Page page, final Localization localization)
throws IOException, ExtractionException {
final Map<String, List<String>> headers = new HashMap<>();
if (!isNullOrEmpty(page.getCookies())) {
headers.put("Cookie", Collections.singletonList(join(";", "=", page.getCookies())));
}
headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));
headers.put("X-YouTube-Client-Version", Collections.singletonList(getClientVersion()));
final Response response = getDownloader().get(page.getUrl(), headers, localization);
return JsonUtils.toJsonArray(getValidJsonResponseBody(response));
}
/**
* Shared alert detection function, multiple endpoints return the error similarly structured.
* <p>
* Will check if the object has an alert of the type "ERROR".
* </p>
*
* @param initialData the object which will be checked if an alert is present
* @throws ContentNotAvailableException if an alert is detected
*/
2020-05-03 10:28:45 +02:00
public static void defaultAlertsCheck(final JsonObject initialData) throws ParsingException {
final JsonArray alerts = initialData.getArray("alerts");
2020-04-15 18:49:58 +02:00
if (!isNullOrEmpty(alerts)) {
final JsonObject alertRenderer = alerts.getObject(0).getObject("alertRenderer");
2020-05-03 10:28:45 +02:00
final String alertText = getTextFromObject(alertRenderer.getObject("text"));
final String alertType = alertRenderer.getString("type", EMPTY_STRING);
if (alertType.equalsIgnoreCase("ERROR")) {
throw new ContentNotAvailableException("Got error: \"" + alertText + "\"");
}
}
}
@Nonnull
public static List<MetaInfo> getMetaInfo(final JsonArray contents) throws ParsingException {
final List<MetaInfo> metaInfo = new ArrayList<>();
for (final Object content : contents) {
final JsonObject resultObject = (JsonObject) content;
if (resultObject.has("itemSectionRenderer")) {
for (final Object sectionContentObject :
resultObject.getObject("itemSectionRenderer").getArray("contents")) {
final JsonObject sectionContent = (JsonObject) sectionContentObject;
if (sectionContent.has("infoPanelContentRenderer")) {
metaInfo.add(getInfoPanelContent(sectionContent.getObject("infoPanelContentRenderer")));
}
if (sectionContent.has("clarificationRenderer")) {
metaInfo.add(getClarificationRendererContent(sectionContent.getObject("clarificationRenderer")
));
}
}
}
}
return metaInfo;
}
@Nonnull
private static MetaInfo getInfoPanelContent(final JsonObject infoPanelContentRenderer)
throws ParsingException {
final MetaInfo metaInfo = new MetaInfo();
final StringBuilder sb = new StringBuilder();
for (final Object paragraph : infoPanelContentRenderer.getArray("paragraphs")) {
if (sb.length() != 0) {
sb.append("<br>");
}
sb.append(YoutubeParsingHelper.getTextFromObject((JsonObject) paragraph));
}
metaInfo.setContent(new Description(sb.toString(), Description.HTML));
if (infoPanelContentRenderer.has("sourceEndpoint")) {
final String metaInfoLinkUrl = YoutubeParsingHelper.getUrlFromNavigationEndpoint(
infoPanelContentRenderer.getObject("sourceEndpoint"));
try {
metaInfo.addUrl(new URL(Objects.requireNonNull(extractCachedUrlIfNeeded(metaInfoLinkUrl))));
} catch (final NullPointerException | MalformedURLException e) {
throw new ParsingException("Could not get metadata info URL", e);
}
final String metaInfoLinkText = YoutubeParsingHelper.getTextFromObject(
infoPanelContentRenderer.getObject("inlineSource"));
if (isNullOrEmpty(metaInfoLinkText)) {
throw new ParsingException("Could not get metadata info link text.");
}
metaInfo.addUrlText(metaInfoLinkText);
}
return metaInfo;
}
@Nonnull
private static MetaInfo getClarificationRendererContent(final JsonObject clarificationRenderer)
throws ParsingException {
final MetaInfo metaInfo = new MetaInfo();
final String title = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("contentTitle"));
final String text = YoutubeParsingHelper.getTextFromObject(clarificationRenderer.getObject("text"));
2021-02-07 22:12:22 +01:00
if (title == null || text == null) {
throw new ParsingException("Could not extract clarification renderer content");
}
metaInfo.setTitle(title);
metaInfo.setContent(new Description(text, Description.PLAIN_TEXT));
if (clarificationRenderer.has("actionButton")) {
final JsonObject actionButton = clarificationRenderer.getObject("actionButton")
.getObject("buttonRenderer");
try {
final String url = YoutubeParsingHelper.getUrlFromNavigationEndpoint(actionButton.getObject("command"));
metaInfo.addUrl(new URL(Objects.requireNonNull(extractCachedUrlIfNeeded(url))));
} catch (final NullPointerException | MalformedURLException e) {
throw new ParsingException("Could not get metadata info URL", e);
}
final String metaInfoLinkText = YoutubeParsingHelper.getTextFromObject(
actionButton.getObject("text"));
if (isNullOrEmpty(metaInfoLinkText)) {
throw new ParsingException("Could not get metadata info link text.");
}
metaInfo.addUrlText(metaInfoLinkText);
}
if (clarificationRenderer.has("secondaryEndpoint") && clarificationRenderer.has("secondarySource")) {
final String url = getUrlFromNavigationEndpoint(clarificationRenderer.getObject("secondaryEndpoint"));
// ignore Google URLs, because those point to a Google search about "Covid-19"
if (url != null && !isGoogleURL(url)) {
try {
metaInfo.addUrl(new URL(url));
final String description = getTextFromObject(clarificationRenderer.getObject("secondarySource"));
metaInfo.addUrlText(description == null ? url : description);
} catch (MalformedURLException e) {
throw new ParsingException("Could not get metadata info secondary URL", e);
}
}
}
return metaInfo;
}
/**
* Sometimes, YouTube provides URLs which use Google's cache. They look like
* {@code https://webcache.googleusercontent.com/search?q=cache:CACHED_URL}
2021-02-07 22:12:22 +01:00
*
* @param url the URL which might refer to the Google's webcache
* @return the URL which is referring to the original site
*/
public static String extractCachedUrlIfNeeded(final String url) {
if (url == null) {
return null;
}
if (url.contains("webcache.googleusercontent.com")) {
return url.split("cache:")[1];
}
return url;
}
public static boolean isVerified(final JsonArray badges) {
if (Utils.isNullOrEmpty(badges)) {
return false;
}
for (Object badge : badges) {
final String style = ((JsonObject) badge).getObject("metadataBadgeRenderer")
.getString("style");
if (style != null && (style.equals("BADGE_STYLE_TYPE_VERIFIED")
|| style.equals("BADGE_STYLE_TYPE_VERIFIED_ARTIST"))) {
return true;
}
}
return false;
}
2017-03-01 18:47:52 +01:00
}