fix: use url parser instead of regex for extracting track type

This commit is contained in:
ThetaDev 2023-03-21 16:03:26 +01:00
parent 6e5b6b76a2
commit f2c167f2dd
3 changed files with 200 additions and 150 deletions

View File

@ -43,6 +43,7 @@ import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.localization.ContentCountry;
import org.schabi.newpipe.extractor.localization.Localization;
import org.schabi.newpipe.extractor.playlist.PlaylistInfo;
import org.schabi.newpipe.extractor.stream.AudioTrackType;
import org.schabi.newpipe.extractor.stream.Description;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Parser;
@ -1483,6 +1484,7 @@ public final class YoutubeParsingHelper {
/**
* Create a map with the required cookie header.
*
* @return A singleton map containing the header.
*/
public static Map<String, List<String>> getCookieHeader() {
@ -1801,4 +1803,52 @@ public final class YoutubeParsingHelper {
public static boolean isConsentAccepted() {
return consentAccepted;
}
private static final Pattern AUDIO_STREAM_TYPE_REGEX =
Pattern.compile("&xtags=[\\w%]*acont(?:=|%3D)([a-z]+)(?:=|%3D|:|%3A|&|$)");
/**
* Extract the audio track type from a YouTube stream URL.
* <p>
* The track type is parsed from the {@code xtags} URL parameter
* (Example: {@code acont=original:lang=en}).
* </p>
* @param streamUrl YouTube stream URL
* @return {@link AudioTrackType} or {@code null} if no track type was found
*/
@Nullable
public static AudioTrackType extractAudioTrackType(final String streamUrl) {
final String xtags;
try {
xtags = Utils.getQueryValue(new URL(streamUrl), "xtags");
} catch (final MalformedURLException e) {
return null;
}
if (xtags == null) {
return null;
}
String atype = null;
for (final String param : xtags.split(":")) {
final String[] kv = param.split("=", 2);
if (kv.length > 1 && kv[0].equals("acont")) {
atype = kv[1];
break;
}
}
if (atype == null) {
return null;
}
switch (atype) {
case "original":
return AudioTrackType.ORIGINAL;
case "dubbed":
return AudioTrackType.DUBBED;
case "descriptive":
return AudioTrackType.DESCRIPTIVE;
default:
return null;
}
}
}

View File

@ -72,7 +72,6 @@ import org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.services.youtube.YoutubeThrottlingDecrypter;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory;
import org.schabi.newpipe.extractor.stream.AudioStream;
import org.schabi.newpipe.extractor.stream.AudioTrackType;
import org.schabi.newpipe.extractor.stream.DeliveryMethod;
import org.schabi.newpipe.extractor.stream.Description;
import org.schabi.newpipe.extractor.stream.Frameset;
@ -100,7 +99,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
@ -812,8 +810,6 @@ public class YoutubeStreamExtractor extends StreamExtractor {
"\\bc\\s*&&\\s*d\\.set\\([^,]+\\s*,\\s*(:encodeURIComponent\\s*\\()([a-zA-Z0-9$]+)\\("
};
private static final String STS_REGEX = "signatureTimestamp[=:](\\d+)";
private static final Pattern AUDIO_STREAM_TYPE_REGEX =
Pattern.compile("&xtags=[\\w%]*acont(?:=|%3D)([a-z]+)(?:=|%3D|:|%3A|&|$)");
@Override
public void onFetchPage(@Nonnull final Downloader downloader)
@ -1488,20 +1484,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
itagItem.setAudioLocale(LocaleCompat.forLanguageTag(
audioTrackId.substring(0, audioTrackIdLastLocaleCharacter)));
}
try {
final String atype = Parser.matchGroup1(AUDIO_STREAM_TYPE_REGEX, streamUrl);
switch (atype) {
case "original":
itagItem.setAudioTrackType(AudioTrackType.ORIGINAL);
break;
case "dubbed":
itagItem.setAudioTrackType(AudioTrackType.DUBBED);
break;
case "descriptive":
itagItem.setAudioTrackType(AudioTrackType.DESCRIPTIVE);
}
} catch (final Parser.RegexException ignored) { }
itagItem.setAudioTrackType(YoutubeParsingHelper.extractAudioTrackType(streamUrl));
}
itagItem.setAudioTrackName(formatData.getObject("audioTrack")

View File

@ -1,16 +1,20 @@
package org.schabi.newpipe.extractor.services.youtube;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.schabi.newpipe.downloader.DownloaderFactory;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.stream.AudioTrackType;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import static org.junit.jupiter.api.Assertions.*;
public class YoutubeParsingHelperTest {
@ -48,4 +52,17 @@ public class YoutubeParsingHelperTest {
assertEquals("https://www.infektionsschutz.de/coronavirus-sars-cov-2.html",
YoutubeParsingHelper.extractCachedUrlIfNeeded("https://www.infektionsschutz.de/coronavirus-sars-cov-2.html"));
}
@Test
public void extractAudioTrackType() {
final String originalUrl = "https://rr2---sn-4g5lzned.googlevideo.com/videoplayback?expire=1679429648&ei=sLsZZKrICIuR1gLSnYbgAg&ip=2001%3A638%3A102%3A26%3A1a7c%3A106b%3A6e4a%3Adc09&id=o-ALWn2ZwDxUXEZKzlsT_X9iuDjRMSi__SgRXVrVjKZEhc&itag=251&source=youtube&requiressl=yes&mh=nU&mm=31%2C29&mn=sn-4g5lzned%2Csn-4g5edndz&ms=au%2Crdu&mv=m&mvi=2&pl=40&initcwndbps=1740000&spc=H3gIhgXQzBxvKu2MOEmFaaEenC4DKdVUwudTeu3dtKwmq-Xv5g&vprv=1&xtags=acont%3Doriginal%3Alang%3Den&mime=audio%2Fwebm&ns=-lg0OQZL1LZRQO-dzE0W4E4L&gir=yes&clen=3513412&dur=303.681&lmt=1679342942566207&mt=1679407764&fvip=1&keepalive=yes&fexp=24007246&c=WEB&txp=5532434&n=gDLP5pImH9Vr7v&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cspc%2Cvprv%2Cxtags%2Cmime%2Cns%2Cgir%2Cclen%2Cdur%2Clmt&sig=AOq0QJ8wRAIgPFQ1yX8aoc35sz2eV2-wzNIhTQeOHGCsOmIonmo776kCIFo5k6HZ5kAQ6DycRCAG8jJgk9jNyncILGPrGZMZUuuo&lsparams=mh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Cinitcwndbps&lsig=AG3C_xAwRQIhANODPaBuc32MWI9gF3Bn1iz3byEn7EwUiXpNLuCcQqW9AiBB88Qrrz2fJCzYKg14_nnGxGQH1Uoi7i31OSrHK6_dGw%3D%3D";
final String dubbedUrl = "https://rr2---sn-4g5lzned.googlevideo.com/videoplayback?expire=1679429648&ei=sLsZZKrICIuR1gLSnYbgAg&ip=2001%3A638%3A102%3A26%3A1a7c%3A106b%3A6e4a%3Adc09&id=o-ALWn2ZwDxUXEZKzlsT_X9iuDjRMSi__SgRXVrVjKZEhc&itag=251&source=youtube&requiressl=yes&mh=nU&mm=31%2C29&mn=sn-4g5lzned%2Csn-4g5edndz&ms=au%2Crdu&mv=m&mvi=2&pl=40&initcwndbps=1740000&spc=H3gIhgXQzBxvKu2MOEmFaaEenC4DKdVUwudTeu3dtKwmq-Xv5g&vprv=1&xtags=acont%3Ddubbed%3Alang%3Den&mime=audio%2Fwebm&ns=-lg0OQZL1LZRQO-dzE0W4E4L&gir=yes&clen=3884070&dur=303.721&lmt=1679342946044954&mt=1679407764&fvip=1&keepalive=yes&fexp=24007246&c=WEB&txp=5532434&n=gDLP5pImH9Vr7v&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cspc%2Cvprv%2Cxtags%2Cmime%2Cns%2Cgir%2Cclen%2Cdur%2Clmt&sig=AOq0QJ8wRQIhAKEMLB8yLZJf2jXAu4P1Q8AVEciYsmjjr2syYAWZfJg6AiAfu-XI11zYpCLqljw_MCegh26pJHYyfatgfFGWfpL-6Q%3D%3D&lsparams=mh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Cinitcwndbps&lsig=AG3C_xAwRQIhANODPaBuc32MWI9gF3Bn1iz3byEn7EwUiXpNLuCcQqW9AiBB88Qrrz2fJCzYKg14_nnGxGQH1Uoi7i31OSrHK6_dGw%3D%3D";
final String descriptiveUrl = "https://rr2---sn-4g5lzned.googlevideo.com/videoplayback?expire=1679429648&ei=sLsZZKrICIuR1gLSnYbgAg&ip=2001%3A638%3A102%3A26%3A1a7c%3A106b%3A6e4a%3Adc09&id=o-ALWn2ZwDxUXEZKzlsT_X9iuDjRMSi__SgRXVrVjKZEhc&itag=251&source=youtube&requiressl=yes&mh=nU&mm=31%2C29&mn=sn-4g5lzned%2Csn-4g5edndz&ms=au%2Crdu&mv=m&mvi=2&pl=40&initcwndbps=1740000&spc=H3gIhgXQzBxvKu2MOEmFaaEenC4DKdVUwudTeu3dtKwmq-Xv5g&vprv=1&xtags=acont%3Ddescriptive%3Alang%3Den&mime=audio%2Fwebm&ns=-lg0OQZL1LZRQO-dzE0W4E4L&gir=yes&clen=4061711&dur=303.721&lmt=1679342946800120&mt=1679407764&fvip=1&keepalive=yes&fexp=24007246&c=WEB&txp=5532434&n=gDLP5pImH9Vr7v&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cspc%2Cvprv%2Cxtags%2Cmime%2Cns%2Cgir%2Cclen%2Cdur%2Clmt&sig=AOq0QJ8wRgIhAKFUzoNscV1hbNcPwcnQO3vOy47q69szj7BdLhFYS52pAiEA2oPhLZIZsrUQrx62iH4dHvTBlCloC3NieJw6edo7LL8%3D&lsparams=mh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Cinitcwndbps&lsig=AG3C_xAwRQIhANODPaBuc32MWI9gF3Bn1iz3byEn7EwUiXpNLuCcQqW9AiBB88Qrrz2fJCzYKg14_nnGxGQH1Uoi7i31OSrHK6_dGw%3D%3D";
final String noTrackUrl = "https://rr2---sn-4g5ednz7.googlevideo.com/videoplayback?expire=1679430240&ei=AL4ZZKiXJefYx_APj_6ECA&ip=2001%3A638%3A102%3A26%3A1a7c%3A106b%3A6e4a%3Adc09&id=o-ALKVh9uHVEvurL3bZOZCEMzFod9ZmJJd6GszA6UEIuKy&itag=251&source=youtube&requiressl=yes&mh=8L&mm=31%2C26&mn=sn-4g5ednz7%2Csn-i5heen7z&ms=au%2Conr&mv=m&mvi=2&pl=40&initcwndbps=1793750&spc=H3gIhh2s06nxQJg3zEgY9pw84syUasRiagYDsQ5UHHfcu5bfTA&vprv=1&mime=audio%2Fwebm&ns=VumObYcnTZNicexX7Ek2WakL&gir=yes&clen=3711099&dur=299.201&lmt=1679334484198077&mt=1679408487&fvip=2&keepalive=yes&fexp=24007246&c=WEB&txp=3318224&n=10c-m6ZvG6C7rC&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cspc%2Cvprv%2Cmime%2Cns%2Cgir%2Cclen%2Cdur%2Clmt&sig=AOq0QJ8wRQIhAODS0aHRBgdrHm5qwquqGC6zq3rU81W59y4BtV0Y9KStAiAPT8ykXXj_7GzAyZbLPgYKs-B1HWT-4bY0CppmZ2rReg%3D%3D&lsparams=mh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Cinitcwndbps&lsig=AG3C_xAwRQIhAL8fS6T-V9BNqrx55mdMvve5be2gcjIY8pYfxlUMPY6pAiAgiCMbqR4eSS_HvLu9KBe6cCFZeMcSTc7vzWtL9y0xvw%3D%3D";
assertEquals(AudioTrackType.ORIGINAL, YoutubeParsingHelper.extractAudioTrackType(originalUrl));
assertEquals(AudioTrackType.DUBBED, YoutubeParsingHelper.extractAudioTrackType(dubbedUrl));
assertEquals(AudioTrackType.DESCRIPTIVE, YoutubeParsingHelper.extractAudioTrackType(descriptiveUrl));
assertNull(YoutubeParsingHelper.extractAudioTrackType(noTrackUrl));
}
}