From 3b021eacefab4a9e43660d72d6d5a49f7ddb025e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 21 Nov 2022 00:51:45 +0000 Subject: [PATCH] [extractor/generic] Add `fragment_query` extractor arg for DASH and HLS (#5528) * `fragment_query`: passthrough any query in generic mpd/m3u8 manifest URLs to their fragments * Add support for `extra_param_to_segment_url` to DASH downloader Authored by: bashonly, pukkandan --- README.md | 3 +++ yt_dlp/downloader/dash.py | 14 +++++++++++--- yt_dlp/extractor/generic.py | 18 +++++++++++++++++- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f336dcb6a..fa55d130b 100644 --- a/README.md +++ b/README.md @@ -1736,6 +1736,9 @@ The following extractors use this feature: * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) * `approximate_date`: Extract approximate `upload_date` and `timestamp` in flat-playlist. This may cause date-based filters to be slightly off +#### generic +* `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg + #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` * `version`: The video version to extract - `uncut` or `simulcast` diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index 8723e1068..4328d739c 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -1,8 +1,9 @@ import time +import urllib.parse from . import get_suitable_downloader from .fragment import FragmentFD -from ..utils import urljoin +from ..utils import update_url_query, urljoin class DashSegmentsFD(FragmentFD): @@ -40,7 +41,12 @@ class DashSegmentsFD(FragmentFD): self._prepare_and_start_frag_download(ctx, fmt) ctx['start'] = real_start - fragments_to_download = self._get_fragments(fmt, ctx) + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + + fragments_to_download = self._get_fragments(fmt, ctx, extra_query) if real_downloader: self.to_screen( @@ -57,7 +63,7 @@ class DashSegmentsFD(FragmentFD): fragments = fragments(ctx) if callable(fragments) else fragments return [next(iter(fragments))] if self.params.get('test') else fragments - def _get_fragments(self, fmt, ctx): + def _get_fragments(self, fmt, ctx, extra_query): fragment_base_url = fmt.get('fragment_base_url') fragments = self._resolve_fragments(fmt['fragments'], ctx) @@ -70,6 +76,8 @@ class DashSegmentsFD(FragmentFD): if not fragment_url: assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) + if extra_query: + fragment_url = update_url_query(fragment_url, extra_query) yield { 'frag_index': frag_index, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 5da77273d..2fcbc6f43 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2189,6 +2189,13 @@ class GenericIE(InfoExtractor): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') + def _fragment_query(self, url): + if self._configuration_arg('fragment_query'): + query_string = urllib.parse.urlparse(url).query + if query_string: + return {'extra_param_to_segment_url': query_string} + return {} + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', @@ -2351,8 +2358,10 @@ class GenericIE(InfoExtractor): subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: @@ -2379,6 +2388,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + info_dict.update(self._fragment_query(url)) return info_dict # Maybe it's a direct link to a video? @@ -2429,6 +2439,7 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + info_dict.update(self._fragment_query(url)) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2541,7 +2552,10 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - else: + for fmt in formats: + fmt.update(self._fragment_query(src)) + + if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) @@ -2776,8 +2790,10 @@ class GenericIE(InfoExtractor): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: