Merge pull request #1163 from AudricV/yt-fix_comments_extraction

[YouTube] Support new comments data
2024-11-29 13:31:33 +01:00 · 2024-04-10 18:19:59 +02:00 · 2024-04-10 18:19:59 +02:00 · 6c3c2e25d7
commit 6c3c2e25d7
parent e5b30ae8c3 02274d5395
8 changed files with 779 additions and 154 deletions
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelper.java
@ -0,0 +1,316 @@
+package org.schabi.newpipe.extractor.services.youtube;
+
+import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getUrlFromNavigationEndpoint;
+import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
+
+import com.grack.nanojson.JsonObject;
+
+import org.jsoup.nodes.Entities;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Stack;
+import java.util.function.Function;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+public final class YoutubeDescriptionHelper {
+
+    private YoutubeDescriptionHelper() {
+    }
+
+    private static final String LINK_CLOSE = "</a>";
+    private static final String STRIKETHROUGH_OPEN = "<s>";
+    private static final String STRIKETHROUGH_CLOSE = "</s>";
+    private static final String BOLD_OPEN = "<b>";
+    private static final String BOLD_CLOSE = "</b>";
+    private static final String ITALIC_OPEN = "<i>";
+    private static final String ITALIC_CLOSE = "</i>";
+
+    // special link chips (e.g. for YT videos, YT channels or social media accounts):
+    // (u00a0) u00a0 u00a0 [/•] u00a0 <link content> u00a0 u00a0
+    private static final Pattern LINK_CONTENT_CLEANER_REGEX
+            = Pattern.compile("(?s)^ +[/•] +(.*?) +$");
+
+    /**
+     * Can be a command run, or a style run.
+     */
+    static final class Run {
+        @Nonnull final String open;
+        @Nonnull final String close;
+        final int pos;
+        @Nullable final Function<String, String> transformContent;
+        int openPosInOutput = -1;
+
+        Run(
+                @Nonnull final String open,
+                @Nonnull final String close,
+                final int pos
+        ) {
+            this(open, close, pos, null);
+        }
+
+        Run(
+                @Nonnull final String open,
+                @Nonnull final String close,
+                final int pos,
+                @Nullable final Function<String, String> transformContent
+        ) {
+            this.open = open;
+            this.close = close;
+            this.pos = pos;
+            this.transformContent = transformContent;
+        }
+
+        public boolean sameOpen(@Nonnull final Run other) {
+            return open.equals(other.open);
+        }
+    }
+
+    /**
+     * Parse a video description in the new "attributed" format, which contains the entire visible
+     * plaintext ({@code content}) and an array of {@code commandRuns} and {@code styleRuns}.
+     * Returns the formatted content in HTML format, and escapes the text to make sure there are no
+     * XSS attacks.
+     *
+     * <p>
+     * {@code commandRuns} include the links and their range in the text, while {@code styleRuns}
+     * include the styling to apply to various ranges in the text.
+     * </p>
+     *
+     * @param attributedDescription the JSON object of the attributed description
+     * @return the parsed description, in HTML format, as a string
+     */
+    @Nullable
+    public static String attributedDescriptionToHtml(
+            @Nullable final JsonObject attributedDescription
+    ) {
+        if (isNullOrEmpty(attributedDescription)) {
+            return null;
+        }
+
+        final String content = attributedDescription.getString("content");
+        if (content == null) {
+            return null;
+        }
+
+        // all run pairs must always of length at least 1, or they should be discarded,
+        // otherwise various assumptions made in runsToHtml may fail
+        final List<Run> openers = new ArrayList<>();
+        final List<Run> closers = new ArrayList<>();
+        addAllCommandRuns(attributedDescription, openers, closers);
+        addAllStyleRuns(attributedDescription, openers, closers);
+
+        // Note that sorting this way might put closers with the same close position in the wrong
+        // order with respect to their openers, causing unnecessary closes and reopens. E.g.
+        // <b>b<i>b&i</i></b> is instead generated as <b>b<i>b&i</b></i><b></b> if the </b> is
+        // encountered before the </i>. Solving this wouldn't be difficult, thanks to stable sort,
+        // but would require additional sorting steps which would just make this slower for the
+        // general case where it's unlikely there are coincident closes.
+        Collections.sort(openers, Comparator.comparingInt(run -> run.pos));
+        Collections.sort(closers, Comparator.comparingInt(run -> run.pos));
+
+        return runsToHtml(openers, closers, content);
+    }
+
+    /**
+     * Applies the formatting specified by the intervals stored in {@code openers} and {@code
+     * closers} to {@code content} in order to obtain valid HTML even when intervals overlap. For
+     * example &lt;b&gt;b&lt;i&gt;b&i&lt;/b&gt;i&lt;/i&gt; would not be valid HTML, so this function
+     * instead generates &lt;b&gt;b&lt;i&gt;b&i&lt;/i&gt;&lt;/b&gt;&lt;i&gt;i&lt;/i&gt;. Any HTML
+     * special characters in {@code rawContent} are escaped to make sure there are no XSS attacks.
+     *
+     * <p>
+     * Every opener in {@code openers} must have a corresponding closer in {@code closers}. Every
+     * corresponding (opener, closer) pair must have a length of at least one (i.e. empty intervals
+     * are not allowed).
+     * </p>
+     *
+     * @param openers    contains all of the places where a run begins, must have the same size of
+     *                   closers, must be ordered by {@link Run#pos}
+     * @param closers    contains all of the places where a run ends, must have the same size of
+     *                   openers, must be ordered by {@link Run#pos}
+     * @param rawContent the content to apply formatting to, and to escape to avoid XSS
+     * @return the formatted content in HTML
+     */
+    static String runsToHtml(
+            @Nonnull final List<Run> openers,
+            @Nonnull final List<Run> closers,
+            @Nonnull final String rawContent
+    ) {
+        final String content = rawContent.replace('\u00a0', ' ');
+        final Stack<Run> openRuns = new Stack<>();
+        final Stack<Run> tempStack = new Stack<>();
+        final StringBuilder textBuilder = new StringBuilder();
+        int currentTextPos = 0;
+        int openersIndex = 0;
+        int closersIndex = 0;
+
+        // openers and closers have the same length, but we will surely finish openers earlier than
+        // closers, since every opened interval needs to be closed at some point and there can't be
+        // empty intervals, hence check only closersIndex < closers.size()
+        while (closersIndex < closers.size()) {
+            final int minPos = openersIndex < openers.size()
+                    ? Math.min(closers.get(closersIndex).pos, openers.get(openersIndex).pos)
+                    : closers.get(closersIndex).pos;
+
+            // append piece of text until current index
+            textBuilder.append(Entities.escape(content.substring(currentTextPos, minPos)));
+            currentTextPos = minPos;
+
+            if (closers.get(closersIndex).pos == minPos) {
+                // even in case of position tie, first process closers
+                final Run closer = closers.get(closersIndex);
+                ++closersIndex;
+
+                // because of the assumptions, this while wouldn't need the !openRuns.empty()
+                // condition, because no run will close before being opened, but let's be sure
+                while (!openRuns.empty()) {
+                    final Run popped = openRuns.pop();
+                    if (popped.sameOpen(closer)) {
+                        // before closing the current run, if the run has a transformContent
+                        // function, use it to transform the content of the current run, based on
+                        // the openPosInOutput set when the current run was opened
+                        if (popped.transformContent != null && popped.openPosInOutput >= 0) {
+                            textBuilder.replace(popped.openPosInOutput, textBuilder.length(),
+                                    popped.transformContent.apply(
+                                            textBuilder.substring(popped.openPosInOutput)));
+                        }
+                        // close the run that we really need to close
+                        textBuilder.append(popped.close);
+                        break;
+                    }
+                    // we keep popping from openRuns, closing all of the runs we find,
+                    // until we find the run that we really need to close ...
+                    textBuilder.append(popped.close);
+                    tempStack.push(popped);
+                }
+                while (!tempStack.empty()) {
+                    // ... and then we reopen all of the runs that we didn't need to close
+                    // e.g. in <b>b<i>b&i</b>i</i>, when </b> is encountered, </i></b><i> is printed
+                    // instead, to make sure the HTML is valid, obtaining <b>b<i>b&i</i></b><i>i</i>
+                    final Run popped = tempStack.pop();
+                    textBuilder.append(popped.open);
+                    openRuns.push(popped);
+                }
+
+            } else {
+                // this will never be reached if openersIndex >= openers.size() because of the
+                // way minPos is calculated
+                final Run opener = openers.get(openersIndex);
+                textBuilder.append(opener.open);
+                opener.openPosInOutput = textBuilder.length(); // save for transforming later
+                openRuns.push(opener);
+                ++openersIndex;
+            }
+        }
+
+        // append last piece of text
+        textBuilder.append(Entities.escape(content.substring(currentTextPos)));
+
+        return textBuilder.toString()
+                .replace("\n", "<br>")
+                .replace("  ", " &nbsp;");
+    }
+
+    private static void addAllCommandRuns(
+            @Nonnull final JsonObject attributedDescription,
+            @Nonnull final List<Run> openers,
+            @Nonnull final List<Run> closers
+    ) {
+        attributedDescription.getArray("commandRuns")
+                .stream()
+                .filter(JsonObject.class::isInstance)
+                .map(JsonObject.class::cast)
+                .forEach(run -> {
+                    final JsonObject navigationEndpoint = run.getObject("onTap")
+                            .getObject("innertubeCommand");
+
+                    final int startIndex = run.getInt("startIndex", -1);
+                    final int length = run.getInt("length", 0);
+                    if (startIndex < 0 || length < 1 || navigationEndpoint == null) {
+                        return;
+                    }
+
+                    final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
+                    if (url == null) {
+                        return;
+                    }
+
+                    final String open = "<a href=\"" + Entities.escape(url) + "\">";
+                    final Function<String, String> transformContent = getTransformContentFun(run);
+
+                    openers.add(new Run(open, LINK_CLOSE, startIndex, transformContent));
+                    closers.add(new Run(open, LINK_CLOSE, startIndex + length, transformContent));
+                });
+    }
+
+    private static Function<String, String> getTransformContentFun(final JsonObject run) {
+        final String accessibilityLabel = run.getObject("onTapOptions")
+                .getObject("accessibilityInfo")
+                .getString("accessibilityLabel", "")
+                // accessibility labels are e.g. "Instagram Channel Link: instagram_profile_name"
+                .replaceFirst(" Channel Link", "");
+
+        final Function<String, String> transformContent;
+        if (accessibilityLabel.isEmpty() || accessibilityLabel.startsWith("YouTube: ")) {
+            // if there is no accessibility label, or the link points to YouTube, cleanup the link
+            // text, see LINK_CONTENT_CLEANER_REGEX's documentation for more details
+            transformContent = (content) -> {
+                final Matcher m = LINK_CONTENT_CLEANER_REGEX.matcher(content);
+                if (m.find()) {
+                    return m.group(1);
+                }
+                return content;
+            };
+        } else {
+            // if there is an accessibility label, replace the link text with it, because on the
+            // YouTube website an ambiguous link text is next to an icon explaining which service it
+            // belongs to, but since we can't add icons, we instead use the accessibility label
+            // which contains information about the service
+            transformContent = (content) -> accessibilityLabel;
+        }
+
+        return transformContent;
+    }
+
+    private static void addAllStyleRuns(
+            @Nonnull final JsonObject attributedDescription,
+            @Nonnull final List<Run> openers,
+            @Nonnull final List<Run> closers
+    ) {
+        attributedDescription.getArray("styleRuns")
+                .stream()
+                .filter(JsonObject.class::isInstance)
+                .map(JsonObject.class::cast)
+                .forEach(run -> {
+                    final int start = run.getInt("startIndex", -1);
+                    final int length = run.getInt("length", 0);
+                    if (start < 0 || length < 1) {
+                        return;
+                    }
+                    final int end = start + length;
+
+                    if (run.has("strikethrough")) {
+                        openers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, start));
+                        closers.add(new Run(STRIKETHROUGH_OPEN, STRIKETHROUGH_CLOSE, end));
+                    }
+
+                    if (run.getBoolean("italic", false)) {
+                        openers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, start));
+                        closers.add(new Run(ITALIC_OPEN, ITALIC_CLOSE, end));
+                    }
+
+                    if (run.has("weightLabel")
+                            && !"FONT_WEIGHT_NORMAL".equals(run.getString("weightLabel"))) {
+                        openers.add(new Run(BOLD_OPEN, BOLD_CLOSE, start));
+                        closers.add(new Run(BOLD_OPEN, BOLD_CLOSE, end));
+                    }
+                });
+    }
+}
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/YoutubeParsingHelper.java
@ -996,86 +996,6 @@ public final class YoutubeParsingHelper {
        return text;
    }

-    /**
-     * Parse a video description in the new "attributed" format, which contains the entire visible
-     * plaintext ({@code content}) and an array of {@code commandRuns}.
-     *
-     * <p>
-     * The {@code commandRuns} include the links and their position in the text.
-     * </p>
-     *
-     * @param attributedDescription the JSON object of the attributed description
-     * @return the parsed description, in HTML format, as a string
-     */
-    @Nullable
-    public static String getAttributedDescription(
-            @Nullable final JsonObject attributedDescription) {
-        if (isNullOrEmpty(attributedDescription)) {
-            return null;
-        }
-
-        final String content = attributedDescription.getString("content");
-        if (content == null) {
-            return null;
-        }
-
-        final JsonArray commandRuns = attributedDescription.getArray("commandRuns");
-
-        final StringBuilder textBuilder = new StringBuilder();
-        int textStart = 0;
-
-        for (final Object commandRun : commandRuns) {
-            if (!(commandRun instanceof JsonObject)) {
-                continue;
-            }
-
-            final JsonObject run = ((JsonObject) commandRun);
-            final int startIndex = run.getInt("startIndex", -1);
-            final int length = run.getInt("length");
-            final JsonObject navigationEndpoint = run.getObject("onTap")
-                    .getObject("innertubeCommand");
-
-            if (startIndex < 0 || length < 1 || navigationEndpoint == null) {
-                continue;
-            }
-
-            final String url = getUrlFromNavigationEndpoint(navigationEndpoint);
-
-            if (url == null) {
-                continue;
-            }
-
-            // Append text before the link
-            if (startIndex > textStart) {
-                textBuilder.append(content, textStart, startIndex);
-            }
-
-            // Trim and append link text
-            // Channel/Video format: 3xu00a0, (/ •), u00a0, <Name>, 2xu00a0
-            final String linkText = content.substring(startIndex, startIndex + length)
-                    .replace('\u00a0', ' ')
-                    .trim()
-                    .replaceFirst("^[/•] *", "");
-
-            textBuilder.append("<a href=\"")
-                    .append(Entities.escape(url))
-                    .append("\">")
-                    .append(Entities.escape(linkText))
-                    .append("</a>");
-
-            textStart = startIndex + length;
-        }
-
-        // Append the remaining text
-        if (textStart < content.length()) {
-            textBuilder.append(content.substring(textStart));
-        }
-
-        return textBuilder.toString()
-                .replaceAll("\\n", "<br>")
-                .replaceAll(" {2}", " &nbsp;");
-    }
-
    @Nonnull
    public static String getTextFromObjectOrThrow(final JsonObject textObject, final String error)
            throws ParsingException {
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsEUVMInfoItemExtractor.java
@ -0,0 +1,235 @@
+package org.schabi.newpipe.extractor.services.youtube.extractors;
+
+import com.grack.nanojson.JsonObject;
+import org.schabi.newpipe.extractor.Image;
+import org.schabi.newpipe.extractor.Page;
+import org.schabi.newpipe.extractor.comments.CommentsInfoItemExtractor;
+import org.schabi.newpipe.extractor.exceptions.ParsingException;
+import org.schabi.newpipe.extractor.localization.DateWrapper;
+import org.schabi.newpipe.extractor.localization.TimeAgoParser;
+import org.schabi.newpipe.extractor.stream.Description;
+import org.schabi.newpipe.extractor.utils.Utils;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.util.List;
+import java.util.Objects;
+
+import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml;
+import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray;
+import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;
+
+/**
+ * A {@link CommentsInfoItemExtractor} for YouTube comment data returned in a view model and entity
+ * updates.
+ */
+class YoutubeCommentsEUVMInfoItemExtractor implements CommentsInfoItemExtractor {
+
+    private static final String AUTHOR = "author";
+    private static final String PROPERTIES = "properties";
+
+    @Nonnull
+    private final JsonObject commentViewModel;
+    @Nullable
+    private final JsonObject commentRepliesRenderer;
+    @Nonnull
+    private final JsonObject commentEntityPayload;
+    @Nonnull
+    private final JsonObject engagementToolbarStateEntityPayload;
+    @Nonnull
+    private final String videoUrl;
+    @Nonnull
+    private final TimeAgoParser timeAgoParser;
+
+    YoutubeCommentsEUVMInfoItemExtractor(
+            @Nonnull final JsonObject commentViewModel,
+            @Nullable final JsonObject commentRepliesRenderer,
+            @Nonnull final JsonObject commentEntityPayload,
+            @Nonnull final JsonObject engagementToolbarStateEntityPayload,
+            @Nonnull final String videoUrl,
+            @Nonnull final TimeAgoParser timeAgoParser) {
+        this.commentViewModel = commentViewModel;
+        this.commentRepliesRenderer = commentRepliesRenderer;
+        this.commentEntityPayload = commentEntityPayload;
+        this.engagementToolbarStateEntityPayload = engagementToolbarStateEntityPayload;
+        this.videoUrl = videoUrl;
+        this.timeAgoParser = timeAgoParser;
+    }
+
+    @Override
+    public String getName() throws ParsingException {
+        return getUploaderName();
+    }
+
+    @Override
+    public String getUrl() throws ParsingException {
+        return videoUrl;
+    }
+
+    @Nonnull
+    @Override
+    public List<Image> getThumbnails() throws ParsingException {
+        return getUploaderAvatars();
+    }
+
+    @Override
+    public int getLikeCount() throws ParsingException {
+        final String textualLikeCount = getTextualLikeCount();
+        try {
+            if (Utils.isBlank(textualLikeCount)) {
+                return 0;
+            }
+
+            return (int) Utils.mixedNumberWordToLong(textualLikeCount);
+        } catch (final Exception e) {
+            throw new ParsingException(
+                    "Unexpected error while converting textual like count to like count", e);
+        }
+    }
+
+    @Override
+    public String getTextualLikeCount() {
+        return commentEntityPayload.getObject("toolbar")
+                .getString("likeCountNotliked");
+    }
+
+    @Override
+    public Description getCommentText() throws ParsingException {
+        // Comments' text work in the same way as an attributed video description
+        return new Description(
+                attributedDescriptionToHtml(commentEntityPayload.getObject(PROPERTIES)
+                        .getObject("content")), Description.HTML);
+    }
+
+    @Override
+    public String getTextualUploadDate() throws ParsingException {
+        return commentEntityPayload.getObject(PROPERTIES)
+                .getString("publishedTime");
+    }
+
+    @Nullable
+    @Override
+    public DateWrapper getUploadDate() throws ParsingException {
+        final String textualPublishedTime = getTextualUploadDate();
+        if (isNullOrEmpty(textualPublishedTime)) {
+            return null;
+        }
+
+        return timeAgoParser.parse(textualPublishedTime);
+    }
+
+    @Override
+    public String getCommentId() throws ParsingException {
+        String commentId = commentEntityPayload.getObject(PROPERTIES)
+                .getString("commentId");
+        if (isNullOrEmpty(commentId)) {
+            commentId = commentViewModel.getString("commentId");
+            if (isNullOrEmpty(commentId)) {
+                throw new ParsingException("Could not get comment ID");
+            }
+        }
+        return commentId;
+    }
+
+    @Override
+    public String getUploaderUrl() throws ParsingException {
+        final JsonObject author = commentEntityPayload.getObject(AUTHOR);
+        String channelId = author.getString("channelId");
+        if (isNullOrEmpty(channelId)) {
+            channelId = author.getObject("channelCommand")
+                    .getObject("innertubeCommand")
+                    .getObject("browseEndpoint")
+                    .getString("browseId");
+            if (isNullOrEmpty(channelId)) {
+                channelId = author.getObject("avatar")
+                        .getObject("endpoint")
+                        .getObject("innertubeCommand")
+                        .getObject("browseEndpoint")
+                        .getString("browseId");
+                if (isNullOrEmpty(channelId)) {
+                    throw new ParsingException("Could not get channel ID");
+                }
+            }
+        }
+        return "https://www.youtube.com/channel/" + channelId;
+    }
+
+    @Override
+    public String getUploaderName() throws ParsingException {
+        return commentEntityPayload.getObject(AUTHOR)
+                .getString("displayName");
+    }
+
+    @Nonnull
+    @Override
+    public List<Image> getUploaderAvatars() throws ParsingException {
+        return getImagesFromThumbnailsArray(commentEntityPayload.getObject("avatar")
+                .getObject("image")
+                .getArray("sources"));
+    }
+
+    @Override
+    public boolean isHeartedByUploader() {
+        return "TOOLBAR_HEART_STATE_HEARTED".equals(
+                engagementToolbarStateEntityPayload.getString("heartState"));
+    }
+
+    @Override
+    public boolean isPinned() {
+        return commentViewModel.has("pinnedText");
+    }
+
+    @Override
+    public boolean isUploaderVerified() throws ParsingException {
+        final JsonObject author = commentEntityPayload.getObject(AUTHOR);
+        return author.getBoolean("isVerified") || author.getBoolean("isArtist");
+    }
+
+    @Override
+    public int getReplyCount() throws ParsingException {
+        // As YouTube allows replies up to 750 comments, we cannot check if the count returned is a
+        // mixed number or a real number
+        // Assume it is a mixed one, as it matches how numbers of most properties are returned
+        final String replyCountString = commentEntityPayload.getObject("toolbar")
+                .getString("replyCount");
+        if (isNullOrEmpty(replyCountString)) {
+            return 0;
+        }
+        return (int) Utils.mixedNumberWordToLong(replyCountString);
+    }
+
+    @Nullable
+    @Override
+    public Page getReplies() throws ParsingException {
+        if (isNullOrEmpty(commentRepliesRenderer)) {
+            return null;
+        }
+
+        final String continuation = commentRepliesRenderer.getArray("contents")
+                .stream()
+                .filter(JsonObject.class::isInstance)
+                .map(JsonObject.class::cast)
+                .map(content -> content.getObject("continuationItemRenderer", null))
+                .filter(Objects::nonNull)
+                .findFirst()
+                .map(continuationItemRenderer ->
+                                continuationItemRenderer.getObject("continuationEndpoint")
+                                        .getObject("continuationCommand")
+                                        .getString("token"))
+                .orElseThrow(() ->
+                        new ParsingException("Could not get comment replies continuation"));
+        return new Page(videoUrl, continuation);
+    }
+
+    @Override
+    public boolean isChannelOwner() {
+        return commentEntityPayload.getObject(AUTHOR)
+                .getBoolean("isCreator");
+    }
+
+    @Override
+    public boolean hasCreatorReply() {
+        return commentRepliesRenderer != null
+                && commentRepliesRenderer.has("viewRepliesCreatorThumbnail");
+    }
+}
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsExtractor.java
@ -13,6 +13,7 @@ import org.schabi.newpipe.extractor.exceptions.ExtractionException;
 import org.schabi.newpipe.extractor.exceptions.ParsingException;
 import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
 import org.schabi.newpipe.extractor.localization.Localization;
+import org.schabi.newpipe.extractor.localization.TimeAgoParser;
 import org.schabi.newpipe.extractor.utils.JsonUtils;
 import org.schabi.newpipe.extractor.utils.Utils;

@ -21,7 +22,6 @@ import javax.annotation.Nullable;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
-import java.util.List;

 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonPostResponse;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getTextFromObject;
@ -30,6 +30,9 @@ import static org.schabi.newpipe.extractor.utils.Utils.isNullOrEmpty;

 public class YoutubeCommentsExtractor extends CommentsExtractor {

+    private static final String COMMENT_VIEW_MODEL_KEY = "commentViewModel";
+    private static final String COMMENT_RENDERER_KEY = "commentRenderer";
+
    /**
     * Whether comments are disabled on video.
     */
@ -74,8 +77,7 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
            return null;
        }

-        final String token = contents
-                .stream()
+        final String token = contents.stream()
                // Only use JsonObjects
                .filter(JsonObject.class::isInstance)
                .map(JsonObject.class::cast)
@ -120,6 +122,21 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
        }
    }

+    @Nonnull
+    private JsonObject getMutationPayloadFromEntityKey(@Nonnull final JsonArray mutations,
+                                                       @Nonnull final String commentKey)
+            throws ParsingException {
+        return mutations.stream()
+                .filter(JsonObject.class::isInstance)
+                .map(JsonObject.class::cast)
+                .filter(mutation -> commentKey.equals(
+                        mutation.getString("entityKey")))
+                .findFirst()
+                .orElseThrow(() -> new ParsingException(
+                        "Could not get comment entity payload mutation"))
+                .getObject("payload");
+    }
+
    @Nonnull
    private InfoItemsPage<CommentsInfoItem> getInfoItemsPageForDisabledComments() {
        return new InfoItemsPage<>(Collections.emptyList(), null, Collections.emptyList());
@ -207,8 +224,8 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
        return new InfoItemsPage<>(collector, getNextPage(jsonObject));
    }

-    private void collectCommentsFrom(final CommentsInfoItemsCollector collector,
-                                     final JsonObject jsonObject)
+    private void collectCommentsFrom(@Nonnull final CommentsInfoItemsCollector collector,
+                                     @Nonnull final JsonObject jsonObject)
            throws ParsingException {

        final JsonArray onResponseReceivedEndpoints =
@ -233,6 +250,8 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {

        final JsonArray contents;
        try {
+            // A copy of the array is needed, otherwise the continuation item is removed from the
+            // original object which is used to get the continuation
            contents = new JsonArray(JsonUtils.getArray(commentsEndpoint, path));
        } catch (final Exception e) {
            // No comments
@ -244,23 +263,80 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
            contents.remove(index);
        }

-        final String jsonKey = contents.getObject(0).has("commentThreadRenderer")
-                ? "commentThreadRenderer"
-                : "commentRenderer";
+        // The mutations object, which is returned in the comments' continuation
+        // It contains parts of comment data when comments are returned with a view model
+        final JsonArray mutations = jsonObject.getObject("frameworkUpdates")
+                .getObject("entityBatchUpdate")
+                .getArray("mutations");
+        final String videoUrl = getUrl();
+        final TimeAgoParser timeAgoParser = getTimeAgoParser();

-        final List<Object> comments;
-        try {
-            comments = JsonUtils.getValues(contents, jsonKey);
-        } catch (final Exception e) {
-            throw new ParsingException("Unable to get parse youtube comments", e);
+        for (final Object o : contents) {
+            if (!(o instanceof JsonObject)) {
+                continue;
+            }
+
+            collectCommentItem(mutations, (JsonObject) o, collector, videoUrl, timeAgoParser);
        }
+    }

-        final String url = getUrl();
-        comments.stream()
-                .filter(JsonObject.class::isInstance)
-                .map(JsonObject.class::cast)
-                .map(jObj -> new YoutubeCommentsInfoItemExtractor(jObj, url, getTimeAgoParser()))
-                .forEach(collector::commit);
+    private void collectCommentItem(@Nonnull final JsonArray mutations,
+                                    @Nonnull final JsonObject content,
+                                    @Nonnull final CommentsInfoItemsCollector collector,
+                                    @Nonnull final String videoUrl,
+                                    @Nonnull final TimeAgoParser timeAgoParser)
+            throws ParsingException {
+        if (content.has("commentThreadRenderer")) {
+            final JsonObject commentThreadRenderer =
+                    content.getObject("commentThreadRenderer");
+            if (commentThreadRenderer.has(COMMENT_VIEW_MODEL_KEY)) {
+                final JsonObject commentViewModel =
+                        commentThreadRenderer.getObject(COMMENT_VIEW_MODEL_KEY)
+                                .getObject(COMMENT_VIEW_MODEL_KEY);
+                collector.commit(new YoutubeCommentsEUVMInfoItemExtractor(
+                        commentViewModel,
+                        commentThreadRenderer.getObject("replies")
+                                .getObject("commentRepliesRenderer"),
+                        getMutationPayloadFromEntityKey(mutations,
+                                commentViewModel.getString("commentKey", ""))
+                                .getObject("commentEntityPayload"),
+                        getMutationPayloadFromEntityKey(mutations,
+                                commentViewModel.getString("toolbarStateKey", ""))
+                                .getObject("engagementToolbarStateEntityPayload"),
+                        videoUrl,
+                        timeAgoParser));
+            } else if (commentThreadRenderer.has("comment")) {
+                collector.commit(new YoutubeCommentsInfoItemExtractor(
+                        commentThreadRenderer.getObject("comment")
+                                .getObject(COMMENT_RENDERER_KEY),
+                        commentThreadRenderer.getObject("replies")
+                                .getObject("commentRepliesRenderer"),
+                        videoUrl,
+                        timeAgoParser));
+            }
+        } else if (content.has(COMMENT_VIEW_MODEL_KEY)) {
+            final JsonObject commentViewModel = content.getObject(COMMENT_VIEW_MODEL_KEY);
+            collector.commit(new YoutubeCommentsEUVMInfoItemExtractor(
+                    commentViewModel,
+                    null,
+                    getMutationPayloadFromEntityKey(mutations,
+                            commentViewModel.getString("commentKey", ""))
+                            .getObject("commentEntityPayload"),
+                    getMutationPayloadFromEntityKey(mutations,
+                            commentViewModel.getString("toolbarStateKey", ""))
+                            .getObject("engagementToolbarStateEntityPayload"),
+                    videoUrl,
+                    timeAgoParser));
+        } else if (content.has(COMMENT_RENDERER_KEY)) {
+            // commentRenderers are directly returned for comment replies, so there is no
+            // commentRepliesRenderer to provide
+            // Also, YouTube has only one comment reply level
+            collector.commit(new YoutubeCommentsInfoItemExtractor(
+                    content.getObject(COMMENT_RENDERER_KEY),
+                    null,
+                    videoUrl,
+                    timeAgoParser));
+        }
    }

    @Override
@ -307,10 +383,11 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
            return -1;
        }

-        final JsonObject countText = ajaxJson
-                .getArray("onResponseReceivedEndpoints").getObject(0)
+        final JsonObject countText = ajaxJson.getArray("onResponseReceivedEndpoints")
+                .getObject(0)
                .getObject("reloadContinuationItemsCommand")
-                .getArray("continuationItems").getObject(0)
+                .getArray("continuationItems")
+                .getObject(0)
                .getObject("commentsHeaderRenderer")
                .getObject("countText");

--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeCommentsInfoItemExtractor.java
@ -22,40 +22,36 @@ import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper

 public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtractor {

-    private final JsonObject json;
-    private JsonObject commentRenderer;
+    @Nonnull
+    private final JsonObject commentRenderer;
+    @Nullable
+    private final JsonObject commentRepliesRenderer;
+    @Nonnull
    private final String url;
+    @Nonnull
    private final TimeAgoParser timeAgoParser;

-    public YoutubeCommentsInfoItemExtractor(final JsonObject json,
-                                            final String url,
-                                            final TimeAgoParser timeAgoParser) {
-        this.json = json;
+    public YoutubeCommentsInfoItemExtractor(@Nonnull final JsonObject commentRenderer,
+                                            @Nullable final JsonObject commentRepliesRenderer,
+                                            @Nonnull final String url,
+                                            @Nonnull final TimeAgoParser timeAgoParser) {
+        this.commentRenderer = commentRenderer;
+        this.commentRepliesRenderer = commentRepliesRenderer;
        this.url = url;
        this.timeAgoParser = timeAgoParser;
    }

-    private JsonObject getCommentRenderer() throws ParsingException {
-        if (commentRenderer == null) {
-            if (json.has("comment")) {
-                commentRenderer = JsonUtils.getObject(json, "comment.commentRenderer");
-            } else {
-                commentRenderer = json;
-            }
-        }
-        return commentRenderer;
-    }
-
    @Nonnull
    private List<Image> getAuthorThumbnails() throws ParsingException {
        try {
-            return getImagesFromThumbnailsArray(JsonUtils.getArray(getCommentRenderer(),
+            return getImagesFromThumbnailsArray(JsonUtils.getArray(commentRenderer,
                    "authorThumbnail.thumbnails"));
        } catch (final Exception e) {
            throw new ParsingException("Could not get author thumbnails", e);
        }
    }

+    @Nonnull
    @Override
    public String getUrl() throws ParsingException {
        return url;
@ -70,7 +66,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public String getName() throws ParsingException {
        try {
-            return getTextFromObject(JsonUtils.getObject(getCommentRenderer(), "authorText"));
+            return getTextFromObject(JsonUtils.getObject(commentRenderer, "authorText"));
        } catch (final Exception e) {
            return "";
        }
@ -79,7 +75,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public String getTextualUploadDate() throws ParsingException {
        try {
-            return getTextFromObject(JsonUtils.getObject(getCommentRenderer(),
+            return getTextFromObject(JsonUtils.getObject(commentRenderer,
                    "publishedTimeText"));
        } catch (final Exception e) {
            throw new ParsingException("Could not get publishedTimeText", e);
@ -90,8 +86,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public DateWrapper getUploadDate() throws ParsingException {
        final String textualPublishedTime = getTextualUploadDate();
-        if (timeAgoParser != null && textualPublishedTime != null
-                && !textualPublishedTime.isEmpty()) {
+        if (textualPublishedTime != null && !textualPublishedTime.isEmpty()) {
            return timeAgoParser.parse(textualPublishedTime);
        } else {
            return null;
@ -118,7 +113,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
        // Try first to get the exact like count by using the accessibility data
        final String likeCount;
        try {
-            likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(getCommentRenderer(),
+            likeCount = Utils.removeNonDigitCharacters(JsonUtils.getString(commentRenderer,
                    "actionButtons.commentActionButtonsRenderer.likeButton.toggleButtonRenderer"
                            + ".accessibilityData.accessibilityData.label"));
        } catch (final Exception e) {
@ -170,11 +165,11 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
         */
        try {
            // If a comment has no likes voteCount is not set
-            if (!getCommentRenderer().has("voteCount")) {
+            if (!commentRenderer.has("voteCount")) {
                return "";
            }

-            final JsonObject voteCountObj = JsonUtils.getObject(getCommentRenderer(), "voteCount");
+            final JsonObject voteCountObj = JsonUtils.getObject(commentRenderer, "voteCount");
            if (voteCountObj.isEmpty()) {
                return "";
            }
@ -188,7 +183,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public Description getCommentText() throws ParsingException {
        try {
-            final JsonObject contentText = JsonUtils.getObject(getCommentRenderer(), "contentText");
+            final JsonObject contentText = JsonUtils.getObject(commentRenderer, "contentText");
            if (contentText.isEmpty()) {
                // completely empty comments as described in
                // https://github.com/TeamNewPipe/NewPipeExtractor/issues/380#issuecomment-668808584
@ -208,7 +203,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public String getCommentId() throws ParsingException {
        try {
-            return JsonUtils.getString(getCommentRenderer(), "commentId");
+            return JsonUtils.getString(commentRenderer, "commentId");
        } catch (final Exception e) {
            throw new ParsingException("Could not get comment id", e);
        }
@ -221,27 +216,26 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    }

    @Override
-    public boolean isHeartedByUploader() throws ParsingException {
-        final JsonObject commentActionButtonsRenderer = getCommentRenderer()
-                .getObject("actionButtons")
+    public boolean isHeartedByUploader() {
+        final JsonObject commentActionButtonsRenderer = commentRenderer.getObject("actionButtons")
                .getObject("commentActionButtonsRenderer");
        return commentActionButtonsRenderer.has("creatorHeart");
    }

    @Override
-    public boolean isPinned() throws ParsingException {
-        return getCommentRenderer().has("pinnedCommentBadge");
+    public boolean isPinned() {
+        return commentRenderer.has("pinnedCommentBadge");
    }

    @Override
    public boolean isUploaderVerified() throws ParsingException {
-        return getCommentRenderer().has("authorCommentBadge");
+        return commentRenderer.has("authorCommentBadge");
    }

    @Override
    public String getUploaderName() throws ParsingException {
        try {
-            return getTextFromObject(JsonUtils.getObject(getCommentRenderer(), "authorText"));
+            return getTextFromObject(JsonUtils.getObject(commentRenderer, "authorText"));
        } catch (final Exception e) {
            return "";
        }
@ -250,7 +244,7 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    @Override
    public String getUploaderUrl() throws ParsingException {
        try {
-            return "https://www.youtube.com/channel/" + JsonUtils.getString(getCommentRenderer(),
+            return "https://www.youtube.com/channel/" + JsonUtils.getString(commentRenderer,
                    "authorEndpoint.browseEndpoint.browseId");
        } catch (final Exception e) {
            return "";
@ -258,19 +252,22 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    }

    @Override
-    public int getReplyCount() throws ParsingException {
-        final JsonObject commentRendererJsonObject = getCommentRenderer();
-        if (commentRendererJsonObject.has("replyCount")) {
-            return commentRendererJsonObject.getInt("replyCount");
+    public int getReplyCount() {
+        if (commentRenderer.has("replyCount")) {
+            return commentRenderer.getInt("replyCount");
        }
        return UNKNOWN_REPLY_COUNT;
    }

    @Override
    public Page getReplies() {
+        if (commentRepliesRenderer == null) {
+            return null;
+        }
+
        try {
            final String id = JsonUtils.getString(
-                    JsonUtils.getArray(json, "replies.commentRepliesRenderer.contents")
+                    JsonUtils.getArray(commentRepliesRenderer, "contents")
                            .getObject(0),
                    "continuationItemRenderer.continuationEndpoint.continuationCommand.token");
            return new Page(url, id);
@ -280,20 +277,17 @@ public class YoutubeCommentsInfoItemExtractor implements CommentsInfoItemExtract
    }

    @Override
-    public boolean isChannelOwner() throws ParsingException {
-        return getCommentRenderer().getBoolean("authorIsChannelOwner");
+    public boolean isChannelOwner() {
+        return commentRenderer.getBoolean("authorIsChannelOwner");
    }

-
    @Override
-    public boolean hasCreatorReply() throws ParsingException {
-        try {
-            final JsonObject commentRepliesRenderer = JsonUtils.getObject(json,
-                    "replies.commentRepliesRenderer");
-            return commentRepliesRenderer.has("viewRepliesCreatorThumbnail");
-        } catch (final Exception e) {
+    public boolean hasCreatorReply() {
+        if (commentRepliesRenderer == null) {
            return false;
        }
+
+        return commentRepliesRenderer.has("viewRepliesCreatorThumbnail");
    }

 }
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java
@ -22,6 +22,7 @@ package org.schabi.newpipe.extractor.services.youtube.extractors;

 import static org.schabi.newpipe.extractor.services.youtube.ItagItem.APPROX_DURATION_MS_UNKNOWN;
 import static org.schabi.newpipe.extractor.services.youtube.ItagItem.CONTENT_LENGTH_UNKNOWN;
+import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.attributedDescriptionToHtml;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CONTENT_CHECK_OK;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.CPN;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.RACY_CHECK_OK;
@ -30,7 +31,6 @@ import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.fixThumbnailUrl;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateContentPlaybackNonce;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.generateTParameter;
-import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getAttributedDescription;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getImagesFromThumbnailsArray;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonAndroidPostResponse;
 import static org.schabi.newpipe.extractor.services.youtube.YoutubeParsingHelper.getJsonIosPostResponse;
@ -261,7 +261,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
            return new Description(videoSecondaryInfoRendererDescription, Description.HTML);
        }

-        final String attributedDescription = getAttributedDescription(
+        final String attributedDescription = attributedDescriptionToHtml(
                getVideoSecondaryInfoRenderer().getObject("attributedDescription"));
        if (!isNullOrEmpty(attributedDescription)) {
            return new Description(attributedDescription, Description.HTML);
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/stream/Description.java
@ -3,6 +3,8 @@ package org.schabi.newpipe.extractor.stream;
 import java.io.Serializable;
 import java.util.Objects;

+import javax.annotation.Nullable;
+
 public class Description implements Serializable {

    public static final int HTML = 1;
@ -13,7 +15,7 @@ public class Description implements Serializable {
    private final String content;
    private final int type;

-    public Description(final String content, final int type) {
+    public Description(@Nullable final String content, final int type) {
        this.type = type;
        if (content == null) {
            this.content = "";
--- a/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java
+++ b/extractor/src/test/java/org/schabi/newpipe/extractor/services/youtube/YoutubeDescriptionHelperTest.java
@ -0,0 +1,81 @@
+package org.schabi.newpipe.extractor.services.youtube;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.runsToHtml;
+
+import org.junit.jupiter.api.Test;
+import org.schabi.newpipe.extractor.services.youtube.YoutubeDescriptionHelper.Run;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+public class YoutubeDescriptionHelperTest {
+
+    private static void assertRunsToHtml(final String expectedHtml,
+                                         final List<Run> openers,
+                                         final List<Run> closers,
+                                         final String content) {
+        assertEquals(
+                expectedHtml,
+                runsToHtml(
+                        openers.stream()
+                                .sorted(Comparator.comparingInt(run -> run.pos))
+                                .collect(Collectors.toList()),
+                        closers.stream()
+                                .sorted(Comparator.comparingInt(run -> run.pos))
+                                .collect(Collectors.toList()),
+                        content
+                )
+        );
+    }
+
+    @Test
+    public void testNoRuns() {
+        assertRunsToHtml(
+                "abc *a* _c_ &lt;br&gt; <br> &lt;a href=\"#\"&gt;test&lt;/a&gt; &nbsp;&amp;amp;",
+                List.of(),
+                List.of(),
+                "abc *a* _c_ <br>\u00a0\n\u00a0<a href=\"#\">test</a>  &amp;"
+        );
+    }
+
+    @Test
+    public void testNormalRuns() {
+        assertRunsToHtml(
+                "<A>hel<B>lo </B>nic</A>e <C>test</C>",
+                List.of(new Run("<A>", "</A>", 0), new Run("<B>", "</B>", 3),
+                        new Run("<C>", "</C>", 11)),
+                List.of(new Run("<A>", "</A>", 9), new Run("<B>", "</B>", 6),
+                        new Run("<C>", "</C>", 15)),
+                "hello nice test"
+        );
+    }
+
+    @Test
+    public void testOverlappingRuns() {
+        assertRunsToHtml(
+                "01<A>23<B>45</B></A><B>67</B>89",
+                List.of(new Run("<A>", "</A>", 2), new Run("<B>", "</B>", 4)),
+                List.of(new Run("<A>", "</A>", 6), new Run("<B>", "</B>", 8)),
+                "0123456789"
+        );
+    }
+
+    @Test
+    public void testTransformingRuns() {
+        final Function<String, String> tA = content -> "whatever";
+        final Function<String, String> tD
+                = content -> Integer.parseInt(content) % 2 == 0 ? "even" : "odd";
+
+        assertRunsToHtml(
+                "0<A>whatever</A><C>4</C>5<D>odd</D>89",
+                List.of(new Run("<A>", "</A>", 1, tA), new Run("<B>", "</B>", 2),
+                        new Run("<C>", "</C>", 3), new Run("<D>", "</D>", 6, tD)),
+                List.of(new Run("<A>", "</A>", 4, tA), new Run("<B>", "</B>", 3),
+                        new Run("<C>", "</C>", 5), new Run("<D>", "</D>", 8, tD)),
+                "0123456789"
+        );
+    }
+}