Fix timestamp links in Youtube video descriptions

For some reason, in NewPipeExtractor, comments were loaded from JSON by YoutubeCommentsInfoItemExtractor as text, sent via CommentsInfoItem#getCommentText to NewPipe, where timestamps are converted to hyperlinks using Linkify: https://github.com/TeamNewPipe/NewPipe/pull/2168 On the other hand, video descriptions are handled in NewPipeExtractor by scraping the watch-page HTML. There, timestamp links were previously mangled (and now properly parsed), before being sent as HTML via YoutubeStreamExtractor#getDescription to NewPipe (where HTML gets converted to Spanned). The logic introduced in this commit is different from the above PR, since it operates in the extractor, and mutates the HTML DOM rather than identifying via regex.
2019-08-17 20:48:15 -07:00 · 2019-08-17 20:48:15 -07:00 · e38d906ff9
parent 430da57350
commit e38d906ff9
1 changed files with 44 additions and 2 deletions
--- a/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java
+++ b/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeStreamExtractor.java
@ -30,6 +30,8 @@ import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;

 /*
 * Created by Christian Schabesberger on 06.08.15.
@ -162,14 +164,54 @@ public class YoutubeStreamExtractor extends StreamExtractor {
        }
    }

+    // onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;"
+    // :00 is NOT recognized as a timestamp in description or comments.
+    // 0:00 is recognized in both description and comments.
+    // https://www.youtube.com/watch?v=4cccfDXu1vA
+    private final static Pattern DESCRIPTION_TIMESTAMP_ONCLICK_REGEX = Pattern.compile(
+        "seekTo\\("
+            + "(?:(\\d+)\\*3600\\+)?"  // hours?
+            + "(\\d+)\\*60\\+"  // minutes
+            + "(\\d+)"  // seconds
+            + "\\)");
+
+    @SafeVarargs
+    private static <T> T coalesce(T... args) {
+        for (T arg : args) {
+            if (arg != null) return arg;
+        }
+        throw new IllegalArgumentException("all arguments to coalesce() were null");
+    }
+
    private String parseHtmlAndGetFullLinks(String descriptionHtml)
            throws MalformedURLException, UnsupportedEncodingException, ParsingException {
        final Document description = Jsoup.parse(descriptionHtml, getUrl());
        for(Element a : description.select("a")) {
            final String rawUrl = a.attr("abs:href");
            final URL redirectLink = new URL(rawUrl);
-            final String queryString = redirectLink.getQuery();
-            if(queryString != null) {
+
+            final Matcher onClickTimestamp;
+            final String queryString;
+            if ((onClickTimestamp = DESCRIPTION_TIMESTAMP_ONCLICK_REGEX.matcher(a.attr("onclick")))
+                    .find()) {
+                a.removeAttr("onclick");
+
+                String hours = coalesce(onClickTimestamp.group(1), "0");
+                String minutes = onClickTimestamp.group(2);
+                String seconds = onClickTimestamp.group(3);
+
+                int timestamp = 0;
+                timestamp += Integer.parseInt(hours) * 3600;
+                timestamp += Integer.parseInt(minutes) * 60;
+                timestamp += Integer.parseInt(seconds);
+
+                String setTimestamp = "&t=" + timestamp;
+
+                // Even after clicking https://youtu.be/...?t=6,
+                // getUrl() is https://www.youtube.com/watch?v=..., never youtu.be, never &t=.
+                a.attr("href", getUrl() + setTimestamp);
+
+            } else if((queryString = redirectLink.getQuery()) != null) {
                // if the query string is null we are not dealing with a redirect link,
                // so we don't need to override it.
                final String link =