Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search

This commit is contained in:
wb9688 2020-02-22 20:19:41 +01:00 committed by TobiGr
parent af49b3c487
commit b88188d419
4 changed files with 118 additions and 354 deletions

View File

@ -4,6 +4,7 @@ import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -23,9 +24,10 @@ import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
import java.io.IOException;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
@SuppressWarnings("WeakerAccess")
public class YoutubePlaylistExtractor extends PlaylistExtractor {
@ -318,7 +320,7 @@ public class YoutubePlaylistExtractor extends PlaylistExtractor {
}
@Override
public String getTextualUploadDate() throws ParsingException {
public String getTextualUploadDate() {
return "";
}

View File

@ -1,5 +1,10 @@
package org.schabi.newpipe.extractor.services.youtube.extractors;
import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -16,12 +21,13 @@ import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.utils.Parser;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import javax.annotation.Nonnull;
/*
* Created by Christian Schabesberger on 22.07.2018
*
@ -45,6 +51,7 @@ import java.net.URL;
public class YoutubeSearchExtractor extends SearchExtractor {
private Document doc;
private JsonObject ytInitialData;
public YoutubeSearchExtractor(StreamingService service, SearchQueryHandler linkHandler) {
super(service, linkHandler);
@ -55,6 +62,7 @@ public class YoutubeSearchExtractor extends SearchExtractor {
final String url = getUrl();
final Response response = downloader.get(url, getExtractorLocalization());
doc = YoutubeParsingHelper.parseAndCheckPage(url, response);
ytInitialData = getInitialData();
}
@Nonnull
@ -86,6 +94,7 @@ public class YoutubeSearchExtractor extends SearchExtractor {
@Override
public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
// TODO: Get extracting next pages working
final String response = getDownloader().get(pageUrl, getExtractorLocalization()).responseBody();
doc = Jsoup.parse(response, pageUrl);
@ -108,37 +117,33 @@ public class YoutubeSearchExtractor extends SearchExtractor {
InfoItemsSearchCollector collector = getInfoItemSearchCollector();
collector.reset();
Element list = doc.select("ol[class=\"item-section\"]").first();
final TimeAgoParser timeAgoParser = getTimeAgoParser();
for (Element item : list.children()) {
/* First we need to determine which kind of item we are working with.
Youtube depicts five different kinds of items on its search result page. These are
regular videos, playlists, channels, two types of video suggestions, and a "no video
found" item. Since we only want videos, we need to filter out all the others.
An example for this can be seen here:
https://www.youtube.com/results?search_query=asdf&page=1
JsonArray list = ytInitialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
.getObject(0).getObject("itemSectionRenderer").getArray("contents");
We already applied a filter to the url, so we don't need to care about channels and
playlists now.
*/
Element el;
if ((el = item.select("div[class*=\"search-message\"]").first()) != null) {
throw new NothingFoundException(el.text());
// video item type
} else if ((el = item.select("div[class*=\"yt-lockup-video\"]").first()) != null) {
collector.commit(new YoutubeStreamInfoItemExtractor(el, timeAgoParser));
} else if ((el = item.select("div[class*=\"yt-lockup-channel\"]").first()) != null) {
collector.commit(new YoutubeChannelInfoItemExtractor(el));
} else if ((el = item.select("div[class*=\"yt-lockup-playlist\"]").first()) != null &&
item.select(".yt-pl-icon-mix").isEmpty()) {
collector.commit(new YoutubePlaylistInfoItemExtractor(el));
for (Object item : list) {
if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
.getObject("bodyText").getArray("runs").getObject(0).getString("text"));
} else if (((JsonObject) item).getObject("videoRenderer") != null) {
collector.commit(new YoutubeStreamInfoItemExtractor(((JsonObject) item).getObject("videoRenderer"), timeAgoParser));
} else if (((JsonObject) item).getObject("channelRenderer") != null) {
// collector.commit(new YoutubeChannelInfoItemExtractor(((JsonObject) item).getObject("channelRenderer")));
} else if (((JsonObject) item).getObject("playlistRenderer") != null) {
// collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
}
}
return collector;
}
private JsonObject getInitialData() throws ParsingException {
try {
String initialData = Parser.matchGroup1("window\\[\"ytInitialData\"\\]\\s*=\\s*(\\{.*?\\});", doc.toString());
return JsonParser.object().from(initialData);
} catch (JsonParserException | Parser.RegexException e) {
throw new ParsingException("Could not get ytInitialData", e);
}
}
}

View File

@ -4,10 +4,10 @@ import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.mozilla.javascript.Context;
import org.mozilla.javascript.Function;
import org.mozilla.javascript.ScriptableObject;
@ -25,24 +25,38 @@ import org.schabi.newpipe.extractor.linkhandler.LinkHandler;
import org.schabi.newpipe.extractor.localization.DateWrapper;
import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.services.youtube.ItagItem;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeStreamLinkHandlerFactory;
import org.schabi.newpipe.extractor.stream.*;
import org.schabi.newpipe.extractor.stream.AudioStream;
import org.schabi.newpipe.extractor.stream.Description;
import org.schabi.newpipe.extractor.stream.Frameset;
import org.schabi.newpipe.extractor.stream.Stream;
import org.schabi.newpipe.extractor.stream.StreamExtractor;
import org.schabi.newpipe.extractor.stream.StreamInfoItem;
import org.schabi.newpipe.extractor.stream.StreamInfoItemsCollector;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.stream.SubtitlesStream;
import org.schabi.newpipe.extractor.stream.VideoStream;
import org.schabi.newpipe.extractor.utils.JsonUtils;
import org.schabi.newpipe.extractor.utils.Parser;
import org.schabi.newpipe.extractor.utils.Utils;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
/*
* Created by Christian Schabesberger on 06.08.15.
*
@ -663,7 +677,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
final TimeAgoParser timeAgoParser = getTimeAgoParser();
StreamInfoItemsCollector collector = new StreamInfoItemsCollector(getServiceId());
collector.commit(extractVideoPreviewInfo(videoInfo, timeAgoParser));
collector.commit(new YoutubeStreamInfoItemExtractor(videoInfo, timeAgoParser));
return collector.getItems().get(0);
} catch (Exception e) {
throw new ParsingException("Could not get next video", e);
@ -684,7 +698,7 @@ public class YoutubeStreamExtractor extends StreamExtractor {
for (Object ul : results) {
final JsonObject videoInfo = ((JsonObject) ul).getObject("compactVideoRenderer");
if (videoInfo != null) collector.commit(extractVideoPreviewInfo(videoInfo, timeAgoParser));
if (videoInfo != null) collector.commit(new YoutubeStreamInfoItemExtractor(videoInfo, timeAgoParser));
}
return collector;
} catch (Exception e) {
@ -1064,123 +1078,6 @@ public class YoutubeStreamExtractor extends StreamExtractor {
return urlAndItags;
}
/**
* Provides information about links to other videos on the video page, such as related videos.
* This is encapsulated in a StreamInfoItem object, which is a subset of the fields in a full StreamInfo.
*/
private StreamInfoItemExtractor extractVideoPreviewInfo(final JsonObject videoInfo, final TimeAgoParser timeAgoParser) {
return new YoutubeStreamInfoItemExtractor(videoInfo, timeAgoParser) {
@Override
public StreamType getStreamType() {
try {
if (videoInfo.getArray("badges").getObject(0).getObject("metadataBadgeRenderer").getString("label").equals("LIVE NOW")) {
return StreamType.LIVE_STREAM;
}
} catch (Exception ignored) {}
return StreamType.VIDEO_STREAM;
}
@Override
public boolean isAd() {
return false;
}
@Override
public String getUrl() throws ParsingException {
try {
String videoId = videoInfo.getString("videoId");
return YoutubeStreamLinkHandlerFactory.getInstance().getUrl(videoId);
} catch (Exception e) {
throw new ParsingException("Could not get url", e);
}
}
@Override
public String getName() throws ParsingException {
String name = null;
try {
name = videoInfo.getObject("title").getString("simpleText");
} catch (Exception ignored) {}
if (name != null && !name.isEmpty()) return name;
throw new ParsingException("Could not get title");
}
@Override
public long getDuration() throws ParsingException {
try {
if (getStreamType() == StreamType.LIVE_STREAM) return -1;
return YoutubeParsingHelper.parseDurationString(videoInfo.getObject("lengthText").getString("simpleText"));
} catch (Exception e) {
throw new ParsingException("Could not get duration", e);
}
}
@Override
public String getUploaderUrl() throws ParsingException {
try {
String id = videoInfo.getObject("longBylineText").getArray("runs")
.getObject(0).getObject("navigationEndpoint")
.getObject("browseEndpoint").getString("browseId");
if (id == null || id.isEmpty()) {
throw new IllegalArgumentException("is empty");
}
return YoutubeChannelLinkHandlerFactory.getInstance().getUrl(id);
} catch (Exception e) {
throw new ParsingException("Could not get uploader url");
}
}
@Nullable
@Override
public String getTextualUploadDate() {
return null;
}
@Nullable
@Override
public DateWrapper getUploadDate() {
return null;
}
@Override
public long getViewCount() throws ParsingException {
try {
String viewCount;
if (getStreamType() == StreamType.LIVE_STREAM) {
viewCount = videoInfo.getObject("viewCountText")
.getArray("runs").getObject(0).getString("text");
} else {
viewCount = videoInfo.getObject("viewCountText").getString("simpleText");
}
if (viewCount.equals("Recommended for you")) return -1;
return Long.parseLong(Utils.removeNonDigitCharacters(viewCount));
} catch (Exception e) {
throw new ParsingException("Could not get view count", e);
}
}
@Override
public String getUploaderName() throws ParsingException {
try {
return videoInfo.getObject("longBylineText").getArray("runs")
.getObject(0).getString("text");
} catch (Exception e) {
throw new ParsingException("Could not get uploader name", e);
}
}
@Override
public String getThumbnailUrl() throws ParsingException {
try {
return videoInfo.getObject("thumbnail").getArray("thumbnails")
.getObject(0).getString("url");
} catch (Exception e) {
throw new ParsingException("Could not get thumbnail url", e);
}
}
};
}
@Nonnull
@Override
public List<Frameset> getFrames() throws ExtractionException {

View File

@ -2,19 +2,17 @@ package org.schabi.newpipe.extractor.services.youtube.extractors;
import com.grack.nanojson.JsonObject;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.localization.DateWrapper;
import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeChannelLinkHandlerFactory;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeStreamLinkHandlerFactory;
import org.schabi.newpipe.extractor.stream.StreamInfoItemExtractor;
import org.schabi.newpipe.extractor.stream.StreamType;
import org.schabi.newpipe.extractor.utils.Utils;
import javax.annotation.Nullable;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
/*
* Copyright (C) Christian Schabesberger 2016 <chris.schabesberger@mailbox.org>
@ -36,20 +34,10 @@ import java.util.Date;
public class YoutubeStreamInfoItemExtractor implements StreamInfoItemExtractor {
private JsonObject videoInfoItem;
private Element item;
private JsonObject videoInfo;
private final TimeAgoParser timeAgoParser;
private String cachedUploadDate;
/**
* Creates an extractor of StreamInfoItems from a YouTube page.
*
* @param item The page element
* @param timeAgoParser A parser of the textual dates or {@code null}.
*/
public YoutubeStreamInfoItemExtractor(Element item, @Nullable TimeAgoParser timeAgoParser) {
this.item = item;
public YoutubeStreamInfoItemExtractor(Element a, @Nullable TimeAgoParser timeAgoParser) {
this.timeAgoParser = timeAgoParser;
}
@ -60,251 +48,123 @@ public class YoutubeStreamInfoItemExtractor implements StreamInfoItemExtractor {
* @param timeAgoParser A parser of the textual dates or {@code null}.
*/
public YoutubeStreamInfoItemExtractor(JsonObject videoInfoItem, @Nullable TimeAgoParser timeAgoParser) {
this.videoInfoItem = videoInfoItem;
this.videoInfo = videoInfoItem;
this.timeAgoParser = timeAgoParser;
}
@Override
public StreamType getStreamType() throws ParsingException {
if (isLiveStream(item)) {
return StreamType.LIVE_STREAM;
} else {
return StreamType.VIDEO_STREAM;
}
public StreamType getStreamType() {
try {
if (videoInfo.getArray("badges").getObject(0).getObject("metadataBadgeRenderer").getString("label").equals("LIVE NOW")) {
return StreamType.LIVE_STREAM;
}
} catch (Exception ignored) {}
return StreamType.VIDEO_STREAM;
}
@Override
public boolean isAd() throws ParsingException {
return !item.select("span[class*=\"icon-not-available\"]").isEmpty()
|| !item.select("span[class*=\"yt-badge-ad\"]").isEmpty()
|| isPremiumVideo();
}
private boolean isPremiumVideo() {
Element premiumSpan = item.select("span[class=\"standalone-collection-badge-renderer-red-text\"]").first();
if (premiumSpan == null) return false;
// if this span has text it most likely says ("Free Video") so we can play this
if (premiumSpan.hasText()) return false;
return true;
public boolean isAd() {
return false;
}
@Override
public String getUrl() throws ParsingException {
try {
Element el = item.select("div[class*=\"yt-lockup-video\"]").first();
Element dl = el.select("h3").first().select("a").first();
return dl.attr("abs:href");
String videoId = videoInfo.getString("videoId");
return YoutubeStreamLinkHandlerFactory.getInstance().getUrl(videoId);
} catch (Exception e) {
throw new ParsingException("Could not get web page url for the video", e);
throw new ParsingException("Could not get url", e);
}
}
@Override
public String getName() throws ParsingException {
String name = null;
try {
Element el = item.select("div[class*=\"yt-lockup-video\"]").first();
Element dl = el.select("h3").first().select("a").first();
return dl.text();
} catch (Exception e) {
throw new ParsingException("Could not get title", e);
name = videoInfo.getObject("title").getString("simpleText");
} catch (Exception ignored) {}
if (name == null) {
try {
name = videoInfo.getObject("title").getArray("runs").getObject(0).getString("text");
} catch (Exception ignored) {}
}
if (name != null && !name.isEmpty()) return name;
throw new ParsingException("Could not get name");
}
@Override
public long getDuration() throws ParsingException {
try {
if (getStreamType() == StreamType.LIVE_STREAM) return -1;
final Element duration = item.select("span[class*=\"video-time\"]").first();
// apparently on youtube, video-time element will not show up if the video has a duration of 00:00
// see: https://www.youtube.com/results?sp=EgIQAVAU&q=asdfgf
return duration == null ? 0 : YoutubeParsingHelper.parseDurationString(duration.text());
return YoutubeParsingHelper.parseDurationString(videoInfo.getObject("lengthText").getString("simpleText"));
} catch (Exception e) {
throw new ParsingException("Could not get Duration: " + getUrl(), e);
throw new ParsingException("Could not get duration", e);
}
}
@Override
public String getUploaderName() throws ParsingException {
try {
return item.select("div[class=\"yt-lockup-byline\"]").first()
.select("a").first()
.text();
return videoInfo.getObject("longBylineText").getArray("runs")
.getObject(0).getString("text");
} catch (Exception e) {
throw new ParsingException("Could not get uploader", e);
throw new ParsingException("Could not get uploader name", e);
}
}
@Override
public String getUploaderUrl() throws ParsingException {
// this url is not always in the form "/channel/..."
// sometimes Youtube provides urls in the from "/user/..."
try {
try {
return item.select("div[class=\"yt-lockup-byline\"]").first()
.select("a").first()
.attr("abs:href");
} catch (Exception e){}
// try this if the first didn't work
return item.select("span[class=\"title\"")
.text().split(" - ")[0];
} catch (Exception e) {
System.out.println(item.html());
throw new ParsingException("Could not get uploader url", e);
}
}
@Nullable
@Override
public String getTextualUploadDate() throws ParsingException {
if (getStreamType().equals(StreamType.LIVE_STREAM)) {
return null;
}
if (cachedUploadDate != null) {
return cachedUploadDate;
}
try {
if (isVideoReminder()) {
final Calendar calendar = getDateFromReminder();
if (calendar != null) {
return cachedUploadDate = new SimpleDateFormat("yyyy-MM-dd HH:mm")
.format(calendar.getTime());
}
String id = videoInfo.getObject("longBylineText").getArray("runs")
.getObject(0).getObject("navigationEndpoint")
.getObject("browseEndpoint").getString("browseId");
if (id == null || id.isEmpty()) {
throw new IllegalArgumentException("is empty");
}
Element meta = item.select("div[class=\"yt-lockup-meta\"]").first();
if (meta == null) return "";
final Elements li = meta.select("li");
if (li.isEmpty()) return "";
return cachedUploadDate = li.first().text();
return YoutubeChannelLinkHandlerFactory.getInstance().getUrl(id);
} catch (Exception e) {
throw new ParsingException("Could not get upload date", e);
throw new ParsingException("Could not get uploader url");
}
}
@Nullable
@Override
public DateWrapper getUploadDate() throws ParsingException {
if (getStreamType().equals(StreamType.LIVE_STREAM)) {
return null;
}
public String getTextualUploadDate() {
// TODO: Get upload date in case of a videoRenderer (not available in case of a compactVideoRenderer)
return null;
}
if (isVideoReminder()) {
return new DateWrapper(getDateFromReminder());
}
String textualUploadDate = getTextualUploadDate();
if (timeAgoParser != null && textualUploadDate != null && !textualUploadDate.isEmpty()) {
return timeAgoParser.parse(textualUploadDate);
} else {
return null;
}
@Nullable
@Override
public DateWrapper getUploadDate() {
return null;
}
@Override
public long getViewCount() throws ParsingException {
String input;
final Element spanViewCount = item.select("span.view-count").first();
if (spanViewCount != null) {
input = spanViewCount.text();
} else if (getStreamType().equals(StreamType.LIVE_STREAM)) {
Element meta = item.select("ul.yt-lockup-meta-info").first();
if (meta == null) return 0;
final Elements li = meta.select("li");
if (li.isEmpty()) return 0;
input = li.first().text();
} else {
try {
Element meta = item.select("div.yt-lockup-meta").first();
if (meta == null) return -1;
// This case can happen if google releases a special video
if (meta.select("li").size() < 2) return -1;
input = meta.select("li").get(1).text();
} catch (IndexOutOfBoundsException e) {
throw new ParsingException("Could not parse yt-lockup-meta although available: " + getUrl(), e);
}
}
if (input == null) {
throw new ParsingException("Input is null");
}
try {
return Long.parseLong(Utils.removeNonDigitCharacters(input));
} catch (NumberFormatException e) {
// if this happens the video probably has no views
if (!input.isEmpty()) {
return 0;
String viewCount;
if (getStreamType() == StreamType.LIVE_STREAM) {
viewCount = videoInfo.getObject("viewCountText")
.getArray("runs").getObject(0).getString("text");
} else {
viewCount = videoInfo.getObject("viewCountText").getString("simpleText");
}
throw new ParsingException("Could not handle input: " + input, e);
if (viewCount.equals("Recommended for you")) return -1;
return Long.parseLong(Utils.removeNonDigitCharacters(viewCount));
} catch (Exception e) {
throw new ParsingException("Could not get view count", e);
}
}
@Override
public String getThumbnailUrl() throws ParsingException {
try {
String url;
Element te = item.select("div[class=\"yt-thumb video-thumb\"]").first()
.select("img").first();
url = te.attr("abs:src");
// Sometimes youtube sends links to gif files which somehow seem to not exist
// anymore. Items with such gif also offer a secondary image source. So we are going
// to use that if we've caught such an item.
if (url.contains(".gif")) {
url = te.attr("abs:data-thumb");
}
return url;
// TODO: Don't simply get the first item, but look at all thumbnails and their resolution
return videoInfo.getObject("thumbnail").getArray("thumbnails")
.getObject(0).getString("url");
} catch (Exception e) {
throw new ParsingException("Could not get thumbnail url", e);
}
}
private boolean isVideoReminder() {
return !item.select("span.yt-uix-livereminder").isEmpty();
}
private Calendar getDateFromReminder() throws ParsingException {
final Element timeFuture = item.select("span.yt-badge.localized-date").first();
if (timeFuture == null) {
throw new ParsingException("Span timeFuture is null");
}
final String timestamp = timeFuture.attr("data-timestamp");
if (!timestamp.isEmpty()) {
try {
final Calendar calendar = Calendar.getInstance();
calendar.setTime(new Date(Long.parseLong(timestamp) * 1000L));
return calendar;
} catch (Exception e) {
throw new ParsingException("Could not parse = \"" + timestamp + "\"");
}
}
throw new ParsingException("Could not parse date from reminder element: \"" + timeFuture + "\"");
}
/**
* Generic method that checks if the element contains any clues that it's a livestream item
*/
protected static boolean isLiveStream(Element item) {
return !item.select("span[class*=\"yt-badge-live\"]").isEmpty()
|| !item.select("span[class*=\"video-time-overlay-live\"]").isEmpty();
}
}