added CommentsInfo

This commit is contained in:
Ritvik Saraf 2018-09-19 04:22:23 +05:30
parent 4ca23ab5c3
commit ee239985ae
4 changed files with 135 additions and 75 deletions

View File

@ -1,12 +1,16 @@
package org.schabi.newpipe.extractor.comments;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.schabi.newpipe.extractor.ListExtractor.InfoItemsPage;
import org.schabi.newpipe.extractor.ListInfo;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandler;
import org.schabi.newpipe.extractor.utils.ExtractorHelper;
public class CommentsInfo extends ListInfo<CommentsInfoItem>{
@ -19,9 +23,88 @@ public class CommentsInfo extends ListInfo<CommentsInfoItem>{
return getInfo(NewPipe.getServiceByUrl(url), url);
}
private static CommentsInfo getInfo(StreamingService serviceByUrl, String url) {
// TODO Auto-generated method stub
return null;
private static CommentsInfo getInfo(StreamingService serviceByUrl, String url) throws ExtractionException, IOException {
return getInfo(serviceByUrl.getCommentsExtractor(url));
}
private static CommentsInfo getInfo(CommentsExtractor commentsExtractor) throws IOException, ExtractionException {
//for services which do not have a comments extractor
if(null == commentsExtractor) {
return null;
}
commentsExtractor.fetchPage();
String name = commentsExtractor.getName();
int serviceId = commentsExtractor.getServiceId();
ListLinkHandler listUrlIdHandler = commentsExtractor.getUIHandler();
CommentsInfo commentsInfo = new CommentsInfo(serviceId, listUrlIdHandler, name);
commentsInfo.setCommentsExtractor(commentsExtractor);
InfoItemsPage<CommentsInfoItem> initialCommentsPage = ExtractorHelper.getItemsPageOrLogError(commentsInfo,
commentsExtractor);
commentsInfo.setComments(new ArrayList<>());
commentsInfo.getComments().addAll(initialCommentsPage.getItems());
commentsInfo.setHasMoreComments(initialCommentsPage.hasNextPage());
commentsInfo.setNextCommentsPageUrl(initialCommentsPage.getNextPageUrl());
return commentsInfo;
}
public static void loadMoreComments(CommentsInfo commentsInfo) {
if (commentsInfo.hasMoreComments()) {
if(null == commentsInfo.getCommentsExtractor()) {
try {
commentsInfo.setCommentsExtractor(NewPipe.getService(commentsInfo.getServiceId()).getCommentsExtractor(commentsInfo.getUrl()));
} catch (ExtractionException e) {
commentsInfo.addError(e);
return;
}
}
try {
InfoItemsPage<CommentsInfoItem> commentsPage = commentsInfo.getCommentsExtractor()
.getPage(commentsInfo.getNextCommentsPageUrl());
commentsInfo.getComments().addAll(commentsPage.getItems());
commentsInfo.setHasMoreComments(commentsPage.hasNextPage());
commentsInfo.setNextCommentsPageUrl(commentsPage.getNextPageUrl());
} catch (IOException | ExtractionException e) {
commentsInfo.addError(e);
}
}
}
private transient CommentsExtractor commentsExtractor;
private List<CommentsInfoItem> comments;
private boolean hasMoreComments;
private String nextCommentsPageUrl;
public List<CommentsInfoItem> getComments() {
return comments;
}
public void setComments(List<CommentsInfoItem> comments) {
this.comments = comments;
}
public boolean hasMoreComments() {
return hasMoreComments;
}
public void setHasMoreComments(boolean hasMoreComments) {
this.hasMoreComments = hasMoreComments;
}
public CommentsExtractor getCommentsExtractor() {
return commentsExtractor;
}
public void setCommentsExtractor(CommentsExtractor commentsExtractor) {
this.commentsExtractor = commentsExtractor;
}
public String getNextCommentsPageUrl() {
return nextCommentsPageUrl;
}
public void setNextCommentsPageUrl(String nextCommentsPageUrl) {
this.nextCommentsPageUrl = nextCommentsPageUrl;
}
}

View File

@ -31,13 +31,13 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
private List<String> cookies;
private String sessionToken;
private String commentsToken;
private String title;
private InfoItemsPage<CommentsInfoItem> initPage;
private ObjectMapper mapper = new ObjectMapper();
public YoutubeCommentsExtractor(StreamingService service, ListLinkHandler uiHandler) {
super(service, uiHandler);
// TODO Auto-generated constructor stub
}
@Override
@ -45,12 +45,16 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
// initial page does not load any comments but is required to get session token
// and cookies
super.fetchPage();
return getPage(getNextPageUrl());
return initPage;
}
// isn't this method redundant. you can just call getnextpage on getInitialPage
@Override
public String getNextPageUrl() throws IOException, ExtractionException {
return getNextPageUrl(commentsToken);
// initial page does not load any comments but is required to get session token
// and cookies
super.fetchPage();
return initPage.getNextPageUrl();
}
private String getNextPageUrl(JsonNode ajaxJson) throws IOException, ExtractionException {
@ -91,6 +95,9 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
}
private void collectCommentsFrom(CommentsInfoItemsCollector collector, JsonNode ajaxJson, String pageUrl) {
fetchTitle(ajaxJson);
List<JsonNode> comments = ajaxJson.findValues("commentRenderer");
comments.stream().forEach(c -> {
CommentsInfoItemExtractor extractor = new CommentsInfoItemExtractor() {
@ -192,19 +199,29 @@ public class YoutubeCommentsExtractor extends CommentsExtractor {
}
private void fetchTitle(JsonNode ajaxJson) {
if(null == title) {
try {
title = ajaxJson.findValue("commentTargetTitle").get("simpleText").asText();
} catch (Exception e) {
title = "Youtube Comments";
}
}
}
@Override
public void onFetchPage(Downloader downloader) throws IOException, ExtractionException {
DownloadResponse response = downloader.get(getUrl());
String responseBody = response.getResponseBody();
cookies = response.getResponseHeaders().get("Set-Cookie");
sessionToken = findValue(responseBody, "XSRF_TOKEN");
commentsToken = findValue(responseBody, "COMMENTS_TOKEN");
String commentsToken = findValue(responseBody, "COMMENTS_TOKEN");
initPage = getPage(getNextPageUrl(commentsToken));
}
@Override
public String getName() throws ParsingException {
// TODO Auto-generated method stub
return null;
return title;
}
private String makeAjaxRequest(String siteUrl) throws IOException, ReCaptchaException {

View File

@ -1,20 +1,11 @@
package org.schabi.newpipe.extractor.services.youtube.linkHandler;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.schabi.newpipe.extractor.Downloader;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.exceptions.FoundAdException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.exceptions.ReCaptchaException;
import org.schabi.newpipe.extractor.linkhandler.ListLinkHandlerFactory;
import org.schabi.newpipe.extractor.utils.Parser;
@ -27,6 +18,11 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory {
return instance;
}
@Override
public String getUrl(String id) {
return "https://www.youtube.com/watch?v=" + id;
}
@Override
public String getId(String url) throws ParsingException, IllegalArgumentException {
if (url.isEmpty()) {
@ -44,8 +40,6 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory {
} catch (UnsupportedEncodingException uee) {
throw new ParsingException("Could not parse attribution_link", uee);
}
} else if (lowercaseUrl.contains("youtube.com/shared?ci=")) {
return getRealIdFromSharedLink(url);
} else if (url.contains("vnd.youtube")) {
id = Parser.matchGroup1(ID_PATTERN, url);
} else if (url.contains("embed")) {
@ -86,56 +80,6 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory {
}
}
/**
* Get the real url from a shared uri.
* <p>
* Shared URI's look like this:
* <pre>
* * https://www.youtube.com/shared?ci=PJICrTByb3E
* * vnd.youtube://www.youtube.com/shared?ci=PJICrTByb3E&amp;feature=twitter-deep-link
* </pre>
*
* @param url The shared url
* @return the id of the stream
* @throws ParsingException
*/
private String getRealIdFromSharedLink(String url) throws ParsingException {
URI uri;
try {
uri = new URI(url);
} catch (URISyntaxException e) {
throw new ParsingException("Invalid shared link", e);
}
String sharedId = getSharedId(uri);
Downloader downloader = NewPipe.getDownloader();
String content;
try {
content = downloader.download("https://www.youtube.com/shared?ci=" + sharedId);
} catch (IOException | ReCaptchaException e) {
throw new ParsingException("Unable to resolve shared link", e);
}
final Document document = Jsoup.parse(content);
final Element element = document.select("link[rel=\"canonical\"]").first();
final String urlWithRealId = (element != null)
? element.attr("abs:href")
: document.select("meta[property=\"og:url\"]").first()
.attr("abs:content");
String realId = Parser.matchGroup1(ID_PATTERN, urlWithRealId);
if (sharedId.equals(realId)) {
throw new ParsingException("Got same id for as shared info_id: " + sharedId);
}
return realId;
}
private String getSharedId(URI uri) throws ParsingException {
if (!"/shared".equals(uri.getPath())) {
throw new ParsingException("Not a shared link: " + uri.toString() + " (path != " + uri.getPath() + ")");
}
return Parser.matchGroup1("ci=" + ID_PATTERN, uri.getQuery());
}
@Override
public boolean onAcceptUrl(final String url) throws FoundAdException {
final String lowercaseUrl = url.toLowerCase();
@ -156,8 +100,8 @@ public class YoutubeCommentsLinkHandlerFactory extends ListLinkHandlerFactory {
}
}
@Override
public String getUrl(String id, List<String> contentFilter, String sortFilter) throws ParsingException {
return "https://www.youtube.com/watch?v=" + id;
}
@Override
public String getUrl(String id, List<String> contentFilter, String sortFilter) throws ParsingException {
return "https://www.youtube.com/watch?v=" + id;
}
}

View File

@ -11,6 +11,7 @@ import org.junit.Test;
import org.schabi.newpipe.Downloader;
import org.schabi.newpipe.extractor.ListExtractor.InfoItemsPage;
import org.schabi.newpipe.extractor.NewPipe;
import org.schabi.newpipe.extractor.comments.CommentsInfo;
import org.schabi.newpipe.extractor.comments.CommentsInfoItem;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.services.youtube.extractors.YoutubeCommentsExtractor;
@ -56,6 +57,21 @@ public class YoutubeCommentsExtractorTest {
assertTrue(result);
}
@Test
public void testGetCommentsFromCommentsInfo() throws IOException, ExtractionException {
boolean result = false;
CommentsInfo commentsInfo = CommentsInfo.getInfo("https://www.youtube.com/watch?v=rrgFN3AxGfs");
assertTrue("what the fuck am i doing with my life.wmv".equals(commentsInfo.getName()));
result = findInComments(commentsInfo.getComments(), "i should really be in the top comment.lol");
while (commentsInfo.hasMoreComments() && !result) {
CommentsInfo.loadMoreComments(commentsInfo);
result = findInComments(commentsInfo.getComments(), "i should really be in the top comment.lol");
}
assertTrue(result);
}
private boolean findInComments(InfoItemsPage<CommentsInfoItem> comments, String comment) {
return findInComments(comments.getItems(), comment);
}