NewPipeExtractor/extractor/src/main/java/org/schabi/newpipe/extractor/services/youtube/extractors/YoutubeSearchExtractor.java

package org.schabi.newpipe.extractor.services.youtube.extractors;

import com.grack.nanojson.JsonArray;
import com.grack.nanojson.JsonObject;
import com.grack.nanojson.JsonParser;
import com.grack.nanojson.JsonParserException;

import org.jsoup.nodes.Document;
import org.schabi.newpipe.extractor.InfoItem;
import org.schabi.newpipe.extractor.StreamingService;
import org.schabi.newpipe.extractor.downloader.Downloader;
import org.schabi.newpipe.extractor.downloader.Response;
import org.schabi.newpipe.extractor.exceptions.ExtractionException;
import org.schabi.newpipe.extractor.exceptions.ParsingException;
import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;
import org.schabi.newpipe.extractor.localization.TimeAgoParser;
import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;
import org.schabi.newpipe.extractor.search.SearchExtractor;
import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Nonnull;

/*
 * Created by Christian Schabesberger on 22.07.2018
 *
 * Copyright (C) Christian Schabesberger 2018 <chris.schabesberger@mailbox.org>
 * YoutubeSearchExtractor.java is part of NewPipe.
 *
 * NewPipe is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * NewPipe is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with NewPipe.  If not, see <http://www.gnu.org/licenses/>.
 */

public class YoutubeSearchExtractor extends SearchExtractor {

    private Document doc;
    private JsonObject initialData;

    public YoutubeSearchExtractor(StreamingService service, SearchQueryHandler linkHandler) {
        super(service, linkHandler);
    }

    @Override
    public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {
        final String url = getUrl();
        final Response response = downloader.get(url, getExtractorLocalization());
        doc = YoutubeParsingHelper.parseAndCheckPage(url, response);
        initialData = YoutubeParsingHelper.getInitialData(response.responseBody());
    }

    @Nonnull
    @Override
    public String getUrl() throws ParsingException {
        return super.getUrl() + "&gl=" + getExtractorContentCountry().getCountryCode();
    }

    @Override
    public String getSearchSuggestion() {
        JsonObject showingResultsForRenderer = initialData.getObject("contents")
                .getObject("twoColumnSearchResultsRenderer").getObject("primaryContents")
                .getObject("sectionListRenderer").getArray("contents").getObject(0)
                .getObject("itemSectionRenderer").getArray("contents").getObject(0)
                .getObject("showingResultsForRenderer");
        if (showingResultsForRenderer == null) {
            return "";
        } else {
            return showingResultsForRenderer.getObject("correctedQuery").getArray("runs")
                    .getObject(0).getString("text");
        }
    }

    @Nonnull
    @Override
    public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {
        InfoItemsSearchCollector collector = getInfoItemSearchCollector();
        JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
                .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
                .getObject(0).getObject("itemSectionRenderer").getArray("contents");

        collectStreamsFrom(collector, videos);
        return new InfoItemsPage<>(collector, getNextPageUrl());
    }

    @Override
    public String getNextPageUrl() throws ExtractionException {
        return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")
                .getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")
                .getObject(0).getObject("itemSectionRenderer").getArray("continuations"));
    }

    @Override
    public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {
        if (pageUrl == null || pageUrl.isEmpty()) {
            throw new ExtractionException(new IllegalArgumentException("Page url is empty or null"));
        }

        InfoItemsSearchCollector collector = getInfoItemSearchCollector();
        JsonArray ajaxJson;

        Map<String, List<String>> headers = new HashMap<>();
        headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));

        try {
            // Use the hardcoded client version first to get JSON with a structure we know
            headers.put("X-YouTube-Client-Version",
                    Collections.singletonList(YoutubeParsingHelper.HARDCODED_CLIENT_VERSION));
            final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();
            if (response.length() < 50) { // ensure to have a valid response
                throw new ParsingException("Could not parse json data for next streams");
            }
            ajaxJson = JsonParser.array().from(response);
        } catch (Exception e) {
            try {
                headers.put("X-YouTube-Client-Version",
                        Collections.singletonList(YoutubeParsingHelper.getClientVersion(initialData, doc.toString())));
                final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();
                if (response.length() < 50) { // ensure to have a valid response
                    throw new ParsingException("Could not parse json data for next streams");
                }
                ajaxJson = JsonParser.array().from(response);
            } catch (JsonParserException ignored) {
                throw new ParsingException("Could not parse json data for next streams", e);
            }
        }

        JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response")
                .getObject("continuationContents").getObject("itemSectionContinuation");

        collectStreamsFrom(collector, itemSectionRenderer.getArray("contents"));

        return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations")));
    }

    private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException {
        collector.reset();

        final TimeAgoParser timeAgoParser = getTimeAgoParser();

        for (Object item : videos) {
            if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {
                throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")
                        .getObject("bodyText").getArray("runs").getObject(0).getString("text"));
            } else if (((JsonObject) item).getObject("videoRenderer") != null) {
                collector.commit(new YoutubeStreamInfoItemExtractor(((JsonObject) item).getObject("videoRenderer"), timeAgoParser));
            } else if (((JsonObject) item).getObject("channelRenderer") != null) {
                collector.commit(new YoutubeChannelInfoItemExtractor(((JsonObject) item).getObject("channelRenderer")));
            } else if (((JsonObject) item).getObject("playlistRenderer") != null) {
                collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));
            }
        }
    }

    private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException {
        if (continuations == null) {
            return "";
        }

        JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData");
        String continuation = nextContinuationData.getString("continuation");
        String clickTrackingParams = nextContinuationData.getString("clickTrackingParams");
        return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation
                + "&itct=" + clickTrackingParams;
    }
}
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`package org.schabi.newpipe.extractor.services.youtube.extractors;`

Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search 2020-02-22 20:19:41 +01:00			`import com.grack.nanojson.JsonArray;`
			`import com.grack.nanojson.JsonObject;`
			`import com.grack.nanojson.JsonParser;`
			`import com.grack.nanojson.JsonParserException;`

remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`import org.jsoup.nodes.Document;`
			`import org.schabi.newpipe.extractor.InfoItem;`
			`import org.schabi.newpipe.extractor.StreamingService;`
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`import org.schabi.newpipe.extractor.downloader.Downloader;`
			`import org.schabi.newpipe.extractor.downloader.Response;`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`import org.schabi.newpipe.extractor.exceptions.ExtractionException;`
			`import org.schabi.newpipe.extractor.exceptions.ParsingException;`
Base Implementation: Parse the upload date of StreamInfoItems In the format '2 days ago' (in English) on a YouTube channel page. (Parser extensible to other pages.) 2019-10-02 07:02:01 +02:00			`import org.schabi.newpipe.extractor.linkhandler.SearchQueryHandler;`
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`import org.schabi.newpipe.extractor.localization.TimeAgoParser;`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`import org.schabi.newpipe.extractor.search.InfoItemsSearchCollector;`
			`import org.schabi.newpipe.extractor.search.SearchExtractor;`
[YouTube] Improve detection of reCAPTCHA pages 2019-10-29 06:00:29 +01:00			`import org.schabi.newpipe.extractor.services.youtube.linkHandler.YoutubeParsingHelper;`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
			`import java.io.IOException;`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`import java.util.Collections;`
			`import java.util.HashMap;`
			`import java.util.List;`
			`import java.util.Map;`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search 2020-02-22 20:19:41 +01:00			`import javax.annotation.Nonnull;`

make less tests fail 2018-07-01 16:21:40 +02:00			`/*`
			`* Created by Christian Schabesberger on 22.07.2018`
			`*`
			`* Copyright (C) Christian Schabesberger 2018 <chris.schabesberger@mailbox.org>`
			`* YoutubeSearchExtractor.java is part of NewPipe.`
			`*`
			`* NewPipe is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* NewPipe is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with NewPipe. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`public class YoutubeSearchExtractor extends SearchExtractor {`

			`private Document doc;`
Move getInitialData() method to YouTubeParsingHelper Rename ytInitialData to initialData 2020-02-22 23:51:02 +01:00			`private JsonObject initialData;`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`public YoutubeSearchExtractor(StreamingService service, SearchQueryHandler linkHandler) {`
			`super(service, linkHandler);`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`

			`@Override`
			`public void onFetchPage(@Nonnull Downloader downloader) throws IOException, ExtractionException {`
fix broken search results 2018-10-25 15:46:47 +02:00			`final String url = getUrl();`
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`final Response response = downloader.get(url, getExtractorLocalization());`
[YouTube] Improve detection of reCAPTCHA pages 2019-10-29 06:00:29 +01:00			`doc = YoutubeParsingHelper.parseAndCheckPage(url, response);`
Move getInitialData() method to YouTubeParsingHelper Rename ytInitialData to initialData 2020-02-22 23:51:02 +01:00			`initialData = YoutubeParsingHelper.getInitialData(response.responseBody());`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`

Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`@Nonnull`
fix broken search results 2018-10-25 15:46:47 +02:00			`@Override`
			`public String getUrl() throws ParsingException {`
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`return super.getUrl() + "&gl=" + getExtractorContentCountry().getCountryCode();`
fix broken search results 2018-10-25 15:46:47 +02:00			`}`

remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`@Override`
add support for yt content country 2018-10-06 12:22:37 +02:00			`public String getSearchSuggestion() {`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`JsonObject showingResultsForRenderer = initialData.getObject("contents")`
			`.getObject("twoColumnSearchResultsRenderer").getObject("primaryContents")`
			`.getObject("sectionListRenderer").getArray("contents").getObject(0)`
			`.getObject("itemSectionRenderer").getArray("contents").getObject(0)`
			`.getObject("showingResultsForRenderer");`
			`if (showingResultsForRenderer == null) {`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`return "";`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`} else {`
			`return showingResultsForRenderer.getObject("correctedQuery").getArray("runs")`
			`.getObject(0).getString("text");`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`
			`}`

			`@Nonnull`
			`@Override`
add support for yt content country 2018-10-06 12:22:37 +02:00			`public InfoItemsPage<InfoItem> getInitialPage() throws ExtractionException {`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`InfoItemsSearchCollector collector = getInfoItemSearchCollector();`
			`JsonArray videos = initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")`
			`.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")`
			`.getObject(0).getObject("itemSectionRenderer").getArray("contents");`

			`collectStreamsFrom(collector, videos);`
			`return new InfoItemsPage<>(collector, getNextPageUrl());`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`

			`@Override`
add tests for searchextractor 2018-05-27 19:57:52 +02:00			`public String getNextPageUrl() throws ExtractionException {`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`return getNextPageUrlFrom(initialData.getObject("contents").getObject("twoColumnSearchResultsRenderer")`
			`.getObject("primaryContents").getObject("sectionListRenderer").getArray("contents")`
			`.getObject(0).getObject("itemSectionRenderer").getArray("continuations"));`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`

			`@Override`
			`public InfoItemsPage<InfoItem> getPage(String pageUrl) throws IOException, ExtractionException {`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`if (pageUrl == null \|\| pageUrl.isEmpty()) {`
			`throw new ExtractionException(new IllegalArgumentException("Page url is empty or null"));`
			`}`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`InfoItemsSearchCollector collector = getInfoItemSearchCollector();`
			`JsonArray ajaxJson;`
Get client version dynamically in YouTubeSearchExtractor 2020-02-24 20:02:45 +01:00
			`Map<String, List<String>> headers = new HashMap<>();`
			`headers.put("X-YouTube-Client-Name", Collections.singletonList("1"));`

Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`try {`
Get client version dynamically in YouTubeSearchExtractor 2020-02-24 20:02:45 +01:00			`// Use the hardcoded client version first to get JSON with a structure we know`
			`headers.put("X-YouTube-Client-Version",`
			`Collections.singletonList(YoutubeParsingHelper.HARDCODED_CLIENT_VERSION));`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();`
Get client version dynamically in YouTubeSearchExtractor 2020-02-24 20:02:45 +01:00			`if (response.length() < 50) { // ensure to have a valid response`
			`throw new ParsingException("Could not parse json data for next streams");`
			`}`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`ajaxJson = JsonParser.array().from(response);`
Get client version dynamically in YouTubeSearchExtractor 2020-02-24 20:02:45 +01:00			`} catch (Exception e) {`
			`try {`
			`headers.put("X-YouTube-Client-Version",`
			`Collections.singletonList(YoutubeParsingHelper.getClientVersion(initialData, doc.toString())));`
			`final String response = getDownloader().get(pageUrl, headers, getExtractorLocalization()).responseBody();`
			`if (response.length() < 50) { // ensure to have a valid response`
			`throw new ParsingException("Could not parse json data for next streams");`
			`}`
			`ajaxJson = JsonParser.array().from(response);`
			`} catch (JsonParserException ignored) {`
			`throw new ParsingException("Could not parse json data for next streams", e);`
			`}`
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`}`

			`JsonObject itemSectionRenderer = ajaxJson.getObject(1).getObject("response")`
			`.getObject("continuationContents").getObject("itemSectionContinuation");`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`collectStreamsFrom(collector, itemSectionRenderer.getArray("contents"));`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`return new InfoItemsPage<>(collector, getNextPageUrlFrom(itemSectionRenderer.getArray("continuations")));`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`

Remove useless code 2020-02-25 09:07:22 +01:00			`private void collectStreamsFrom(InfoItemsSearchCollector collector, JsonArray videos) throws NothingFoundException {`
Fix search result paging - due to the way as the InfoItemsSearchCollector are re-used, the returned item list just grows, which cause that same videos are returned. 2019-11-21 23:03:14 +01:00			`collector.reset();`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement time ago parser and improve localization handling - Handle special cases for languages where the number is not shown - Rework the Downloader base implementation, allowing for more advanced things to be done - Separate the localization from the content country (just like YouTube let's the user choose both). 2019-04-28 22:03:16 +02:00			`final TimeAgoParser timeAgoParser = getTimeAgoParser();`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`for (Object item : videos) {`
Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search 2020-02-22 20:19:41 +01:00			`if (((JsonObject) item).getObject("backgroundPromoRenderer") != null) {`
			`throw new NothingFoundException(((JsonObject) item).getObject("backgroundPromoRenderer")`
			`.getObject("bodyText").getArray("runs").getObject(0).getString("text"));`
			`} else if (((JsonObject) item).getObject("videoRenderer") != null) {`
			`collector.commit(new YoutubeStreamInfoItemExtractor(((JsonObject) item).getObject("videoRenderer"), timeAgoParser));`
			`} else if (((JsonObject) item).getObject("channelRenderer") != null) {`
Reimplement YoutubeChannelInfoItemExtractor 2020-02-23 18:27:28 +01:00			`collector.commit(new YoutubeChannelInfoItemExtractor(((JsonObject) item).getObject("channelRenderer")));`
Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search 2020-02-22 20:19:41 +01:00			`} else if (((JsonObject) item).getObject("playlistRenderer") != null) {`
Reimplement YoutubePlaylistInfoItemExtractor 2020-02-23 19:45:45 +01:00			`collector.commit(new YoutubePlaylistInfoItemExtractor(((JsonObject) item).getObject("playlistRenderer")));`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`
			`}`
			`}`
Move stuff from extractVideoPreviewInfo() into YoutubeStreamInfoItemExtractor and partially fix search 2020-02-22 20:19:41 +01:00
Implement pagination in YoutubeSearchExtractor 2020-02-24 18:24:36 +01:00			`private String getNextPageUrlFrom(JsonArray continuations) throws ParsingException {`
			`if (continuations == null) {`
			`return "";`
			`}`

			`JsonObject nextContinuationData = continuations.getObject(0).getObject("nextContinuationData");`
			`String continuation = nextContinuationData.getString("continuation");`
			`String clickTrackingParams = nextContinuationData.getString("clickTrackingParams");`
			`return getUrl() + "&pbj=1&ctoken=" + continuation + "&continuation=" + continuation`
			`+ "&itct=" + clickTrackingParams;`
			`}`
remove soundcloud and make first search test work 2018-05-26 19:15:45 +02:00			`}`