From 32057850594b721a6bb19606a9c4a3c8857545d5 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Tue, 13 Jul 2021 18:16:09 +0200 Subject: [PATCH 1/6] [fix] Qwant engines - implement API v3 and add 'quant videos' The implementation uses the Qwant API (https://api.qwant.com/v3). The API is undocumented but can be reverse engineered by reading the network log of https://www.qwant.com/ queries. This implementation is used by different qwant engines in the settings.yml:: - name: qwant categories: general ... - name: qwant news categories: news ... - name: qwant images categories: images ... - name: qwant videos categories: videos ... Signed-off-by: Markus Heiser --- searx/data/engines_languages.json | 43 ++++++ searx/engines/qwant.py | 209 ++++++++++++++++++++---------- searx/settings.yml | 20 ++- 3 files changed, 200 insertions(+), 72 deletions(-) diff --git a/searx/data/engines_languages.json b/searx/data/engines_languages.json index 438bbb6c..16e1de0e 100644 --- a/searx/data/engines_languages.json +++ b/searx/data/engines_languages.json @@ -25773,6 +25773,49 @@ "zh-CN", "zh-HK" ], + "qwant videos": [ + "bg-BG", + "ca-ES", + "cs-CZ", + "da-DK", + "de-AT", + "de-CH", + "de-DE", + "el-GR", + "en-AU", + "en-CA", + "en-GB", + "en-IE", + "en-IN", + "en-MY", + "en-NZ", + "en-US", + "es-AR", + "es-CL", + "es-ES", + "es-MX", + "et-EE", + "fi-FI", + "fr-BE", + "fr-CA", + "fr-CH", + "fr-FR", + "hu-HU", + "it-CH", + "it-IT", + "ko-KR", + "nb-NO", + "nl-BE", + "nl-NL", + "pl-PL", + "pt-BR", + "pt-PT", + "ro-RO", + "sv-SE", + "th-TH", + "zh-CN", + "zh-HK" + ], "startpage": { "af": { "alias": "afrikaans" diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index fb525843..00ecf7e8 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -1,15 +1,42 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -""" - Qwant (Web, Images, News, Social) +# lint: pylint +"""Qwant (Web, News, Images, Videos) + +This engine uses the Qwant API (https://api.qwant.com/v3). The API is +undocumented but can be reverse engineered by reading the network log of +https://www.qwant.com/ queries. + +This implementation is used by different qwant engines in the settings.yml:: + + - name: qwant + categories: general + ... + - name: qwant news + categories: news + ... + - name: qwant images + categories: images + ... + - name: qwant videos + categories: videos + ... + """ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) from json import loads from urllib.parse import urlencode -from searx.utils import html_to_text, match_language -from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException + +# from searx import logger +from searx.utils import match_language +from searx.exceptions import SearxEngineAPIException from searx.network import raise_for_httperror +#logger = logger.getChild('qwant') + # about about = { "website": 'https://www.qwant.com/', @@ -25,98 +52,148 @@ categories = [] paging = True supported_languages_url = about['website'] -category_to_keyword = {'general': 'web', - 'images': 'images', - 'news': 'news'} +category_to_keyword = { + 'general': 'web', + 'news': 'news', + 'images': 'images', + 'videos': 'videos', +} # search-url -url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' +url = 'https://api.qwant.com/v3/search/{keyword}?q={query}&count={count}&offset={offset}' - -# do search-request def request(query, params): - offset = (params['pageno'] - 1) * 10 + """Qwant search request""" + keyword = category_to_keyword[categories[0]] + count = 10 # web: count must be equal to 10 - if categories[0] and categories[0] in category_to_keyword: - - params['url'] = url.format(keyword=category_to_keyword[categories[0]], - query=urlencode({'q': query}), - offset=offset) + if keyword == 'images': + count = 50 + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 250 + offset = min(offset, 199) else: - params['url'] = url.format(keyword='web', - query=urlencode({'q': query}), - offset=offset) + offset = (params['pageno'] - 1) * count + # count + offset must be lower than 50 + offset = min(offset, 40) + + params['url'] = url.format( + keyword = keyword, + query = urlencode({'q': query}), + offset = offset, + count = count, + ) # add language tag if params['language'] != 'all': - language = match_language(params['language'], supported_languages, language_aliases) - params['url'] += '&locale=' + language.replace('-', '_').lower() + language = match_language( + params['language'], + # pylint: disable=undefined-variable + supported_languages, + language_aliases, + ) + params['url'] += '&locale=' + language.replace('-', '_') - params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' params['raise_for_httperror'] = False return params -# get response from search-request def response(resp): + """Get response from Qwant's search request""" + + keyword = category_to_keyword[categories[0]] results = [] - # According to https://www.qwant.com/js/app.js - if resp.status_code == 429: - raise SearxEngineCaptchaException() + # load JSON result + search_results = loads(resp.text) + data = search_results.get('data', {}) + + # check for an API error + if search_results.get('status') != 'success': + msg = ",".join(data.get('message', ['unknown', ])) + raise SearxEngineAPIException('API error::' + msg) # raise for other errors raise_for_httperror(resp) - # load JSON result - search_results = loads(resp.text) - - # check for an API error - if search_results.get('status') != 'success': - raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) + if keyword == 'web': + # The WEB query contains a list named 'mainline'. This list can contain + # different result types (e.g. mainline[0]['type'] returns type of the + # result items in mainline[0]['items'] + mainline = data.get('result', {}).get('items', {}).get('mainline', {}) + else: + # Queries on News, Images and Videos do not have a list named 'mainline' + # in the response. The result items are directly in the list + # result['items']. + mainline = data.get('result', {}).get('items', []) + mainline = [ + {'type' : keyword, 'items' : mainline }, + ] # return empty array if there are no results - if 'data' not in search_results: + if not mainline: return [] - data = search_results.get('data', {}) + for row in mainline: - res = data.get('result', {}) + mainline_type = row.get('type', 'web') + if mainline_type == 'ads': + # ignore adds + continue - # parse results - for result in res.get('items', {}): + mainline_items = row.get('items', []) + for item in mainline_items: - title = html_to_text(result['title']) - res_url = result['url'] - content = html_to_text(result['desc']) + title = item['title'] + res_url = item['url'] - if category_to_keyword.get(categories[0], '') == 'web': - results.append({'title': title, - 'content': content, - 'url': res_url}) + if mainline_type == 'web': + content = item['desc'] + results.append({ + 'title': title, + 'url': res_url, + 'content': content, + }) - elif category_to_keyword.get(categories[0], '') == 'images': - thumbnail_src = result['thumbnail'] - img_src = result['media'] - results.append({'template': 'images.html', - 'url': res_url, - 'title': title, - 'content': '', - 'thumbnail_src': thumbnail_src, - 'img_src': img_src}) - - elif category_to_keyword.get(categories[0], '') == 'news': - published_date = datetime.fromtimestamp(result['date'], None) - media = result.get('media', []) - if len(media) > 0: - img_src = media[0].get('pict', {}).get('url', None) - else: + elif mainline_type == 'news': + pub_date = datetime.fromtimestamp(item['date'], None) + news_media = item.get('media', []) img_src = None - results.append({'url': res_url, - 'title': title, - 'publishedDate': published_date, - 'content': content, - 'img_src': img_src}) + if news_media: + img_src = news_media[0].get('pict', {}).get('url', None) + results.append({ + 'title': title, + 'url': res_url, + 'publishedDate': pub_date, + 'img_src': img_src, + }) + + elif mainline_type == 'images': + thumbnail = item['thumbnail'] + img_src = item['media'] + results.append({ + 'title': title, + 'url': res_url, + 'template': 'images.html', + 'thumbnail_src': thumbnail, + 'img_src': img_src, + }) + + elif mainline_type == 'videos': + content = item['desc'] + length = timedelta(seconds=item['duration']) + pub_date = datetime.fromtimestamp(item['date']) + thumbnail = item['thumbnail'] + + results.append({ + 'title': title, + 'url': res_url, + 'content': content, + 'publishedDate': pub_date, + 'thumbnail': thumbnail, + 'template': 'videos.html', + 'length': length, + }) return results diff --git a/searx/settings.yml b/searx/settings.yml index 564d774d..591c819d 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -988,18 +988,26 @@ engines: additional_tests: rosebud: *test_rosebud - - name : qwant images - engine : qwant - shortcut : qwi - categories : images - network: qwant - - name : qwant news engine : qwant shortcut : qwn categories : news network: qwant + - name: qwant images + engine: qwant + shortcut: qwi + categories: images + disabled: True + network: qwant + + - name: qwant videos + engine: qwant + shortcut: qwv + categories: videos + disabled: True + network: qwant + # - name: library # engine: recoll # shortcut: lib From 2b69710aef92923f7c9126c0d3da73c978a326e3 Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Thu, 15 Jul 2021 20:10:37 +0200 Subject: [PATCH 2/6] [mod] improve video results of the qwant engine Signed-off-by: Markus Heiser --- searx/engines/qwant.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 00ecf7e8..00b30f4d 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -29,6 +29,7 @@ from datetime import ( ) from json import loads from urllib.parse import urlencode +from flask_babel import gettext # from searx import logger from searx.utils import match_language @@ -100,6 +101,7 @@ def request(query, params): def response(resp): """Get response from Qwant's search request""" + # pylint: disable=too-many-locals, too-many-branches, too-many-statements keyword = category_to_keyword[categories[0]] results = [] @@ -180,11 +182,28 @@ def response(resp): }) elif mainline_type == 'videos': - content = item['desc'] + # some videos do not have a description: while quant-video + # returns an empty string, such video from a quant-web query + # miss the 'desc' key. + content = item.get('desc', '') + s, c = item.get('source',''), item.get('channel','') + if content and (s or c): + content += " // " + if s: + content += "%s: %s " % (gettext("Source"), s) + if c: + content += "//" + if c: + content += " %s: %s " % (gettext("Channel"), c) length = timedelta(seconds=item['duration']) pub_date = datetime.fromtimestamp(item['date']) thumbnail = item['thumbnail'] - + # from some locations (DE and others?) the s2 link do + # response a 'Please wait ..' but does not deliver the thumbnail + thumbnail = thumbnail.replace( + 'https://s2.qwant.com', + 'https://s1.qwant.com', 1 + ) results.append({ 'title': title, 'url': res_url, From 7aa94b7084c0d6a1f0e877f50dbc4be96416dfa7 Mon Sep 17 00:00:00 2001 From: Alexandre Flament Date: Fri, 16 Jul 2021 15:32:12 +0200 Subject: [PATCH 3/6] [mod] qwant engine: fix typos / minor change minor modification of commit 628b5703f3aeeed117772696f83efb344d6f337e (no functionnal change) --- searx/engines/qwant.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 00b30f4d..97e46117 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -31,12 +31,10 @@ from json import loads from urllib.parse import urlencode from flask_babel import gettext -# from searx import logger from searx.utils import match_language from searx.exceptions import SearxEngineAPIException from searx.network import raise_for_httperror -#logger = logger.getChild('qwant') # about about = { @@ -182,19 +180,18 @@ def response(resp): }) elif mainline_type == 'videos': - # some videos do not have a description: while quant-video - # returns an empty string, such video from a quant-web query + # some videos do not have a description: while qwant-video + # returns an empty string, such video from a qwant-web query # miss the 'desc' key. - content = item.get('desc', '') - s, c = item.get('source',''), item.get('channel','') - if content and (s or c): - content += " // " + d, s, c = item.get('desc'), item.get('source'), item.get('channel') + content_parts = [] + if d: + content_parts.append(d) if s: - content += "%s: %s " % (gettext("Source"), s) - if c: - content += "//" + content_parts.append("%s: %s " % (gettext("Source"), s)) if c: - content += " %s: %s " % (gettext("Channel"), c) + content_parts.append("%s: %s " % (gettext("Channel"), c)) + content = ' // '.join(content_parts) length = timedelta(seconds=item['duration']) pub_date = datetime.fromtimestamp(item['date']) thumbnail = item['thumbnail'] From b10c1346d78c3562167acbcec70b5b3e74ffd9ef Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 24 Jul 2021 13:52:36 +0200 Subject: [PATCH 4/6] [fix] qwant engine - prevent exception on date/time value is None Has been reported in [1], error messages:: Error Error: ValueError Percentage: 0 Parameters: () File name: searx/engines/qwant.py:159 Function: response Code: pub_date = datetime.fromtimestamp(item['date'], None) Error Error: TypeError Percentage: 0 Parameters: ('an integer is required (got type NoneType)',) File name: searx/engines/qwant.py:196 Function: response Code: pub_date = datetime.fromtimestamp(item['date']) Fix timedelta from seconds to milliseconds [1], error message:: Error Error: TypeError Percentage: 0 Parameters: ('unsupported type for timedelta seconds component: NoneType',) File name: searx/engines/qwant.py:195 Function: response Code: length = timedelta(seconds=item['duration']) [1] https://github.com/searxng/searxng/issues/222 Signed-off-by: Markus Heiser --- searx/engines/qwant.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 97e46117..deac55cf 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -156,7 +156,10 @@ def response(resp): }) elif mainline_type == 'news': - pub_date = datetime.fromtimestamp(item['date'], None) + + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) news_media = item.get('media', []) img_src = None if news_media: @@ -192,8 +195,12 @@ def response(resp): if c: content_parts.append("%s: %s " % (gettext("Channel"), c)) content = ' // '.join(content_parts) - length = timedelta(seconds=item['duration']) - pub_date = datetime.fromtimestamp(item['date']) + length = item['duration'] + if length is not None: + length = timedelta(milliseconds=length) + pub_date = item['date'] + if pub_date is not None: + pub_date = datetime.fromtimestamp(pub_date) thumbnail = item['thumbnail'] # from some locations (DE and others?) the s2 link do # response a 'Please wait ..' but does not deliver the thumbnail From 263db54aa97ed47ad601ac27648e4b64946c82bd Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sat, 24 Jul 2021 14:45:32 +0200 Subject: [PATCH 5/6] [fix] qwant engine - prevent API locale exception on lang 'all' Has been reported in [1], error message:: Error Error: searx.exceptions.SearxEngineAPIException Percentage: 0 Parameters: ('API error::locale must be a string,locale must be one of the following values: en_gb, en_ie, en_us, en_ca, en_in, en_my, en_au, en_nz, cy_gb, gd_gb, de_de, de_ch, de_at, fr_fr, br_fr, fr_be, fr_ch, fr_ca, fr_ad, fc_ca, ec_ca, co_fr, es_es, es_ar, es_cl, es_co, es_mx, es_pe, es_ad, ca_es, ca_ad, ca_fr, eu_es, eu_fr, it_it, it_ch, pt_br, pt_pt, pt_ad, nl_be, nl_nl, pl_pl, zh_hk, zh_cn, fi_fi, bg_bg, et_ee, hu_hu, da_dk, nb_no, sv_se, ko_kr, th_th, cs_cz, ro_ro, el_gr',) File name: searx/engines/qwant.py:114 Function: response Code: raise SearxEngineAPIException('API error::' + msg) [1] https://github.com/searxng/searxng/issues/222 Signed-off-by: Markus Heiser --- searx/engines/qwant.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index deac55cf..8d03d832 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -84,14 +84,16 @@ def request(query, params): ) # add language tag - if params['language'] != 'all': + if params['language'] == 'all': + params['url'] += '&locale=en_us' + else: language = match_language( params['language'], # pylint: disable=undefined-variable supported_languages, language_aliases, ) - params['url'] += '&locale=' + language.replace('-', '_') + params['url'] += '&locale=' + language.replace('-', '_').lower() params['raise_for_httperror'] = False return params @@ -144,8 +146,8 @@ def response(resp): mainline_items = row.get('items', []) for item in mainline_items: - title = item['title'] - res_url = item['url'] + title = item.get('title', None) + res_url = item.get('url', None) if mainline_type == 'web': content = item['desc'] From a0fb8ebeaf260025e4b94101918efe31b254ceae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?No=C3=A9mi=20V=C3=A1nyi?= Date: Sun, 10 Oct 2021 21:13:55 +0200 Subject: [PATCH 6/6] Fix style errors in Qwant engine --- searx/engines/qwant.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 8d03d832..70f69483 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -61,6 +61,7 @@ category_to_keyword = { # search-url url = 'https://api.qwant.com/v3/search/{keyword}?q={query}&count={count}&offset={offset}' + def request(query, params): """Qwant search request""" keyword = category_to_keyword[categories[0]] @@ -77,10 +78,10 @@ def request(query, params): offset = min(offset, 40) params['url'] = url.format( - keyword = keyword, - query = urlencode({'q': query}), - offset = offset, - count = count, + keyword=keyword, + query=urlencode({'q': query}), + offset=offset, + count=count, ) # add language tag @@ -129,7 +130,7 @@ def response(resp): # result['items']. mainline = data.get('result', {}).get('items', []) mainline = [ - {'type' : keyword, 'items' : mainline }, + {'type': keyword, 'items': mainline}, ] # return empty array if there are no results @@ -217,8 +218,8 @@ def response(resp): 'publishedDate': pub_date, 'thumbnail': thumbnail, 'template': 'videos.html', - 'length': length, - }) + 'length': length, + }) return results