diff --git a/searx/engines/google.py b/searx/engines/google.py index 8e548215..707bff8a 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -108,8 +108,8 @@ filter_mapping = { # specific xpath variables # ------------------------ -# google results are grouped into
-results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]' +# google results are grouped into
+results_xpath = '//div[contains(@class, "jtfYYd")]' results_xpath_mobile_ui = '//div[contains(@class, "g ")]' # google *sections* are no usual *results*, we ignore them @@ -223,6 +223,7 @@ def request(query, params): 'oe': "utf8", 'start': offset, 'filter': '0', + 'ucbcb': 1, **additional_parameters, }) @@ -235,6 +236,7 @@ def request(query, params): params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) + params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) if use_mobile_ui: params['headers']['Accept'] = '*/*' diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 8c204b29..d9ff3f82 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -109,6 +109,7 @@ def request(query, params): **lang_info['params'], 'ie': "utf8", 'oe': "utf8", + 'ucbcd': 1, 'num': 30, }) @@ -121,6 +122,7 @@ def request(query, params): params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) + params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index eb074ebc..c9b23ccc 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -104,6 +104,7 @@ def request(query, params): **lang_info['params'], 'ie': "utf8", 'oe': "utf8", + 'ucbcb': 1, 'gl': lang_info['country'], }) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded @@ -111,10 +112,12 @@ def request(query, params): params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) + + params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' - ) + ) return params diff --git a/searx/engines/google_play_apps.py b/searx/engines/google_play_apps.py new file mode 100644 index 00000000..304ff60a --- /dev/null +++ b/searx/engines/google_play_apps.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Google Play Apps +""" + +from urllib.parse import urlencode +from lxml import html +from searx.utils import ( + eval_xpath, + extract_url, + extract_text, + eval_xpath_list, + eval_xpath_getindex, +) + +about = { + "website": "https://play.google.com/", + "wikidata_id": "Q79576", + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +categories = ["files", "apps"] +search_url = "https://play.google.com/store/search?{query}&c=apps&ucbcb=1" + + +def request(query, params): + params["url"] = search_url.format(query=urlencode({"q": query})) + params['cookies']['CONSENT'] = "YES+" + + return params + + +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + if eval_xpath(dom, '//div[@class="v6DsQb"]'): + return [] + + spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None) + if spot is not None: + url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url) + title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]')) + content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]')) + img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src')) + + results.append({"url": url, "title": title, "content": content, "img_src": img}) + + more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1) + for result in more: + url = extract_url(eval_xpath(result, ".//a/@href"), search_url) + title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]')) + content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]')) + img = extract_text( + eval_xpath( + result, + './/img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src', + ) + ) + + results.append({"url": url, "title": title, "content": content, "img_src": img}) + + for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'): + results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))}) + + return results diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 960219aa..307380ff 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -85,13 +85,13 @@ def request(query, params): # subdomain is: scholar.google.xy lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") - query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ - 'q': query, - **lang_info['params'], - 'ie': "utf8", - 'oe': "utf8", - 'start' : offset, - }) + query_url = ( + 'https://' + + lang_info['subdomain'] + + '/scholar' + + "?" + + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, 'ucbcb': 1}) + ) query_url += time_range_url(params) @@ -99,6 +99,7 @@ def request(query, params): params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) + params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py index 40c7f2b9..1c286d03 100644 --- a/searx/engines/google_videos.py +++ b/searx/engines/google_videos.py @@ -125,6 +125,7 @@ def request(query, params): 'q': query, 'tbm': "vid", **lang_info['params'], + 'ucbcb': 1, 'ie': "utf8", 'oe': "utf8", }) @@ -138,6 +139,7 @@ def request(query, params): params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) + params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' diff --git a/searx/engines/lingva.py b/searx/engines/lingva.py new file mode 100644 index 00000000..bf51b705 --- /dev/null +++ b/searx/engines/lingva.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Lingva (alternative Google Translate frontend)""" + +from json import loads + +about = { + "website": 'https://lingva.ml', + "wikidata_id": None, + "official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +engine_type = 'online_dictionary' +categories = ['general'] + +url = "https://lingva.ml" +search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}" + + +def request(_query, params): + params['url'] = search_url.format( + url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query'] + ) + return params + + +def response(resp): + results = [] + + result = loads(resp.text) + info = result["info"] + from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1]) + + if "typo" in info: + results.append({"suggestion": from_to_prefix + info["typo"]}) + + if 'definitions' in info: # pylint: disable=too-many-nested-blocks + for definition in info['definitions']: + if 'list' in definition: + for item in definition['list']: + if 'synonyms' in item: + for synonym in item['synonyms']: + results.append({"suggestion": from_to_prefix + synonym}) + + infobox = "" + + for translation in info["extraTranslations"]: + infobox += f"{translation['type']}" + + for word in translation["list"]: + infobox += f"
{word['word']}
" + + for meaning in word["meanings"]: + infobox += f"
{meaning}
" + + infobox += "
" + + results.append( + { + 'infobox': result["translation"], + 'content': infobox, + } + ) + + return results diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 541272d4..49165e09 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -30,6 +30,7 @@ about = { # engine dependent config categories = ['map'] paging = False +language_support = True # search-url base_url = 'https://nominatim.openstreetmap.org/' @@ -141,6 +142,9 @@ def request(query, params): params['url'] = base_url + search_string.format(query=urlencode({'q': query})) params['route'] = route_re.match(query) params['headers']['User-Agent'] = searx_useragent() + + accept_language = 'en' if params['language'] == 'all' else params['language'] + params['headers']['Accept-Language'] = accept_language return params @@ -200,7 +204,7 @@ def get_wikipedia_image(raw_value): return get_external_url('wikimedia_image', raw_value) -def fetch_wikidata(nominatim_json, user_langage): +def fetch_wikidata(nominatim_json, user_language): """Update nominatim_json using the result of an unique to wikidata For result in nominatim_json: @@ -221,9 +225,10 @@ def fetch_wikidata(nominatim_json, user_langage): wd_to_results.setdefault(wd_id, []).append(result) if wikidata_ids: + user_language = 'en' if user_language == 'all' else user_language.split('-')[0] wikidata_ids_str = " ".join(wikidata_ids) query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace( - '%LANGUAGE%', sparql_string_escape(user_langage) + '%LANGUAGE%', sparql_string_escape(user_language) ) wikidata_json = send_wikidata_query(query) for wd_result in wikidata_json.get('results', {}).get('bindings', {}): @@ -238,7 +243,7 @@ def fetch_wikidata(nominatim_json, user_langage): # overwrite wikipedia link wikipedia_name = wd_result.get('wikipediaName', {}).get('value') if wikipedia_name: - result['extratags']['wikipedia'] = user_langage + ':' + wikipedia_name + result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name # get website if not already defined website = wd_result.get('website', {}).get('value') if ( diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py index eff7b709..99793ddb 100644 --- a/searx/engines/sjp.py +++ b/searx/engines/sjp.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Słownik Języka Polskiego (general) +# lint: pylint +"""Słownik Języka Polskiego +Dictionary of the polish language from PWN (sjp.pwn) """ from lxml.html import fromstring diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py index efcfff22..d7b0d525 100644 --- a/searx/engines/tineye.py +++ b/searx/engines/tineye.py @@ -2,10 +2,12 @@ Tineye - Reverse search images """ -from json import loads from urllib.parse import urlencode from datetime import datetime +from flask_babel import gettext + +from searx import logger about = { "website": "https://tineye.com", @@ -18,13 +20,29 @@ about = { categories = ['images'] paging = True - safesearch = False base_url = 'https://tineye.com' search_string = '/result_json/?page={page}&{query}' +logger = logger.getChild('tineye') + +FORMAT_NOT_SUPPORTED = gettext( + "Could not read that image url. This may be due to an unsupported file" + " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP." +) +"""TinEye error message""" + +NO_SIGNATURE_ERROR = gettext( + "The image is too simple to find matches. TinEye requires a basic level of" + " visual detail to successfully identify matches." +) +"""TinEye error message""" + +DOWNLOAD_ERROR = gettext("The image could not be downloaded.") +"""TinEye error message""" + def request(query, params): params['url'] = base_url +\ @@ -40,47 +58,147 @@ def request(query, params): 'TE': 'trailers', }) + query = urlencode({'url': query}) + + # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py + params['url'] = base_url + search_string.format(query=query, page=params['pageno']) + return params +def parse_tineye_match(match_json): + """Takes parsed JSON from the API server and turns it into a :py:obj:`dict` + object. + + Attributes `(class Match) `__ + + - `image_url`, link to the result image. + - `domain`, domain this result was found on. + - `score`, a number (0 to 100) that indicates how closely the images match. + - `width`, image width in pixels. + - `height`, image height in pixels. + - `size`, image area in pixels. + - `format`, image format. + - `filesize`, image size in bytes. + - `overlay`, overlay URL. + - `tags`, whether this match belongs to a collection or stock domain. + + - `backlinks`, a list of Backlink objects pointing to the original websites + and image URLs. List items are instances of :py:obj:`dict`, (`Backlink + `__): + + - `url`, the image URL to the image. + - `backlink`, the original website URL. + - `crawl_date`, the date the image was crawled. + + """ + + # HINT: there exists an alternative backlink dict in the domains list / e.g.:: + # + # match_json['domains'][0]['backlinks'] + + backlinks = [] + if "backlinks" in match_json: + + for backlink_json in match_json["backlinks"]: + if not isinstance(backlink_json, dict): + continue + + crawl_date = backlink_json.get("crawl_date") + if crawl_date: + crawl_date = datetime.fromisoformat(crawl_date[:-3]) + else: + crawl_date = datetime.min + + backlinks.append({ + 'url': backlink_json.get("url"), + 'backlink': backlink_json.get("backlink"), + 'crawl_date': crawl_date, + 'image_name': backlink_json.get("image_name")} + ) + + return { + 'image_url': match_json.get("image_url"), + 'domain': match_json.get("domain"), + 'score': match_json.get("score"), + 'width': match_json.get("width"), + 'height': match_json.get("height"), + 'size': match_json.get("size"), + 'image_format': match_json.get("format"), + 'filesize': match_json.get("filesize"), + 'overlay': match_json.get("overlay"), + 'tags': match_json.get("tags"), + 'backlinks': backlinks, + } + + def response(resp): + """Parse HTTP response from TinEye.""" results = [] - # Define wanted results - json_data = loads(resp.text) - number_of_results = json_data['num_matches'] - for i in json_data['matches']: - for i in json_data['matches']: - image_format = i['format'] - width = i['width'] - height = i['height'] - thumbnail_src = i['image_url'] - backlink = i['domains'][0]['backlinks'][0] + try: + json_data = resp.json() + except Exception as exc: # pylint: disable=broad-except + msg = "can't parse JSON response // %s" % exc + logger.error(msg) + json_data = {'error': msg} - url = backlink['backlink'] - source = backlink['url'] - title = backlink['image_name'] - img_src = backlink['url'] + # handle error codes from Tineye - # Get and convert published date - api_date = backlink['crawl_date'][:-3] - publishedDate = datetime.fromisoformat(api_date) + if resp.is_error: + if resp.status_code in (400, 422): - # Append results - results.append({ + message = 'HTTP status: %s' % resp.status_code + error = json_data.get('error') + s_key = json_data.get('suggestions', {}).get('key', '') + + if error and s_key: + message = "%s (%s)" % (error, s_key) + elif error: + message = error + + if s_key == "Invalid image URL": + # test https://docs.searxng.org/_static/searxng-wordmark.svg + message = FORMAT_NOT_SUPPORTED + elif s_key == 'NO_SIGNATURE_ERROR': + # test https://pngimg.com/uploads/dot/dot_PNG4.png + message = NO_SIGNATURE_ERROR + elif s_key == 'Download Error': + # test https://notexists + message = DOWNLOAD_ERROR + + logger.error(message) + + return results + + resp.raise_for_status() + + # append results from matches + for match_json in json_data['matches']: + + tineye_match = parse_tineye_match(match_json) + if not tineye_match['backlinks']: + continue + + backlink = tineye_match['backlinks'][0] + results.append( + { 'template': 'images.html', - 'url': url, - 'thumbnail_src': thumbnail_src, - 'source': source, - 'title': title, - 'img_src': img_src, - 'format': image_format, - 'widht': width, - 'height': height, - 'publishedDate': publishedDate, - }) + 'url': backlink['backlink'], + 'thumbnail_src': tineye_match['image_url'], + 'source': backlink['url'], + 'title': backlink['image_name'], + 'img_src': backlink['url'], + 'format': tineye_match['image_format'], + 'widht': tineye_match['width'], + 'height': tineye_match['height'], + 'publishedDate': backlink['crawl_date'], + } + ) - # Append number of results - results.append({'number_of_results': number_of_results}) + # append number of results + number_of_results = json_data.get('num_matches') + if number_of_results: + results.append({'number_of_results': number_of_results}) return results diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c8e4cfae..60adb41c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -64,6 +64,7 @@ WHERE mwapi:language "%LANGUAGE%". ?item wikibase:apiOutputItem mwapi:item. } + hint:Prior hint:runFirst "true". %WHERE% @@ -92,6 +93,12 @@ WHERE { } """ +# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata) +# hard coded here to avoid to an additional SPARQL request when the server starts +DUMMY_ENTITY_URLS = set( + "http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402") +) + # https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1 # https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html @@ -173,7 +180,7 @@ def response(resp): for result in jsonresponse.get('results', {}).get('bindings', []): attribute_result = {key: value['value'] for key, value in result.items()} entity_url = attribute_result['item'] - if entity_url not in seen_entities: + if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS: seen_entities.add(entity_url) results += get_results(attribute_result, attributes, language) else: diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 68b75bc7..d9af3429 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -3,7 +3,6 @@ Youtube (Videos) """ -from datetime import datetime from functools import reduce from json import loads, dumps from urllib.parse import quote_plus @@ -26,7 +25,7 @@ time_range_support = True # search-url base_url = 'https://www.youtube.com/results' -search_url = base_url + '?search_query={query}&page={page}' +search_url = base_url + '?search_query={query}&page={page}&ucbcb=1' time_range_url = '&sp=EgII{time_range}%253D%253D' # the key seems to be constant next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' @@ -44,6 +43,7 @@ base_youtube_url = 'https://www.youtube.com/watch?v=' # do search-request def request(query, params): + params['cookies']['CONSENT'] = "YES+" if not params['engine_data'].get('next_page_token'): params['url'] = search_url.format(query=quote_plus(query), page=params['pageno']) if params['time_range'] in time_range_dict: @@ -57,7 +57,6 @@ def request(query, params): }) params['headers']['Content-Type'] = 'application/json' - params['headers']['Cookie'] = "CONSENT=YES+cb.%s-17-p0.en+F+941;" % datetime.now().strftime("%Y%m%d") return params diff --git a/searx/settings.yml b/searx/settings.yml index b4f61413..99f0be23 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -787,17 +787,23 @@ engines: shortcut : loc categories : images - - name : lobste.rs - engine : xpath - search_url : https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance - results_xpath : //li[contains(@class, "story")] - url_xpath : .//a[@class="u-url"]/@href - title_xpath : .//a[@class="u-url"] - content_xpath : .//a[@class="domain"] - categories : it - shortcut : lo - timeout : 5.0 - disabled: True + - name: lingva + engine: lingva + shortcut: lv + # set lingva instance in url, by default it will use the official instance + # url: https://lingva.ml + + - name: lobste.rs + engine: xpath + search_url: https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance + results_xpath: //li[contains(@class, "story")] + url_xpath: .//a[@class="u-url"]/@href + title_xpath: .//a[@class="u-url"] + content_xpath: .//a[@class="domain"] + categories: it + shortcut: lo + timeout: 5.0 + disabled: true about: website: https://lobste.rs/ wikidata_id: Q60762874 @@ -1632,7 +1638,7 @@ engines: require_api_key: false results: HTML - - name: słownik języka polskiego + - name: sjp.pwn engine: sjp shortcut: sjp base_url: https://sjp.pwn.pl/