diff --git a/searx/engines/google.py b/searx/engines/google.py
index 8e548215..707bff8a 100644
--- a/searx/engines/google.py
+++ b/searx/engines/google.py
@@ -108,8 +108,8 @@ filter_mapping = {
# specific xpath variables
# ------------------------
-# google results are grouped into
-results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]'
+# google results are grouped into
+results_xpath = '//div[contains(@class, "jtfYYd")]'
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
# google *sections* are no usual *results*, we ignore them
@@ -223,6 +223,7 @@ def request(query, params):
'oe': "utf8",
'start': offset,
'filter': '0',
+ 'ucbcb': 1,
**additional_parameters,
})
@@ -235,6 +236,7 @@ def request(query, params):
params['url'] = query_url
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
+ params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
if use_mobile_ui:
params['headers']['Accept'] = '*/*'
diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py
index 8c204b29..d9ff3f82 100644
--- a/searx/engines/google_images.py
+++ b/searx/engines/google_images.py
@@ -109,6 +109,7 @@ def request(query, params):
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
+ 'ucbcd': 1,
'num': 30,
})
@@ -121,6 +122,7 @@ def request(query, params):
params['url'] = query_url
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
+ params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py
index eb074ebc..c9b23ccc 100644
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
@@ -104,6 +104,7 @@ def request(query, params):
**lang_info['params'],
'ie': "utf8",
'oe': "utf8",
+ 'ucbcb': 1,
'gl': lang_info['country'],
}) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
@@ -111,10 +112,12 @@ def request(query, params):
params['url'] = query_url
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
+
+ params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
- )
+ )
return params
diff --git a/searx/engines/google_play_apps.py b/searx/engines/google_play_apps.py
new file mode 100644
index 00000000..304ff60a
--- /dev/null
+++ b/searx/engines/google_play_apps.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+ Google Play Apps
+"""
+
+from urllib.parse import urlencode
+from lxml import html
+from searx.utils import (
+ eval_xpath,
+ extract_url,
+ extract_text,
+ eval_xpath_list,
+ eval_xpath_getindex,
+)
+
+about = {
+ "website": "https://play.google.com/",
+ "wikidata_id": "Q79576",
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": "HTML",
+}
+
+categories = ["files", "apps"]
+search_url = "https://play.google.com/store/search?{query}&c=apps&ucbcb=1"
+
+
+def request(query, params):
+ params["url"] = search_url.format(query=urlencode({"q": query}))
+ params['cookies']['CONSENT'] = "YES+"
+
+ return params
+
+
+def response(resp):
+ results = []
+
+ dom = html.fromstring(resp.text)
+
+ if eval_xpath(dom, '//div[@class="v6DsQb"]'):
+ return []
+
+ spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
+ if spot is not None:
+ url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
+ title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
+ content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
+ img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
+
+ results.append({"url": url, "title": title, "content": content, "img_src": img})
+
+ more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
+ for result in more:
+ url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
+ title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
+ content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
+ img = extract_text(
+ eval_xpath(
+ result,
+ './/img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
+ )
+ )
+
+ results.append({"url": url, "title": title, "content": content, "img_src": img})
+
+ for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
+ results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
+
+ return results
diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py
index 960219aa..307380ff 100644
--- a/searx/engines/google_scholar.py
+++ b/searx/engines/google_scholar.py
@@ -85,13 +85,13 @@ def request(query, params):
# subdomain is: scholar.google.xy
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
- query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({
- 'q': query,
- **lang_info['params'],
- 'ie': "utf8",
- 'oe': "utf8",
- 'start' : offset,
- })
+ query_url = (
+ 'https://'
+ + lang_info['subdomain']
+ + '/scholar'
+ + "?"
+ + urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, 'ucbcb': 1})
+ )
query_url += time_range_url(params)
@@ -99,6 +99,7 @@ def request(query, params):
params['url'] = query_url
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
+ params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
diff --git a/searx/engines/google_videos.py b/searx/engines/google_videos.py
index 40c7f2b9..1c286d03 100644
--- a/searx/engines/google_videos.py
+++ b/searx/engines/google_videos.py
@@ -125,6 +125,7 @@ def request(query, params):
'q': query,
'tbm': "vid",
**lang_info['params'],
+ 'ucbcb': 1,
'ie': "utf8",
'oe': "utf8",
})
@@ -138,6 +139,7 @@ def request(query, params):
params['url'] = query_url
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
+ params['cookies']['CONSENT'] = "YES+"
params['headers'].update(lang_info['headers'])
params['headers']['Accept'] = (
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
diff --git a/searx/engines/lingva.py b/searx/engines/lingva.py
new file mode 100644
index 00000000..bf51b705
--- /dev/null
+++ b/searx/engines/lingva.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Lingva (alternative Google Translate frontend)"""
+
+from json import loads
+
+about = {
+ "website": 'https://lingva.ml',
+ "wikidata_id": None,
+ "official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
+ "use_official_api": True,
+ "require_api_key": False,
+ "results": 'JSON',
+}
+
+engine_type = 'online_dictionary'
+categories = ['general']
+
+url = "https://lingva.ml"
+search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
+
+
+def request(_query, params):
+ params['url'] = search_url.format(
+ url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
+ )
+ return params
+
+
+def response(resp):
+ results = []
+
+ result = loads(resp.text)
+ info = result["info"]
+ from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
+
+ if "typo" in info:
+ results.append({"suggestion": from_to_prefix + info["typo"]})
+
+ if 'definitions' in info: # pylint: disable=too-many-nested-blocks
+ for definition in info['definitions']:
+ if 'list' in definition:
+ for item in definition['list']:
+ if 'synonyms' in item:
+ for synonym in item['synonyms']:
+ results.append({"suggestion": from_to_prefix + synonym})
+
+ infobox = ""
+
+ for translation in info["extraTranslations"]:
+ infobox += f"{translation['type']}"
+
+ for word in translation["list"]:
+ infobox += f"- {word['word']}
"
+
+ for meaning in word["meanings"]:
+ infobox += f"- {meaning}
"
+
+ infobox += "
"
+
+ results.append(
+ {
+ 'infobox': result["translation"],
+ 'content': infobox,
+ }
+ )
+
+ return results
diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py
index 541272d4..49165e09 100644
--- a/searx/engines/openstreetmap.py
+++ b/searx/engines/openstreetmap.py
@@ -30,6 +30,7 @@ about = {
# engine dependent config
categories = ['map']
paging = False
+language_support = True
# search-url
base_url = 'https://nominatim.openstreetmap.org/'
@@ -141,6 +142,9 @@ def request(query, params):
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
params['route'] = route_re.match(query)
params['headers']['User-Agent'] = searx_useragent()
+
+ accept_language = 'en' if params['language'] == 'all' else params['language']
+ params['headers']['Accept-Language'] = accept_language
return params
@@ -200,7 +204,7 @@ def get_wikipedia_image(raw_value):
return get_external_url('wikimedia_image', raw_value)
-def fetch_wikidata(nominatim_json, user_langage):
+def fetch_wikidata(nominatim_json, user_language):
"""Update nominatim_json using the result of an unique to wikidata
For result in nominatim_json:
@@ -221,9 +225,10 @@ def fetch_wikidata(nominatim_json, user_langage):
wd_to_results.setdefault(wd_id, []).append(result)
if wikidata_ids:
+ user_language = 'en' if user_language == 'all' else user_language.split('-')[0]
wikidata_ids_str = " ".join(wikidata_ids)
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
- '%LANGUAGE%', sparql_string_escape(user_langage)
+ '%LANGUAGE%', sparql_string_escape(user_language)
)
wikidata_json = send_wikidata_query(query)
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
@@ -238,7 +243,7 @@ def fetch_wikidata(nominatim_json, user_langage):
# overwrite wikipedia link
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
if wikipedia_name:
- result['extratags']['wikipedia'] = user_langage + ':' + wikipedia_name
+ result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name
# get website if not already defined
website = wd_result.get('website', {}).get('value')
if (
diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py
index eff7b709..99793ddb 100644
--- a/searx/engines/sjp.py
+++ b/searx/engines/sjp.py
@@ -1,6 +1,8 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-"""Słownik Języka Polskiego (general)
+# lint: pylint
+"""Słownik Języka Polskiego
+Dictionary of the polish language from PWN (sjp.pwn)
"""
from lxml.html import fromstring
diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py
index efcfff22..d7b0d525 100644
--- a/searx/engines/tineye.py
+++ b/searx/engines/tineye.py
@@ -2,10 +2,12 @@
Tineye - Reverse search images
"""
-from json import loads
from urllib.parse import urlencode
from datetime import datetime
+from flask_babel import gettext
+
+from searx import logger
about = {
"website": "https://tineye.com",
@@ -18,13 +20,29 @@ about = {
categories = ['images']
paging = True
-
safesearch = False
base_url = 'https://tineye.com'
search_string = '/result_json/?page={page}&{query}'
+logger = logger.getChild('tineye')
+
+FORMAT_NOT_SUPPORTED = gettext(
+ "Could not read that image url. This may be due to an unsupported file"
+ " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
+)
+"""TinEye error message"""
+
+NO_SIGNATURE_ERROR = gettext(
+ "The image is too simple to find matches. TinEye requires a basic level of"
+ " visual detail to successfully identify matches."
+)
+"""TinEye error message"""
+
+DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
+"""TinEye error message"""
+
def request(query, params):
params['url'] = base_url +\
@@ -40,47 +58,147 @@ def request(query, params):
'TE': 'trailers',
})
+ query = urlencode({'url': query})
+
+ # see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
+ params['url'] = base_url + search_string.format(query=query, page=params['pageno'])
+
return params
+def parse_tineye_match(match_json):
+ """Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
+ object.
+
+ Attributes `(class Match) `__
+
+ - `image_url`, link to the result image.
+ - `domain`, domain this result was found on.
+ - `score`, a number (0 to 100) that indicates how closely the images match.
+ - `width`, image width in pixels.
+ - `height`, image height in pixels.
+ - `size`, image area in pixels.
+ - `format`, image format.
+ - `filesize`, image size in bytes.
+ - `overlay`, overlay URL.
+ - `tags`, whether this match belongs to a collection or stock domain.
+
+ - `backlinks`, a list of Backlink objects pointing to the original websites
+ and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
+ `__):
+
+ - `url`, the image URL to the image.
+ - `backlink`, the original website URL.
+ - `crawl_date`, the date the image was crawled.
+
+ """
+
+ # HINT: there exists an alternative backlink dict in the domains list / e.g.::
+ #
+ # match_json['domains'][0]['backlinks']
+
+ backlinks = []
+ if "backlinks" in match_json:
+
+ for backlink_json in match_json["backlinks"]:
+ if not isinstance(backlink_json, dict):
+ continue
+
+ crawl_date = backlink_json.get("crawl_date")
+ if crawl_date:
+ crawl_date = datetime.fromisoformat(crawl_date[:-3])
+ else:
+ crawl_date = datetime.min
+
+ backlinks.append({
+ 'url': backlink_json.get("url"),
+ 'backlink': backlink_json.get("backlink"),
+ 'crawl_date': crawl_date,
+ 'image_name': backlink_json.get("image_name")}
+ )
+
+ return {
+ 'image_url': match_json.get("image_url"),
+ 'domain': match_json.get("domain"),
+ 'score': match_json.get("score"),
+ 'width': match_json.get("width"),
+ 'height': match_json.get("height"),
+ 'size': match_json.get("size"),
+ 'image_format': match_json.get("format"),
+ 'filesize': match_json.get("filesize"),
+ 'overlay': match_json.get("overlay"),
+ 'tags': match_json.get("tags"),
+ 'backlinks': backlinks,
+ }
+
+
def response(resp):
+ """Parse HTTP response from TinEye."""
results = []
- # Define wanted results
- json_data = loads(resp.text)
- number_of_results = json_data['num_matches']
- for i in json_data['matches']:
- for i in json_data['matches']:
- image_format = i['format']
- width = i['width']
- height = i['height']
- thumbnail_src = i['image_url']
- backlink = i['domains'][0]['backlinks'][0]
+ try:
+ json_data = resp.json()
+ except Exception as exc: # pylint: disable=broad-except
+ msg = "can't parse JSON response // %s" % exc
+ logger.error(msg)
+ json_data = {'error': msg}
- url = backlink['backlink']
- source = backlink['url']
- title = backlink['image_name']
- img_src = backlink['url']
+ # handle error codes from Tineye
- # Get and convert published date
- api_date = backlink['crawl_date'][:-3]
- publishedDate = datetime.fromisoformat(api_date)
+ if resp.is_error:
+ if resp.status_code in (400, 422):
- # Append results
- results.append({
+ message = 'HTTP status: %s' % resp.status_code
+ error = json_data.get('error')
+ s_key = json_data.get('suggestions', {}).get('key', '')
+
+ if error and s_key:
+ message = "%s (%s)" % (error, s_key)
+ elif error:
+ message = error
+
+ if s_key == "Invalid image URL":
+ # test https://docs.searxng.org/_static/searxng-wordmark.svg
+ message = FORMAT_NOT_SUPPORTED
+ elif s_key == 'NO_SIGNATURE_ERROR':
+ # test https://pngimg.com/uploads/dot/dot_PNG4.png
+ message = NO_SIGNATURE_ERROR
+ elif s_key == 'Download Error':
+ # test https://notexists
+ message = DOWNLOAD_ERROR
+
+ logger.error(message)
+
+ return results
+
+ resp.raise_for_status()
+
+ # append results from matches
+ for match_json in json_data['matches']:
+
+ tineye_match = parse_tineye_match(match_json)
+ if not tineye_match['backlinks']:
+ continue
+
+ backlink = tineye_match['backlinks'][0]
+ results.append(
+ {
'template': 'images.html',
- 'url': url,
- 'thumbnail_src': thumbnail_src,
- 'source': source,
- 'title': title,
- 'img_src': img_src,
- 'format': image_format,
- 'widht': width,
- 'height': height,
- 'publishedDate': publishedDate,
- })
+ 'url': backlink['backlink'],
+ 'thumbnail_src': tineye_match['image_url'],
+ 'source': backlink['url'],
+ 'title': backlink['image_name'],
+ 'img_src': backlink['url'],
+ 'format': tineye_match['image_format'],
+ 'widht': tineye_match['width'],
+ 'height': tineye_match['height'],
+ 'publishedDate': backlink['crawl_date'],
+ }
+ )
- # Append number of results
- results.append({'number_of_results': number_of_results})
+ # append number of results
+ number_of_results = json_data.get('num_matches')
+ if number_of_results:
+ results.append({'number_of_results': number_of_results})
return results
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index c8e4cfae..60adb41c 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -64,6 +64,7 @@ WHERE
mwapi:language "%LANGUAGE%".
?item wikibase:apiOutputItem mwapi:item.
}
+ hint:Prior hint:runFirst "true".
%WHERE%
@@ -92,6 +93,12 @@ WHERE {
}
"""
+# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata)
+# hard coded here to avoid to an additional SPARQL request when the server starts
+DUMMY_ENTITY_URLS = set(
+ "http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402")
+)
+
# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
@@ -173,7 +180,7 @@ def response(resp):
for result in jsonresponse.get('results', {}).get('bindings', []):
attribute_result = {key: value['value'] for key, value in result.items()}
entity_url = attribute_result['item']
- if entity_url not in seen_entities:
+ if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS:
seen_entities.add(entity_url)
results += get_results(attribute_result, attributes, language)
else:
diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py
index 68b75bc7..d9af3429 100644
--- a/searx/engines/youtube_noapi.py
+++ b/searx/engines/youtube_noapi.py
@@ -3,7 +3,6 @@
Youtube (Videos)
"""
-from datetime import datetime
from functools import reduce
from json import loads, dumps
from urllib.parse import quote_plus
@@ -26,7 +25,7 @@ time_range_support = True
# search-url
base_url = 'https://www.youtube.com/results'
-search_url = base_url + '?search_query={query}&page={page}'
+search_url = base_url + '?search_query={query}&page={page}&ucbcb=1'
time_range_url = '&sp=EgII{time_range}%253D%253D'
# the key seems to be constant
next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
@@ -44,6 +43,7 @@ base_youtube_url = 'https://www.youtube.com/watch?v='
# do search-request
def request(query, params):
+ params['cookies']['CONSENT'] = "YES+"
if not params['engine_data'].get('next_page_token'):
params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
if params['time_range'] in time_range_dict:
@@ -57,7 +57,6 @@ def request(query, params):
})
params['headers']['Content-Type'] = 'application/json'
- params['headers']['Cookie'] = "CONSENT=YES+cb.%s-17-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
return params
diff --git a/searx/settings.yml b/searx/settings.yml
index b4f61413..99f0be23 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -787,17 +787,23 @@ engines:
shortcut : loc
categories : images
- - name : lobste.rs
- engine : xpath
- search_url : https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
- results_xpath : //li[contains(@class, "story")]
- url_xpath : .//a[@class="u-url"]/@href
- title_xpath : .//a[@class="u-url"]
- content_xpath : .//a[@class="domain"]
- categories : it
- shortcut : lo
- timeout : 5.0
- disabled: True
+ - name: lingva
+ engine: lingva
+ shortcut: lv
+ # set lingva instance in url, by default it will use the official instance
+ # url: https://lingva.ml
+
+ - name: lobste.rs
+ engine: xpath
+ search_url: https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
+ results_xpath: //li[contains(@class, "story")]
+ url_xpath: .//a[@class="u-url"]/@href
+ title_xpath: .//a[@class="u-url"]
+ content_xpath: .//a[@class="domain"]
+ categories: it
+ shortcut: lo
+ timeout: 5.0
+ disabled: true
about:
website: https://lobste.rs/
wikidata_id: Q60762874
@@ -1632,7 +1638,7 @@ engines:
require_api_key: false
results: HTML
- - name: słownik języka polskiego
+ - name: sjp.pwn
engine: sjp
shortcut: sjp
base_url: https://sjp.pwn.pl/