mirror of https://github.com/searx/searx
pick engine fixes (#3306)
* [fix] google engine: results XPath * [fix] google & youtube - set EU consent cookie This change the previous bypass method for Google consent using ``ucbcb=1`` (6face215b8) to accept the consent using ``CONSENT=YES+``. The youtube_noapi and google have a similar API, at least for the consent[1]. Get CONSENT cookie from google reguest:: curl -i "https://www.google.com/search?q=time&tbm=isch" \ -A "Mozilla/5.0 (X11; Linux i686; rv:102.0) Gecko/20100101 Firefox/102.0" \ | grep -i consent ... location: https://consent.google.com/m?continue=https://www.google.com/search?q%3Dtime%26tbm%3Disch&gl=DE&m=0&pc=irp&uxe=eomtm&hl=en-US&src=1 set-cookie: CONSENT=PENDING+936; expires=Wed, 24-Jul-2024 11:26:20 GMT; path=/; domain=.google.com; Secure ... PENDING & YES [2]: Google change the way for consent about YouTube cookies agreement in EU countries. Instead of showing a popup in the website, YouTube redirects the user to a new webpage at consent.youtube.com domain ... Fix for this is to put a cookie CONSENT with YES+ value for every YouTube request [1] https://github.com/iv-org/invidious/pull/2207 [2] https://github.com/TeamNewPipe/NewPipeExtractor/issues/592 Closes: https://github.com/searxng/searxng/issues/1432 * [fix] sjp engine - convert enginename to a latin1 compliance name The engine name is not only a *name* its also a identifier that is used in logs, HTTP headers and more. Unicode characters in the name of an engine could cause various issues. Closes: https://github.com/searxng/searxng/issues/1544 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * [fix] engine tineye: handle 422 response of not supported img format Closes: https://github.com/searxng/searxng/issues/1449 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * bypass google consent with ucbcb=1 * [mod] Adds Lingva translate engine Add the lingva engine (which grabs data from google translate). Results from Lingva are added to the infobox results. * openstreetmap engine: return the localized named. For example: display "Tokyo" instead of "東京都" when the language is English. * [fix] engines/openstreetmap.py typo: user_langage --> user_language Signed-off-by: Markus Heiser <markus.heiser@darmarit.de> * Wikidata engine: ignore dummy entities * Wikidata engine: minor change of the SPARQL request The engine can be slow especially when the query won't return any answer. See https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI#Find_articles_in_Wikipedia_speaking_about_cheese_and_see_which_Wikibase_items_they_correspond_to Co-authored-by: Léon Tiekötter <leon@tiekoetter.com> Co-authored-by: Emilien Devos <contact@emiliendevos.be> Co-authored-by: Markus Heiser <markus.heiser@darmarit.de> Co-authored-by: Emilien Devos <github@emiliendevos.be> Co-authored-by: ta <alt3753.7@gmail.com> Co-authored-by: Alexandre Flament <alex@al-f.net>
This commit is contained in:
parent
85034b49ef
commit
05fe2ee093
|
@ -108,8 +108,8 @@ filter_mapping = {
|
|||
# specific xpath variables
|
||||
# ------------------------
|
||||
|
||||
# google results are grouped into <div class="g ..." ../>
|
||||
results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]'
|
||||
# google results are grouped into <div class="jtfYYd ..." ../>
|
||||
results_xpath = '//div[contains(@class, "jtfYYd")]'
|
||||
results_xpath_mobile_ui = '//div[contains(@class, "g ")]'
|
||||
|
||||
# google *sections* are no usual *results*, we ignore them
|
||||
|
@ -223,6 +223,7 @@ def request(query, params):
|
|||
'oe': "utf8",
|
||||
'start': offset,
|
||||
'filter': '0',
|
||||
'ucbcb': 1,
|
||||
**additional_parameters,
|
||||
})
|
||||
|
||||
|
@ -235,6 +236,7 @@ def request(query, params):
|
|||
params['url'] = query_url
|
||||
|
||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
if use_mobile_ui:
|
||||
params['headers']['Accept'] = '*/*'
|
||||
|
|
|
@ -109,6 +109,7 @@ def request(query, params):
|
|||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
'ucbcd': 1,
|
||||
'num': 30,
|
||||
})
|
||||
|
||||
|
@ -121,6 +122,7 @@ def request(query, params):
|
|||
params['url'] = query_url
|
||||
|
||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = (
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
|
|
@ -104,6 +104,7 @@ def request(query, params):
|
|||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
'ucbcb': 1,
|
||||
'gl': lang_info['country'],
|
||||
}) + ('&ceid=%s' % ceid) # ceid includes a ':' character which must not be urlencoded
|
||||
|
||||
|
@ -111,10 +112,12 @@ def request(query, params):
|
|||
params['url'] = query_url
|
||||
|
||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = (
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
)
|
||||
)
|
||||
|
||||
return params
|
||||
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Google Play Apps
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from lxml import html
|
||||
from searx.utils import (
|
||||
eval_xpath,
|
||||
extract_url,
|
||||
extract_text,
|
||||
eval_xpath_list,
|
||||
eval_xpath_getindex,
|
||||
)
|
||||
|
||||
about = {
|
||||
"website": "https://play.google.com/",
|
||||
"wikidata_id": "Q79576",
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": "HTML",
|
||||
}
|
||||
|
||||
categories = ["files", "apps"]
|
||||
search_url = "https://play.google.com/store/search?{query}&c=apps&ucbcb=1"
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params["url"] = search_url.format(query=urlencode({"q": query}))
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
if eval_xpath(dom, '//div[@class="v6DsQb"]'):
|
||||
return []
|
||||
|
||||
spot = eval_xpath_getindex(dom, '//div[@class="ipRz4"]', 0, None)
|
||||
if spot is not None:
|
||||
url = extract_url(eval_xpath(spot, './a[@class="Qfxief"]/@href'), search_url)
|
||||
title = extract_text(eval_xpath(spot, './/div[@class="vWM94c"]'))
|
||||
content = extract_text(eval_xpath(spot, './/div[@class="LbQbAe"]'))
|
||||
img = extract_text(eval_xpath(spot, './/img[@class="T75of bzqKMd"]/@src'))
|
||||
|
||||
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||
|
||||
more = eval_xpath_list(dom, '//c-wiz[@jsrenderer="RBsfwb"]//div[@role="listitem"]', min_len=1)
|
||||
for result in more:
|
||||
url = extract_url(eval_xpath(result, ".//a/@href"), search_url)
|
||||
title = extract_text(eval_xpath(result, './/span[@class="DdYX5"]'))
|
||||
content = extract_text(eval_xpath(result, './/span[@class="wMUdtb"]'))
|
||||
img = extract_text(
|
||||
eval_xpath(
|
||||
result,
|
||||
'.//img[@class="T75of stzEZd" or @class="T75of etjhNc Q8CSx "]/@src',
|
||||
)
|
||||
)
|
||||
|
||||
results.append({"url": url, "title": title, "content": content, "img_src": img})
|
||||
|
||||
for suggestion in eval_xpath_list(dom, '//c-wiz[@jsrenderer="qyd4Kb"]//div[@class="ULeU3b neq64b"]'):
|
||||
results.append({"suggestion": extract_text(eval_xpath(suggestion, './/div[@class="Epkrse "]'))})
|
||||
|
||||
return results
|
|
@ -85,13 +85,13 @@ def request(query, params):
|
|||
# subdomain is: scholar.google.xy
|
||||
lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.")
|
||||
|
||||
query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({
|
||||
'q': query,
|
||||
**lang_info['params'],
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
'start' : offset,
|
||||
})
|
||||
query_url = (
|
||||
'https://'
|
||||
+ lang_info['subdomain']
|
||||
+ '/scholar'
|
||||
+ "?"
|
||||
+ urlencode({'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, 'ucbcb': 1})
|
||||
)
|
||||
|
||||
query_url += time_range_url(params)
|
||||
|
||||
|
@ -99,6 +99,7 @@ def request(query, params):
|
|||
params['url'] = query_url
|
||||
|
||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = (
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
|
|
@ -125,6 +125,7 @@ def request(query, params):
|
|||
'q': query,
|
||||
'tbm': "vid",
|
||||
**lang_info['params'],
|
||||
'ucbcb': 1,
|
||||
'ie': "utf8",
|
||||
'oe': "utf8",
|
||||
})
|
||||
|
@ -138,6 +139,7 @@ def request(query, params):
|
|||
params['url'] = query_url
|
||||
|
||||
logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
params['headers'].update(lang_info['headers'])
|
||||
params['headers']['Accept'] = (
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""Lingva (alternative Google Translate frontend)"""
|
||||
|
||||
from json import loads
|
||||
|
||||
about = {
|
||||
"website": 'https://lingva.ml',
|
||||
"wikidata_id": None,
|
||||
"official_api_documentation": 'https://github.com/thedaviddelta/lingva-translate#public-apis',
|
||||
"use_official_api": True,
|
||||
"require_api_key": False,
|
||||
"results": 'JSON',
|
||||
}
|
||||
|
||||
engine_type = 'online_dictionary'
|
||||
categories = ['general']
|
||||
|
||||
url = "https://lingva.ml"
|
||||
search_url = "{url}/api/v1/{from_lang}/{to_lang}/{query}"
|
||||
|
||||
|
||||
def request(_query, params):
|
||||
params['url'] = search_url.format(
|
||||
url=url, from_lang=params['from_lang'][1], to_lang=params['to_lang'][1], query=params['query']
|
||||
)
|
||||
return params
|
||||
|
||||
|
||||
def response(resp):
|
||||
results = []
|
||||
|
||||
result = loads(resp.text)
|
||||
info = result["info"]
|
||||
from_to_prefix = "%s-%s " % (resp.search_params['from_lang'][1], resp.search_params['to_lang'][1])
|
||||
|
||||
if "typo" in info:
|
||||
results.append({"suggestion": from_to_prefix + info["typo"]})
|
||||
|
||||
if 'definitions' in info: # pylint: disable=too-many-nested-blocks
|
||||
for definition in info['definitions']:
|
||||
if 'list' in definition:
|
||||
for item in definition['list']:
|
||||
if 'synonyms' in item:
|
||||
for synonym in item['synonyms']:
|
||||
results.append({"suggestion": from_to_prefix + synonym})
|
||||
|
||||
infobox = ""
|
||||
|
||||
for translation in info["extraTranslations"]:
|
||||
infobox += f"<b>{translation['type']}</b>"
|
||||
|
||||
for word in translation["list"]:
|
||||
infobox += f"<dl><dt>{word['word']}</dt>"
|
||||
|
||||
for meaning in word["meanings"]:
|
||||
infobox += f"<dd>{meaning}</dd>"
|
||||
|
||||
infobox += "</dl>"
|
||||
|
||||
results.append(
|
||||
{
|
||||
'infobox': result["translation"],
|
||||
'content': infobox,
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
|
@ -30,6 +30,7 @@ about = {
|
|||
# engine dependent config
|
||||
categories = ['map']
|
||||
paging = False
|
||||
language_support = True
|
||||
|
||||
# search-url
|
||||
base_url = 'https://nominatim.openstreetmap.org/'
|
||||
|
@ -141,6 +142,9 @@ def request(query, params):
|
|||
params['url'] = base_url + search_string.format(query=urlencode({'q': query}))
|
||||
params['route'] = route_re.match(query)
|
||||
params['headers']['User-Agent'] = searx_useragent()
|
||||
|
||||
accept_language = 'en' if params['language'] == 'all' else params['language']
|
||||
params['headers']['Accept-Language'] = accept_language
|
||||
return params
|
||||
|
||||
|
||||
|
@ -200,7 +204,7 @@ def get_wikipedia_image(raw_value):
|
|||
return get_external_url('wikimedia_image', raw_value)
|
||||
|
||||
|
||||
def fetch_wikidata(nominatim_json, user_langage):
|
||||
def fetch_wikidata(nominatim_json, user_language):
|
||||
"""Update nominatim_json using the result of an unique to wikidata
|
||||
|
||||
For result in nominatim_json:
|
||||
|
@ -221,9 +225,10 @@ def fetch_wikidata(nominatim_json, user_langage):
|
|||
wd_to_results.setdefault(wd_id, []).append(result)
|
||||
|
||||
if wikidata_ids:
|
||||
user_language = 'en' if user_language == 'all' else user_language.split('-')[0]
|
||||
wikidata_ids_str = " ".join(wikidata_ids)
|
||||
query = wikidata_image_sparql.replace('%WIKIDATA_IDS%', sparql_string_escape(wikidata_ids_str)).replace(
|
||||
'%LANGUAGE%', sparql_string_escape(user_langage)
|
||||
'%LANGUAGE%', sparql_string_escape(user_language)
|
||||
)
|
||||
wikidata_json = send_wikidata_query(query)
|
||||
for wd_result in wikidata_json.get('results', {}).get('bindings', {}):
|
||||
|
@ -238,7 +243,7 @@ def fetch_wikidata(nominatim_json, user_langage):
|
|||
# overwrite wikipedia link
|
||||
wikipedia_name = wd_result.get('wikipediaName', {}).get('value')
|
||||
if wikipedia_name:
|
||||
result['extratags']['wikipedia'] = user_langage + ':' + wikipedia_name
|
||||
result['extratags']['wikipedia'] = user_language + ':' + wikipedia_name
|
||||
# get website if not already defined
|
||||
website = wd_result.get('website', {}).get('value')
|
||||
if (
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""Słownik Języka Polskiego (general)
|
||||
# lint: pylint
|
||||
"""Słownik Języka Polskiego
|
||||
|
||||
Dictionary of the polish language from PWN (sjp.pwn)
|
||||
"""
|
||||
|
||||
from lxml.html import fromstring
|
||||
|
|
|
@ -2,10 +2,12 @@
|
|||
Tineye - Reverse search images
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from datetime import datetime
|
||||
from flask_babel import gettext
|
||||
|
||||
from searx import logger
|
||||
|
||||
about = {
|
||||
"website": "https://tineye.com",
|
||||
|
@ -18,13 +20,29 @@ about = {
|
|||
|
||||
categories = ['images']
|
||||
paging = True
|
||||
|
||||
safesearch = False
|
||||
|
||||
|
||||
base_url = 'https://tineye.com'
|
||||
search_string = '/result_json/?page={page}&{query}'
|
||||
|
||||
logger = logger.getChild('tineye')
|
||||
|
||||
FORMAT_NOT_SUPPORTED = gettext(
|
||||
"Could not read that image url. This may be due to an unsupported file"
|
||||
" format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
|
||||
)
|
||||
"""TinEye error message"""
|
||||
|
||||
NO_SIGNATURE_ERROR = gettext(
|
||||
"The image is too simple to find matches. TinEye requires a basic level of"
|
||||
" visual detail to successfully identify matches."
|
||||
)
|
||||
"""TinEye error message"""
|
||||
|
||||
DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
|
||||
"""TinEye error message"""
|
||||
|
||||
|
||||
def request(query, params):
|
||||
params['url'] = base_url +\
|
||||
|
@ -40,47 +58,147 @@ def request(query, params):
|
|||
'TE': 'trailers',
|
||||
})
|
||||
|
||||
query = urlencode({'url': query})
|
||||
|
||||
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
|
||||
params['url'] = base_url + search_string.format(query=query, page=params['pageno'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
def parse_tineye_match(match_json):
|
||||
"""Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
|
||||
object.
|
||||
|
||||
Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__
|
||||
|
||||
- `image_url`, link to the result image.
|
||||
- `domain`, domain this result was found on.
|
||||
- `score`, a number (0 to 100) that indicates how closely the images match.
|
||||
- `width`, image width in pixels.
|
||||
- `height`, image height in pixels.
|
||||
- `size`, image area in pixels.
|
||||
- `format`, image format.
|
||||
- `filesize`, image size in bytes.
|
||||
- `overlay`, overlay URL.
|
||||
- `tags`, whether this match belongs to a collection or stock domain.
|
||||
|
||||
- `backlinks`, a list of Backlink objects pointing to the original websites
|
||||
and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
|
||||
<https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):
|
||||
|
||||
- `url`, the image URL to the image.
|
||||
- `backlink`, the original website URL.
|
||||
- `crawl_date`, the date the image was crawled.
|
||||
|
||||
"""
|
||||
|
||||
# HINT: there exists an alternative backlink dict in the domains list / e.g.::
|
||||
#
|
||||
# match_json['domains'][0]['backlinks']
|
||||
|
||||
backlinks = []
|
||||
if "backlinks" in match_json:
|
||||
|
||||
for backlink_json in match_json["backlinks"]:
|
||||
if not isinstance(backlink_json, dict):
|
||||
continue
|
||||
|
||||
crawl_date = backlink_json.get("crawl_date")
|
||||
if crawl_date:
|
||||
crawl_date = datetime.fromisoformat(crawl_date[:-3])
|
||||
else:
|
||||
crawl_date = datetime.min
|
||||
|
||||
backlinks.append({
|
||||
'url': backlink_json.get("url"),
|
||||
'backlink': backlink_json.get("backlink"),
|
||||
'crawl_date': crawl_date,
|
||||
'image_name': backlink_json.get("image_name")}
|
||||
)
|
||||
|
||||
return {
|
||||
'image_url': match_json.get("image_url"),
|
||||
'domain': match_json.get("domain"),
|
||||
'score': match_json.get("score"),
|
||||
'width': match_json.get("width"),
|
||||
'height': match_json.get("height"),
|
||||
'size': match_json.get("size"),
|
||||
'image_format': match_json.get("format"),
|
||||
'filesize': match_json.get("filesize"),
|
||||
'overlay': match_json.get("overlay"),
|
||||
'tags': match_json.get("tags"),
|
||||
'backlinks': backlinks,
|
||||
}
|
||||
|
||||
|
||||
def response(resp):
|
||||
"""Parse HTTP response from TinEye."""
|
||||
results = []
|
||||
# Define wanted results
|
||||
json_data = loads(resp.text)
|
||||
number_of_results = json_data['num_matches']
|
||||
|
||||
for i in json_data['matches']:
|
||||
for i in json_data['matches']:
|
||||
image_format = i['format']
|
||||
width = i['width']
|
||||
height = i['height']
|
||||
thumbnail_src = i['image_url']
|
||||
backlink = i['domains'][0]['backlinks'][0]
|
||||
try:
|
||||
json_data = resp.json()
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
msg = "can't parse JSON response // %s" % exc
|
||||
logger.error(msg)
|
||||
json_data = {'error': msg}
|
||||
|
||||
url = backlink['backlink']
|
||||
source = backlink['url']
|
||||
title = backlink['image_name']
|
||||
img_src = backlink['url']
|
||||
# handle error codes from Tineye
|
||||
|
||||
# Get and convert published date
|
||||
api_date = backlink['crawl_date'][:-3]
|
||||
publishedDate = datetime.fromisoformat(api_date)
|
||||
if resp.is_error:
|
||||
if resp.status_code in (400, 422):
|
||||
|
||||
# Append results
|
||||
results.append({
|
||||
message = 'HTTP status: %s' % resp.status_code
|
||||
error = json_data.get('error')
|
||||
s_key = json_data.get('suggestions', {}).get('key', '')
|
||||
|
||||
if error and s_key:
|
||||
message = "%s (%s)" % (error, s_key)
|
||||
elif error:
|
||||
message = error
|
||||
|
||||
if s_key == "Invalid image URL":
|
||||
# test https://docs.searxng.org/_static/searxng-wordmark.svg
|
||||
message = FORMAT_NOT_SUPPORTED
|
||||
elif s_key == 'NO_SIGNATURE_ERROR':
|
||||
# test https://pngimg.com/uploads/dot/dot_PNG4.png
|
||||
message = NO_SIGNATURE_ERROR
|
||||
elif s_key == 'Download Error':
|
||||
# test https://notexists
|
||||
message = DOWNLOAD_ERROR
|
||||
|
||||
logger.error(message)
|
||||
|
||||
return results
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
# append results from matches
|
||||
for match_json in json_data['matches']:
|
||||
|
||||
tineye_match = parse_tineye_match(match_json)
|
||||
if not tineye_match['backlinks']:
|
||||
continue
|
||||
|
||||
backlink = tineye_match['backlinks'][0]
|
||||
results.append(
|
||||
{
|
||||
'template': 'images.html',
|
||||
'url': url,
|
||||
'thumbnail_src': thumbnail_src,
|
||||
'source': source,
|
||||
'title': title,
|
||||
'img_src': img_src,
|
||||
'format': image_format,
|
||||
'widht': width,
|
||||
'height': height,
|
||||
'publishedDate': publishedDate,
|
||||
})
|
||||
'url': backlink['backlink'],
|
||||
'thumbnail_src': tineye_match['image_url'],
|
||||
'source': backlink['url'],
|
||||
'title': backlink['image_name'],
|
||||
'img_src': backlink['url'],
|
||||
'format': tineye_match['image_format'],
|
||||
'widht': tineye_match['width'],
|
||||
'height': tineye_match['height'],
|
||||
'publishedDate': backlink['crawl_date'],
|
||||
}
|
||||
)
|
||||
|
||||
# Append number of results
|
||||
results.append({'number_of_results': number_of_results})
|
||||
# append number of results
|
||||
number_of_results = json_data.get('num_matches')
|
||||
if number_of_results:
|
||||
results.append({'number_of_results': number_of_results})
|
||||
|
||||
return results
|
||||
|
|
|
@ -64,6 +64,7 @@ WHERE
|
|||
mwapi:language "%LANGUAGE%".
|
||||
?item wikibase:apiOutputItem mwapi:item.
|
||||
}
|
||||
hint:Prior hint:runFirst "true".
|
||||
|
||||
%WHERE%
|
||||
|
||||
|
@ -92,6 +93,12 @@ WHERE {
|
|||
}
|
||||
"""
|
||||
|
||||
# see the property "dummy value" of https://www.wikidata.org/wiki/Q2013 (Wikidata)
|
||||
# hard coded here to avoid to an additional SPARQL request when the server starts
|
||||
DUMMY_ENTITY_URLS = set(
|
||||
"http://www.wikidata.org/entity/" + wid for wid in ("Q4115189", "Q13406268", "Q15397819", "Q17339402")
|
||||
)
|
||||
|
||||
|
||||
# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
|
||||
# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
|
||||
|
@ -173,7 +180,7 @@ def response(resp):
|
|||
for result in jsonresponse.get('results', {}).get('bindings', []):
|
||||
attribute_result = {key: value['value'] for key, value in result.items()}
|
||||
entity_url = attribute_result['item']
|
||||
if entity_url not in seen_entities:
|
||||
if entity_url not in seen_entities and entity_url not in DUMMY_ENTITY_URLS:
|
||||
seen_entities.add(entity_url)
|
||||
results += get_results(attribute_result, attributes, language)
|
||||
else:
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
Youtube (Videos)
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from functools import reduce
|
||||
from json import loads, dumps
|
||||
from urllib.parse import quote_plus
|
||||
|
@ -26,7 +25,7 @@ time_range_support = True
|
|||
|
||||
# search-url
|
||||
base_url = 'https://www.youtube.com/results'
|
||||
search_url = base_url + '?search_query={query}&page={page}'
|
||||
search_url = base_url + '?search_query={query}&page={page}&ucbcb=1'
|
||||
time_range_url = '&sp=EgII{time_range}%253D%253D'
|
||||
# the key seems to be constant
|
||||
next_page_url = 'https://www.youtube.com/youtubei/v1/search?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
|
||||
|
@ -44,6 +43,7 @@ base_youtube_url = 'https://www.youtube.com/watch?v='
|
|||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['cookies']['CONSENT'] = "YES+"
|
||||
if not params['engine_data'].get('next_page_token'):
|
||||
params['url'] = search_url.format(query=quote_plus(query), page=params['pageno'])
|
||||
if params['time_range'] in time_range_dict:
|
||||
|
@ -57,7 +57,6 @@ def request(query, params):
|
|||
})
|
||||
params['headers']['Content-Type'] = 'application/json'
|
||||
|
||||
params['headers']['Cookie'] = "CONSENT=YES+cb.%s-17-p0.en+F+941;" % datetime.now().strftime("%Y%m%d")
|
||||
return params
|
||||
|
||||
|
||||
|
|
|
@ -787,17 +787,23 @@ engines:
|
|||
shortcut : loc
|
||||
categories : images
|
||||
|
||||
- name : lobste.rs
|
||||
engine : xpath
|
||||
search_url : https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
|
||||
results_xpath : //li[contains(@class, "story")]
|
||||
url_xpath : .//a[@class="u-url"]/@href
|
||||
title_xpath : .//a[@class="u-url"]
|
||||
content_xpath : .//a[@class="domain"]
|
||||
categories : it
|
||||
shortcut : lo
|
||||
timeout : 5.0
|
||||
disabled: True
|
||||
- name: lingva
|
||||
engine: lingva
|
||||
shortcut: lv
|
||||
# set lingva instance in url, by default it will use the official instance
|
||||
# url: https://lingva.ml
|
||||
|
||||
- name: lobste.rs
|
||||
engine: xpath
|
||||
search_url: https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance
|
||||
results_xpath: //li[contains(@class, "story")]
|
||||
url_xpath: .//a[@class="u-url"]/@href
|
||||
title_xpath: .//a[@class="u-url"]
|
||||
content_xpath: .//a[@class="domain"]
|
||||
categories: it
|
||||
shortcut: lo
|
||||
timeout: 5.0
|
||||
disabled: true
|
||||
about:
|
||||
website: https://lobste.rs/
|
||||
wikidata_id: Q60762874
|
||||
|
@ -1632,7 +1638,7 @@ engines:
|
|||
require_api_key: false
|
||||
results: HTML
|
||||
|
||||
- name: słownik języka polskiego
|
||||
- name: sjp.pwn
|
||||
engine: sjp
|
||||
shortcut: sjp
|
||||
base_url: https://sjp.pwn.pl/
|
||||
|
|
Loading…
Reference in New Issue