[enh] add raise_for_httperror

check HTTP response:
* detect some comme CAPTCHA challenge (no solving). In this case the engine is suspended for long a time.
* otherwise raise HTTPError as before

the check is done in poolrequests.py (was before in search.py).

update qwant, wikipedia, wikidata to use raise_for_httperror instead of raise_for_status
This commit is contained in:
Alexandre Flament 2020-12-09 21:23:20 +01:00
parent 033f39bff7
commit d703119d3a
11 changed files with 179 additions and 56 deletions

View File

@ -134,9 +134,9 @@ The function ``def request(query, params):`` always returns the ``params``
variable. Inside searx, the following paramters can be used to specify a search variable. Inside searx, the following paramters can be used to specify a search
request: request:
================== =========== ========================================================================== =================== =========== ==========================================================================
argument type information argument type information
================== =========== ========================================================================== =================== =========== ==========================================================================
url string requested url url string requested url
method string HTTP request method method string HTTP request method
headers set HTTP header information headers set HTTP header information
@ -145,8 +145,8 @@ cookies set HTTP cookies
verify boolean Performing SSL-Validity check verify boolean Performing SSL-Validity check
max_redirects int maximum redirects, hard limit max_redirects int maximum redirects, hard limit
soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine soft_max_redirects int maximum redirects, soft limit. Record an error but don't stop the engine
raise_for_status bool True by default: raise an exception if the HTTP code of response is >= 300 raise_for_httperror bool True by default: raise an exception if the HTTP code of response is >= 300
================== =========== ========================================================================== =================== =========== ==========================================================================
example code example code

View File

@ -281,7 +281,11 @@ def initialize_engines(engine_list):
load_engines(engine_list) load_engines(engine_list)
def engine_init(engine_name, init_fn): def engine_init(engine_name, init_fn):
try:
init_fn(get_engine_from_settings(engine_name)) init_fn(get_engine_from_settings(engine_name))
except Exception:
logger.exception('%s engine: Fail to initialize', engine_name)
else:
logger.debug('%s engine: Initialized', engine_name) logger.debug('%s engine: Initialized', engine_name)
for engine_name, engine in engines.items(): for engine_name, engine in engines.items():

View File

@ -14,6 +14,8 @@ from datetime import datetime
from json import loads from json import loads
from urllib.parse import urlencode from urllib.parse import urlencode
from searx.utils import html_to_text, match_language from searx.utils import html_to_text, match_language
from searx.exceptions import SearxEngineAPIException, SearxEngineCaptchaException
from searx.raise_for_httperror import raise_for_httperror
# engine dependent config # engine dependent config
@ -24,8 +26,7 @@ supported_languages_url = 'https://qwant.com/region'
category_to_keyword = {'general': 'web', category_to_keyword = {'general': 'web',
'images': 'images', 'images': 'images',
'news': 'news', 'news': 'news'}
'social media': 'social'}
# search-url # search-url
url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4' url = 'https://api.qwant.com/api/search/{keyword}?count=10&offset={offset}&f=&{query}&t={keyword}&uiv=4'
@ -51,6 +52,7 @@ def request(query, params):
params['url'] += '&locale=' + language.replace('-', '_').lower() params['url'] += '&locale=' + language.replace('-', '_').lower()
params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' params['headers']['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0'
params['raise_for_httperror'] = False
return params return params
@ -58,8 +60,20 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
# According to https://www.qwant.com/js/app.js
if resp.status_code == 429:
raise SearxEngineCaptchaException()
# raise for other errors
raise_for_httperror(resp)
# load JSON result
search_results = loads(resp.text) search_results = loads(resp.text)
# check for an API error
if search_results.get('status') != 'success':
raise SearxEngineAPIException('API error ' + str(search_results.get('error', '')))
# return empty array if there are no results # return empty array if there are no results
if 'data' not in search_results: if 'data' not in search_results:
return [] return []
@ -90,15 +104,6 @@ def response(resp):
'thumbnail_src': thumbnail_src, 'thumbnail_src': thumbnail_src,
'img_src': img_src}) 'img_src': img_src})
elif category_to_keyword.get(categories[0], '') == 'social':
published_date = datetime.fromtimestamp(result['date'], None)
img_src = result.get('img', None)
results.append({'url': res_url,
'title': title,
'publishedDate': published_date,
'content': content,
'img_src': img_src})
elif category_to_keyword.get(categories[0], '') == 'news': elif category_to_keyword.get(categories[0], '') == 'news':
published_date = datetime.fromtimestamp(result['date'], None) published_date = datetime.fromtimestamp(result['date'], None)
media = result.get('media', []) media = result.get('media', [])

View File

@ -161,9 +161,6 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
if resp.status_code != 200:
logger.debug('SPARQL endpoint error %s', resp.content.decode())
resp.raise_for_status()
jsonresponse = loads(resp.content.decode()) jsonresponse = loads(resp.content.decode())
language = resp.search_params['language'].lower() language = resp.search_params['language'].lower()

View File

@ -14,6 +14,7 @@ from urllib.parse import quote
from json import loads from json import loads
from lxml.html import fromstring from lxml.html import fromstring
from searx.utils import match_language, searx_useragent from searx.utils import match_language, searx_useragent
from searx.raise_for_httperror import raise_for_httperror
# search-url # search-url
search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}'
@ -37,7 +38,7 @@ def request(query, params):
language=url_lang(params['language'])) language=url_lang(params['language']))
params['headers']['User-Agent'] = searx_useragent() params['headers']['User-Agent'] = searx_useragent()
params['raise_for_status'] = False params['raise_for_httperror'] = False
params['soft_max_redirects'] = 2 params['soft_max_redirects'] = 2
return params return params
@ -47,6 +48,7 @@ def request(query, params):
def response(resp): def response(resp):
if resp.status_code == 404: if resp.status_code == 404:
return [] return []
raise_for_httperror(resp)
results = [] results = []
api_result = loads(resp.text) api_result = loads(resp.text)

View File

@ -64,8 +64,33 @@ class SearxEngineAPIException(SearxEngineResponseException):
"""The website has returned an application error""" """The website has returned an application error"""
class SearxEngineCaptchaException(SearxEngineResponseException): class SearxEngineAccessDeniedException(SearxEngineResponseException):
"""The website has returned a CAPTCHA""" """The website is blocking the access"""
def __init__(self, suspended_time=24 * 3600, message='Access denied'):
super().__init__(message + ', suspended_time=' + str(suspended_time))
self.suspended_time = suspended_time
self.message = message
class SearxEngineCaptchaException(SearxEngineAccessDeniedException):
"""The website has returned a CAPTCHA
By default, searx stops sending requests to this engine for 1 day.
"""
def __init__(self, suspended_time=24 * 3600, message='CAPTCHA'):
super().__init__(message=message, suspended_time=suspended_time)
class SearxEngineTooManyRequestsException(SearxEngineAccessDeniedException):
"""The website has returned a Too Many Request status code
By default, searx stops sending requests to this engine for 1 hour.
"""
def __init__(self, suspended_time=3600, message='Too many request'):
super().__init__(message=message, suspended_time=suspended_time)
class SearxEngineXPathException(SearxEngineResponseException): class SearxEngineXPathException(SearxEngineResponseException):

View File

@ -4,7 +4,8 @@ import logging
from json import JSONDecodeError from json import JSONDecodeError
from urllib.parse import urlparse from urllib.parse import urlparse
from requests.exceptions import RequestException from requests.exceptions import RequestException
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
SearxEngineAccessDeniedException)
from searx import logger from searx import logger
@ -100,6 +101,10 @@ def get_messages(exc, filename) -> typing.Tuple:
return (exc.xpath_str, exc.message) return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineXPathException): if isinstance(exc, SearxEngineXPathException):
return (exc.xpath_str, exc.message) return (exc.xpath_str, exc.message)
if isinstance(exc, SearxEngineAPIException):
return (str(exc.args[0]), )
if isinstance(exc, SearxEngineAccessDeniedException):
return (exc.message, )
return () return ()

View File

@ -7,6 +7,7 @@ import requests
from searx import settings from searx import settings
from searx import logger from searx import logger
from searx.raise_for_httperror import raise_for_httperror
logger = logger.getChild('poolrequests') logger = logger.getChild('poolrequests')
@ -156,6 +157,12 @@ def request(method, url, **kwargs):
if timeout is not None: if timeout is not None:
kwargs['timeout'] = timeout kwargs['timeout'] = timeout
# raise_for_error
check_for_httperror = True
if 'raise_for_httperror' in kwargs:
check_for_httperror = kwargs['raise_for_httperror']
del kwargs['raise_for_httperror']
# do request # do request
response = session.request(method=method, url=url, **kwargs) response = session.request(method=method, url=url, **kwargs)
@ -176,6 +183,10 @@ def request(method, url, **kwargs):
if hasattr(threadLocal, 'total_time'): if hasattr(threadLocal, 'total_time'):
threadLocal.total_time += time_after_request - time_before_request threadLocal.total_time += time_after_request - time_before_request
# raise an exception
if check_for_httperror:
raise_for_httperror(response)
return response return response

View File

@ -0,0 +1,66 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Raise exception for an HTTP response is an error.
"""
from searx.exceptions import (SearxEngineCaptchaException, SearxEngineTooManyRequestsException,
SearxEngineAccessDeniedException)
def is_cloudflare_challenge(resp):
if resp.status_code in [429, 503]:
if ('__cf_chl_jschl_tk__=' in resp.text)\
or ('/cdn-cgi/challenge-platform/' in resp.text
and 'orchestrate/jsch/v1' in resp.text
and 'window._cf_chl_enter(' in resp.text):
return True
if resp.status_code == 403 and '__cf_chl_captcha_tk__=' in resp.text:
return True
return False
def is_cloudflare_firewall(resp):
return resp.status_code == 403 and '<span class="cf-error-code">1020</span>' in resp.text
def raise_for_cloudflare_captcha(resp):
if resp.headers.get('Server', '').startswith('cloudflare'):
if is_cloudflare_challenge(resp):
# https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha-
# suspend for 2 weeks
raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15)
if is_cloudflare_firewall(resp):
raise SearxEngineAccessDeniedException(message='Cloudflare Firewall', suspended_time=3600 * 24)
def raise_for_recaptcha(resp):
if resp.status_code == 503 \
and '"https://www.google.com/recaptcha/' in resp.text:
raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
def raise_for_captcha(resp):
raise_for_cloudflare_captcha(resp)
raise_for_recaptcha(resp)
def raise_for_httperror(resp):
"""Raise exception for an HTTP response is an error.
Args:
resp (requests.Response): Response to check
Raises:
requests.HTTPError: raise by resp.raise_for_status()
searx.exceptions.SearxEngineAccessDeniedException: raise when the HTTP status code is 402 or 403.
searx.exceptions.SearxEngineTooManyRequestsException: raise when the HTTP status code is 429.
searx.exceptions.SearxEngineCaptchaException: raise when if CATPCHA challenge is detected.
"""
if resp.status_code and resp.status_code >= 400:
raise_for_captcha(resp)
if resp.status_code in (402, 403):
raise SearxEngineAccessDeniedException(message='HTTP error ' + str(resp.status_code),
suspended_time=3600 * 24)
if resp.status_code == 429:
raise SearxEngineTooManyRequestsException()
resp.raise_for_status()

View File

@ -32,7 +32,8 @@ from searx.utils import gen_useragent
from searx.results import ResultContainer from searx.results import ResultContainer
from searx import logger from searx import logger
from searx.plugins import plugins from searx.plugins import plugins
from searx.exceptions import SearxEngineCaptchaException from searx.exceptions import (SearxEngineAccessDeniedException, SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,)
from searx.metrology.error_recorder import record_exception, record_error from searx.metrology.error_recorder import record_exception, record_error
@ -131,6 +132,9 @@ def send_http_request(engine, request_params):
# soft_max_redirects # soft_max_redirects
soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0) soft_max_redirects = request_params.get('soft_max_redirects', max_redirects or 0)
# raise_for_status
request_args['raise_for_httperror'] = request_params.get('raise_for_httperror', False)
# specific type of request (GET or POST) # specific type of request (GET or POST)
if request_params['method'] == 'GET': if request_params['method'] == 'GET':
req = requests_lib.get req = requests_lib.get
@ -142,10 +146,6 @@ def send_http_request(engine, request_params):
# send the request # send the request
response = req(request_params['url'], **request_args) response = req(request_params['url'], **request_args)
# check HTTP status
if request_params.get('raise_for_status'):
response.raise_for_status()
# check soft limit of the redirect count # check soft limit of the redirect count
if len(response.history) > soft_max_redirects: if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error # unexpected redirect : record an error
@ -191,6 +191,7 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
# suppose everything will be alright # suppose everything will be alright
requests_exception = False requests_exception = False
suspended_time = None
try: try:
# send requests and parse the results # send requests and parse the results
@ -240,6 +241,15 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
elif (issubclass(e.__class__, SearxEngineCaptchaException)): elif (issubclass(e.__class__, SearxEngineCaptchaException)):
result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required') result_container.add_unresponsive_engine(engine_name, 'CAPTCHA required')
logger.exception('engine {0} : CAPTCHA') logger.exception('engine {0} : CAPTCHA')
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineTooManyRequestsException)):
result_container.add_unresponsive_engine(engine_name, 'too many requests')
logger.exception('engine {0} : Too many requests')
suspended_time = e.suspended_time # pylint: disable=no-member
elif (issubclass(e.__class__, SearxEngineAccessDeniedException)):
result_container.add_unresponsive_engine(engine_name, 'blocked')
logger.exception('engine {0} : Searx is blocked')
suspended_time = e.suspended_time # pylint: disable=no-member
else: else:
result_container.add_unresponsive_engine(engine_name, 'unexpected crash') result_container.add_unresponsive_engine(engine_name, 'unexpected crash')
# others errors # others errors
@ -248,16 +258,18 @@ def search_one_http_request_safe(engine_name, query, request_params, result_cont
if getattr(threading.current_thread(), '_timeout', False): if getattr(threading.current_thread(), '_timeout', False):
record_error(engine_name, 'Timeout') record_error(engine_name, 'Timeout')
# suspend or not the engine if there are HTTP errors # suspend the engine if there is an HTTP error
# or suspended_time is defined
with threading.RLock(): with threading.RLock():
if requests_exception: if requests_exception or suspended_time:
# update continuous_errors / suspend_end_time # update continuous_errors / suspend_end_time
engine.continuous_errors += 1 engine.continuous_errors += 1
engine.suspend_end_time = time() + min(settings['search']['max_ban_time_on_fail'], if suspended_time is None:
suspended_time = min(settings['search']['max_ban_time_on_fail'],
engine.continuous_errors * settings['search']['ban_time_on_fail']) engine.continuous_errors * settings['search']['ban_time_on_fail'])
engine.suspend_end_time = time() + suspended_time
else: else:
# no HTTP error (perhaps an engine error) # reset the suspend variables
# anyway, reset the suspend variables
engine.continuous_errors = 0 engine.continuous_errors = 0
engine.suspend_end_time = 0 engine.suspend_end_time = 0
@ -342,7 +354,7 @@ def default_request_params():
'cookies': {}, 'cookies': {},
'verify': True, 'verify': True,
'auth': None, 'auth': None,
'raise_for_status': True 'raise_for_httperror': True
} }

View File

@ -647,11 +647,6 @@ engines:
shortcut : qwn shortcut : qwn
categories : news categories : news
- name : qwant social
engine : qwant
shortcut : qws
categories : social media
# - name: library # - name: library
# engine: recoll # engine: recoll
# shortcut: lib # shortcut: lib
@ -817,12 +812,13 @@ engines:
# Or you can use the html non-stable engine, activated by default # Or you can use the html non-stable engine, activated by default
engine : youtube_noapi engine : youtube_noapi
- name : yggtorrent # tmp suspended: Cloudflare CAPTCHA
engine : yggtorrent #- name : yggtorrent
shortcut : ygg # engine : yggtorrent
url: https://www2.yggtorrent.si/ # shortcut : ygg
disabled : True # url: https://www2.yggtorrent.si/
timeout : 4.0 # disabled : True
# timeout : 4.0
- name : dailymotion - name : dailymotion
engine : dailymotion engine : dailymotion