diff --git a/docs/admin/settings.rst b/docs/admin/settings.rst index 62221827..7cf055db 100644 --- a/docs/admin/settings.rst +++ b/docs/admin/settings.rst @@ -130,12 +130,14 @@ Global Settings request_timeout : 2.0 # default timeout in seconds, can be override by engine # max_request_timeout: 10.0 # the maximum timeout in seconds useragent_suffix : "" # informations like an email address to the administrator - pool_connections : 100 # Maximum number of allowable connections, or None for no limits. The default is 100. - pool_maxsize : 10 # Number of allowable keep-alive connections, or None to always allow. The default is 10. - enable_http2: True # See https://www.python-httpx.org/http2/ + pool_connections : 100 # Number of different hosts + pool_maxsize : 10 # Number of simultaneous requests by host # uncomment below section if you want to use a proxy # proxies: - # all://: + # http: + # - http://proxy1:8080 + # - http://proxy2:8080 + # https: # - http://proxy1:8080 # - http://proxy2:8080 # uncomment below section only if you have more than one network interface @@ -143,7 +145,6 @@ Global Settings # source_ips: # - 1.1.1.1 # - 1.1.1.2 - # - fe80::/126 ``request_timeout`` : @@ -156,46 +157,20 @@ Global Settings Suffix to the user-agent searx uses to send requests to others engines. If an engine wish to block you, a contact info here may be useful to avoid that. -``keepalive_expiry``: - Number of seconds to keep a connection in the pool. By default 5.0 seconds. - -.. _httpx proxies: https://www.python-httpx.org/advanced/#http-proxying +.. _requests proxies: https://requests.readthedocs.io/en/latest/user/advanced/#proxies +.. _PySocks: https://pypi.org/project/PySocks/ ``proxies`` : - Define one or more proxies you wish to use, see `httpx proxies`_. + Define one or more proxies you wish to use, see `requests proxies`_. If there are more than one proxy for one protocol (http, https), requests to the engines are distributed in a round-robin fashion. + - Proxy: `see `__. + - SOCKS proxies are also supported: `see `__ + ``source_ips`` : If you use multiple network interfaces, define from which IP the requests must - be made. Example: - - * ``0.0.0.0`` any local IPv4 address. - * ``::`` any local IPv6 address. - * ``192.168.0.1`` - * ``[ 192.168.0.1, 192.168.0.2 ]`` these two specific IP addresses - * ``fe80::60a2:1691:e5a2:ee1f`` - * ``fe80::60a2:1691:e5a2:ee1f/126`` all IP addresses in this network. - * ``[ 192.168.0.1, fe80::/126 ]`` - -``retries`` : - Number of retry in case of an HTTP error. - On each retry, searx uses an different proxy and source ip. - -``retry_on_http_error`` : - Retry request on some HTTP status code. - - Example: - - * ``true`` : on HTTP status code between 400 and 599. - * ``403`` : on HTTP status code 403. - * ``[403, 429]``: on HTTP status code 403 and 429. - -``enable_http2`` : - Enable by default. Set to ``False`` to disable HTTP/2. - -``max_redirects`` : - 30 by default. Maximum redirect before it is an error. + be made. This parameter is ignored when ``proxies`` is set. ``locales:`` @@ -241,13 +216,6 @@ Engine settings api_key : 'apikey' disabled : True language : en_US - #enable_http: False - #enable_http2: False - #retries: 1 - #retry_on_http_error: True # or 403 or [404, 429] - #max_connections: 100 - #max_keepalive_connections: 10 - #keepalive_expiry: 5.0 #proxies: # http: # - http://proxy1:8080 @@ -302,12 +270,6 @@ Engine settings ``display_error_messages`` : default ``True`` When an engine returns an error, the message is displayed on the user interface. -``network``: optional - Use the network configuration from another engine. - In addition, there are two default networks: - * ``ipv4`` set ``local_addresses`` to ``0.0.0.0`` (use only IPv4 local addresses) - * ``ipv6`` set ``local_addresses`` to ``::`` (use only IPv6 local addresses) - .. note:: A few more options are possible, but they are pretty specific to some diff --git a/manage b/manage index 6ec2291b..d2087941 100755 --- a/manage +++ b/manage @@ -107,8 +107,7 @@ fi export DOCS_BUILD buildenv() { - SEARX_DEBUG=1 pyenv.cmd python utils/build_env.py 2>&1 \ - | prefix_stdout "${_Blue}BUILDENV${_creset} " + SEARX_DEBUG=1 pyenv.cmd python utils/build_env.py 2>&1 return "${PIPESTATUS[0]}" } diff --git a/requirements-dev.txt b/requirements-dev.txt index 63c19f4b..388120c6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,4 +17,3 @@ sphinx-tabs==3.2.0 sphinxcontrib-programoutput==0.17 sphinx-autobuild==2021.3.14 linuxdoc==20211220 -aiounittest==1.4.1 diff --git a/requirements.txt b/requirements.txt index f18751c2..8d7382be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,13 @@ -certifi==2022.5.18.1 +Brotli==1.0.9 babel==2.9.1 +certifi==2022.5.18.1 flask-babel==2.0.0 flask==2.1.1 jinja2==3.1.2 +langdetect==1.0.9 lxml==4.9.0 pygments==2.8.0 python-dateutil==2.8.2 pyyaml==6.0 -httpx[http2]==0.23.0 -Brotli==1.0.9 -uvloop==0.16.0; python_version >= '3.7' -uvloop==0.14.0; python_version < '3.7' -httpx-socks[asyncio]==0.7.4 -langdetect==1.0.9 +requests[socks]==2.28.1 setproctitle==1.2.2 diff --git a/searx/autocomplete.py b/searx/autocomplete.py index a636807b..761c86cc 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -20,12 +20,10 @@ from lxml import etree from json import loads from urllib.parse import urlencode -from httpx import HTTPError - +from requests import RequestException from searx import settings -from searx.data import ENGINES_LANGUAGES -from searx.network import get as http_get +from searx.poolrequests import get as http_get from searx.exceptions import SearxEngineResponseException @@ -154,5 +152,5 @@ def search_autocomplete(backend_name, query, lang): try: return backend(query, lang) - except (HTTPError, SearxEngineResponseException): + except (RequestException, SearxEngineResponseException): return [] diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 635c98d8..79bdfbc0 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -27,7 +27,7 @@ from searx import settings from searx import logger from searx.data import ENGINES_LANGUAGES from searx.exceptions import SearxEngineResponseException -from searx.network import get, initialize as initialize_network, set_context_network_name +from searx.poolrequests import get, get_proxy_cycles from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent @@ -89,6 +89,8 @@ def load_engine(engine_data): engine.categories = [] else: engine.categories = list(map(str.strip, param_value.split(','))) + elif param_name == 'proxies': + engine.proxies = get_proxy_cycles(param_value) else: setattr(engine, param_name, param_value) @@ -283,3 +285,24 @@ def load_engines(engine_list): if engine is not None: engines[engine.name] = engine return engines + + +def initialize_engines(engine_list): + load_engines(engine_list) + + def engine_init(engine_name, init_fn): + try: + init_fn(get_engine_from_settings(engine_name)) + except SearxEngineResponseException as exc: + logger.warn('%s engine: Fail to initialize // %s', engine_name, exc) + except Exception: + logger.exception('%s engine: Fail to initialize', engine_name) + else: + logger.debug('%s engine: Initialized', engine_name) + + for engine_name, engine in engines.items(): + if hasattr(engine, 'init'): + init_fn = getattr(engine, 'init') + if init_fn: + logger.debug('%s engine: Starting background initialization', engine_name) + threading.Thread(target=engine_init, args=(engine_name, init_fn)).start() diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index eaa8b6ab..2483c080 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -52,7 +52,7 @@ def response(resp): to_results.append(to_result.text_content()) results.append({ - 'url': urljoin(str(resp.url), '?%d' % k), + 'url': urljoin(resp.url, '?%d' % k), 'title': from_result.text_content(), 'content': '; '.join(to_results) }) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 783b0db2..883d3152 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -1,18 +1,24 @@ # SPDX-License-Identifier: AGPL-3.0-or-later # lint: pylint -""" - DuckDuckGo (Web) +"""DuckDuckGo Lite """ from json import loads -from urllib.parse import urlencode -from searx.utils import match_language, HTMLTextExtractor -import re -from searx.network import get + +from lxml.html import fromstring + +from searx.utils import ( + dict_subset, + eval_xpath, + eval_xpath_getindex, + extract_text, + match_language, +) +from searx.poolrequests import get # about about = { - "website": 'https://duckduckgo.com/', + "website": 'https://lite.duckduckgo.com/lite', "wikidata_id": 'Q12805', "official_api_documentation": 'https://duckduckgo.com/api', "use_official_api": False, @@ -21,13 +27,11 @@ about = { } # engine dependent config -categories = ['general'] +categories = ['general', 'web'] paging = True -supported_languages_url = 'https://duckduckgo.com/util/u172.js' -number_of_results = 10 +supported_languages_url = 'https://duckduckgo.com/util/u588.js' time_range_support = True -safesearch = True -VQD_REGEX = r"vqd='(\d+-\d+-\d+)'" + language_aliases = { 'ar-SA': 'ar-XA', 'es-419': 'es-XL', @@ -35,16 +39,14 @@ language_aliases = { 'ko': 'kr-KR', 'sl-SI': 'sl-SL', 'zh-TW': 'tzh-TW', - 'zh-HK': 'tzh-HK' + 'zh-HK': 'tzh-HK', } +time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'} + # search-url -url = 'https://links.duckduckgo.com/d.js?' -url_ping = 'https://duckduckgo.com/t/sl_h' -time_range_dict = {'day': 'd', - 'week': 'w', - 'month': 'm', - 'year': 'y'} +url = 'https://lite.duckduckgo.com/lite' +url_ping = 'https://duckduckgo.com/t/sl_l' # match query's language to a region code that duckduckgo will accept @@ -59,103 +61,111 @@ def get_region_code(lang, lang_list=None): return lang_parts[1].lower() + '-' + lang_parts[0].lower() -def get_vqd(query, headers): - resp = get(f"https://duckduckgo.com/?q={query}&ia=web", headers=headers) - resp = re.findall(VQD_REGEX, resp.text) - return resp[0] - - def request(query, params): - params['method'] = 'GET' + params['url'] = url + params['method'] = 'POST' - vqd = get_vqd(query, params['headers']) - dl, ct = match_language(params['language'], supported_languages, language_aliases, 'wt-WT').split('-') - query_dict = { - 'q': query, - 't': 'D', - 'l': params['language'], - 'kl': f'{ct}-{dl}', - 's': (params['pageno'] - 1) * number_of_results, - 'dl': dl, - 'ct': ct, - 'ss_mkt': get_region_code(params['language'], supported_languages), - 'df': params['time_range'], - 'vqd': vqd, - 'ex': -2, - 'sp': '1', - 'bpa': '1', - 'biaexp': 'b', - 'msvrtexp': 'b' - } - if params['safesearch'] == 2: # STRICT - del query_dict['t'] - query_dict['p'] = 1 - query_dict.update({ - 'videxp': 'a', - 'nadse': 'b', - 'eclsexp': 'a', - 'stiaexp': 'a', - 'tjsexp': 'b', - 'related': 'b', - 'msnexp': 'a' - }) - elif params['safesearch'] == 1: # MODERATE - query_dict['ex'] = -1 - query_dict.update({ - 'nadse': 'b', - 'eclsexp': 'b', - 'tjsexp': 'b' - }) - else: # OFF - query_dict['ex'] = -2 - query_dict.update({ - 'nadse': 'b', - 'eclsexp': 'b', - 'tjsexp': 'b' - }) + params['data']['q'] = query - params['allow_redirects'] = False - params['data'] = query_dict - params['cookies']['kl'] = params['data']['kl'] + # The API is not documented, so we do some reverse engineering and emulate + # what https://lite.duckduckgo.com/lite/ does when you press "next Page" + # link again and again .. + + params['headers']['Content-Type'] = 'application/x-www-form-urlencoded' + + # initial page does not have an offset + if params['pageno'] == 2: + # second page does have an offset of 30 + offset = (params['pageno'] - 1) * 30 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + elif params['pageno'] > 2: + # third and following pages do have an offset of 30 + n*50 + offset = 30 + (params['pageno'] - 2) * 50 + params['data']['s'] = offset + params['data']['dc'] = offset + 1 + + # initial page does not have additional data in the input form + if params['pageno'] > 1: + # request the second page (and more pages) needs 'o' and 'api' arguments + params['data']['o'] = 'json' + params['data']['api'] = 'd.js' + + # initial page does not have additional data in the input form + if params['pageno'] > 2: + # request the third page (and more pages) some more arguments + params['data']['nextParams'] = '' + params['data']['v'] = '' + params['data']['vqd'] = '' + + region_code = get_region_code(params['language'], supported_languages) + if region_code: + params['data']['kl'] = region_code + params['cookies']['kl'] = region_code + + params['data']['df'] = '' if params['time_range'] in time_range_dict: params['data']['df'] = time_range_dict[params['time_range']] params['cookies']['df'] = time_range_dict[params['time_range']] - params['url'] = url + urlencode(params['data']) + return params # get response from search-request def response(resp): + + headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie']) + get(url_ping, headers=headers_ping) + if resp.status_code == 303: return [] - # parse the response results = [] + doc = fromstring(resp.text) - data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text)) - try: - search_data = loads(data[0].replace('/\t/g', ' ')) - except IndexError: - return + result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table') + if not len(result_table) >= 3: + # no more results + return [] + result_table = result_table[2] - if len(search_data) == 1 and ('n' not in search_data[0]): - only_result = search_data[0] - if ((only_result.get('da') is not None and only_result.get('t') == 'EOF') or - only_result.get('a') is not None or only_result.get('d') == 'google.com search'): - return + tr_rows = eval_xpath(result_table, './/tr') - for search_result in search_data: - if 'n' in search_result: + # In the last is the form of the 'previous/next page' links + tr_rows = tr_rows[:-1] + + len_tr_rows = len(tr_rows) + offset = 0 + + while len_tr_rows >= offset + 4: + + # assemble table rows we need to scrap + tr_title = tr_rows[offset] + tr_content = tr_rows[offset + 1] + offset += 4 + + # ignore sponsored Adds + if tr_content.get('class') == 'result-sponsored': continue - title = HTMLTextExtractor() - title.feed(search_result.get('t')) - content = HTMLTextExtractor() - content.feed(search_result.get('a')) - results.append({'title': title.get_text(), - 'content': content.get_text(), - 'url': search_result.get('u')}) + a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None) + if a_tag is None: + continue + + td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None) + if td_content is None: + continue + + results.append( + { + 'title': a_tag.text_content(), + 'content': extract_text(td_content), + 'url': a_tag.get('href'), + } + ) + return results @@ -165,7 +175,7 @@ def _fetch_supported_languages(resp): # response is a js file with regions as an embedded object response_page = resp.text response_page = response_page[response_page.find('regions:{') + 8:] - response_page = response_page[:response_page.find('}') + 1] + response_page = response_page[: response_page.find('}') + 1] regions_json = loads(response_page) supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys()) diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 0daaf41e..305eb1ca 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -8,7 +8,7 @@ from urllib.parse import urlencode from searx.exceptions import SearxEngineAPIException from searx.engines.duckduckgo import get_region_code from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import -from searx.network import get +from searx.poolrequests import get # about about = { diff --git a/searx/engines/elasticsearch.py b/searx/engines/elasticsearch.py index db84a5c1..da7f9807 100644 --- a/searx/engines/elasticsearch.py +++ b/searx/engines/elasticsearch.py @@ -4,6 +4,7 @@ """ from json import loads, dumps +from requests.auth import HTTPBasicAuth from searx.exceptions import SearxEngineAPIException @@ -31,7 +32,7 @@ def request(query, params): return params if username and password: - params['auth'] = (username, password) + params['auth'] = HTTPBasicAuth(username, password) params['url'] = search_url params['method'] = 'GET' diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index c3181408..11d3ba75 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -7,8 +7,8 @@ import re from json import loads, JSONDecodeError from urllib.parse import urlencode -from searx.network import get from searx.exceptions import SearxEngineResponseException +from searx.poolrequests import get # about about = { diff --git a/searx/engines/google.py b/searx/engines/google.py index 96365daf..8e548215 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -10,7 +10,7 @@ Definitions`_. # pylint: disable=invalid-name, missing-function-docstring, too-many-branches -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse from lxml import html from searx import logger from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex @@ -194,7 +194,8 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): return ret_val def detect_google_sorry(resp): - if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'): + resp_url = urlparse(resp.url) + if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'): raise SearxEngineCaptchaException() diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 5d88d398..da02f91c 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -7,7 +7,7 @@ from flask_babel import gettext from lxml import etree from datetime import datetime from urllib.parse import urlencode -from searx.network import get +from searx.poolrequests import get # about about = { diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 55c355ef..cee2dfa9 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -33,7 +33,7 @@ from flask_babel import gettext from searx.utils import match_language from searx.exceptions import SearxEngineAPIException -from searx.network import raise_for_httperror +from searx.raise_for_httperror import raise_for_httperror # about @@ -86,15 +86,14 @@ def request(query, params): # add language tag if params['language'] == 'all': - params['url'] += '&locale=en_us' + params['url'] += '&locale=en_US' else: language = match_language( params['language'], - # pylint: disable=undefined-variable supported_languages, language_aliases, ) - params['url'] += '&locale=' + language.replace('-', '_').lower() + params['url'] += '&locale=' + language.replace('-', '_') params['raise_for_httperror'] = False return params @@ -113,7 +112,14 @@ def response(resp): # check for an API error if search_results.get('status') != 'success': - msg = ",".join(data.get('message', ['unknown', ])) + msg = ",".join( + data.get( + 'message', + [ + 'unknown', + ], + ) + ) raise SearxEngineAPIException('API error::' + msg) # raise for other errors @@ -155,11 +161,13 @@ def response(resp): if mainline_type == 'web': content = item['desc'] - results.append({ - 'title': title, - 'url': res_url, - 'content': content, - }) + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + } + ) elif mainline_type == 'news': @@ -170,23 +178,27 @@ def response(resp): img_src = None if news_media: img_src = news_media[0].get('pict', {}).get('url', None) - results.append({ - 'title': title, - 'url': res_url, - 'publishedDate': pub_date, - 'img_src': img_src, - }) + results.append( + { + 'title': title, + 'url': res_url, + 'publishedDate': pub_date, + 'img_src': img_src, + } + ) elif mainline_type == 'images': thumbnail = item['thumbnail'] img_src = item['media'] - results.append({ - 'title': title, - 'url': res_url, - 'template': 'images.html', - 'thumbnail_src': thumbnail, - 'img_src': img_src, - }) + results.append( + { + 'title': title, + 'url': res_url, + 'template': 'images.html', + 'thumbnail_src': thumbnail, + 'img_src': img_src, + } + ) elif mainline_type == 'videos': # some videos do not have a description: while qwant-video @@ -210,19 +222,18 @@ def response(resp): thumbnail = item['thumbnail'] # from some locations (DE and others?) the s2 link do # response a 'Please wait ..' but does not deliver the thumbnail - thumbnail = thumbnail.replace( - 'https://s2.qwant.com', - 'https://s1.qwant.com', 1 + thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1) + results.append( + { + 'title': title, + 'url': res_url, + 'content': content, + 'publishedDate': pub_date, + 'thumbnail': thumbnail, + 'template': 'videos.html', + 'length': length, + } ) - results.append({ - 'title': title, - 'url': res_url, - 'content': content, - 'publishedDate': pub_date, - 'thumbnail': thumbnail, - 'template': 'videos.html', - 'length': length, - }) return results @@ -232,7 +243,7 @@ def _fetch_supported_languages(resp): # list of regions is embedded in page as a js object response_text = resp.text response_text = response_text[response_text.find('INITIAL_PROPS'):] - response_text = response_text[response_text.find('{'):response_text.find('')] + response_text = response_text[response_text.find('{'): response_text.find('')] regions_json = loads(response_text) diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py index 9cd50dfc..776281f6 100644 --- a/searx/engines/seznam.py +++ b/searx/engines/seznam.py @@ -3,9 +3,9 @@ Seznam """ -from urllib.parse import urlencode +from urllib.parse import urlencode, urlparse from lxml import html -from searx.network import get +from searx.poolrequests import get from searx.exceptions import SearxEngineAccessDeniedException from searx.utils import ( extract_text, @@ -46,7 +46,8 @@ def request(query, params): def response(resp): - if resp.url.path.startswith('/verify'): + resp_url = urlparse(resp.url) + if resp_url.path.startswith('/verify'): raise SearxEngineAccessDeniedException() results = [] diff --git a/searx/engines/sjp.py b/searx/engines/sjp.py index e26a42a5..eff7b709 100644 --- a/searx/engines/sjp.py +++ b/searx/engines/sjp.py @@ -6,7 +6,7 @@ from lxml.html import fromstring from searx import logger from searx.utils import extract_text -from searx.network import raise_for_httperror +from searx.raise_for_httperror import raise_for_httperror logger = logger.getChild('sjp engine') diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index a6f92385..b3e3383b 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -9,7 +9,7 @@ from lxml import html from dateutil import parser from urllib.parse import quote_plus, urlencode from searx import logger -from searx.network import get as http_get +from searx.poolrequests import get as http_get # about about = { diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 6816fe67..0ad8bfe3 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -5,10 +5,9 @@ from json import loads from urllib.parse import urlencode +import requests import base64 -from searx.network import post as http_post - # about about = { "website": 'https://www.spotify.com', @@ -39,7 +38,7 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) - r = http_post( + r = requests.post( 'https://accounts.spotify.com/api/token', data={'grant_type': 'client_credentials'}, headers={'Authorization': 'Basic ' + base64.b64encode( diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py new file mode 100644 index 00000000..91eaa68e --- /dev/null +++ b/searx/engines/stackoverflow.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Stackoverflow (IT) +""" + +from urllib.parse import urlencode, urljoin, urlparse +from lxml import html +from searx.utils import extract_text +from searx.exceptions import SearxEngineCaptchaException + +# about +about = { + "website": 'https://stackoverflow.com/', + "wikidata_id": 'Q549037', + "official_api_documentation": 'https://api.stackexchange.com/docs', + "use_official_api": False, + "require_api_key": False, + "results": 'HTML', +} + +# engine dependent config +categories = ['it'] +paging = True + +# search-url +url = 'https://stackoverflow.com/' +search_url = url + 'search?{query}&page={pageno}' + +# specific xpath variables +results_xpath = '//div[contains(@class,"question-summary")]' +link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a' +content_xpath = './/div[@class="excerpt"]' + + +# do search-request +def request(query, params): + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) + + return params + + +# get response from search-request +def response(resp): + resp_url = urlparse(resp.url) + if resp_url.path.startswith('/nocaptcha'): + raise SearxEngineCaptchaException() + + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath(results_xpath): + link = result.xpath(link_xpath)[0] + href = urljoin(url, link.attrib.get('href')) + title = extract_text(link) + content = extract_text(result.xpath(content_xpath)) + + # append result + results.append({'url': href, + 'title': title, + 'content': content}) + + # return results + return results diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 247b49e3..513f508e 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -17,7 +17,7 @@ from babel import Locale from babel.localedata import locale_identifiers from searx import logger -from searx.network import get +from searx.poolrequests import get from searx.utils import extract_text, eval_xpath, match_language from searx.exceptions import ( SearxEngineResponseException, diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index ddcce908..c8e4cfae 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -12,7 +12,7 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_ from searx import logger from searx.data import WIKIDATA_UNITS -from searx.network import post, get +from searx.poolrequests import post, get from searx.utils import match_language, searx_useragent, get_string_replaces_function from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 5e34db9a..3ad8748f 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -7,7 +7,7 @@ from urllib.parse import quote from json import loads from lxml.html import fromstring from searx.utils import match_language, searx_useragent -from searx.network import raise_for_httperror +from searx.raise_for_httperror import raise_for_httperror # about about = { diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 1f2cfa4e..8e427d57 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -7,7 +7,7 @@ from json import loads from time import time from urllib.parse import urlencode -from searx.network import get as http_get +from searx.poolrequests import get as http_get # about about = { diff --git a/searx/engines/wordnik.py b/searx/engines/wordnik.py index 4bfeb407..3abe9efa 100644 --- a/searx/engines/wordnik.py +++ b/searx/engines/wordnik.py @@ -6,7 +6,7 @@ from lxml.html import fromstring from searx import logger from searx.utils import extract_text -from searx.network import raise_for_httperror +from searx.raise_for_httperror import raise_for_httperror logger = logger.getChild('Wordnik engine') diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index fbd99c47..c194ca45 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -7,7 +7,7 @@ from json import loads from dateutil import parser from urllib.parse import urlencode -from httpx import DigestAuth +from requests.auth import HTTPDigestAuth from searx.utils import html_to_text @@ -56,7 +56,7 @@ def request(query, params): search_type=search_type) if http_digest_auth_user and http_digest_auth_pass: - params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass) + params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass) # add language tag if specified if params['language'] != 'all': diff --git a/searx/engines/yggtorrent.py b/searx/engines/yggtorrent.py index 79ebebd3..d59b66c0 100644 --- a/searx/engines/yggtorrent.py +++ b/searx/engines/yggtorrent.py @@ -8,7 +8,7 @@ from operator import itemgetter from datetime import datetime from urllib.parse import quote from searx.utils import extract_text, get_torrent_size -from searx.network import get as http_get +from searx.poolrequests import get as http_get # about about = { @@ -39,7 +39,7 @@ cookies = dict() def init(engine_settings=None): global cookies # pylint: disable=global-variable-not-assigned # initial cookies - resp = http_get(url, follow_redirects=False) + resp = http_get(url) if resp.ok: for r in resp.history: cookies.update(r.cookies) diff --git a/searx/metrology/error_recorder.py b/searx/metrology/error_recorder.py index 167d1c8a..f533e4e8 100644 --- a/searx/metrology/error_recorder.py +++ b/searx/metrology/error_recorder.py @@ -3,7 +3,7 @@ import inspect import logging from json import JSONDecodeError from urllib.parse import urlparse -from httpx import HTTPError, HTTPStatusError +from requests.exceptions import RequestException from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException, SearxEngineAccessDeniedException) from searx import logger @@ -60,28 +60,28 @@ def get_trace(traces): return traces[-1] -def get_hostname(exc: HTTPError) -> typing.Optional[None]: +def get_hostname(exc: RequestException) -> typing.Optional[None]: url = exc.request.url if url is None and exc.response is not None: url = exc.response.url return urlparse(url).netloc -def get_request_exception_messages(exc: HTTPError)\ +def get_request_exception_messages(exc: RequestException)\ -> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]: url = None status_code = None reason = None hostname = None - if hasattr(exc, 'request') and exc.request is not None: + if exc.request is not None: url = exc.request.url - if url is None and hasattr(exc, 'response') and exc.respones is not None: + if url is None and exc.response is not None: url = exc.response.url if url is not None: - hostname = url.host - if isinstance(exc, HTTPStatusError): + hostname = str(urlparse(url).netloc) + if exc.response is not None: status_code = str(exc.response.status_code) - reason = exc.response.reason_phrase + reason = exc.response.reason return (status_code, reason, hostname) @@ -92,7 +92,7 @@ def get_messages(exc, filename) -> typing.Tuple: return (str(exc), ) if isinstance(exc, ValueError) and 'lxml' in filename: return (str(exc), ) - if isinstance(exc, HTTPError): + if isinstance(exc, RequestException): return get_request_exception_messages(exc) if isinstance(exc, SearxXPathSyntaxException): return (exc.xpath_str, exc.message) diff --git a/searx/network/__init__.py b/searx/network/__init__.py deleted file mode 100644 index 93427d91..00000000 --- a/searx/network/__init__.py +++ /dev/null @@ -1,188 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import asyncio -import threading -import concurrent.futures -from time import time -from queue import SimpleQueue -from types import MethodType - -import httpx -import h2.exceptions - -from .network import get_network, initialize, check_network_configuration -from .client import get_loop -from .raise_for_httperror import raise_for_httperror - - -THREADLOCAL = threading.local() - - -def reset_time_for_thread(): - THREADLOCAL.total_time = 0 - - -def get_time_for_thread(): - return THREADLOCAL.total_time - - -def set_timeout_for_thread(timeout, start_time=None): - THREADLOCAL.timeout = timeout - THREADLOCAL.start_time = start_time - - -def set_context_network_name(network_name): - THREADLOCAL.network = get_network(network_name) - - -def get_context_network(): - try: - return THREADLOCAL.network - except AttributeError: - return get_network() - - -def request(method, url, **kwargs): - """same as requests/requests/api.py request(...)""" - time_before_request = time() - - # timeout (httpx) - if 'timeout' in kwargs: - timeout = kwargs['timeout'] - else: - timeout = getattr(THREADLOCAL, 'timeout', None) - if timeout is not None: - kwargs['timeout'] = timeout - - # 2 minutes timeout for the requests without timeout - timeout = timeout or 120 - - # ajdust actual timeout - timeout += 0.2 # overhead - start_time = getattr(THREADLOCAL, 'start_time', time_before_request) - if start_time: - timeout -= time() - start_time - - # raise_for_error - check_for_httperror = True - if 'raise_for_httperror' in kwargs: - check_for_httperror = kwargs['raise_for_httperror'] - del kwargs['raise_for_httperror'] - - # requests compatibility - if isinstance(url, bytes): - url = url.decode() - - # network - network = get_context_network() - - # do request - future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop()) - try: - response = future.result(timeout) - except concurrent.futures.TimeoutError as e: - raise httpx.TimeoutException('Timeout', request=None) from e - - # requests compatibility - # see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses - response.ok = not response.is_error - - # update total_time. - # See get_time_for_thread() and reset_time_for_thread() - if hasattr(THREADLOCAL, 'total_time'): - time_after_request = time() - THREADLOCAL.total_time += time_after_request - time_before_request - - # raise an exception - if check_for_httperror: - raise_for_httperror(response) - - return response - - -def get(url, **kwargs): - kwargs.setdefault('follow_redirects', True) - return request('get', url, **kwargs) - - -def options(url, **kwargs): - kwargs.setdefault('follow_redirects', True) - return request('options', url, **kwargs) - - -def head(url, **kwargs): - kwargs.setdefault('follow_redirects', False) - return request('head', url, **kwargs) - - -def post(url, data=None, **kwargs): - return request('post', url, data=data, **kwargs) - - -def put(url, data=None, **kwargs): - return request('put', url, data=data, **kwargs) - - -def patch(url, data=None, **kwargs): - return request('patch', url, data=data, **kwargs) - - -def delete(url, **kwargs): - return request('delete', url, **kwargs) - - -async def stream_chunk_to_queue(network, q, method, url, **kwargs): - try: - async with await network.stream(method, url, **kwargs) as response: - q.put(response) - # aiter_raw: access the raw bytes on the response without applying any HTTP content decoding - # https://www.python-httpx.org/quickstart/#streaming-responses - async for chunk in response.aiter_bytes(65536): - if len(chunk) > 0: - q.put(chunk) - except httpx.ResponseClosed as e: - # the response was closed - pass - except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e: - q.put(e) - finally: - q.put(None) - - -def _close_response_method(self): - asyncio.run_coroutine_threadsafe( - self.aclose(), - get_loop() - ) - - -def stream(method, url, **kwargs): - """Replace httpx.stream. - - Usage: - stream = poolrequests.stream(...) - response = next(stream) - for chunk in stream: - ... - - httpx.Client.stream requires to write the httpx.HTTPTransport version of the - the httpx.AsyncHTTPTransport declared above. - """ - q = SimpleQueue() - future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs), - get_loop()) - # yield response - response = q.get() - if isinstance(response, Exception): - raise response - response.close = MethodType(_close_response_method, response) - yield response - - # yield chunks - chunk_or_exception = q.get() - while chunk_or_exception is not None: - if isinstance(chunk_or_exception, Exception): - raise chunk_or_exception - yield chunk_or_exception - chunk_or_exception = q.get() - future.result() diff --git a/searx/network/client.py b/searx/network/client.py deleted file mode 100644 index 6cd3576f..00000000 --- a/searx/network/client.py +++ /dev/null @@ -1,167 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -import asyncio -import logging -import threading -import uvloop - -import httpx -from httpx_socks import AsyncProxyTransport -from python_socks import ( - parse_proxy_url, - ProxyConnectionError, - ProxyTimeoutError, - ProxyError -) -import python_socks._errors - -from searx import logger - - -logger = logger.getChild('searx.http.client') -LOOP = None -SSLCONTEXTS = {} -TRANSPORT_KWARGS = { - 'trust_env': False, -} - - -def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False): - global SSLCONTEXTS - key = (proxy_url, cert, verify, trust_env, http2) - if key not in SSLCONTEXTS: - SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2) - return SSLCONTEXTS[key] - - -class AsyncHTTPTransportNoHttp(httpx.AsyncHTTPTransport): - """Block HTTP request""" - - async def handle_async_request(self, request): - raise httpx.UnsupportedProtocol('HTTP protocol is disabled') - - -class AsyncProxyTransportFixed(AsyncProxyTransport): - """Fix httpx_socks.AsyncProxyTransport - - Map python_socks exceptions to httpx.ProxyError exceptions - """ - - async def handle_async_request(self, request): - try: - return await super().handle_async_request(request) - except ProxyConnectionError as e: - raise httpx.ProxyError("ProxyConnectionError: " + e.strerror, request=request) from e - except ProxyTimeoutError as e: - raise httpx.ProxyError("ProxyTimeoutError: " + e.args[0], request=request) from e - except ProxyError as e: - raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e - - -def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries): - # support socks5h (requests compatibility): - # https://requests.readthedocs.io/en/master/user/advanced/#socks - # socks5:// hostname is resolved on client side - # socks5h:// hostname is resolved on proxy side - rdns = False - socks5h = 'socks5h://' - if proxy_url.startswith(socks5h): - proxy_url = 'socks5://' + proxy_url[len(socks5h):] - rdns = True - - proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) - verify = get_sslcontexts(proxy_url, None, True, False, http2) if verify is True else verify - return AsyncProxyTransportFixed( - proxy_type=proxy_type, - proxy_host=proxy_host, - proxy_port=proxy_port, - username=proxy_username, - password=proxy_password, - rdns=rdns, - loop=get_loop(), - verify=verify, - http2=http2, - local_address=local_address, - limits=limit, - retries=retries, - **TRANSPORT_KWARGS, - ) - - -def get_transport(verify, http2, local_address, proxy_url, limit, retries): - verify = get_sslcontexts(None, None, True, False, http2) if verify is True else verify - return httpx.AsyncHTTPTransport( - # pylint: disable=protected-access - verify=verify, - http2=http2, - limits=limit, - proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, - local_address=local_address, - retries=retries, - **TRANSPORT_KWARGS, - ) - - -def iter_proxies(proxies): - # https://www.python-httpx.org/compatibility/#proxy-keys - if isinstance(proxies, str): - yield 'all://', proxies - elif isinstance(proxies, dict): - for pattern, proxy_url in proxies.items(): - yield pattern, proxy_url - - -def new_client(enable_http, verify, enable_http2, - max_connections, max_keepalive_connections, keepalive_expiry, - proxies, local_address, retries, max_redirects, hook_log_response): - limit = httpx.Limits(max_connections=max_connections, - max_keepalive_connections=max_keepalive_connections, - keepalive_expiry=keepalive_expiry) - # See https://www.python-httpx.org/advanced/#routing - mounts = {} - for pattern, proxy_url in iter_proxies(proxies): - if not enable_http and (pattern == 'http' or pattern.startswith('http://')): - continue - if proxy_url.startswith('socks4://') \ - or proxy_url.startswith('socks5://') \ - or proxy_url.startswith('socks5h://'): - mounts[pattern] = get_transport_for_socks_proxy(verify, enable_http2, local_address, proxy_url, limit, - retries) - else: - mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries) - - if not enable_http: - mounts['http://'] = AsyncHTTPTransportNoHttp() - - transport = get_transport(verify, enable_http2, local_address, None, limit, retries) - event_hooks = None - if hook_log_response: - event_hooks = {'response': [hook_log_response]} - return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects, event_hooks=event_hooks) - - -def get_loop(): - global LOOP - return LOOP - - -def init(): - # log - for logger_name in ('hpack.hpack', 'hpack.table'): - logging.getLogger(logger_name).setLevel(logging.WARNING) - - # loop - def loop_thread(): - global LOOP - LOOP = asyncio.new_event_loop() - LOOP.run_forever() - - th = threading.Thread( - target=loop_thread, - name='asyncio_loop', - daemon=True, - ) - th.start() - - -init() diff --git a/searx/network/network.py b/searx/network/network.py deleted file mode 100644 index 17c98a8d..00000000 --- a/searx/network/network.py +++ /dev/null @@ -1,402 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -# pylint: disable=global-statement -# pylint: disable=missing-module-docstring, missing-class-docstring - -import atexit -import asyncio -import ipaddress -from itertools import cycle -from typing import Dict - -import httpx - -from searx import logger, searx_debug -from .client import new_client, get_loop, AsyncHTTPTransportNoHttp - - -logger = logger.getChild('network') -DEFAULT_NAME = '__DEFAULT__' -NETWORKS: Dict[str, 'Network'] = {} -# requests compatibility when reading proxy settings from settings.yml -PROXY_PATTERN_MAPPING = { - 'http': 'http://', - 'https': 'https://', - 'socks4': 'socks4://', - 'socks5': 'socks5://', - 'socks5h': 'socks5h://', - 'http:': 'http://', - 'https:': 'https://', - 'socks4:': 'socks4://', - 'socks5:': 'socks5://', - 'socks5h:': 'socks5h://', -} - -ADDRESS_MAPPING = {'ipv4': '0.0.0.0', 'ipv6': '::'} - - -class Network: - - __slots__ = ( - 'enable_http', - 'verify', - 'enable_http2', - 'max_connections', - 'max_keepalive_connections', - 'keepalive_expiry', - 'local_addresses', - 'proxies', - 'using_tor_proxy', - 'max_redirects', - 'retries', - 'retry_on_http_error', - '_local_addresses_cycle', - '_proxies_cycle', - '_clients', - '_logger', - ) - - _TOR_CHECK_RESULT = {} - - def __init__( - # pylint: disable=too-many-arguments - self, - enable_http=True, - verify=True, - enable_http2=False, - max_connections=None, - max_keepalive_connections=None, - keepalive_expiry=None, - proxies=None, - using_tor_proxy=False, - local_addresses=None, - retries=0, - retry_on_http_error=None, - max_redirects=30, - logger_name=None, - ): - - self.enable_http = enable_http - self.verify = verify - self.enable_http2 = enable_http2 - self.max_connections = max_connections - self.max_keepalive_connections = max_keepalive_connections - self.keepalive_expiry = keepalive_expiry - self.proxies = proxies - self.using_tor_proxy = using_tor_proxy - self.local_addresses = local_addresses - self.retries = retries - self.retry_on_http_error = retry_on_http_error - self.max_redirects = max_redirects - self._local_addresses_cycle = self.get_ipaddress_cycle() - self._proxies_cycle = self.get_proxy_cycles() - self._clients = {} - self._logger = logger.getChild(logger_name) if logger_name else logger - self.check_parameters() - - def check_parameters(self): - for address in self.iter_ipaddresses(): - if '/' in address: - ipaddress.ip_network(address, False) - else: - ipaddress.ip_address(address) - - if self.proxies is not None and not isinstance(self.proxies, (str, dict)): - raise ValueError('proxies type has to be str, dict or None') - - def iter_ipaddresses(self): - local_addresses = self.local_addresses - if not local_addresses: - return - if isinstance(local_addresses, str): - local_addresses = [local_addresses] - for address in local_addresses: - yield address - - def get_ipaddress_cycle(self): - while True: - count = 0 - for address in self.iter_ipaddresses(): - if '/' in address: - for a in ipaddress.ip_network(address, False).hosts(): - yield str(a) - count += 1 - else: - a = ipaddress.ip_address(address) - yield str(a) - count += 1 - if count == 0: - yield None - - def iter_proxies(self): - if not self.proxies: - return - # https://www.python-httpx.org/compatibility/#proxy-keys - if isinstance(self.proxies, str): - yield 'all://', [self.proxies] - else: - for pattern, proxy_url in self.proxies.items(): - pattern = PROXY_PATTERN_MAPPING.get(pattern, pattern) - if isinstance(proxy_url, str): - proxy_url = [proxy_url] - yield pattern, proxy_url - - def get_proxy_cycles(self): - proxy_settings = {} - for pattern, proxy_urls in self.iter_proxies(): - proxy_settings[pattern] = cycle(proxy_urls) - while True: - # pylint: disable=stop-iteration-return - yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items()) - - async def log_response(self, response: httpx.Response): - request = response.request - status = f"{response.status_code} {response.reason_phrase}" - response_line = f"{response.http_version} {status}" - content_type = response.headers.get("Content-Type") - content_type = f' ({content_type})' if content_type else '' - self._logger.debug(f'HTTP Request: {request.method} {request.url} "{response_line}"{content_type}') - - @staticmethod - async def check_tor_proxy(client: httpx.AsyncClient, proxies) -> bool: - if proxies in Network._TOR_CHECK_RESULT: - return Network._TOR_CHECK_RESULT[proxies] - - result = True - # ignore client._transport because it is not used with all:// - for transport in client._mounts.values(): # pylint: disable=protected-access - if isinstance(transport, AsyncHTTPTransportNoHttp): - continue - if getattr(transport, '_pool') and getattr(transport._pool, '_rdns', False): - continue - return False - response = await client.get("https://check.torproject.org/api/ip", timeout=10) - if not response.json()["IsTor"]: - result = False - Network._TOR_CHECK_RESULT[proxies] = result - return result - - async def get_client(self, verify=None, max_redirects=None): - verify = self.verify if verify is None else verify - max_redirects = self.max_redirects if max_redirects is None else max_redirects - local_address = next(self._local_addresses_cycle) - proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key - key = (verify, max_redirects, local_address, proxies) - hook_log_response = self.log_response if searx_debug else None - if key not in self._clients or self._clients[key].is_closed: - client = new_client( - self.enable_http, - verify, - self.enable_http2, - self.max_connections, - self.max_keepalive_connections, - self.keepalive_expiry, - dict(proxies), - local_address, - 0, - max_redirects, - hook_log_response, - ) - if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies): - await client.aclose() - raise httpx.ProxyError('Network configuration problem: not using Tor') - self._clients[key] = client - return self._clients[key] - - async def aclose(self): - async def close_client(client): - try: - await client.aclose() - except httpx.HTTPError: - pass - - await asyncio.gather(*[close_client(client) for client in self._clients.values()], return_exceptions=False) - - @staticmethod - def extract_kwargs_clients(kwargs): - kwargs_clients = {} - if 'verify' in kwargs: - kwargs_clients['verify'] = kwargs.pop('verify') - if 'max_redirects' in kwargs: - kwargs_clients['max_redirects'] = kwargs.pop('max_redirects') - if 'allow_redirects' in kwargs: - # see https://github.com/encode/httpx/pull/1808 - kwargs['follow_redirects'] = kwargs.pop('allow_redirects') - return kwargs_clients - - def is_valid_response(self, response): - # pylint: disable=too-many-boolean-expressions - if ( - (self.retry_on_http_error is True and 400 <= response.status_code <= 599) - or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) - or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error) - ): - return False - return True - - async def call_client(self, stream, method, url, **kwargs): - retries = self.retries - was_disconnected = False - kwargs_clients = Network.extract_kwargs_clients(kwargs) - while retries >= 0: # pragma: no cover - client = await self.get_client(**kwargs_clients) - try: - if stream: - response = client.stream(method, url, **kwargs) - else: - response = await client.request(method, url, **kwargs) - if self.is_valid_response(response) or retries <= 0: - return response - except httpx.RemoteProtocolError as e: - if not was_disconnected: - # the server has closed the connection: - # try again without decreasing the retries variable & with a new HTTP client - was_disconnected = True - await client.aclose() - self._logger.warning('httpx.RemoteProtocolError: the server has disconnected, retrying') - continue - if retries <= 0: - raise e - except (httpx.RequestError, httpx.HTTPStatusError) as e: - if retries <= 0: - raise e - retries -= 1 - - async def request(self, method, url, **kwargs): - return await self.call_client(False, method, url, **kwargs) - - async def stream(self, method, url, **kwargs): - return await self.call_client(True, method, url, **kwargs) - - @classmethod - async def aclose_all(cls): - await asyncio.gather(*[network.aclose() for network in NETWORKS.values()], return_exceptions=False) - - -def get_network(name=None): - return NETWORKS.get(name or DEFAULT_NAME) - - -def check_network_configuration(): - async def check(): - exception_count = 0 - for network in NETWORKS.values(): - if network.using_tor_proxy: - try: - await network.get_client() - except Exception: # pylint: disable=broad-except - network._logger.exception('Error') # pylint: disable=protected-access - exception_count += 1 - return exception_count - - future = asyncio.run_coroutine_threadsafe(check(), get_loop()) - exception_count = future.result() - if exception_count > 0: - raise RuntimeError("Invalid network configuration") - - -def initialize(settings_engines=None, settings_outgoing=None): - # pylint: disable=import-outside-toplevel) - from searx.engines import engines - from searx import settings - - # pylint: enable=import-outside-toplevel) - - settings_engines = settings_engines or settings['engines'] - settings_outgoing = settings_outgoing or settings['outgoing'] - - # default parameters for AsyncHTTPTransport - # see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # nopep8 - default_params = { - 'enable_http': False, - 'verify': True, - 'enable_http2': settings_outgoing.get('enable_http2', True), - 'max_connections': settings_outgoing.get('pool_connections', 100), - 'max_keepalive_connections': settings_outgoing.get('pool_maxsize', 10), - 'keepalive_expiry': settings_outgoing.get('keepalive_expiry', 5.0), - 'local_addresses': settings_outgoing.get('source_ips', []), - 'using_tor_proxy': settings_outgoing.get('using_tor_proxy', False), - 'proxies': settings_outgoing.get('proxies', None), - 'max_redirects': settings_outgoing.get('max_redirects', 30), - 'retries': settings_outgoing.get('retries', 0), - 'retry_on_http_error': None, - } - - def new_network(params, logger_name=None): - nonlocal default_params - result = {} - result.update(default_params) - result.update(params) - if logger_name: - result['logger_name'] = logger_name - return Network(**result) - - def iter_networks(): - nonlocal settings_engines - for engine_spec in settings_engines: - engine_name = engine_spec['name'] - engine = engines.get(engine_name) - if engine is None: - continue - network = getattr(engine, 'network', None) - yield engine_name, engine, network - - if NETWORKS: - done() - NETWORKS.clear() - NETWORKS[DEFAULT_NAME] = new_network({}, logger_name='default') - NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'}, logger_name='ipv4') - NETWORKS['ipv6'] = new_network({'local_addresses': '::'}, logger_name='ipv6') - - # define networks from outgoing.networks - for network_name, network in settings_outgoing.get('networks', {}).items(): - NETWORKS[network_name] = new_network(network, logger_name=network_name) - - # define networks from engines.[i].network (except references) - for engine_name, engine, network in iter_networks(): - if network is None: - network = {} - for attribute_name, attribute_value in default_params.items(): - if hasattr(engine, attribute_name): - network[attribute_name] = getattr(engine, attribute_name) - else: - network[attribute_name] = attribute_value - NETWORKS[engine_name] = new_network(network, logger_name=engine_name) - elif isinstance(network, dict): - NETWORKS[engine_name] = new_network(network, logger_name=engine_name) - - # define networks from engines.[i].network (references) - for engine_name, engine, network in iter_networks(): - if isinstance(network, str): - NETWORKS[engine_name] = NETWORKS[network] - - # the /image_proxy endpoint has a dedicated network. - # same parameters than the default network, but HTTP/2 is disabled. - # It decreases the CPU load average, and the total time is more or less the same - if 'image_proxy' not in NETWORKS: - image_proxy_params = default_params.copy() - image_proxy_params['enable_http2'] = False - NETWORKS['image_proxy'] = new_network(image_proxy_params, logger_name='image_proxy') - - -@atexit.register -def done(): - """Close all HTTP client - - Avoid a warning at exit - see https://github.com/encode/httpx/blob/1a6e254f72d9fd5694a1c10a28927e193ab4f76b/httpx/_client.py#L1785 - - Note: since Network.aclose has to be async, it is not possible to call this method on Network.__del__ - So Network.aclose is called here using atexit.register - """ - try: - loop = get_loop() - if loop: - future = asyncio.run_coroutine_threadsafe(Network.aclose_all(), loop) - # wait 3 seconds to close the HTTP clients - future.result(3) - finally: - NETWORKS.clear() - - -NETWORKS[DEFAULT_NAME] = Network() diff --git a/searx/poolrequests.py b/searx/poolrequests.py new file mode 100644 index 00000000..ab327251 --- /dev/null +++ b/searx/poolrequests.py @@ -0,0 +1,235 @@ +import sys +from time import time +from itertools import cycle +from threading import local + +import requests + +from searx import settings +from searx import logger +from searx.raise_for_httperror import raise_for_httperror + + +logger = logger.getChild('poolrequests') + + +try: + import ssl + if ssl.OPENSSL_VERSION_INFO[0:3] < (1, 0, 2): + # https://github.com/certifi/python-certifi#1024-bit-root-certificates + logger.critical('You are using an old openssl version({0}), please upgrade above 1.0.2!' + .format(ssl.OPENSSL_VERSION)) + sys.exit(1) +except ImportError: + ssl = None +if not getattr(ssl, "HAS_SNI", False): + try: + import OpenSSL # pylint: disable=unused-import + except ImportError: + logger.critical("ssl doesn't support SNI and the pyopenssl module is not installed.\n" + "Some HTTPS connections will fail") + sys.exit(1) + + +class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter): + + def __init__(self, pool_connections=requests.adapters.DEFAULT_POOLSIZE, + pool_maxsize=requests.adapters.DEFAULT_POOLSIZE, + max_retries=requests.adapters.DEFAULT_RETRIES, + pool_block=requests.adapters.DEFAULT_POOLBLOCK, + **conn_params): + if max_retries == requests.adapters.DEFAULT_RETRIES: + self.max_retries = requests.adapters.Retry(0, read=False) + else: + self.max_retries = requests.adapters.Retry.from_int(max_retries) + self.config = {} + self.proxy_manager = {} + + super().__init__() + + self._pool_connections = pool_connections + self._pool_maxsize = pool_maxsize + self._pool_block = pool_block + self._conn_params = conn_params + + self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block, **conn_params) + + def __setstate__(self, state): + # Can't handle by adding 'proxy_manager' to self.__attrs__ because + # because self.poolmanager uses a lambda function, which isn't pickleable. + self.proxy_manager = {} + self.config = {} + + for attr, value in state.items(): + setattr(self, attr, value) + + self.init_poolmanager(self._pool_connections, self._pool_maxsize, + block=self._pool_block, **self._conn_params) + + +threadLocal = local() +connect = settings['outgoing'].get('pool_connections', 100) # Magic number kept from previous code +maxsize = settings['outgoing'].get('pool_maxsize', requests.adapters.DEFAULT_POOLSIZE) # Picked from constructor +if settings['outgoing'].get('source_ips'): + http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize, + source_address=(source_ip, 0)) + for source_ip in settings['outgoing']['source_ips']) + https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize, + source_address=(source_ip, 0)) + for source_ip in settings['outgoing']['source_ips']) +else: + http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), )) + https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), )) + + +class SessionSinglePool(requests.Session): + + def __init__(self): + super().__init__() + + # reuse the same adapters + self.adapters.clear() + + https_adapter = threadLocal.__dict__.setdefault('https_adapter', next(https_adapters)) + self.mount('https://', https_adapter) + if get_enable_http_protocol(): + http_adapter = threadLocal.__dict__.setdefault('http_adapter', next(http_adapters)) + self.mount('http://', http_adapter) + + def close(self): + """Call super, but clear adapters since there are managed globaly""" + self.adapters.clear() + super().close() + + +def set_timeout_for_thread(timeout, start_time=None): + threadLocal.timeout = timeout + threadLocal.start_time = start_time + + +def set_enable_http_protocol(enable_http): + threadLocal.enable_http = enable_http + + +def get_enable_http_protocol(): + try: + return threadLocal.enable_http + except AttributeError: + return False + + +def reset_time_for_thread(): + threadLocal.total_time = 0 + + +def get_time_for_thread(): + return threadLocal.total_time + + +def get_proxy_cycles(proxy_settings): + if not proxy_settings: + return None + # Backwards compatibility for single proxy in settings.yml + for protocol, proxy in proxy_settings.items(): + if isinstance(proxy, str): + proxy_settings[protocol] = [proxy] + + for protocol in proxy_settings: + proxy_settings[protocol] = cycle(proxy_settings[protocol]) + return proxy_settings + + +GLOBAL_PROXY_CYCLES = get_proxy_cycles(settings['outgoing'].get('proxies')) + + +def get_proxies(proxy_cycles): + if proxy_cycles: + return {protocol: next(proxy_cycle) for protocol, proxy_cycle in proxy_cycles.items()} + return None + + +def get_global_proxies(): + return get_proxies(GLOBAL_PROXY_CYCLES) + + +def request(method, url, **kwargs): + """same as requests/requests/api.py request(...)""" + time_before_request = time() + + # session start + session = SessionSinglePool() + + # proxies + if not kwargs.get('proxies'): + kwargs['proxies'] = get_global_proxies() + + # timeout + if 'timeout' in kwargs: + timeout = kwargs['timeout'] + else: + timeout = getattr(threadLocal, 'timeout', None) + if timeout is not None: + kwargs['timeout'] = timeout + + # raise_for_error + check_for_httperror = True + if 'raise_for_httperror' in kwargs: + check_for_httperror = kwargs['raise_for_httperror'] + del kwargs['raise_for_httperror'] + + # do request + response = session.request(method=method, url=url, **kwargs) + + time_after_request = time() + + # is there a timeout for this engine ? + if timeout is not None: + timeout_overhead = 0.2 # seconds + # start_time = when the user request started + start_time = getattr(threadLocal, 'start_time', time_before_request) + search_duration = time_after_request - start_time + if search_duration > timeout + timeout_overhead: + raise requests.exceptions.Timeout(response=response) + + # session end + session.close() + + if hasattr(threadLocal, 'total_time'): + threadLocal.total_time += time_after_request - time_before_request + + # raise an exception + if check_for_httperror: + raise_for_httperror(response) + + return response + + +def get(url, **kwargs): + kwargs.setdefault('allow_redirects', True) + return request('get', url, **kwargs) + + +def options(url, **kwargs): + kwargs.setdefault('allow_redirects', True) + return request('options', url, **kwargs) + + +def head(url, **kwargs): + kwargs.setdefault('allow_redirects', False) + return request('head', url, **kwargs) + + +def post(url, data=None, **kwargs): + return request('post', url, data=data, **kwargs) + + +def put(url, data=None, **kwargs): + return request('put', url, data=data, **kwargs) + + +def patch(url, data=None, **kwargs): + return request('patch', url, data=data, **kwargs) + + +def delete(url, **kwargs): + return request('delete', url, **kwargs) diff --git a/searx/network/raise_for_httperror.py b/searx/raise_for_httperror.py similarity index 100% rename from searx/network/raise_for_httperror.py rename to searx/raise_for_httperror.py diff --git a/searx/search/__init__.py b/searx/search/__init__.py index 93043e40..45d1cb1e 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -25,13 +25,11 @@ from _thread import start_new_thread from searx import settings from searx.answerers import ask from searx.external_bang import get_bang_url -from searx.engines import load_engines from searx.results import ResultContainer from searx import logger from searx.plugins import plugins from searx.search.models import EngineRef, SearchQuery -from searx.search.processors import PROCESSORS, initialize as initialize_processors -from searx.network import check_network_configuration, initialize as initialize_network +from searx.search.processors import processors, initialize as initialize_processors from searx.search.checker import initialize as initialize_checker @@ -49,14 +47,9 @@ else: sys.exit(1) -def initialize(settings_engines=None, enable_checker=False, check_network=False): +def initialize(settings_engines=None, enable_checker=False): settings_engines = settings_engines or settings['engines'] - load_engines(settings_engines) - initialize_network(settings_engines, settings['outgoing']) - if check_network: - check_network_configuration() initialize_processors(settings_engines) - if enable_checker: initialize_checker() @@ -111,7 +104,7 @@ class Search: # start search-reqest for all selected engines for engineref in self.search_query.engineref_list: - processor = PROCESSORS[engineref.name] + processor = processors[engineref.name] # set default request parameters request_params = processor.get_params(self.search_query, engineref.category) @@ -154,7 +147,7 @@ class Search: for engine_name, query, request_params in requests: th = threading.Thread( - target=PROCESSORS[engine_name].search, + target=processors[engine_name].search, args=(query, request_params, self.result_container, self.start_time, self.actual_timeout), name=search_id, ) diff --git a/searx/search/checker/background.py b/searx/search/checker/background.py index 276426fa..c3292d9a 100644 --- a/searx/search/checker/background.py +++ b/searx/search/checker/background.py @@ -9,7 +9,7 @@ import signal from searx import logger, settings, searx_debug from searx.exceptions import SearxSettingsException -from searx.search.processors import PROCESSORS +from searx.search.processors import processors from searx.search.checker import Checker from searx.shared import schedule, storage @@ -55,7 +55,7 @@ def run(): 'status': 'ok', 'engines': {} } - for name, processor in PROCESSORS.items(): + for name, processor in processors.items(): logger.debug('Checking %s engine', name) checker = Checker(processor) checker.run() diff --git a/searx/search/checker/impl.py b/searx/search/checker/impl.py index 6cc89704..ad45440e 100644 --- a/searx/search/checker/impl.py +++ b/searx/search/checker/impl.py @@ -11,9 +11,9 @@ from urllib.parse import urlparse import re from langdetect import detect_langs from langdetect.lang_detect_exception import LangDetectException -import httpx +import requests.exceptions -from searx import network, logger +from searx import poolrequests, logger from searx.results import ResultContainer from searx.search.models import SearchQuery, EngineRef from searx.search.processors import EngineProcessor @@ -75,8 +75,8 @@ def _is_url_image(image_url): while retry > 0: a = time() try: - network.set_timeout_for_thread(10.0, time()) - r = network.get(image_url, timeout=10.0, follow_redirects=True, headers={ + poolrequests.set_timeout_for_thread(10.0, time()) + r = poolrequests.get(image_url, timeout=10.0, allow_redirects=True, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', @@ -90,10 +90,10 @@ def _is_url_image(image_url): if r.headers["content-type"].startswith('image/'): return True return False - except httpx.TimeoutException: + except requests.exceptions.Timeout: logger.error('Timeout for %s: %i', image_url, int(time() - a)) retry -= 1 - except httpx.HTTPError: + except requests.exceptions.RequestException: logger.exception('Exception for %s', image_url) return False diff --git a/searx/search/processors/__init__.py b/searx/search/processors/__init__.py index c2f6df17..4cae3cd0 100644 --- a/searx/search/processors/__init__.py +++ b/searx/search/processors/__init__.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -import threading - from .online import OnlineProcessor from .offline import OfflineProcessor from .online_dictionary import OnlineDictionaryProcessor @@ -12,9 +10,9 @@ import searx.engines as engines __all__ = ['EngineProcessor', 'OfflineProcessor', 'OnlineProcessor', - 'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', 'PROCESSORS'] + 'OnlineDictionaryProcessor', 'OnlineCurrencyProcessor', 'processors'] logger = logger.getChild('search.processors') -PROCESSORS = {} +processors = {} def get_processor_class(engine_type): @@ -29,27 +27,15 @@ def get_processor(engine, engine_name): processor_class = get_processor_class(engine_type) if processor_class: return processor_class(engine, engine_name) - return None - - -def initialize_processor(processor): - """Initialize one processor - Call the init function of the engine - """ - if processor.has_initialize_function: - t = threading.Thread(target=processor.initialize, daemon=True) - t.start() + else: + return None def initialize(engine_list): - """Initialize all engines and store a processor for each engine in :py:obj:`PROCESSORS`.""" - for engine_data in engine_list: - engine_name = engine_data['name'] - engine = engines.engines.get(engine_name) - if engine: - processor = get_processor(engine, engine_name) - initialize_processor(processor) - if processor is None: - engine.logger.error('Error get processor for engine %s', engine_name) - else: - PROCESSORS[engine_name] = processor + engines.initialize_engines(engine_list) + for engine_name, engine in engines.engines.items(): + processor = get_processor(engine, engine_name) + if processor is None: + logger.error('Error get processor for engine %s', engine_name) + else: + processors[engine_name] = processor diff --git a/searx/search/processors/abstract.py b/searx/search/processors/abstract.py index e676b1f5..26dab069 100644 --- a/searx/search/processors/abstract.py +++ b/searx/search/processors/abstract.py @@ -2,32 +2,17 @@ from abc import abstractmethod, ABC from searx import logger -from searx.engines import engines -from searx.utils import get_engine_from_settings logger = logger.getChild('searx.search.processor') class EngineProcessor(ABC): + def __init__(self, engine, engine_name): self.engine = engine self.engine_name = engine_name - def initialize(self): - try: - self.engine.init(get_engine_from_settings(self.engine_name)) - except SearxEngineResponseException as exc: - logger.warn('Fail to initialize %s // %s', self.engine_name, exc) - except Exception: # pylint: disable=broad-except - logger.exception('Fail to initialize %s', self.engine_name) - else: - logger.debug('Initialized %s', self.engine_name) - - @property - def has_initialize_function(self): - return hasattr(self.engine, 'init') - def get_params(self, search_query, engine_category): # if paging is not supported, skip if search_query.pageno > 1 and not self.engine.paging: diff --git a/searx/search/processors/online.py b/searx/search/processors/online.py index 59471d14..dde34786 100644 --- a/searx/search/processors/online.py +++ b/searx/search/processors/online.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +from urllib.parse import urlparse from time import time import threading -import asyncio -import httpx +import requests.exceptions -import searx.network +import searx.poolrequests as poolrequests from searx.engines import settings from searx import logger from searx.utils import gen_useragent @@ -64,6 +64,10 @@ class OnlineProcessor(EngineProcessor): auth=params['auth'] ) + # setting engine based proxies + if hasattr(self.engine, 'proxies'): + request_args['proxies'] = poolrequests.get_proxies(self.engine.proxies) + # max_redirects max_redirects = params.get('max_redirects') if max_redirects: @@ -82,9 +86,9 @@ class OnlineProcessor(EngineProcessor): # specific type of request (GET or POST) if params['method'] == 'GET': - req = searx.network.get + req = poolrequests.get else: - req = searx.network.post + req = poolrequests.post request_args['data'] = params['data'] @@ -96,8 +100,8 @@ class OnlineProcessor(EngineProcessor): # unexpected redirect : record an error # but the engine might still return valid results. status_code = str(response.status_code or '') - reason = response.reason_phrase or '' - hostname = response.url.host + reason = response.reason or '' + hostname = str(urlparse(response.url or '').netloc) record_error(self.engine_name, '{} redirects, maximum: {}'.format(len(response.history), soft_max_redirects), (status_code, reason, hostname)) @@ -125,14 +129,14 @@ class OnlineProcessor(EngineProcessor): def search(self, query, params, result_container, start_time, timeout_limit): # set timeout for all HTTP requests - searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time) + poolrequests.set_timeout_for_thread(timeout_limit, start_time=start_time) # reset the HTTP total time - searx.network.reset_time_for_thread() - # set the network - searx.network.set_context_network_name(self.engine_name) + poolrequests.reset_time_for_thread() + # enable HTTP only if explicitly enabled + poolrequests.set_enable_http_protocol(self.engine.enable_http) # suppose everything will be alright - http_exception = False + requests_exception = False suspended_time = None try: @@ -146,7 +150,7 @@ class OnlineProcessor(EngineProcessor): # update engine time when there is no exception engine_time = time() - start_time - page_load_time = searx.network.get_time_for_thread() + page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) with threading.RLock(): self.engine.stats['engine_time'] += engine_time @@ -159,27 +163,27 @@ class OnlineProcessor(EngineProcessor): # Timing engine_time = time() - start_time - page_load_time = searx.network.get_time_for_thread() + page_load_time = poolrequests.get_time_for_thread() result_container.add_timing(self.engine_name, engine_time, page_load_time) # Record the errors with threading.RLock(): self.engine.stats['errors'] += 1 - if (issubclass(e.__class__, (httpx.TimeoutException, asyncio.TimeoutError))): + if (issubclass(e.__class__, requests.exceptions.Timeout)): result_container.add_unresponsive_engine(self.engine_name, 'HTTP timeout') # requests timeout (connect or read) logger.error("engine {0} : HTTP requests timeout" "(search duration : {1} s, timeout: {2} s) : {3}" .format(self.engine_name, engine_time, timeout_limit, e.__class__.__name__)) - http_exception = True - elif (issubclass(e.__class__, (httpx.HTTPError, httpx.StreamError))): + requests_exception = True + elif (issubclass(e.__class__, requests.exceptions.RequestException)): result_container.add_unresponsive_engine(self.engine_name, 'HTTP error') # other requests exception logger.exception("engine {0} : requests exception" "(search duration : {1} s, timeout: {2} s) : {3}" .format(self.engine_name, engine_time, timeout_limit, e)) - http_exception = True + requests_exception = True elif (issubclass(e.__class__, SearxEngineCaptchaException)): result_container.add_unresponsive_engine(self.engine_name, 'CAPTCHA required') logger.exception('engine {0} : CAPTCHA'.format(self.engine_name)) @@ -203,7 +207,7 @@ class OnlineProcessor(EngineProcessor): # suspend the engine if there is an HTTP error # or suspended_time is defined with threading.RLock(): - if http_exception or suspended_time: + if requests_exception or suspended_time: # update continuous_errors / suspend_end_time self.engine.continuous_errors += 1 if suspended_time is None: diff --git a/searx/settings.yml b/searx/settings.yml index 5560eb7e..b4f61413 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -69,17 +69,19 @@ ui: # key : !!binary "your_morty_proxy_key" outgoing: # communication with search engines - request_timeout : 3.0 # default timeout in seconds, can be override by engine + request_timeout : 2.0 # default timeout in seconds, can be override by engine # max_request_timeout: 10.0 # the maximum timeout in seconds useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator - pool_connections : 100 # The maximum number of concurrent connections that may be established. - pool_maxsize : 20 # Allow the connection pool to maintain keep-alive connections below this point. - enable_http2: True # See https://www.python-httpx.org/http2/ + pool_connections : 100 # Number of different hosts + pool_maxsize : 10 # Number of simultaneous requests by host # uncomment below section if you want to use a proxy # see https://2.python-requests.org/en/latest/user/advanced/#proxies # SOCKS proxies are also supported: see https://2.python-requests.org/en/latest/user/advanced/#socks # proxies: -# all://: +# http: +# - http://proxy1:8080 +# - http://proxy2:8080 +# https: # - http://proxy1:8080 # - http://proxy2:8080 # using_tor_proxy : True @@ -89,7 +91,6 @@ outgoing: # communication with search engines # source_ips: # - 1.1.1.1 # - 1.1.1.2 -# - fe80::/126 # External plugin configuration # See https://searx.github.io/searx/dev/plugins.html for more details @@ -1026,18 +1027,16 @@ engines: additional_tests: rosebud: *test_rosebud + - name : qwant images + engine : qwant + shortcut : qwi + disabled: True + categories : images + - name : qwant news engine : qwant shortcut : qwn categories : news - network: qwant - - - name: qwant images - engine: qwant - shortcut: qwi - categories: images - disabled: True - network: qwant - name: qwant videos engine: qwant diff --git a/searx/testing.py b/searx/testing.py index 3375bed6..c31595a7 100644 --- a/searx/testing.py +++ b/searx/testing.py @@ -10,8 +10,8 @@ import traceback from os.path import dirname, join, abspath, realpath +from unittest import TestCase from splinter import Browser -import aiounittest class SearxTestLayer: @@ -82,7 +82,7 @@ def run_robot_tests(tests): test(browser) -class SearxTestCase(aiounittest.AsyncTestCase): +class SearxTestCase(TestCase): """Base test case for non-robot tests.""" layer = SearxTestLayer diff --git a/searx/utils.py b/searx/utils.py index c46739cb..c60edf3b 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -45,7 +45,7 @@ def searx_useragent(): """Return the searx User Agent""" return 'searx/{searx_version} {suffix}'.format( searx_version=VERSION_STRING, - suffix=settings['outgoing'].get('useragent_suffix', '')).strip() + suffix=settings['outgoing'].get('useragent_suffix', '')) def gen_useragent(os=None): diff --git a/searx/webapp.py b/searx/webapp.py index 85c4ed7e..1b5e854f 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -26,26 +26,12 @@ if __name__ == '__main__': from os.path import realpath, dirname sys.path.append(realpath(dirname(realpath(__file__)) + '/../')) -# set Unix thread name -try: - import setproctitle -except ImportError: - pass -else: - import threading - old_thread_init = threading.Thread.__init__ - - def new_thread_init(self, *args, **kwargs): - old_thread_init(self, *args, **kwargs) - setproctitle.setthreadtitle(self._name) - threading.Thread.__init__ = new_thread_init - import hashlib import hmac import json import os -import httpx +import requests from searx import logger logger = logger.getChild('webapp') @@ -94,7 +80,7 @@ from searx.plugins import plugins from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.answerers import answerers -from searx.network import stream as http_stream, set_context_network_name +from searx.poolrequests import get_global_proxies from searx.answerers import ask from searx.metrology.error_recorder import errors_per_engines from searx.settings_loader import get_default_settings_path @@ -153,7 +139,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai # initialize the engines except on the first run of the werkzeug server. if not werkzeug_reloader\ or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): - search_initialize(enable_checker=True, check_network=True) + search_initialize(enable_checker=True) babel = Babel(app) @@ -921,84 +907,57 @@ def _is_selected_language_supported(engine, preferences): @app.route('/image_proxy', methods=['GET']) def image_proxy(): - # pylint: disable=too-many-return-statements, too-many-branches - - url = request.args.get('url') + url = request.args.get('url').encode() if not url: return '', 400 - h = new_hmac(settings['server']['secret_key'], url.encode()) + h = new_hmac(settings['server']['secret_key'], url) if h != request.args.get('h'): return '', 400 - maximum_size = 5 * 1024 * 1024 - forward_resp = False - resp = None - try: - request_headers = { - 'User-Agent': gen_useragent(), - 'Accept': 'image/webp,*/*', - 'Accept-Encoding': 'gzip, deflate', - 'Sec-GPC': '1', - 'DNT': '1', - } - set_context_network_name('image_proxy') - stream = http_stream( - method='GET', - url=url, - headers=request_headers, - timeout=settings['outgoing']['request_timeout'], - follow_redirects=True, - max_redirects=20) + headers = { + 'User-Agent': gen_useragent(), + 'Accept': 'image/webp,*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Sec-GPC': '1', + 'DNT': '1', + } + headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) - resp = next(stream) - content_length = resp.headers.get('Content-Length') - if content_length and content_length.isdigit() and int(content_length) > maximum_size: - return 'Max size', 400 + resp = requests.get(url, + stream=True, + timeout=settings['outgoing']['request_timeout'], + headers=headers, + proxies=get_global_proxies()) - if resp.status_code != 200: - logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) - if resp.status_code >= 400: - return '', resp.status_code - return '', 400 + if resp.status_code == 304: + return '', resp.status_code - if not resp.headers.get('Content-Type', '').startswith('image/'): - logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) - return '', 400 - - forward_resp = True - except httpx.HTTPError: - logger.exception('HTTP error') + if resp.status_code != 200: + logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) + if resp.status_code >= 400: + return '', resp.status_code return '', 400 - finally: - if resp and not forward_resp: - # the code is about to return an HTTP 400 error to the browser - # we make sure to close the response between searxng and the HTTP server - try: - resp.close() - except httpx.HTTPError: - logger.exception('HTTP error on closing') - try: - headers = dict_subset( - resp.headers, - {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'} - ) - - def forward_chunk(): - total_length = 0 - for chunk in stream: - total_length += len(chunk) - if total_length > maximum_size: - break - yield chunk - - return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) - except httpx.HTTPError: + if not resp.headers.get('content-type', '').startswith('image/'): + logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 + img = b'' + chunk_counter = 0 + + for chunk in resp.iter_content(1024 * 1024): + chunk_counter += 1 + if chunk_counter > 5: + return '', 502 # Bad gateway - file is too big (>5M) + img += chunk + + headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) + + return Response(img, mimetype=resp.headers['content-type'], headers=headers) + @app.route('/stats', methods=['GET']) def stats(): diff --git a/searx_extra/update/update_engine_descriptions.py b/searx_extra/update/update_engine_descriptions.py index cf9007da..109fdbfa 100755 --- a/searx_extra/update/update_engine_descriptions.py +++ b/searx_extra/update/update_engine_descriptions.py @@ -10,7 +10,7 @@ from searx.engines.wikidata import send_wikidata_query from searx.utils import extract_text import searx import searx.search -import searx.network +import searx.poolrequests SPARQL_WIKIPEDIA_ARTICLE = """ SELECT DISTINCT ?item ?name @@ -59,7 +59,7 @@ def get_wikipedia_summary(language, pageid): search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' url = search_url.format(title=quote(pageid), language=language) try: - response = searx.network.get(url) + response = searx.poolrequests.get(url) response.raise_for_status() api_result = json.loads(response.text) return api_result.get('extract') @@ -89,7 +89,7 @@ def get_website_description(url, lang1, lang2=None): lang_list.append(lang2) headers['Accept-Language'] = f'{",".join(lang_list)};q=0.8' try: - response = searx.network.get(url, headers=headers, timeout=10) + response = searx.poolrequests.get(url, headers=headers, timeout=10) response.raise_for_status() except Exception: return (None, None) diff --git a/searx_extra/update/update_external_bangs.py b/searx_extra/update/update_external_bangs.py index cf5f93dc..b8849643 100755 --- a/searx_extra/update/update_external_bangs.py +++ b/searx_extra/update/update_external_bangs.py @@ -17,7 +17,7 @@ import json import re from os.path import join -import httpx +import requests from searx import searx_dir # pylint: disable=E0401 C0413 @@ -30,7 +30,7 @@ HTTP_COLON = 'http:' def get_bang_url(): - response = httpx.get(URL_BV1) + response = requests.get(URL_BV1) response.raise_for_status() r = RE_BANG_VERSION.findall(response.text) @@ -38,7 +38,7 @@ def get_bang_url(): def fetch_ddg_bangs(url): - response = httpx.get(url) + response = requests.get(url) response.raise_for_status() return json.loads(response.content.decode()) diff --git a/searx_extra/update/update_osm_keys_tags.py b/searx_extra/update/update_osm_keys_tags.py index f803d0c3..18ed4c06 100755 --- a/searx_extra/update/update_osm_keys_tags.py +++ b/searx_extra/update/update_osm_keys_tags.py @@ -45,7 +45,7 @@ import collections from pathlib import Path from searx import searx_dir -from searx.network import set_timeout_for_thread +from searx.poolrequests import set_timeout_for_thread from searx.engines.wikidata import send_wikidata_query from searx.languages import language_codes from searx.engines.openstreetmap import get_key_rank, VALUE_TO_LINK diff --git a/tests/unit/network/__init__.py b/tests/unit/network/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py deleted file mode 100644 index 3f723452..00000000 --- a/tests/unit/network/test_network.py +++ /dev/null @@ -1,241 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later - -from mock import patch - -import httpx - -from searx.network.network import Network, NETWORKS, initialize -from searx.testing import SearxTestCase - - -class TestNetwork(SearxTestCase): - - def setUp(self): - initialize() - - def test_simple(self): - network = Network() - - self.assertEqual(next(network._local_addresses_cycle), None) - self.assertEqual(next(network._proxies_cycle), ()) - - def test_ipaddress_cycle(self): - network = NETWORKS['ipv6'] - self.assertEqual(next(network._local_addresses_cycle), '::') - self.assertEqual(next(network._local_addresses_cycle), '::') - - network = NETWORKS['ipv4'] - self.assertEqual(next(network._local_addresses_cycle), '0.0.0.0') - self.assertEqual(next(network._local_addresses_cycle), '0.0.0.0') - - network = Network(local_addresses=['192.168.0.1', '192.168.0.2']) - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1') - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2') - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1') - - network = Network(local_addresses=['192.168.0.0/30']) - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1') - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2') - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.1') - self.assertEqual(next(network._local_addresses_cycle), '192.168.0.2') - - network = Network(local_addresses=['fe80::/10']) - self.assertEqual(next(network._local_addresses_cycle), 'fe80::1') - self.assertEqual(next(network._local_addresses_cycle), 'fe80::2') - self.assertEqual(next(network._local_addresses_cycle), 'fe80::3') - - with self.assertRaises(ValueError): - Network(local_addresses=['not_an_ip_address']) - - def test_proxy_cycles(self): - network = Network(proxies='http://localhost:1337') - self.assertEqual(next(network._proxies_cycle), (('all://', 'http://localhost:1337'),)) - - network = Network(proxies={ - 'https': 'http://localhost:1337', - 'http': 'http://localhost:1338' - }) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - - network = Network(proxies={ - 'https': ['http://localhost:1337', 'http://localhost:1339'], - 'http': 'http://localhost:1338' - }) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338'))) - - with self.assertRaises(ValueError): - Network(proxies=1) - - def test_get_kwargs_clients(self): - kwargs = { - 'verify': True, - 'max_redirects': 5, - 'timeout': 2, - 'allow_redirects': True, - } - kwargs_client = Network.extract_kwargs_clients(kwargs) - - self.assertEqual(len(kwargs_client), 2) - self.assertEqual(len(kwargs), 2) - - self.assertEqual(kwargs['timeout'], 2) - self.assertEqual(kwargs['follow_redirects'], True) - - self.assertTrue(kwargs_client['verify']) - self.assertEqual(kwargs_client['max_redirects'], 5) - - async def test_get_client(self): - network = Network(verify=True) - client1 = await network.get_client() - client2 = await network.get_client(verify=True) - client3 = await network.get_client(max_redirects=10) - client4 = await network.get_client(verify=True) - client5 = await network.get_client(verify=False) - client6 = await network.get_client(max_redirects=10) - - self.assertEqual(client1, client2) - self.assertEqual(client1, client4) - self.assertNotEqual(client1, client3) - self.assertNotEqual(client1, client5) - self.assertEqual(client3, client6) - - await network.aclose() - - async def test_aclose(self): - network = Network(verify=True) - await network.get_client() - await network.aclose() - - async def test_request(self): - a_text = 'Lorem Ipsum' - response = httpx.Response(status_code=200, text=a_text) - with patch.object(httpx.AsyncClient, 'request', return_value=response): - network = Network(enable_http=True) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.text, a_text) - await network.aclose() - - -class TestNetworkRequestRetries(SearxTestCase): - - TEXT = 'Lorem Ipsum' - - @classmethod - def get_response_404_then_200(cls): - first = True - - async def get_response(*args, **kwargs): - nonlocal first - if first: - first = False - return httpx.Response(status_code=403, text=TestNetworkRequestRetries.TEXT) - return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT) - return get_response - - async def test_retries_ok(self): - with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): - network = Network(enable_http=True, retries=1, retry_on_http_error=403) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) - await network.aclose() - - async def test_retries_fail_int(self): - with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): - network = Network(enable_http=True, retries=0, retry_on_http_error=403) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.status_code, 403) - await network.aclose() - - async def test_retries_fail_list(self): - with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): - network = Network(enable_http=True, retries=0, retry_on_http_error=[403, 429]) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.status_code, 403) - await network.aclose() - - async def test_retries_fail_bool(self): - with patch.object(httpx.AsyncClient, 'request', new=TestNetworkRequestRetries.get_response_404_then_200()): - network = Network(enable_http=True, retries=0, retry_on_http_error=True) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.status_code, 403) - await network.aclose() - - async def test_retries_exception_then_200(self): - request_count = 0 - - async def get_response(*args, **kwargs): - nonlocal request_count - request_count += 1 - if request_count < 3: - raise httpx.RequestError('fake exception', request=None) - return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT) - - with patch.object(httpx.AsyncClient, 'request', new=get_response): - network = Network(enable_http=True, retries=2) - response = await network.request('GET', 'https://example.com/') - self.assertEqual(response.status_code, 200) - self.assertEqual(response.text, TestNetworkRequestRetries.TEXT) - await network.aclose() - - async def test_retries_exception(self): - async def get_response(*args, **kwargs): - raise httpx.RequestError('fake exception', request=None) - - with patch.object(httpx.AsyncClient, 'request', new=get_response): - network = Network(enable_http=True, retries=0) - with self.assertRaises(httpx.RequestError): - await network.request('GET', 'https://example.com/') - await network.aclose() - - -class TestNetworkStreamRetries(SearxTestCase): - - TEXT = 'Lorem Ipsum' - - @classmethod - def get_response_exception_then_200(cls): - first = True - - def stream(*args, **kwargs): - nonlocal first - if first: - first = False - raise httpx.RequestError('fake exception', request=None) - return httpx.Response(status_code=200, text=TestNetworkStreamRetries.TEXT) - return stream - - async def test_retries_ok(self): - with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()): - network = Network(enable_http=True, retries=1, retry_on_http_error=403) - response = await network.stream('GET', 'https://example.com/') - self.assertEqual(response.text, TestNetworkStreamRetries.TEXT) - await network.aclose() - - async def test_retries_fail(self): - with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()): - network = Network(enable_http=True, retries=0, retry_on_http_error=403) - with self.assertRaises(httpx.RequestError): - await network.stream('GET', 'https://example.com/') - await network.aclose() - - async def test_retries_exception(self): - first = True - - def stream(*args, **kwargs): - nonlocal first - if first: - first = False - return httpx.Response(status_code=403, text=TestNetworkRequestRetries.TEXT) - return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT) - - with patch.object(httpx.AsyncClient, 'stream', new=stream): - network = Network(enable_http=True, retries=0, retry_on_http_error=403) - response = await network.stream('GET', 'https://example.com/') - self.assertEqual(response.status_code, 403) - await network.aclose() diff --git a/tests/unit/test_engines_init.py b/tests/unit/test_engines_init.py index d2aee41f..c75637f2 100644 --- a/tests/unit/test_engines_init.py +++ b/tests/unit/test_engines_init.py @@ -23,7 +23,7 @@ class TestEnginesInit(SearxTestCase): engine_list = [{'engine': 'dummy', 'name': 'engine1', 'shortcut': 'e1', 'categories': 'general'}, {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}] - engines.load_engines(engine_list) + engines.initialize_engines(engine_list) self.assertEqual(len(engines.engines), 1) self.assertIn('engine1', engines.engines) self.assertNotIn('onions', engines.categories) @@ -35,7 +35,7 @@ class TestEnginesInit(SearxTestCase): 'timeout': 20.0, 'onion_url': 'http://engine1.onion'}, {'engine': 'dummy', 'name': 'engine2', 'shortcut': 'e2', 'categories': 'onions'}] - engines.load_engines(engine_list) + engines.initialize_engines(engine_list) self.assertEqual(len(engines.engines), 2) self.assertIn('engine1', engines.engines) self.assertIn('engine2', engines.engines) diff --git a/tests/unit/test_poolrequests.py b/tests/unit/test_poolrequests.py new file mode 100644 index 00000000..b22685fd --- /dev/null +++ b/tests/unit/test_poolrequests.py @@ -0,0 +1,89 @@ +from unittest.mock import patch +from requests.models import Response + +from searx.testing import SearxTestCase + +import searx.poolrequests +from searx.poolrequests import get_proxy_cycles, get_proxies + + +CONFIG = {'http': ['http://localhost:9090', 'http://localhost:9092'], + 'https': ['http://localhost:9091', 'http://localhost:9093']} + + +class TestProxy(SearxTestCase): + + def test_noconfig(self): + cycles = get_proxy_cycles(None) + self.assertIsNone(cycles) + + cycles = get_proxy_cycles(False) + self.assertIsNone(cycles) + + def test_oldconfig(self): + config = { + 'http': 'http://localhost:9090', + 'https': 'http://localhost:9091', + } + cycles = get_proxy_cycles(config) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_one_proxy(self): + config = { + 'http': ['http://localhost:9090'], + 'https': ['http://localhost:9091'], + } + cycles = get_proxy_cycles(config) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_multiple_proxies(self): + cycles = get_proxy_cycles(CONFIG) + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['http']), 'http://localhost:9092') + self.assertEqual(next(cycles['http']), 'http://localhost:9090') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + self.assertEqual(next(cycles['https']), 'http://localhost:9093') + self.assertEqual(next(cycles['https']), 'http://localhost:9091') + + def test_getproxies_none(self): + self.assertIsNone(get_proxies(None)) + + def test_getproxies_config(self): + cycles = get_proxy_cycles(CONFIG) + self.assertEqual(get_proxies(cycles), { + 'http': 'http://localhost:9090', + 'https': 'http://localhost:9091' + }) + self.assertEqual(get_proxies(cycles), { + 'http': 'http://localhost:9092', + 'https': 'http://localhost:9093' + }) + + @patch('searx.poolrequests.get_global_proxies') + def test_request(self, mock_get_global_proxies): + method = 'GET' + url = 'http://localhost' + custom_proxies = { + 'https': 'http://localhost:1080' + } + global_proxies = { + 'http': 'http://localhost:9092', + 'https': 'http://localhost:9093' + } + mock_get_global_proxies.return_value = global_proxies + + # check the global proxies usage + with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method: + searx.poolrequests.request(method, url) + mock_method.assert_called_once_with(method=method, url=url, proxies=global_proxies) + + # check if the proxies parameter overrides the global proxies + with patch.object(searx.poolrequests.SessionSinglePool, 'request', return_value=Response()) as mock_method: + searx.poolrequests.request(method, url, proxies=custom_proxies) + mock_method.assert_called_once_with(method=method, url=url, proxies=custom_proxies) diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 83be9ef3..7a79ce24 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -1,9 +1,11 @@ -from searx import settings -from searx.engines import load_engines +from mock import patch + from searx.search import initialize from searx.query import RawTextQuery from searx.testing import SearxTestCase +import searx.engines + TEST_ENGINES = [ { @@ -279,6 +281,10 @@ class TestBang(SearxTestCase): self.assertEqual(query.getQuery(), '!dum the query') def test_bang_autocomplete_empty(self): - load_engines(settings['engines']) - query = RawTextQuery('the query !', []) - self.assertEqual(query.autocomplete_list, ['!images', '!wikipedia', '!osm']) + with patch.object(searx.engines, 'initialize_engines', searx.engines.load_engines): + initialize() + query = RawTextQuery('the query !', []) + self.assertEqual(query.autocomplete_list, ['!images', '!wikipedia', '!osm']) + + query = RawTextQuery('the query ?', ['osm']) + self.assertEqual(query.autocomplete_list, ['?images', '?wikipedia']) diff --git a/tests/unit/test_webapp.py b/tests/unit/test_webapp.py index f865ef03..9c598a16 100644 --- a/tests/unit/test_webapp.py +++ b/tests/unit/test_webapp.py @@ -3,20 +3,16 @@ import json from urllib.parse import ParseResult from mock import Mock - -import searx.search.processors from searx.testing import SearxTestCase from searx.search import Search +import searx.engines class ViewsTestCase(SearxTestCase): def setUp(self): # skip init function (no external HTTP request) - def dummy(*args, **kwargs): - pass - - self.setattr4test(searx.search.processors, 'initialize_processor', dummy) + self.setattr4test(searx.engines, 'initialize_engines', searx.engines.load_engines) from searx import webapp # pylint disable=import-outside-toplevel