diff --git a/searx/network/__init__.py b/searx/network/__init__.py index 3794fcca..93427d91 100644 --- a/searx/network/__init__.py +++ b/searx/network/__init__.py @@ -5,6 +5,7 @@ import threading import concurrent.futures from time import time from queue import SimpleQueue +from types import MethodType import httpx import h2.exceptions @@ -134,15 +135,27 @@ async def stream_chunk_to_queue(network, q, method, url, **kwargs): try: async with await network.stream(method, url, **kwargs) as response: q.put(response) + # aiter_raw: access the raw bytes on the response without applying any HTTP content decoding + # https://www.python-httpx.org/quickstart/#streaming-responses async for chunk in response.aiter_bytes(65536): if len(chunk) > 0: q.put(chunk) + except httpx.ResponseClosed as e: + # the response was closed + pass except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e: q.put(e) finally: q.put(None) +def _close_response_method(self): + asyncio.run_coroutine_threadsafe( + self.aclose(), + get_loop() + ) + + def stream(method, url, **kwargs): """Replace httpx.stream. @@ -158,10 +171,18 @@ def stream(method, url, **kwargs): q = SimpleQueue() future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs), get_loop()) + # yield response + response = q.get() + if isinstance(response, Exception): + raise response + response.close = MethodType(_close_response_method, response) + yield response + + # yield chunks chunk_or_exception = q.get() while chunk_or_exception is not None: if isinstance(chunk_or_exception, Exception): raise chunk_or_exception yield chunk_or_exception chunk_or_exception = q.get() - return future.result() + future.result() diff --git a/searx/network/network.py b/searx/network/network.py index 25399ddc..1537cd0c 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -326,6 +326,14 @@ def initialize(settings_engines=None, settings_outgoing=None): if isinstance(network, str): NETWORKS[engine_name] = NETWORKS[network] + # the /image_proxy endpoint has a dedicated network. + # same parameters than the default network, but HTTP/2 is disabled. + # It decreases the CPU load average, and the total time is more or less the same + if 'image_proxy' not in NETWORKS: + image_proxy_params = default_params.copy() + image_proxy_params['enable_http2'] = False + NETWORKS['image_proxy'] = new_network(image_proxy_params) + @atexit.register def done(): diff --git a/searx/utils.py b/searx/utils.py index 9aea9bb0..c46739cb 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -272,11 +272,7 @@ def dict_subset(d, properties): >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D']) {'A': 'a'} """ - result = {} - for k in properties: - if k in d: - result[k] = d[k] - return result + return {k: d[k] for k in properties if k in d} def get_torrent_size(filesize, filesize_multiplier): diff --git a/searx/webapp.py b/searx/webapp.py index 20431dcb..85c4ed7e 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -94,7 +94,7 @@ from searx.plugins import plugins from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.answerers import answerers -from searx.network import stream as http_stream +from searx.network import stream as http_stream, set_context_network_name from searx.answerers import ask from searx.metrology.error_recorder import errors_per_engines from searx.settings_loader import get_default_settings_path @@ -921,6 +921,8 @@ def _is_selected_language_supported(engine, preferences): @app.route('/image_proxy', methods=['GET']) def image_proxy(): + # pylint: disable=too-many-return-statements, too-many-branches + url = request.args.get('url') if not url: @@ -932,14 +934,21 @@ def image_proxy(): return '', 400 maximum_size = 5 * 1024 * 1024 - + forward_resp = False + resp = None try: - headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) - headers['User-Agent'] = gen_useragent() + request_headers = { + 'User-Agent': gen_useragent(), + 'Accept': 'image/webp,*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Sec-GPC': '1', + 'DNT': '1', + } + set_context_network_name('image_proxy') stream = http_stream( method='GET', url=url, - headers=headers, + headers=request_headers, timeout=settings['outgoing']['request_timeout'], follow_redirects=True, max_redirects=20) @@ -949,25 +958,37 @@ def image_proxy(): if content_length and content_length.isdigit() and int(content_length) > maximum_size: return 'Max size', 400 - if resp.status_code == 304: - return '', resp.status_code - if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 - if not resp.headers.get('content-type', '').startswith('image/'): - logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) + if not resp.headers.get('Content-Type', '').startswith('image/'): + logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400 - headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) + forward_resp = True + except httpx.HTTPError: + logger.exception('HTTP error') + return '', 400 + finally: + if resp and not forward_resp: + # the code is about to return an HTTP 400 error to the browser + # we make sure to close the response between searxng and the HTTP server + try: + resp.close() + except httpx.HTTPError: + logger.exception('HTTP error on closing') - total_length = 0 + try: + headers = dict_subset( + resp.headers, + {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'} + ) def forward_chunk(): - nonlocal total_length + total_length = 0 for chunk in stream: total_length += len(chunk) if total_length > maximum_size: @@ -1148,6 +1169,13 @@ def run(): ) +def patch_application(app): + # serve pages with HTTP/1.1 + WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version']) + # patch app to handle non root url-s behind proxy & wsgi + app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app)) + + class ReverseProxyPathFix: '''Wrap the application in this middleware and configure the front-end server to add these headers, to let you quietly bind