Pick image_proxy changes from searxng (#2965)

* [mod] /image_proxy: don't decompress images

* [fix] image_proxy: always close the httpx respone

previously, when the content type was not an image and some other error,
the httpx response was not closed

* [mod] /image_proxy: use HTTP/1 instead of HTTP/2

httpx: HTTP/2 is slow when a lot data is downloaded.
https://github.com/dalf/pyhttp-benchmark

also, the usage of HTTP/1 decreases the load average

* [mod] searx.utils.dict_subset: rewrite with comprehension

Co-authored-by: Alexandre Flament <alex@al-f.net>
This commit is contained in:
Noémi Ványi 2022-01-22 13:49:00 +01:00 committed by GitHub
parent ad7e00ad03
commit ea38fea711
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 72 additions and 19 deletions

View File

@ -5,6 +5,7 @@ import threading
import concurrent.futures import concurrent.futures
from time import time from time import time
from queue import SimpleQueue from queue import SimpleQueue
from types import MethodType
import httpx import httpx
import h2.exceptions import h2.exceptions
@ -134,15 +135,27 @@ async def stream_chunk_to_queue(network, q, method, url, **kwargs):
try: try:
async with await network.stream(method, url, **kwargs) as response: async with await network.stream(method, url, **kwargs) as response:
q.put(response) q.put(response)
# aiter_raw: access the raw bytes on the response without applying any HTTP content decoding
# https://www.python-httpx.org/quickstart/#streaming-responses
async for chunk in response.aiter_bytes(65536): async for chunk in response.aiter_bytes(65536):
if len(chunk) > 0: if len(chunk) > 0:
q.put(chunk) q.put(chunk)
except httpx.ResponseClosed as e:
# the response was closed
pass
except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e: except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
q.put(e) q.put(e)
finally: finally:
q.put(None) q.put(None)
def _close_response_method(self):
asyncio.run_coroutine_threadsafe(
self.aclose(),
get_loop()
)
def stream(method, url, **kwargs): def stream(method, url, **kwargs):
"""Replace httpx.stream. """Replace httpx.stream.
@ -158,10 +171,18 @@ def stream(method, url, **kwargs):
q = SimpleQueue() q = SimpleQueue()
future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs), future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs),
get_loop()) get_loop())
# yield response
response = q.get()
if isinstance(response, Exception):
raise response
response.close = MethodType(_close_response_method, response)
yield response
# yield chunks
chunk_or_exception = q.get() chunk_or_exception = q.get()
while chunk_or_exception is not None: while chunk_or_exception is not None:
if isinstance(chunk_or_exception, Exception): if isinstance(chunk_or_exception, Exception):
raise chunk_or_exception raise chunk_or_exception
yield chunk_or_exception yield chunk_or_exception
chunk_or_exception = q.get() chunk_or_exception = q.get()
return future.result() future.result()

View File

@ -326,6 +326,14 @@ def initialize(settings_engines=None, settings_outgoing=None):
if isinstance(network, str): if isinstance(network, str):
NETWORKS[engine_name] = NETWORKS[network] NETWORKS[engine_name] = NETWORKS[network]
# the /image_proxy endpoint has a dedicated network.
# same parameters than the default network, but HTTP/2 is disabled.
# It decreases the CPU load average, and the total time is more or less the same
if 'image_proxy' not in NETWORKS:
image_proxy_params = default_params.copy()
image_proxy_params['enable_http2'] = False
NETWORKS['image_proxy'] = new_network(image_proxy_params)
@atexit.register @atexit.register
def done(): def done():

View File

@ -272,11 +272,7 @@ def dict_subset(d, properties):
>>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D']) >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
{'A': 'a'} {'A': 'a'}
""" """
result = {} return {k: d[k] for k in properties if k in d}
for k in properties:
if k in d:
result[k] = d[k]
return result
def get_torrent_size(filesize, filesize_multiplier): def get_torrent_size(filesize, filesize_multiplier):

View File

@ -94,7 +94,7 @@ from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers from searx.answerers import answerers
from searx.network import stream as http_stream from searx.network import stream as http_stream, set_context_network_name
from searx.answerers import ask from searx.answerers import ask
from searx.metrology.error_recorder import errors_per_engines from searx.metrology.error_recorder import errors_per_engines
from searx.settings_loader import get_default_settings_path from searx.settings_loader import get_default_settings_path
@ -921,6 +921,8 @@ def _is_selected_language_supported(engine, preferences):
@app.route('/image_proxy', methods=['GET']) @app.route('/image_proxy', methods=['GET'])
def image_proxy(): def image_proxy():
# pylint: disable=too-many-return-statements, too-many-branches
url = request.args.get('url') url = request.args.get('url')
if not url: if not url:
@ -932,14 +934,21 @@ def image_proxy():
return '', 400 return '', 400
maximum_size = 5 * 1024 * 1024 maximum_size = 5 * 1024 * 1024
forward_resp = False
resp = None
try: try:
headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) request_headers = {
headers['User-Agent'] = gen_useragent() 'User-Agent': gen_useragent(),
'Accept': 'image/webp,*/*',
'Accept-Encoding': 'gzip, deflate',
'Sec-GPC': '1',
'DNT': '1',
}
set_context_network_name('image_proxy')
stream = http_stream( stream = http_stream(
method='GET', method='GET',
url=url, url=url,
headers=headers, headers=request_headers,
timeout=settings['outgoing']['request_timeout'], timeout=settings['outgoing']['request_timeout'],
follow_redirects=True, follow_redirects=True,
max_redirects=20) max_redirects=20)
@ -949,25 +958,37 @@ def image_proxy():
if content_length and content_length.isdigit() and int(content_length) > maximum_size: if content_length and content_length.isdigit() and int(content_length) > maximum_size:
return 'Max size', 400 return 'Max size', 400
if resp.status_code == 304:
return '', resp.status_code
if resp.status_code != 200: if resp.status_code != 200:
logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
if resp.status_code >= 400: if resp.status_code >= 400:
return '', resp.status_code return '', resp.status_code
return '', 400 return '', 400
if not resp.headers.get('content-type', '').startswith('image/'): if not resp.headers.get('Content-Type', '').startswith('image/'):
logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', ''))
return '', 400 return '', 400
headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) forward_resp = True
except httpx.HTTPError:
logger.exception('HTTP error')
return '', 400
finally:
if resp and not forward_resp:
# the code is about to return an HTTP 400 error to the browser
# we make sure to close the response between searxng and the HTTP server
try:
resp.close()
except httpx.HTTPError:
logger.exception('HTTP error on closing')
total_length = 0 try:
headers = dict_subset(
resp.headers,
{'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}
)
def forward_chunk(): def forward_chunk():
nonlocal total_length total_length = 0
for chunk in stream: for chunk in stream:
total_length += len(chunk) total_length += len(chunk)
if total_length > maximum_size: if total_length > maximum_size:
@ -1148,6 +1169,13 @@ def run():
) )
def patch_application(app):
# serve pages with HTTP/1.1
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version'])
# patch app to handle non root url-s behind proxy & wsgi
app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app))
class ReverseProxyPathFix: class ReverseProxyPathFix:
'''Wrap the application in this middleware and configure the '''Wrap the application in this middleware and configure the
front-end server to add these headers, to let you quietly bind front-end server to add these headers, to let you quietly bind