Add scheme to img_src and thumbnail_url if missing from URL

Closes #3092
This commit is contained in:
Noémi Ványi 2022-01-22 11:59:21 +01:00
parent 148090df12
commit fd9d6b58d5
2 changed files with 19 additions and 3 deletions

View File

@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
from searx import logger
from searx.engines import engines
from searx.metrology.error_recorder import record_error
from searx.utils import add_scheme_to_url
from searx import settings
@ -240,10 +241,15 @@ class ResultContainer:
result['parsed_url'] = urlparse(result['url'])
# if the result has no scheme, use http as default
if not result['parsed_url'].scheme:
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
if not result['parsed_url'].scheme or result['parsed_url'].scheme == '':
result['parsed_url'] = result['parsed_url']._replace(scheme='http')
result['url'] = result['parsed_url'].geturl()
if 'thumbnail_src' in result:
result['thumbnail_src'] = add_scheme_to_url(result['thumbnail_src'])
if 'img_src' in result:
result['img_src'] = add_scheme_to_url(result['img_src'])
result['engines'] = set([result['engine']])
# strip multiple spaces and cariage returns from content

View File

@ -7,7 +7,7 @@ from numbers import Number
from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin, urlparse, urlunparse
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@ -214,6 +214,16 @@ def normalize_url(url, base_url):
return url
def add_scheme_to_url(url, scheme="https"):
"""Add schema to URL: if scheme is missing from the URL, then add it."""
parsed = urlparse(url)
if parsed.scheme == '':
parsed_with_scheme = parsed._replace(scheme=scheme)
return urlunparse(parsed_with_scheme)
return url
def extract_url(xpath_results, base_url):
"""Extract and normalize URL from lxml Element