mirror of
https://github.com/searx/searx
synced 2024-11-29 05:30:10 +01:00
Add scheme to img_src and thumbnail_url if missing from URL
Closes #3092
This commit is contained in:
parent
148090df12
commit
fd9d6b58d5
@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
|
||||
from searx import logger
|
||||
from searx.engines import engines
|
||||
from searx.metrology.error_recorder import record_error
|
||||
from searx.utils import add_scheme_to_url
|
||||
from searx import settings
|
||||
|
||||
|
||||
@ -240,10 +241,15 @@ class ResultContainer:
|
||||
result['parsed_url'] = urlparse(result['url'])
|
||||
|
||||
# if the result has no scheme, use http as default
|
||||
if not result['parsed_url'].scheme:
|
||||
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
|
||||
if not result['parsed_url'].scheme or result['parsed_url'].scheme == '':
|
||||
result['parsed_url'] = result['parsed_url']._replace(scheme='http')
|
||||
result['url'] = result['parsed_url'].geturl()
|
||||
|
||||
if 'thumbnail_src' in result:
|
||||
result['thumbnail_src'] = add_scheme_to_url(result['thumbnail_src'])
|
||||
if 'img_src' in result:
|
||||
result['img_src'] = add_scheme_to_url(result['img_src'])
|
||||
|
||||
result['engines'] = set([result['engine']])
|
||||
|
||||
# strip multiple spaces and cariage returns from content
|
||||
|
@ -7,7 +7,7 @@ from numbers import Number
|
||||
from os.path import splitext, join
|
||||
from random import choice
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.parse import urljoin, urlparse, urlunparse
|
||||
|
||||
from lxml import html
|
||||
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
|
||||
@ -214,6 +214,16 @@ def normalize_url(url, base_url):
|
||||
return url
|
||||
|
||||
|
||||
def add_scheme_to_url(url, scheme="https"):
|
||||
"""Add schema to URL: if scheme is missing from the URL, then add it."""
|
||||
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme == '':
|
||||
parsed_with_scheme = parsed._replace(scheme=scheme)
|
||||
return urlunparse(parsed_with_scheme)
|
||||
return url
|
||||
|
||||
|
||||
def extract_url(xpath_results, base_url):
|
||||
"""Extract and normalize URL from lxml Element
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user