mirror of
https://github.com/searx/searx
synced 2025-01-19 15:30:02 +01:00
Remove httpx
and use requests
instead (#3305)
## What does this PR do? This PR prepares for removing `httpx`, and reverts back to `requests`. ## Why is this change important? `httpx` hasn't proven itself to be faster or better than `requests`. On the other hand it has caused issues on Windows. ============================================= Please update your environment to use requests instead of httpx. =============================================
This commit is contained in:
parent
210e59c68c
commit
85034b49ef
@ -130,12 +130,14 @@ Global Settings
|
||||
request_timeout : 2.0 # default timeout in seconds, can be override by engine
|
||||
# max_request_timeout: 10.0 # the maximum timeout in seconds
|
||||
useragent_suffix : "" # informations like an email address to the administrator
|
||||
pool_connections : 100 # Maximum number of allowable connections, or None for no limits. The default is 100.
|
||||
pool_maxsize : 10 # Number of allowable keep-alive connections, or None to always allow. The default is 10.
|
||||
enable_http2: True # See https://www.python-httpx.org/http2/
|
||||
pool_connections : 100 # Number of different hosts
|
||||
pool_maxsize : 10 # Number of simultaneous requests by host
|
||||
# uncomment below section if you want to use a proxy
|
||||
# proxies:
|
||||
# all://:
|
||||
# http:
|
||||
# - http://proxy1:8080
|
||||
# - http://proxy2:8080
|
||||
# https:
|
||||
# - http://proxy1:8080
|
||||
# - http://proxy2:8080
|
||||
# uncomment below section only if you have more than one network interface
|
||||
@ -143,7 +145,6 @@ Global Settings
|
||||
# source_ips:
|
||||
# - 1.1.1.1
|
||||
# - 1.1.1.2
|
||||
# - fe80::/126
|
||||
|
||||
|
||||
``request_timeout`` :
|
||||
@ -156,46 +157,20 @@ Global Settings
|
||||
Suffix to the user-agent searx uses to send requests to others engines. If an
|
||||
engine wish to block you, a contact info here may be useful to avoid that.
|
||||
|
||||
``keepalive_expiry``:
|
||||
Number of seconds to keep a connection in the pool. By default 5.0 seconds.
|
||||
|
||||
.. _httpx proxies: https://www.python-httpx.org/advanced/#http-proxying
|
||||
.. _requests proxies: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
|
||||
.. _PySocks: https://pypi.org/project/PySocks/
|
||||
|
||||
``proxies`` :
|
||||
Define one or more proxies you wish to use, see `httpx proxies`_.
|
||||
Define one or more proxies you wish to use, see `requests proxies`_.
|
||||
If there are more than one proxy for one protocol (http, https),
|
||||
requests to the engines are distributed in a round-robin fashion.
|
||||
|
||||
- Proxy: `see <https://2.python-requests.org/en/latest/user/advanced/#proxies>`__.
|
||||
- SOCKS proxies are also supported: `see <https://2.python-requests.org/en/latest/user/advanced/#socks>`__
|
||||
|
||||
``source_ips`` :
|
||||
If you use multiple network interfaces, define from which IP the requests must
|
||||
be made. Example:
|
||||
|
||||
* ``0.0.0.0`` any local IPv4 address.
|
||||
* ``::`` any local IPv6 address.
|
||||
* ``192.168.0.1``
|
||||
* ``[ 192.168.0.1, 192.168.0.2 ]`` these two specific IP addresses
|
||||
* ``fe80::60a2:1691:e5a2:ee1f``
|
||||
* ``fe80::60a2:1691:e5a2:ee1f/126`` all IP addresses in this network.
|
||||
* ``[ 192.168.0.1, fe80::/126 ]``
|
||||
|
||||
``retries`` :
|
||||
Number of retry in case of an HTTP error.
|
||||
On each retry, searx uses an different proxy and source ip.
|
||||
|
||||
``retry_on_http_error`` :
|
||||
Retry request on some HTTP status code.
|
||||
|
||||
Example:
|
||||
|
||||
* ``true`` : on HTTP status code between 400 and 599.
|
||||
* ``403`` : on HTTP status code 403.
|
||||
* ``[403, 429]``: on HTTP status code 403 and 429.
|
||||
|
||||
``enable_http2`` :
|
||||
Enable by default. Set to ``False`` to disable HTTP/2.
|
||||
|
||||
``max_redirects`` :
|
||||
30 by default. Maximum redirect before it is an error.
|
||||
be made. This parameter is ignored when ``proxies`` is set.
|
||||
|
||||
|
||||
``locales:``
|
||||
@ -241,13 +216,6 @@ Engine settings
|
||||
api_key : 'apikey'
|
||||
disabled : True
|
||||
language : en_US
|
||||
#enable_http: False
|
||||
#enable_http2: False
|
||||
#retries: 1
|
||||
#retry_on_http_error: True # or 403 or [404, 429]
|
||||
#max_connections: 100
|
||||
#max_keepalive_connections: 10
|
||||
#keepalive_expiry: 5.0
|
||||
#proxies:
|
||||
# http:
|
||||
# - http://proxy1:8080
|
||||
@ -302,12 +270,6 @@ Engine settings
|
||||
``display_error_messages`` : default ``True``
|
||||
When an engine returns an error, the message is displayed on the user interface.
|
||||
|
||||
``network``: optional
|
||||
Use the network configuration from another engine.
|
||||
In addition, there are two default networks:
|
||||
* ``ipv4`` set ``local_addresses`` to ``0.0.0.0`` (use only IPv4 local addresses)
|
||||
* ``ipv6`` set ``local_addresses`` to ``::`` (use only IPv6 local addresses)
|
||||
|
||||
.. note::
|
||||
|
||||
A few more options are possible, but they are pretty specific to some
|
||||
|
3
manage
3
manage
@ -107,8 +107,7 @@ fi
|
||||
export DOCS_BUILD
|
||||
|
||||
buildenv() {
|
||||
SEARX_DEBUG=1 pyenv.cmd python utils/build_env.py 2>&1 \
|
||||
| prefix_stdout "${_Blue}BUILDENV${_creset} "
|
||||
SEARX_DEBUG=1 pyenv.cmd python utils/build_env.py 2>&1
|
||||
return "${PIPESTATUS[0]}"
|
||||
}
|
||||
|
||||
|
@ -17,4 +17,3 @@ sphinx-tabs==3.2.0
|
||||
sphinxcontrib-programoutput==0.17
|
||||
sphinx-autobuild==2021.3.14
|
||||
linuxdoc==20211220
|
||||
aiounittest==1.4.1
|
||||
|
@ -1,16 +1,13 @@
|
||||
certifi==2022.5.18.1
|
||||
Brotli==1.0.9
|
||||
babel==2.9.1
|
||||
certifi==2022.5.18.1
|
||||
flask-babel==2.0.0
|
||||
flask==2.1.1
|
||||
jinja2==3.1.2
|
||||
langdetect==1.0.9
|
||||
lxml==4.9.0
|
||||
pygments==2.8.0
|
||||
python-dateutil==2.8.2
|
||||
pyyaml==6.0
|
||||
httpx[http2]==0.23.0
|
||||
Brotli==1.0.9
|
||||
uvloop==0.16.0; python_version >= '3.7'
|
||||
uvloop==0.14.0; python_version < '3.7'
|
||||
httpx-socks[asyncio]==0.7.4
|
||||
langdetect==1.0.9
|
||||
requests[socks]==2.28.1
|
||||
setproctitle==1.2.2
|
||||
|
@ -20,12 +20,10 @@ from lxml import etree
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from httpx import HTTPError
|
||||
|
||||
from requests import RequestException
|
||||
|
||||
from searx import settings
|
||||
from searx.data import ENGINES_LANGUAGES
|
||||
from searx.network import get as http_get
|
||||
from searx.poolrequests import get as http_get
|
||||
from searx.exceptions import SearxEngineResponseException
|
||||
|
||||
|
||||
@ -154,5 +152,5 @@ def search_autocomplete(backend_name, query, lang):
|
||||
|
||||
try:
|
||||
return backend(query, lang)
|
||||
except (HTTPError, SearxEngineResponseException):
|
||||
except (RequestException, SearxEngineResponseException):
|
||||
return []
|
||||
|
@ -27,7 +27,7 @@ from searx import settings
|
||||
from searx import logger
|
||||
from searx.data import ENGINES_LANGUAGES
|
||||
from searx.exceptions import SearxEngineResponseException
|
||||
from searx.network import get, initialize as initialize_network, set_context_network_name
|
||||
from searx.poolrequests import get, get_proxy_cycles
|
||||
from searx.utils import load_module, match_language, get_engine_from_settings, gen_useragent
|
||||
|
||||
|
||||
@ -89,6 +89,8 @@ def load_engine(engine_data):
|
||||
engine.categories = []
|
||||
else:
|
||||
engine.categories = list(map(str.strip, param_value.split(',')))
|
||||
elif param_name == 'proxies':
|
||||
engine.proxies = get_proxy_cycles(param_value)
|
||||
else:
|
||||
setattr(engine, param_name, param_value)
|
||||
|
||||
@ -283,3 +285,24 @@ def load_engines(engine_list):
|
||||
if engine is not None:
|
||||
engines[engine.name] = engine
|
||||
return engines
|
||||
|
||||
|
||||
def initialize_engines(engine_list):
|
||||
load_engines(engine_list)
|
||||
|
||||
def engine_init(engine_name, init_fn):
|
||||
try:
|
||||
init_fn(get_engine_from_settings(engine_name))
|
||||
except SearxEngineResponseException as exc:
|
||||
logger.warn('%s engine: Fail to initialize // %s', engine_name, exc)
|
||||
except Exception:
|
||||
logger.exception('%s engine: Fail to initialize', engine_name)
|
||||
else:
|
||||
logger.debug('%s engine: Initialized', engine_name)
|
||||
|
||||
for engine_name, engine in engines.items():
|
||||
if hasattr(engine, 'init'):
|
||||
init_fn = getattr(engine, 'init')
|
||||
if init_fn:
|
||||
logger.debug('%s engine: Starting background initialization', engine_name)
|
||||
threading.Thread(target=engine_init, args=(engine_name, init_fn)).start()
|
||||
|
@ -52,7 +52,7 @@ def response(resp):
|
||||
to_results.append(to_result.text_content())
|
||||
|
||||
results.append({
|
||||
'url': urljoin(str(resp.url), '?%d' % k),
|
||||
'url': urljoin(resp.url, '?%d' % k),
|
||||
'title': from_result.text_content(),
|
||||
'content': '; '.join(to_results)
|
||||
})
|
||||
|
@ -1,18 +1,24 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
# lint: pylint
|
||||
"""
|
||||
DuckDuckGo (Web)
|
||||
"""DuckDuckGo Lite
|
||||
"""
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
from searx.utils import match_language, HTMLTextExtractor
|
||||
import re
|
||||
from searx.network import get
|
||||
|
||||
from lxml.html import fromstring
|
||||
|
||||
from searx.utils import (
|
||||
dict_subset,
|
||||
eval_xpath,
|
||||
eval_xpath_getindex,
|
||||
extract_text,
|
||||
match_language,
|
||||
)
|
||||
from searx.poolrequests import get
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://duckduckgo.com/',
|
||||
"website": 'https://lite.duckduckgo.com/lite',
|
||||
"wikidata_id": 'Q12805',
|
||||
"official_api_documentation": 'https://duckduckgo.com/api',
|
||||
"use_official_api": False,
|
||||
@ -21,13 +27,11 @@ about = {
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['general']
|
||||
categories = ['general', 'web']
|
||||
paging = True
|
||||
supported_languages_url = 'https://duckduckgo.com/util/u172.js'
|
||||
number_of_results = 10
|
||||
supported_languages_url = 'https://duckduckgo.com/util/u588.js'
|
||||
time_range_support = True
|
||||
safesearch = True
|
||||
VQD_REGEX = r"vqd='(\d+-\d+-\d+)'"
|
||||
|
||||
language_aliases = {
|
||||
'ar-SA': 'ar-XA',
|
||||
'es-419': 'es-XL',
|
||||
@ -35,16 +39,14 @@ language_aliases = {
|
||||
'ko': 'kr-KR',
|
||||
'sl-SI': 'sl-SL',
|
||||
'zh-TW': 'tzh-TW',
|
||||
'zh-HK': 'tzh-HK'
|
||||
'zh-HK': 'tzh-HK',
|
||||
}
|
||||
|
||||
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
|
||||
|
||||
# search-url
|
||||
url = 'https://links.duckduckgo.com/d.js?'
|
||||
url_ping = 'https://duckduckgo.com/t/sl_h'
|
||||
time_range_dict = {'day': 'd',
|
||||
'week': 'w',
|
||||
'month': 'm',
|
||||
'year': 'y'}
|
||||
url = 'https://lite.duckduckgo.com/lite'
|
||||
url_ping = 'https://duckduckgo.com/t/sl_l'
|
||||
|
||||
|
||||
# match query's language to a region code that duckduckgo will accept
|
||||
@ -59,103 +61,111 @@ def get_region_code(lang, lang_list=None):
|
||||
return lang_parts[1].lower() + '-' + lang_parts[0].lower()
|
||||
|
||||
|
||||
def get_vqd(query, headers):
|
||||
resp = get(f"https://duckduckgo.com/?q={query}&ia=web", headers=headers)
|
||||
resp = re.findall(VQD_REGEX, resp.text)
|
||||
return resp[0]
|
||||
|
||||
|
||||
def request(query, params):
|
||||
|
||||
params['method'] = 'GET'
|
||||
params['url'] = url
|
||||
params['method'] = 'POST'
|
||||
|
||||
vqd = get_vqd(query, params['headers'])
|
||||
dl, ct = match_language(params['language'], supported_languages, language_aliases, 'wt-WT').split('-')
|
||||
query_dict = {
|
||||
'q': query,
|
||||
't': 'D',
|
||||
'l': params['language'],
|
||||
'kl': f'{ct}-{dl}',
|
||||
's': (params['pageno'] - 1) * number_of_results,
|
||||
'dl': dl,
|
||||
'ct': ct,
|
||||
'ss_mkt': get_region_code(params['language'], supported_languages),
|
||||
'df': params['time_range'],
|
||||
'vqd': vqd,
|
||||
'ex': -2,
|
||||
'sp': '1',
|
||||
'bpa': '1',
|
||||
'biaexp': 'b',
|
||||
'msvrtexp': 'b'
|
||||
}
|
||||
if params['safesearch'] == 2: # STRICT
|
||||
del query_dict['t']
|
||||
query_dict['p'] = 1
|
||||
query_dict.update({
|
||||
'videxp': 'a',
|
||||
'nadse': 'b',
|
||||
'eclsexp': 'a',
|
||||
'stiaexp': 'a',
|
||||
'tjsexp': 'b',
|
||||
'related': 'b',
|
||||
'msnexp': 'a'
|
||||
})
|
||||
elif params['safesearch'] == 1: # MODERATE
|
||||
query_dict['ex'] = -1
|
||||
query_dict.update({
|
||||
'nadse': 'b',
|
||||
'eclsexp': 'b',
|
||||
'tjsexp': 'b'
|
||||
})
|
||||
else: # OFF
|
||||
query_dict['ex'] = -2
|
||||
query_dict.update({
|
||||
'nadse': 'b',
|
||||
'eclsexp': 'b',
|
||||
'tjsexp': 'b'
|
||||
})
|
||||
params['data']['q'] = query
|
||||
|
||||
params['allow_redirects'] = False
|
||||
params['data'] = query_dict
|
||||
params['cookies']['kl'] = params['data']['kl']
|
||||
# The API is not documented, so we do some reverse engineering and emulate
|
||||
# what https://lite.duckduckgo.com/lite/ does when you press "next Page"
|
||||
# link again and again ..
|
||||
|
||||
params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
|
||||
|
||||
# initial page does not have an offset
|
||||
if params['pageno'] == 2:
|
||||
# second page does have an offset of 30
|
||||
offset = (params['pageno'] - 1) * 30
|
||||
params['data']['s'] = offset
|
||||
params['data']['dc'] = offset + 1
|
||||
|
||||
elif params['pageno'] > 2:
|
||||
# third and following pages do have an offset of 30 + n*50
|
||||
offset = 30 + (params['pageno'] - 2) * 50
|
||||
params['data']['s'] = offset
|
||||
params['data']['dc'] = offset + 1
|
||||
|
||||
# initial page does not have additional data in the input form
|
||||
if params['pageno'] > 1:
|
||||
# request the second page (and more pages) needs 'o' and 'api' arguments
|
||||
params['data']['o'] = 'json'
|
||||
params['data']['api'] = 'd.js'
|
||||
|
||||
# initial page does not have additional data in the input form
|
||||
if params['pageno'] > 2:
|
||||
# request the third page (and more pages) some more arguments
|
||||
params['data']['nextParams'] = ''
|
||||
params['data']['v'] = ''
|
||||
params['data']['vqd'] = ''
|
||||
|
||||
region_code = get_region_code(params['language'], supported_languages)
|
||||
if region_code:
|
||||
params['data']['kl'] = region_code
|
||||
params['cookies']['kl'] = region_code
|
||||
|
||||
params['data']['df'] = ''
|
||||
if params['time_range'] in time_range_dict:
|
||||
params['data']['df'] = time_range_dict[params['time_range']]
|
||||
params['cookies']['df'] = time_range_dict[params['time_range']]
|
||||
params['url'] = url + urlencode(params['data'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
|
||||
headers_ping = dict_subset(resp.request.headers, ['User-Agent', 'Accept-Encoding', 'Accept', 'Cookie'])
|
||||
get(url_ping, headers=headers_ping)
|
||||
|
||||
if resp.status_code == 303:
|
||||
return []
|
||||
|
||||
# parse the response
|
||||
results = []
|
||||
doc = fromstring(resp.text)
|
||||
|
||||
data = re.findall(r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'", str(resp.text))
|
||||
try:
|
||||
search_data = loads(data[0].replace('/\t/g', ' '))
|
||||
except IndexError:
|
||||
return
|
||||
result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
|
||||
if not len(result_table) >= 3:
|
||||
# no more results
|
||||
return []
|
||||
result_table = result_table[2]
|
||||
|
||||
if len(search_data) == 1 and ('n' not in search_data[0]):
|
||||
only_result = search_data[0]
|
||||
if ((only_result.get('da') is not None and only_result.get('t') == 'EOF') or
|
||||
only_result.get('a') is not None or only_result.get('d') == 'google.com search'):
|
||||
return
|
||||
tr_rows = eval_xpath(result_table, './/tr')
|
||||
|
||||
for search_result in search_data:
|
||||
if 'n' in search_result:
|
||||
# In the last <tr> is the form of the 'previous/next page' links
|
||||
tr_rows = tr_rows[:-1]
|
||||
|
||||
len_tr_rows = len(tr_rows)
|
||||
offset = 0
|
||||
|
||||
while len_tr_rows >= offset + 4:
|
||||
|
||||
# assemble table rows we need to scrap
|
||||
tr_title = tr_rows[offset]
|
||||
tr_content = tr_rows[offset + 1]
|
||||
offset += 4
|
||||
|
||||
# ignore sponsored Adds <tr class="result-sponsored">
|
||||
if tr_content.get('class') == 'result-sponsored':
|
||||
continue
|
||||
title = HTMLTextExtractor()
|
||||
title.feed(search_result.get('t'))
|
||||
content = HTMLTextExtractor()
|
||||
content.feed(search_result.get('a'))
|
||||
|
||||
results.append({'title': title.get_text(),
|
||||
'content': content.get_text(),
|
||||
'url': search_result.get('u')})
|
||||
a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
|
||||
if a_tag is None:
|
||||
continue
|
||||
|
||||
td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
|
||||
if td_content is None:
|
||||
continue
|
||||
|
||||
results.append(
|
||||
{
|
||||
'title': a_tag.text_content(),
|
||||
'content': extract_text(td_content),
|
||||
'url': a_tag.get('href'),
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@ -165,7 +175,7 @@ def _fetch_supported_languages(resp):
|
||||
# response is a js file with regions as an embedded object
|
||||
response_page = resp.text
|
||||
response_page = response_page[response_page.find('regions:{') + 8:]
|
||||
response_page = response_page[:response_page.find('}') + 1]
|
||||
response_page = response_page[: response_page.find('}') + 1]
|
||||
|
||||
regions_json = loads(response_page)
|
||||
supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
|
||||
|
@ -8,7 +8,7 @@ from urllib.parse import urlencode
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.engines.duckduckgo import get_region_code
|
||||
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
|
||||
from searx.network import get
|
||||
from searx.poolrequests import get
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -4,6 +4,7 @@
|
||||
"""
|
||||
|
||||
from json import loads, dumps
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
|
||||
|
||||
@ -31,7 +32,7 @@ def request(query, params):
|
||||
return params
|
||||
|
||||
if username and password:
|
||||
params['auth'] = (username, password)
|
||||
params['auth'] = HTTPBasicAuth(username, password)
|
||||
|
||||
params['url'] = search_url
|
||||
params['method'] = 'GET'
|
||||
|
@ -7,8 +7,8 @@
|
||||
import re
|
||||
from json import loads, JSONDecodeError
|
||||
from urllib.parse import urlencode
|
||||
from searx.network import get
|
||||
from searx.exceptions import SearxEngineResponseException
|
||||
from searx.poolrequests import get
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -10,7 +10,7 @@ Definitions`_.
|
||||
|
||||
# pylint: disable=invalid-name, missing-function-docstring, too-many-branches
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urlencode, urlparse
|
||||
from lxml import html
|
||||
from searx import logger
|
||||
from searx.utils import match_language, extract_text, eval_xpath, eval_xpath_list, eval_xpath_getindex
|
||||
@ -194,7 +194,8 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language):
|
||||
return ret_val
|
||||
|
||||
def detect_google_sorry(resp):
|
||||
if resp.url.host == 'sorry.google.com' or resp.url.path.startswith('/sorry'):
|
||||
resp_url = urlparse(resp.url)
|
||||
if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith('/sorry'):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@ from flask_babel import gettext
|
||||
from lxml import etree
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlencode
|
||||
from searx.network import get
|
||||
from searx.poolrequests import get
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -33,7 +33,7 @@ from flask_babel import gettext
|
||||
|
||||
from searx.utils import match_language
|
||||
from searx.exceptions import SearxEngineAPIException
|
||||
from searx.network import raise_for_httperror
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
|
||||
# about
|
||||
@ -86,15 +86,14 @@ def request(query, params):
|
||||
|
||||
# add language tag
|
||||
if params['language'] == 'all':
|
||||
params['url'] += '&locale=en_us'
|
||||
params['url'] += '&locale=en_US'
|
||||
else:
|
||||
language = match_language(
|
||||
params['language'],
|
||||
# pylint: disable=undefined-variable
|
||||
supported_languages,
|
||||
language_aliases,
|
||||
)
|
||||
params['url'] += '&locale=' + language.replace('-', '_').lower()
|
||||
params['url'] += '&locale=' + language.replace('-', '_')
|
||||
|
||||
params['raise_for_httperror'] = False
|
||||
return params
|
||||
@ -113,7 +112,14 @@ def response(resp):
|
||||
|
||||
# check for an API error
|
||||
if search_results.get('status') != 'success':
|
||||
msg = ",".join(data.get('message', ['unknown', ]))
|
||||
msg = ",".join(
|
||||
data.get(
|
||||
'message',
|
||||
[
|
||||
'unknown',
|
||||
],
|
||||
)
|
||||
)
|
||||
raise SearxEngineAPIException('API error::' + msg)
|
||||
|
||||
# raise for other errors
|
||||
@ -155,11 +161,13 @@ def response(resp):
|
||||
|
||||
if mainline_type == 'web':
|
||||
content = item['desc']
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'content': content,
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'content': content,
|
||||
}
|
||||
)
|
||||
|
||||
elif mainline_type == 'news':
|
||||
|
||||
@ -170,23 +178,27 @@ def response(resp):
|
||||
img_src = None
|
||||
if news_media:
|
||||
img_src = news_media[0].get('pict', {}).get('url', None)
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'publishedDate': pub_date,
|
||||
'img_src': img_src,
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'publishedDate': pub_date,
|
||||
'img_src': img_src,
|
||||
}
|
||||
)
|
||||
|
||||
elif mainline_type == 'images':
|
||||
thumbnail = item['thumbnail']
|
||||
img_src = item['media']
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'template': 'images.html',
|
||||
'thumbnail_src': thumbnail,
|
||||
'img_src': img_src,
|
||||
})
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'template': 'images.html',
|
||||
'thumbnail_src': thumbnail,
|
||||
'img_src': img_src,
|
||||
}
|
||||
)
|
||||
|
||||
elif mainline_type == 'videos':
|
||||
# some videos do not have a description: while qwant-video
|
||||
@ -210,19 +222,18 @@ def response(resp):
|
||||
thumbnail = item['thumbnail']
|
||||
# from some locations (DE and others?) the s2 link do
|
||||
# response a 'Please wait ..' but does not deliver the thumbnail
|
||||
thumbnail = thumbnail.replace(
|
||||
'https://s2.qwant.com',
|
||||
'https://s1.qwant.com', 1
|
||||
thumbnail = thumbnail.replace('https://s2.qwant.com', 'https://s1.qwant.com', 1)
|
||||
results.append(
|
||||
{
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'content': content,
|
||||
'publishedDate': pub_date,
|
||||
'thumbnail': thumbnail,
|
||||
'template': 'videos.html',
|
||||
'length': length,
|
||||
}
|
||||
)
|
||||
results.append({
|
||||
'title': title,
|
||||
'url': res_url,
|
||||
'content': content,
|
||||
'publishedDate': pub_date,
|
||||
'thumbnail': thumbnail,
|
||||
'template': 'videos.html',
|
||||
'length': length,
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
@ -232,7 +243,7 @@ def _fetch_supported_languages(resp):
|
||||
# list of regions is embedded in page as a js object
|
||||
response_text = resp.text
|
||||
response_text = response_text[response_text.find('INITIAL_PROPS'):]
|
||||
response_text = response_text[response_text.find('{'):response_text.find('</script>')]
|
||||
response_text = response_text[response_text.find('{'): response_text.find('</script>')]
|
||||
|
||||
regions_json = loads(response_text)
|
||||
|
||||
|
@ -3,9 +3,9 @@
|
||||
Seznam
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urlencode, urlparse
|
||||
from lxml import html
|
||||
from searx.network import get
|
||||
from searx.poolrequests import get
|
||||
from searx.exceptions import SearxEngineAccessDeniedException
|
||||
from searx.utils import (
|
||||
extract_text,
|
||||
@ -46,7 +46,8 @@ def request(query, params):
|
||||
|
||||
|
||||
def response(resp):
|
||||
if resp.url.path.startswith('/verify'):
|
||||
resp_url = urlparse(resp.url)
|
||||
if resp_url.path.startswith('/verify'):
|
||||
raise SearxEngineAccessDeniedException()
|
||||
|
||||
results = []
|
||||
|
@ -6,7 +6,7 @@
|
||||
from lxml.html import fromstring
|
||||
from searx import logger
|
||||
from searx.utils import extract_text
|
||||
from searx.network import raise_for_httperror
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
logger = logger.getChild('sjp engine')
|
||||
|
||||
|
@ -9,7 +9,7 @@ from lxml import html
|
||||
from dateutil import parser
|
||||
from urllib.parse import quote_plus, urlencode
|
||||
from searx import logger
|
||||
from searx.network import get as http_get
|
||||
from searx.poolrequests import get as http_get
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -5,10 +5,9 @@
|
||||
|
||||
from json import loads
|
||||
from urllib.parse import urlencode
|
||||
import requests
|
||||
import base64
|
||||
|
||||
from searx.network import post as http_post
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://www.spotify.com',
|
||||
@ -39,7 +38,7 @@ def request(query, params):
|
||||
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset)
|
||||
|
||||
r = http_post(
|
||||
r = requests.post(
|
||||
'https://accounts.spotify.com/api/token',
|
||||
data={'grant_type': 'client_credentials'},
|
||||
headers={'Authorization': 'Basic ' + base64.b64encode(
|
||||
|
65
searx/engines/stackoverflow.py
Normal file
65
searx/engines/stackoverflow.py
Normal file
@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
"""
|
||||
Stackoverflow (IT)
|
||||
"""
|
||||
|
||||
from urllib.parse import urlencode, urljoin, urlparse
|
||||
from lxml import html
|
||||
from searx.utils import extract_text
|
||||
from searx.exceptions import SearxEngineCaptchaException
|
||||
|
||||
# about
|
||||
about = {
|
||||
"website": 'https://stackoverflow.com/',
|
||||
"wikidata_id": 'Q549037',
|
||||
"official_api_documentation": 'https://api.stackexchange.com/docs',
|
||||
"use_official_api": False,
|
||||
"require_api_key": False,
|
||||
"results": 'HTML',
|
||||
}
|
||||
|
||||
# engine dependent config
|
||||
categories = ['it']
|
||||
paging = True
|
||||
|
||||
# search-url
|
||||
url = 'https://stackoverflow.com/'
|
||||
search_url = url + 'search?{query}&page={pageno}'
|
||||
|
||||
# specific xpath variables
|
||||
results_xpath = '//div[contains(@class,"question-summary")]'
|
||||
link_xpath = './/div[@class="result-link"]//a|.//div[@class="summary"]//h3//a'
|
||||
content_xpath = './/div[@class="excerpt"]'
|
||||
|
||||
|
||||
# do search-request
|
||||
def request(query, params):
|
||||
params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'])
|
||||
|
||||
return params
|
||||
|
||||
|
||||
# get response from search-request
|
||||
def response(resp):
|
||||
resp_url = urlparse(resp.url)
|
||||
if resp_url.path.startswith('/nocaptcha'):
|
||||
raise SearxEngineCaptchaException()
|
||||
|
||||
results = []
|
||||
|
||||
dom = html.fromstring(resp.text)
|
||||
|
||||
# parse results
|
||||
for result in dom.xpath(results_xpath):
|
||||
link = result.xpath(link_xpath)[0]
|
||||
href = urljoin(url, link.attrib.get('href'))
|
||||
title = extract_text(link)
|
||||
content = extract_text(result.xpath(content_xpath))
|
||||
|
||||
# append result
|
||||
results.append({'url': href,
|
||||
'title': title,
|
||||
'content': content})
|
||||
|
||||
# return results
|
||||
return results
|
@ -17,7 +17,7 @@ from babel import Locale
|
||||
from babel.localedata import locale_identifiers
|
||||
|
||||
from searx import logger
|
||||
from searx.network import get
|
||||
from searx.poolrequests import get
|
||||
from searx.utils import extract_text, eval_xpath, match_language
|
||||
from searx.exceptions import (
|
||||
SearxEngineResponseException,
|
||||
|
@ -12,7 +12,7 @@ from babel.dates import format_datetime, format_date, format_time, get_datetime_
|
||||
|
||||
from searx import logger
|
||||
from searx.data import WIKIDATA_UNITS
|
||||
from searx.network import post, get
|
||||
from searx.poolrequests import post, get
|
||||
from searx.utils import match_language, searx_useragent, get_string_replaces_function
|
||||
from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
|
||||
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
|
||||
|
@ -7,7 +7,7 @@ from urllib.parse import quote
|
||||
from json import loads
|
||||
from lxml.html import fromstring
|
||||
from searx.utils import match_language, searx_useragent
|
||||
from searx.network import raise_for_httperror
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -7,7 +7,7 @@ from json import loads
|
||||
from time import time
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from searx.network import get as http_get
|
||||
from searx.poolrequests import get as http_get
|
||||
|
||||
# about
|
||||
about = {
|
||||
|
@ -6,7 +6,7 @@
|
||||
from lxml.html import fromstring
|
||||
from searx import logger
|
||||
from searx.utils import extract_text
|
||||
from searx.network import raise_for_httperror
|
||||
from searx.raise_for_httperror import raise_for_httperror
|
||||
|
||||
logger = logger.getChild('Wordnik engine')
|
||||
|
||||
|
@ -7,7 +7,7 @@ from json import loads
|
||||
from dateutil import parser
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from httpx import DigestAuth
|
||||
from requests.auth import HTTPDigestAuth
|
||||
|
||||
from searx.utils import html_to_text
|
||||
|
||||
@ -56,7 +56,7 @@ def request(query, params):
|
||||
search_type=search_type)
|
||||
|
||||
if http_digest_auth_user and http_digest_auth_pass:
|
||||
params['auth'] = DigestAuth(http_digest_auth_user, http_digest_auth_pass)
|
||||
params['auth'] = HTTPDigestAuth(http_digest_auth_user, http_digest_auth_pass)
|
||||
|
||||
# add language tag if specified
|
||||
if params['language'] != 'all':
|
||||
|
@ -8,7 +8,7 @@ from operator import itemgetter
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote
|
||||
from searx.utils import extract_text, get_torrent_size
|
||||
from searx.network import get as http_get
|
||||
from searx.poolrequests import get as http_get
|
||||
|
||||
# about
|
||||
about = {
|
||||
@ -39,7 +39,7 @@ cookies = dict()
|
||||
def init(engine_settings=None):
|
||||
global cookies # pylint: disable=global-variable-not-assigned
|
||||
# initial cookies
|
||||
resp = http_get(url, follow_redirects=False)
|
||||
resp = http_get(url)
|
||||
if resp.ok:
|
||||
for r in resp.history:
|
||||
cookies.update(r.cookies)
|
||||
|
@ -3,7 +3,7 @@ import inspect
|
||||
import logging
|
||||
from json import JSONDecodeError
|
||||
from urllib.parse import urlparse
|
||||
from httpx import HTTPError, HTTPStatusError
|
||||
from requests.exceptions import RequestException
|
||||
from searx.exceptions import (SearxXPathSyntaxException, SearxEngineXPathException, SearxEngineAPIException,
|
||||
SearxEngineAccessDeniedException)
|
||||
from searx import logger
|
||||
@ -60,28 +60,28 @@ def get_trace(traces):
|
||||
return traces[-1]
|
||||
|
||||
|
||||
def get_hostname(exc: HTTPError) -> typing.Optional[None]:
|
||||
def get_hostname(exc: RequestException) -> typing.Optional[None]:
|
||||
url = exc.request.url
|
||||
if url is None and exc.response is not None:
|
||||
url = exc.response.url
|
||||
return urlparse(url).netloc
|
||||
|
||||
|
||||
def get_request_exception_messages(exc: HTTPError)\
|
||||
def get_request_exception_messages(exc: RequestException)\
|
||||
-> typing.Tuple[typing.Optional[str], typing.Optional[str], typing.Optional[str]]:
|
||||
url = None
|
||||
status_code = None
|
||||
reason = None
|
||||
hostname = None
|
||||
if hasattr(exc, 'request') and exc.request is not None:
|
||||
if exc.request is not None:
|
||||
url = exc.request.url
|
||||
if url is None and hasattr(exc, 'response') and exc.respones is not None:
|
||||
if url is None and exc.response is not None:
|
||||
url = exc.response.url
|
||||
if url is not None:
|
||||
hostname = url.host
|
||||
if isinstance(exc, HTTPStatusError):
|
||||
hostname = str(urlparse(url).netloc)
|
||||
if exc.response is not None:
|
||||
status_code = str(exc.response.status_code)
|
||||
reason = exc.response.reason_phrase
|
||||
reason = exc.response.reason
|
||||
return (status_code, reason, hostname)
|
||||
|
||||
|
||||
@ -92,7 +92,7 @@ def get_messages(exc, filename) -> typing.Tuple:
|
||||
return (str(exc), )
|
||||
if isinstance(exc, ValueError) and 'lxml' in filename:
|
||||
return (str(exc), )
|
||||
if isinstance(exc, HTTPError):
|
||||
if isinstance(exc, RequestException):
|
||||
return get_request_exception_messages(exc)
|
||||
if isinstance(exc, SearxXPathSyntaxException):
|
||||
return (exc.xpath_str, exc.message)
|
||||
|
@ -1,188 +0,0 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import asyncio
|
||||
import threading
|
||||
import concurrent.futures
|
||||
from time import time
|
||||
from queue import SimpleQueue
|
||||
from types import MethodType
|
||||
|
||||
import httpx
|
||||
import h2.exceptions
|
||||
|
||||
from .network import get_network, initialize, check_network_configuration
|
||||
from .client import get_loop
|
||||
from .raise_for_httperror import raise_for_httperror
|
||||
|
||||
|
||||
THREADLOCAL = threading.local()
|
||||
|
||||
|
||||
def reset_time_for_thread():
|
||||
THREADLOCAL.total_time = 0
|
||||
|
||||
|
||||
def get_time_for_thread():
|
||||
return THREADLOCAL.total_time
|
||||
|
||||
|
||||
def set_timeout_for_thread(timeout, start_time=None):
|
||||
THREADLOCAL.timeout = timeout
|
||||
THREADLOCAL.start_time = start_time
|
||||
|
||||
|
||||
def set_context_network_name(network_name):
|
||||
THREADLOCAL.network = get_network(network_name)
|
||||
|
||||
|
||||
def get_context_network():
|
||||
try:
|
||||
return THREADLOCAL.network
|
||||
except AttributeError:
|
||||
return get_network()
|
||||
|
||||
|
||||
def request(method, url, **kwargs):
|
||||
"""same as requests/requests/api.py request(...)"""
|
||||
time_before_request = time()
|
||||
|
||||
# timeout (httpx)
|
||||
if 'timeout' in kwargs:
|
||||
timeout = kwargs['timeout']
|
||||
else:
|
||||
timeout = getattr(THREADLOCAL, 'timeout', None)
|
||||
if timeout is not None:
|
||||
kwargs['timeout'] = timeout
|
||||
|
||||
# 2 minutes timeout for the requests without timeout
|
||||
timeout = timeout or 120
|
||||
|
||||
# ajdust actual timeout
|
||||
timeout += 0.2 # overhead
|
||||
start_time = getattr(THREADLOCAL, 'start_time', time_before_request)
|
||||
if start_time:
|
||||
timeout -= time() - start_time
|
||||
|
||||
# raise_for_error
|
||||
check_for_httperror = True
|
||||
if 'raise_for_httperror' in kwargs:
|
||||
check_for_httperror = kwargs['raise_for_httperror']
|
||||
del kwargs['raise_for_httperror']
|
||||
|
||||
# requests compatibility
|
||||
if isinstance(url, bytes):
|
||||
url = url.decode()
|
||||
|
||||
# network
|
||||
network = get_context_network()
|
||||
|
||||
# do request
|
||||
future = asyncio.run_coroutine_threadsafe(network.request(method, url, **kwargs), get_loop())
|
||||
try:
|
||||
response = future.result(timeout)
|
||||
except concurrent.futures.TimeoutError as e:
|
||||
raise httpx.TimeoutException('Timeout', request=None) from e
|
||||
|
||||
# requests compatibility
|
||||
# see also https://www.python-httpx.org/compatibility/#checking-for-4xx5xx-responses
|
||||
response.ok = not response.is_error
|
||||
|
||||
# update total_time.
|
||||
# See get_time_for_thread() and reset_time_for_thread()
|
||||
if hasattr(THREADLOCAL, 'total_time'):
|
||||
time_after_request = time()
|
||||
THREADLOCAL.total_time += time_after_request - time_before_request
|
||||
|
||||
# raise an exception
|
||||
if check_for_httperror:
|
||||
raise_for_httperror(response)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def get(url, **kwargs):
|
||||
kwargs.setdefault('follow_redirects', True)
|
||||
return request('get', url, **kwargs)
|
||||
|
||||
|
||||
def options(url, **kwargs):
|
||||
kwargs.setdefault('follow_redirects', True)
|
||||
return request('options', url, **kwargs)
|
||||
|
||||
|
||||
def head(url, **kwargs):
|
||||
kwargs.setdefault('follow_redirects', False)
|
||||
return request('head', url, **kwargs)
|
||||
|
||||
|
||||
def post(url, data=None, **kwargs):
|
||||
return request('post', url, data=data, **kwargs)
|
||||
|
||||
|
||||
def put(url, data=None, **kwargs):
|
||||
return request('put', url, data=data, **kwargs)
|
||||
|
||||
|
||||
def patch(url, data=None, **kwargs):
|
||||
return request('patch', url, data=data, **kwargs)
|
||||
|
||||
|
||||
def delete(url, **kwargs):
|
||||
return request('delete', url, **kwargs)
|
||||
|
||||
|
||||
async def stream_chunk_to_queue(network, q, method, url, **kwargs):
|
||||
try:
|
||||
async with await network.stream(method, url, **kwargs) as response:
|
||||
q.put(response)
|
||||
# aiter_raw: access the raw bytes on the response without applying any HTTP content decoding
|
||||
# https://www.python-httpx.org/quickstart/#streaming-responses
|
||||
async for chunk in response.aiter_bytes(65536):
|
||||
if len(chunk) > 0:
|
||||
q.put(chunk)
|
||||
except httpx.ResponseClosed as e:
|
||||
# the response was closed
|
||||
pass
|
||||
except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
|
||||
q.put(e)
|
||||
finally:
|
||||
q.put(None)
|
||||
|
||||
|
||||
def _close_response_method(self):
|
||||
asyncio.run_coroutine_threadsafe(
|
||||
self.aclose(),
|
||||
get_loop()
|
||||
)
|
||||
|
||||
|
||||
def stream(method, url, **kwargs):
|
||||
"""Replace httpx.stream.
|
||||
|
||||
Usage:
|
||||
stream = poolrequests.stream(...)
|
||||
response = next(stream)
|
||||
for chunk in stream:
|
||||
...
|
||||
|
||||
httpx.Client.stream requires to write the httpx.HTTPTransport version of the
|
||||
the httpx.AsyncHTTPTransport declared above.
|
||||
"""
|
||||
q = SimpleQueue()
|
||||
future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs),
|
||||
get_loop())
|
||||
# yield response
|
||||
response = q.get()
|
||||
if isinstance(response, Exception):
|
||||
raise response
|
||||
response.close = MethodType(_close_response_method, response)
|
||||
yield response
|
||||
|
||||
# yield chunks
|
||||
chunk_or_exception = q.get()
|
||||
while chunk_or_exception is not None:
|
||||
if isinstance(chunk_or_exception, Exception):
|
||||
raise chunk_or_exception
|
||||
yield chunk_or_exception
|
||||
chunk_or_exception = q.get()
|
||||
future.result()
|
@ -1,167 +0,0 @@
|
||||
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import uvloop
|
||||
|
||||
import httpx
|
||||
from httpx_socks import AsyncProxyTransport
|
||||
from python_socks import (
|
||||
parse_proxy_url,
|
||||
ProxyConnectionError,
|
||||
ProxyTimeoutError,
|
||||
ProxyError
|
||||
)
|
||||
import python_socks._errors
|
||||
|
||||
from searx import logger
|
||||
|
||||
|
||||
logger = logger.getChild('searx.http.client')
|
||||
LOOP = None
|
||||
SSLCONTEXTS = {}
|
||||
TRANSPORT_KWARGS = {
|
||||
'trust_env': False,
|
||||
}
|
||||
|
||||
|
||||
def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False):
|
||||
global SSLCONTEXTS
|
||||
key = (proxy_url, cert, verify, trust_env, http2)
|
||||
if key not in SSLCONTEXTS:
|
||||
SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2)
|
||||
return SSLCONTEXTS[key]
|
||||
|
||||
|
||||
class AsyncHTTPTransportNoHttp(httpx.AsyncHTTPTransport):
|
||||
"""Block HTTP request"""
|
||||
|
||||
async def handle_async_request(self, request):
|
||||
raise httpx.UnsupportedProtocol('HTTP protocol is disabled')
|
||||
|
||||
|
||||
class AsyncProxyTransportFixed(AsyncProxyTransport):
|
||||
"""Fix httpx_socks.AsyncProxyTransport
|
||||
|
||||
Map python_socks exceptions to httpx.ProxyError exceptions
|
||||
"""
|
||||