[enh] checker: background check

See settings.yml for the options
SIGUSR1 signal starts the checker.
The result is available at /stats/checker
This commit is contained in:
Alexandre Flament 2021-01-05 11:24:39 +01:00
parent 6e2872f436
commit 3a9f513521
9 changed files with 255 additions and 97 deletions

View File

@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
from searx.results import ResultContainer
from searx import logger
from searx.plugins import plugins
from searx.search.models import EngineRef, SearchQuery
from searx.search.processors import processors, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker
logger = logger.getChild('search')
@ -45,75 +47,11 @@ else:
sys.exit(1)
def initialize(settings_engines=None):
def initialize(settings_engines=None, enable_checker=False):
settings_engines = settings_engines or settings['engines']
initialize_processors(settings_engines)
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))
if enable_checker:
initialize_checker()
class Search:

View File

@ -1 +1,4 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from .impl import Checker
from .background import initialize, get_result

View File

@ -1,9 +1,13 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import sys
import os
import argparse
import searx.search
import searx.search.processors
import searx.search.checker
from searx.search import processors
from searx.engines import engine_shortcuts
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
@ -18,20 +22,24 @@ else:
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
def iter_processor():
if len(sys.argv) > 1:
for name, processor in searx.search.processors.items():
if name in sys.argv:
def iter_processor(engine_name_list):
if len(engine_name_list) > 0:
for name in engine_name_list:
name = engine_shortcuts.get(name, name)
processor = processors.get(name)
if processor is not None:
yield name, processor
else:
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ)
else:
for name, processor in searx.search.processors.items():
yield name, processor
def main():
def run(engine_name_list):
searx.search.initialize()
broken_urls = []
for name, processor in iter_processor():
for name, processor in iter_processor(engine_name_list):
if sys.stdout.isatty():
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
checker = searx.search.checker.Checker(processor)
@ -48,5 +56,13 @@ def main():
print('Error fetching', url)
def main():
parser = argparse.ArgumentParser(description='Check searx engines.')
parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
help='engines name or shortcut list. Empty for all engines.')
args = parser.parse_args()
run(args.engine_name_list)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import json
import random
import time
import threading
import os
import signal
from searx import logger, settings, searx_debug
from searx.exceptions import SearxSettingsException
from searx.search.processors import processors
from searx.search.checker import Checker
from searx.shared import schedule, storage
CHECKER_RESULT = 'CHECKER_RESULT'
running = threading.Lock()
def _get_interval(every, error_msg):
if isinstance(every, int):
every = (every, every)
if not isinstance(every, (tuple, list))\
or len(every) != 2\
or not isinstance(every[0], int)\
or not isinstance(every[1], int):
raise SearxSettingsException(error_msg, None)
return every
def _get_every():
every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
return _get_interval(every, 'checker.scheduling.every is not a int or list')
def get_result():
serialized_result = storage.get_str('CHECKER_RESULT')
if serialized_result is not None:
return json.loads(serialized_result)
def run():
if not running.acquire(blocking=False):
return
try:
logger.info('Starting checker')
result = {}
for name, processor in processors.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.succesfull:
result[name] = {'status': True}
else:
result[name] = {'status': False, 'errors': checker.test_results.errors}
storage.set_str('CHECKER_RESULT', json.dumps(result))
logger.info('Check done')
finally:
running.release()
def _run_with_delay():
every = _get_every()
delay = random.randint(0, every[1] - every[0])
logger.debug('Start checker in %i seconds', delay)
time.sleep(delay)
run()
def _start_scheduling():
every = _get_every()
schedule(every[0], _run_with_delay)
run()
def _signal_handler(signum, frame):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def initialize():
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# special case when debug is activate
if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings.get('checker', {}).get('scheduling', None)
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
#
start_after = scheduling.get('start_after', (300, 1800))
start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
delay = random.randint(start_after[0], start_after[1])
logger.info('Start checker in %i seconds', delay)
t = threading.Timer(delay, _start_scheduling)
t.daemon = True
t.start()

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
import types
import functools
@ -11,7 +13,7 @@ import requests.exceptions
from searx import poolrequests, logger
from searx.results import ResultContainer
from searx.search import SearchQuery, EngineRef
from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor
@ -240,18 +242,24 @@ class ResultContainerTests:
self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self):
"""Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox')
def has_answer(self):
"""Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0:
self._record_error('No answer')
def has_language(self, lang):
"""Check at least one title or content of the results is written in the `lang`.
Detected using pycld3, may be not accurate"""
if lang not in self.languages:
self._record_error(lang + ' not found')
def not_empty(self):
"""Check the ResultContainer has at least one answer or infobox or result"""
result_types = set()
results = self.result_container.get_ordered_results()
if len(results) > 0:
@ -267,6 +275,7 @@ class ResultContainerTests:
self._record_error('No result')
def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparaison)"""
title = title.lower()
for result in self.result_container.get_ordered_results():
if title in result['title'].lower():
@ -287,6 +296,7 @@ class CheckerTests:
self.result_container_tests_list = result_container_tests_list
def unique_results(self):
"""Check the results of each ResultContain is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0:
# results on the first page

69
searx/search/models.py Normal file
View File

@ -0,0 +1,69 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))

View File

@ -102,24 +102,33 @@ outgoing: # communication with search engines
# - "HTTPS rewrite"
# - ...
additional_tests:
rosebud: &test_rosebud
matrix:
query: rosebud
lang: en
result_container:
- not_empty
- [one_title_contains', 'citizen kane']
test:
- unique_results
tests:
infobox: &tests_infobox
infobox:
matrix:
query: ["linux", "new york", "bbc"]
result_container:
- has_infobox
checker:
# disable checker when in debug mode
off_when_debug: True
# scheduling: interval or int
# use "scheduling: False" to disable scheduling
scheduling:
start_after: [300, 1800] # delay to start the first run of the checker
every: [86400, 90000] # how often the checker runs
# additional tests: only for the YAML anchors (see the engines section)
additional_tests:
rosebud: &test_rosebud
matrix:
query: rosebud
lang: en
result_container:
- not_empty
- ['one_title_contains', 'citizen kane']
test:
- unique_results
# tests: only for the YAML anchors (see the engines section)
tests:
infobox: &tests_infobox
infobox:
matrix:
query: ["linux", "new york", "bbc"]
result_container:
- has_infobox
engines:
- name: apk mirror

View File

@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
from searx.version import VERSION_STRING
from searx.languages import language_codes as languages
from searx.search import SearchWithPlugins, initialize
from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.search.checker import get_result as checker_get_result
from searx.query import RawTextQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins
@ -81,7 +82,6 @@ from searx.answerers import answerers
from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines
# serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
# initialize the engines except on the first run of the werkzeug server.
if not werkzeug_reloader\
or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
initialize()
search_initialize(enable_checker=True)
babel = Babel(app)
@ -977,6 +977,12 @@ def stats_errors():
return jsonify(result)
@app.route('/stats/checker', methods=['GET'])
def stats_checker():
result = checker_get_result()
return jsonify(result)
@app.route('/robots.txt', methods=['GET'])
def robots():
return Response("""User-agent: *

View File

@ -49,7 +49,8 @@ setup(
},
entry_points={
'console_scripts': [
'searx-run = searx.webapp:run'
'searx-run = searx.webapp:run',
'searx-checker = searx.search.checker.__main__:main'
]
},
package_data={