[enh] checker: background check

See settings.yml for the options
SIGUSR1 signal starts the checker.
The result is available at /stats/checker
This commit is contained in:
Alexandre Flament 2021-01-05 11:24:39 +01:00
parent 6e2872f436
commit 3a9f513521
9 changed files with 255 additions and 97 deletions

View File

@ -28,7 +28,9 @@ from searx.external_bang import get_bang_url
from searx.results import ResultContainer from searx.results import ResultContainer
from searx import logger from searx import logger
from searx.plugins import plugins from searx.plugins import plugins
from searx.search.models import EngineRef, SearchQuery
from searx.search.processors import processors, initialize as initialize_processors from searx.search.processors import processors, initialize as initialize_processors
from searx.search.checker import initialize as initialize_checker
logger = logger.getChild('search') logger = logger.getChild('search')
@ -45,75 +47,11 @@ else:
sys.exit(1) sys.exit(1)
def initialize(settings_engines=None): def initialize(settings_engines=None, enable_checker=False):
settings_engines = settings_engines or settings['engines'] settings_engines = settings_engines or settings['engines']
initialize_processors(settings_engines) initialize_processors(settings_engines)
if enable_checker:
initialize_checker()
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))
class Search: class Search:

View File

@ -1 +1,4 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
from .impl import Checker from .impl import Checker
from .background import initialize, get_result

View File

@ -1,9 +1,13 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import sys import sys
import os import os
import argparse
import searx.search import searx.search
import searx.search.processors
import searx.search.checker import searx.search.checker
from searx.search import processors
from searx.engines import engine_shortcuts
if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']: if sys.stdout.isatty() and os.environ.get('TERM') not in ['dumb', 'unknown']:
@ -18,20 +22,24 @@ else:
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", "" BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = "", "", "", "", "", "", "", ""
def iter_processor(): def iter_processor(engine_name_list):
if len(sys.argv) > 1: if len(engine_name_list) > 0:
for name, processor in searx.search.processors.items(): for name in engine_name_list:
if name in sys.argv: name = engine_shortcuts.get(name, name)
processor = processors.get(name)
if processor is not None:
yield name, processor yield name, processor
else:
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, RED, ' Not found ', RESET_SEQ)
else: else:
for name, processor in searx.search.processors.items(): for name, processor in searx.search.processors.items():
yield name, processor yield name, processor
def main(): def run(engine_name_list):
searx.search.initialize() searx.search.initialize()
broken_urls = [] broken_urls = []
for name, processor in iter_processor(): for name, processor in iter_processor(engine_name_list):
if sys.stdout.isatty(): if sys.stdout.isatty():
print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ) print(BOLD_SEQ, 'Engine ', '%-30s' % name, RESET_SEQ, WHITE, ' Checking', RESET_SEQ)
checker = searx.search.checker.Checker(processor) checker = searx.search.checker.Checker(processor)
@ -48,5 +56,13 @@ def main():
print('Error fetching', url) print('Error fetching', url)
def main():
parser = argparse.ArgumentParser(description='Check searx engines.')
parser.add_argument('engine_name_list', metavar='engine name', type=str, nargs='*',
help='engines name or shortcut list. Empty for all engines.')
args = parser.parse_args()
run(args.engine_name_list)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -0,0 +1,106 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import json
import random
import time
import threading
import os
import signal
from searx import logger, settings, searx_debug
from searx.exceptions import SearxSettingsException
from searx.search.processors import processors
from searx.search.checker import Checker
from searx.shared import schedule, storage
CHECKER_RESULT = 'CHECKER_RESULT'
running = threading.Lock()
def _get_interval(every, error_msg):
if isinstance(every, int):
every = (every, every)
if not isinstance(every, (tuple, list))\
or len(every) != 2\
or not isinstance(every[0], int)\
or not isinstance(every[1], int):
raise SearxSettingsException(error_msg, None)
return every
def _get_every():
every = settings.get('checker', {}).get('scheduling', {}).get('every', (300, 1800))
return _get_interval(every, 'checker.scheduling.every is not a int or list')
def get_result():
serialized_result = storage.get_str('CHECKER_RESULT')
if serialized_result is not None:
return json.loads(serialized_result)
def run():
if not running.acquire(blocking=False):
return
try:
logger.info('Starting checker')
result = {}
for name, processor in processors.items():
logger.debug('Checking %s engine', name)
checker = Checker(processor)
checker.run()
if checker.test_results.succesfull:
result[name] = {'status': True}
else:
result[name] = {'status': False, 'errors': checker.test_results.errors}
storage.set_str('CHECKER_RESULT', json.dumps(result))
logger.info('Check done')
finally:
running.release()
def _run_with_delay():
every = _get_every()
delay = random.randint(0, every[1] - every[0])
logger.debug('Start checker in %i seconds', delay)
time.sleep(delay)
run()
def _start_scheduling():
every = _get_every()
schedule(every[0], _run_with_delay)
run()
def _signal_handler(signum, frame):
t = threading.Thread(target=run)
t.daemon = True
t.start()
def initialize():
logger.info('Send SIGUSR1 signal to pid %i to start the checker', os.getpid())
signal.signal(signal.SIGUSR1, _signal_handler)
# special case when debug is activate
if searx_debug and settings.get('checker', {}).get('off_when_debug', True):
logger.info('debug mode: checker is disabled')
return
# check value of checker.scheduling.every now
scheduling = settings.get('checker', {}).get('scheduling', None)
if scheduling is None or not scheduling:
logger.info('Checker scheduler is disabled')
return
#
start_after = scheduling.get('start_after', (300, 1800))
start_after = _get_interval(start_after, 'checker.scheduling.start_after is not a int or list')
delay = random.randint(start_after[0], start_after[1])
logger.info('Start checker in %i seconds', delay)
t = threading.Timer(delay, _start_scheduling)
t.daemon = True
t.start()

View File

@ -1,3 +1,5 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing import typing
import types import types
import functools import functools
@ -11,7 +13,7 @@ import requests.exceptions
from searx import poolrequests, logger from searx import poolrequests, logger
from searx.results import ResultContainer from searx.results import ResultContainer
from searx.search import SearchQuery, EngineRef from searx.search.models import SearchQuery, EngineRef
from searx.search.processors import EngineProcessor from searx.search.processors import EngineProcessor
@ -240,18 +242,24 @@ class ResultContainerTests:
self._check_infoboxes(self.result_container.infoboxes) self._check_infoboxes(self.result_container.infoboxes)
def has_infobox(self): def has_infobox(self):
"""Check the ResultContainer has at least one infobox"""
if len(self.result_container.infoboxes) == 0: if len(self.result_container.infoboxes) == 0:
self._record_error('No infobox') self._record_error('No infobox')
def has_answer(self): def has_answer(self):
"""Check the ResultContainer has at least one answer"""
if len(self.result_container.answers) == 0: if len(self.result_container.answers) == 0:
self._record_error('No answer') self._record_error('No answer')
def has_language(self, lang): def has_language(self, lang):
"""Check at least one title or content of the results is written in the `lang`.
Detected using pycld3, may be not accurate"""
if lang not in self.languages: if lang not in self.languages:
self._record_error(lang + ' not found') self._record_error(lang + ' not found')
def not_empty(self): def not_empty(self):
"""Check the ResultContainer has at least one answer or infobox or result"""
result_types = set() result_types = set()
results = self.result_container.get_ordered_results() results = self.result_container.get_ordered_results()
if len(results) > 0: if len(results) > 0:
@ -267,6 +275,7 @@ class ResultContainerTests:
self._record_error('No result') self._record_error('No result')
def one_title_contains(self, title: str): def one_title_contains(self, title: str):
"""Check one of the title contains `title` (case insensitive comparaison)"""
title = title.lower() title = title.lower()
for result in self.result_container.get_ordered_results(): for result in self.result_container.get_ordered_results():
if title in result['title'].lower(): if title in result['title'].lower():
@ -287,6 +296,7 @@ class CheckerTests:
self.result_container_tests_list = result_container_tests_list self.result_container_tests_list = result_container_tests_list
def unique_results(self): def unique_results(self):
"""Check the results of each ResultContain is unique"""
urls_list = [rct.result_urls for rct in self.result_container_tests_list] urls_list = [rct.result_urls for rct in self.result_container_tests_list]
if len(urls_list[0]) > 0: if len(urls_list[0]) > 0:
# results on the first page # results on the first page

69
searx/search/models.py Normal file
View File

@ -0,0 +1,69 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
import typing
class EngineRef:
__slots__ = 'name', 'category'
def __init__(self, name: str, category: str):
self.name = name
self.category = category
def __repr__(self):
return "EngineRef({!r}, {!r})".format(self.name, self.category)
def __eq__(self, other):
return self.name == other.name and self.category == other.category
def __hash__(self):
return hash((self.name, self.category))
class SearchQuery:
"""container for all the search parameters (query, language, etc...)"""
__slots__ = 'query', 'engineref_list', 'lang', 'safesearch', 'pageno', 'time_range',\
'timeout_limit', 'external_bang'
def __init__(self,
query: str,
engineref_list: typing.List[EngineRef],
lang: str='all',
safesearch: int=0,
pageno: int=1,
time_range: typing.Optional[str]=None,
timeout_limit: typing.Optional[float]=None,
external_bang: typing.Optional[str]=None):
self.query = query
self.engineref_list = engineref_list
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.timeout_limit = timeout_limit
self.external_bang = external_bang
@property
def categories(self):
return list(set(map(lambda engineref: engineref.category, self.engineref_list)))
def __repr__(self):
return "SearchQuery({!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r}, {!r})".\
format(self.query, self.engineref_list, self.lang, self.safesearch,
self.pageno, self.time_range, self.timeout_limit, self.external_bang)
def __eq__(self, other):
return self.query == other.query\
and self.engineref_list == other.engineref_list\
and self.lang == other.lang\
and self.safesearch == other.safesearch\
and self.pageno == other.pageno\
and self.time_range == other.time_range\
and self.timeout_limit == other.timeout_limit\
and self.external_bang == other.external_bang
def __hash__(self):
return hash((self.query, tuple(self.engineref_list), self.lang, self.safesearch, self.pageno, self.time_range,
self.timeout_limit, self.external_bang))

View File

@ -102,24 +102,33 @@ outgoing: # communication with search engines
# - "HTTPS rewrite" # - "HTTPS rewrite"
# - ... # - ...
additional_tests: checker:
rosebud: &test_rosebud # disable checker when in debug mode
matrix: off_when_debug: True
query: rosebud # scheduling: interval or int
lang: en # use "scheduling: False" to disable scheduling
result_container: scheduling:
- not_empty start_after: [300, 1800] # delay to start the first run of the checker
- [one_title_contains', 'citizen kane'] every: [86400, 90000] # how often the checker runs
test: # additional tests: only for the YAML anchors (see the engines section)
- unique_results additional_tests:
rosebud: &test_rosebud
tests: matrix:
infobox: &tests_infobox query: rosebud
infobox: lang: en
matrix: result_container:
query: ["linux", "new york", "bbc"] - not_empty
result_container: - ['one_title_contains', 'citizen kane']
- has_infobox test:
- unique_results
# tests: only for the YAML anchors (see the engines section)
tests:
infobox: &tests_infobox
infobox:
matrix:
query: ["linux", "new york", "bbc"]
result_container:
- has_infobox
engines: engines:
- name: apk mirror - name: apk mirror

View File

@ -71,7 +71,8 @@ from searx.webadapter import get_search_query_from_webapp, get_selected_categori
from searx.utils import html_to_text, gen_useragent, dict_subset, match_language from searx.utils import html_to_text, gen_useragent, dict_subset, match_language
from searx.version import VERSION_STRING from searx.version import VERSION_STRING
from searx.languages import language_codes as languages from searx.languages import language_codes as languages
from searx.search import SearchWithPlugins, initialize from searx.search import SearchWithPlugins, initialize as search_initialize
from searx.search.checker import get_result as checker_get_result
from searx.query import RawTextQuery from searx.query import RawTextQuery
from searx.autocomplete import searx_bang, backends as autocomplete_backends from searx.autocomplete import searx_bang, backends as autocomplete_backends
from searx.plugins import plugins from searx.plugins import plugins
@ -81,7 +82,6 @@ from searx.answerers import answerers
from searx.poolrequests import get_global_proxies from searx.poolrequests import get_global_proxies
from searx.metrology.error_recorder import errors_per_engines from searx.metrology.error_recorder import errors_per_engines
# serve pages with HTTP/1.1 # serve pages with HTTP/1.1
from werkzeug.serving import WSGIRequestHandler from werkzeug.serving import WSGIRequestHandler
WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0')) WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server'].get('http_protocol_version', '1.0'))
@ -136,7 +136,7 @@ werkzeug_reloader = flask_run_development or (searx_debug and __name__ == "__mai
# initialize the engines except on the first run of the werkzeug server. # initialize the engines except on the first run of the werkzeug server.
if not werkzeug_reloader\ if not werkzeug_reloader\
or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"): or (werkzeug_reloader and os.environ.get("WERKZEUG_RUN_MAIN") == "true"):
initialize() search_initialize(enable_checker=True)
babel = Babel(app) babel = Babel(app)
@ -977,6 +977,12 @@ def stats_errors():
return jsonify(result) return jsonify(result)
@app.route('/stats/checker', methods=['GET'])
def stats_checker():
result = checker_get_result()
return jsonify(result)
@app.route('/robots.txt', methods=['GET']) @app.route('/robots.txt', methods=['GET'])
def robots(): def robots():
return Response("""User-agent: * return Response("""User-agent: *

View File

@ -49,7 +49,8 @@ setup(
}, },
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'searx-run = searx.webapp:run' 'searx-run = searx.webapp:run',
'searx-checker = searx.search.checker.__main__:main'
] ]
}, },
package_data={ package_data={