From 8c3454fd1be4b578db5693ff1e939af238da6ac8 Mon Sep 17 00:00:00 2001 From: Finn <34682885+0xhtml@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:31:22 +0100 Subject: [PATCH] [enh] Improve ranking based on language (#3053) Add configurable setting to rank search results higher when part of the domain (e.g. 'en' in 'en.wikipedia.org' or 'de' in 'beispiel.de') matches the selected search language. Does not apply to e.g. 'be' in 'youtube.com'. Closes #206 --- searx/results.py | 16 ++++++++++++---- searx/search/__init__.py | 2 +- searx/settings.yml | 1 + tests/unit/test_results.py | 8 ++++---- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/searx/results.py b/searx/results.py index b3b87411..3acf1045 100644 --- a/searx/results.py +++ b/searx/results.py @@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote from searx import logger from searx.engines import engines from searx.metrology.error_recorder import record_error +from searx import settings CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U) @@ -129,13 +130,18 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['content'] = content2 -def result_score(result): +def result_score(result, language): weight = 1.0 for result_engine in result['engines']: if hasattr(engines[result_engine], 'weight'): weight *= float(engines[result_engine].weight) + if settings['search']['prefer_configured_language']: + domain_parts = result['parsed_url'].netloc.split('.') + if language in domain_parts: + weight *= 1.1 + occurences = len(result['positions']) return sum((occurences * weight) / position for position in result['positions']) @@ -145,9 +151,10 @@ class ResultContainer: """docstring for ResultContainer""" __slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\ - '_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data' + '_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data',\ + '_language' - def __init__(self): + def __init__(self, language): super().__init__() self._merged_results = [] self.infoboxes = [] @@ -161,6 +168,7 @@ class ResultContainer: self.unresponsive_engines = set() self.timings = [] self.redirect_url = None + self._language = language.lower().split('-')[0] def extend(self, engine_name, results): standard_result_count = 0 @@ -299,7 +307,7 @@ class ResultContainer: def order_results(self): for result in self._merged_results: - score = result_score(result) + score = result_score(result, self._language) result['score'] = score with RLock(): for result_engine in result['engines']: diff --git a/searx/search/__init__.py b/searx/search/__init__.py index a3c70866..f2f774bb 100644 --- a/searx/search/__init__.py +++ b/searx/search/__init__.py @@ -66,7 +66,7 @@ class Search: # init vars super().__init__() self.search_query = search_query - self.result_container = ResultContainer() + self.result_container = ResultContainer(search_query.lang) self.start_time = None self.actual_timeout = None diff --git a/searx/settings.yml b/searx/settings.yml index 591c819d..175a8656 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -19,6 +19,7 @@ search: default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py' ban_time_on_fail : 5 # ban time in seconds after engine errors max_ban_time_on_fail : 120 # max ban time in seconds after engine errors + prefer_configured_language: False # increase weight of results in confiugred language in ranking server: port : 8888 diff --git a/tests/unit/test_results.py b/tests/unit/test_results.py index 274b5b37..a1d9e673 100644 --- a/tests/unit/test_results.py +++ b/tests/unit/test_results.py @@ -20,22 +20,22 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff', class ResultContainerTestCase(SearxTestCase): def test_empty(self): - c = ResultContainer() + c = ResultContainer("en-US") self.assertEqual(c.get_ordered_results(), []) def test_one_result(self): - c = ResultContainer() + c = ResultContainer("en-US") c.extend('wikipedia', [fake_result()]) self.assertEqual(c.results_length(), 1) def test_one_suggestion(self): - c = ResultContainer() + c = ResultContainer("en-US") c.extend('wikipedia', [fake_result(suggestion=True)]) self.assertEqual(len(c.suggestions), 1) self.assertEqual(c.results_length(), 0) def test_result_merge(self): - c = ResultContainer() + c = ResultContainer("en-US") c.extend('wikipedia', [fake_result()]) c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')]) self.assertEqual(c.results_length(), 2)