[enh] Improve ranking based on language (#3053)

Add configurable setting to rank search results higher when part of the domain (e.g. 'en' in 'en.wikipedia.org' or 'de' in 'beispiel.de') matches the selected search language. Does not apply to e.g. 'be' in 'youtube.com'. Closes #206
2024-12-22 09:12:05 +01:00 · 2021-11-15 20:31:22 +01:00 · 2021-11-15 20:31:22 +01:00 · 8c3454fd1b
commit 8c3454fd1b
parent a880920dc7
4 changed files with 18 additions and 9 deletions
--- a/searx/results.py
+++ b/searx/results.py
@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
 from searx import logger
 from searx.engines import engines
 from searx.metrology.error_recorder import record_error
+from searx import settings


 CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -129,13 +130,18 @@ def merge_two_infoboxes(infobox1, infobox2):
            infobox1['content'] = content2


-def result_score(result):
+def result_score(result, language):
    weight = 1.0

    for result_engine in result['engines']:
        if hasattr(engines[result_engine], 'weight'):
            weight *= float(engines[result_engine].weight)

+    if settings['search']['prefer_configured_language']:
+        domain_parts = result['parsed_url'].netloc.split('.')
+        if language in domain_parts:
+            weight *= 1.1
+
    occurences = len(result['positions'])

    return sum((occurences * weight) / position for position in result['positions'])
@ -145,9 +151,10 @@ class ResultContainer:
    """docstring for ResultContainer"""

    __slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\
-                '_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data'
+                '_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data',\
+                '_language'

-    def __init__(self):
+    def __init__(self, language):
        super().__init__()
        self._merged_results = []
        self.infoboxes = []
@ -161,6 +168,7 @@ class ResultContainer:
        self.unresponsive_engines = set()
        self.timings = []
        self.redirect_url = None
+        self._language = language.lower().split('-')[0]

    def extend(self, engine_name, results):
        standard_result_count = 0
@ -299,7 +307,7 @@ class ResultContainer:

    def order_results(self):
        for result in self._merged_results:
-            score = result_score(result)
+            score = result_score(result, self._language)
            result['score'] = score
            with RLock():
                for result_engine in result['engines']:
--- a/searx/search/init.py
+++ b/searx/search/init.py
@ -66,7 +66,7 @@ class Search:
        # init vars
        super().__init__()
        self.search_query = search_query
-        self.result_container = ResultContainer()
+        self.result_container = ResultContainer(search_query.lang)
        self.start_time = None
        self.actual_timeout = None

--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -19,6 +19,7 @@ search:
    default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py'
    ban_time_on_fail : 5 # ban time in seconds after engine errors
    max_ban_time_on_fail : 120 # max ban time in seconds after engine errors
+    prefer_configured_language: False # increase weight of results in confiugred language in ranking

 server:
    port : 8888
--- a/tests/unit/test_results.py
+++ b/tests/unit/test_results.py
@ -20,22 +20,22 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff',
 class ResultContainerTestCase(SearxTestCase):

    def test_empty(self):
-        c = ResultContainer()
+        c = ResultContainer("en-US")
        self.assertEqual(c.get_ordered_results(), [])

    def test_one_result(self):
-        c = ResultContainer()
+        c = ResultContainer("en-US")
        c.extend('wikipedia', [fake_result()])
        self.assertEqual(c.results_length(), 1)

    def test_one_suggestion(self):
-        c = ResultContainer()
+        c = ResultContainer("en-US")
        c.extend('wikipedia', [fake_result(suggestion=True)])
        self.assertEqual(len(c.suggestions), 1)
        self.assertEqual(c.results_length(), 0)

    def test_result_merge(self):
-        c = ResultContainer()
+        c = ResultContainer("en-US")
        c.extend('wikipedia', [fake_result()])
        c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')])
        self.assertEqual(c.results_length(), 2)