1
0
mirror of https://github.com/searx/searx synced 2024-11-22 01:45:21 +01:00

[enh] Improve ranking based on language (#3053)

Add configurable setting to rank search results higher when part of the
domain (e.g. 'en' in 'en.wikipedia.org' or 'de' in 'beispiel.de')
matches the selected search language. Does not apply to e.g. 'be' in
'youtube.com'.

Closes #206
This commit is contained in:
Finn 2021-11-15 20:31:22 +01:00 committed by GitHub
parent a880920dc7
commit 8c3454fd1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 18 additions and 9 deletions

View File

@ -6,6 +6,7 @@ from urllib.parse import urlparse, unquote
from searx import logger
from searx.engines import engines
from searx.metrology.error_recorder import record_error
from searx import settings
CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
@ -129,13 +130,18 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['content'] = content2
def result_score(result):
def result_score(result, language):
weight = 1.0
for result_engine in result['engines']:
if hasattr(engines[result_engine], 'weight'):
weight *= float(engines[result_engine].weight)
if settings['search']['prefer_configured_language']:
domain_parts = result['parsed_url'].netloc.split('.')
if language in domain_parts:
weight *= 1.1
occurences = len(result['positions'])
return sum((occurences * weight) / position for position in result['positions'])
@ -145,9 +151,10 @@ class ResultContainer:
"""docstring for ResultContainer"""
__slots__ = '_merged_results', 'infoboxes', 'suggestions', 'answers', 'corrections', '_number_of_results',\
'_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data'
'_ordered', 'paging', 'unresponsive_engines', 'timings', 'redirect_url', 'engine_data',\
'_language'
def __init__(self):
def __init__(self, language):
super().__init__()
self._merged_results = []
self.infoboxes = []
@ -161,6 +168,7 @@ class ResultContainer:
self.unresponsive_engines = set()
self.timings = []
self.redirect_url = None
self._language = language.lower().split('-')[0]
def extend(self, engine_name, results):
standard_result_count = 0
@ -299,7 +307,7 @@ class ResultContainer:
def order_results(self):
for result in self._merged_results:
score = result_score(result)
score = result_score(result, self._language)
result['score'] = score
with RLock():
for result_engine in result['engines']:

View File

@ -66,7 +66,7 @@ class Search:
# init vars
super().__init__()
self.search_query = search_query
self.result_container = ResultContainer()
self.result_container = ResultContainer(search_query.lang)
self.start_time = None
self.actual_timeout = None

View File

@ -19,6 +19,7 @@ search:
default_lang : "" # Default search language - leave blank to detect from browser information or use codes from 'languages.py'
ban_time_on_fail : 5 # ban time in seconds after engine errors
max_ban_time_on_fail : 120 # max ban time in seconds after engine errors
prefer_configured_language: False # increase weight of results in confiugred language in ranking
server:
port : 8888

View File

@ -20,22 +20,22 @@ def fake_result(url='https://aa.bb/cc?dd=ee#ff',
class ResultContainerTestCase(SearxTestCase):
def test_empty(self):
c = ResultContainer()
c = ResultContainer("en-US")
self.assertEqual(c.get_ordered_results(), [])
def test_one_result(self):
c = ResultContainer()
c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result()])
self.assertEqual(c.results_length(), 1)
def test_one_suggestion(self):
c = ResultContainer()
c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result(suggestion=True)])
self.assertEqual(len(c.suggestions), 1)
self.assertEqual(c.results_length(), 0)
def test_result_merge(self):
c = ResultContainer()
c = ResultContainer("en-US")
c.extend('wikipedia', [fake_result()])
c.extend('wikidata', [fake_result(), fake_result(url='https://example.com/')])
self.assertEqual(c.results_length(), 2)