fix utils/fetch_languages to work with new languages

This commit is contained in:
Marc Abonce Seguin 2020-09-14 00:07:45 -07:00 committed by Alexandre Flament
parent 41800835f9
commit c86504b47a
4 changed files with 2912 additions and 1054 deletions

View File

@ -239,7 +239,8 @@ test.sh:
test.pep8: pyenvinstall
@echo "TEST pep8"
$(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/engines/gigablast.py' --max-line-length=120 --ignore "E402,W503" searx tests
$(Q)$(PY_ENV_ACT); pep8 --exclude='searx/static, searx/languages.py, searx/engines/gigablast.py' \
--max-line-length=120 --ignore "E402,W503" searx tests
test.unit: pyenvinstall
@echo "TEST tests/unit"

File diff suppressed because it is too large Load Diff

View File

@ -1,75 +1,96 @@
# -*- coding: utf-8 -*-
# list of language codes
# this file is generated automatically by utils/update_search_languages.py
language_codes = (
("af-NA", "Afrikaans", "", "Afrikaans"),
("ar-SA", "العربية", "", "Arabic"),
("be-BY", "Беларуская", "", "Belarusian"),
("bg-BG", "Български", "", "Bulgarian"),
("ca-AD", "Català", "", "Catalan"),
("cs-CZ", "Čeština", "", "Czech"),
("da-DK", "Dansk", "", "Danish"),
("de", "Deutsch", "", "German"),
("de-AT", "Deutsch", "Österreich", "German"),
("de-CH", "Deutsch", "Schweiz", "German"),
("de-DE", "Deutsch", "Deutschland", "German"),
("el-GR", "Ελληνικά", "", "Greek"),
("en", "English", "", "English"),
("en-AU", "English", "Australia", "English"),
("en-CA", "English", "Canada", "English"),
("en-GB", "English", "United Kingdom", "English"),
("en-IE", "English", "Ireland", "English"),
("en-IN", "English", "India", "English"),
("en-NZ", "English", "New Zealand", "English"),
("en-PH", "English", "Philippines", "English"),
("en-SG", "English", "Singapore", "English"),
("en-US", "English", "United States", "English"),
("es", "Español", "", "Spanish"),
("es-AR", "Español", "Argentina", "Spanish"),
("es-CL", "Español", "Chile", "Spanish"),
("es-ES", "Español", "España", "Spanish"),
("es-MX", "Español", "México", "Spanish"),
("et-EE", "Eesti", "", "Estonian"),
("fa-IR", "فارسی", "", "Persian"),
("fi-FI", "Suomi", "", "Finnish"),
("fr", "Français", "", "French"),
("fr-BE", "Français", "Belgique", "French"),
("fr-CA", "Français", "Canada", "French"),
("fr-CH", "Français", "Suisse", "French"),
("fr-FR", "Français", "France", "French"),
("he-IL", "עברית", "", "Hebrew"),
("hr-HR", "Hrvatski", "", "Croatian"),
("hu-HU", "Magyar", "", "Hungarian"),
("hy-AM", "Հայերեն", "", "Armenian"),
("id-ID", "Indonesia", "", "Indonesian"),
("is-IS", "Íslenska", "", "Icelandic"),
("it-IT", "Italiano", "", "Italian"),
("ja-JP", "日本語", "", "Japanese"),
("ko-KR", "한국어", "", "Korean"),
("lt-LT", "Lietuvių", "", "Lithuanian"),
("lv-LV", "Latviešu", "", "Latvian"),
("ms-MY", "Melayu", "", "Malay"),
("nb-NO", "Norsk Bokmål", "", "Norwegian Bokmål"),
("nl", "Nederlands", "", "Dutch"),
("nl-BE", "Nederlands", "België", "Dutch"),
("nl-NL", "Nederlands", "Nederland", "Dutch"),
("pl-PL", "Polski", "", "Polish"),
("pt", "Português", "", "Portuguese"),
("pt-BR", "Português", "Brasil", "Portuguese"),
("pt-PT", "Português", "Portugal", "Portuguese"),
("ro-RO", "Română", "", "Romanian"),
("ru-RU", "Русский", "", "Russian"),
("sk-SK", "Slovenčina", "", "Slovak"),
("sl-SI", "Slovenščina", "", "Slovenian"),
("sr-RS", "Srpski", "", "Serbian"),
("sv-SE", "Svenska", "", "Swedish"),
("sw-KE", "Kiswahili", "", "Swahili"),
("th-TH", "ไทย", "", "Thai"),
("tr-TR", "Türkçe", "", "Turkish"),
("uk-UA", "Українська", "", "Ukrainian"),
("vi-VN", "Tiếng Việt", "", "Vietnamese"),
("zh", "中文", "", "Chinese"),
("zh-CN", "中文", "中国", "Chinese"),
("zh-TW", "中文", "台灣", "Chinese")
)
# this file is generated automatically by utils/fetch_languages.py
language_codes = \
( ('af-ZA', 'Afrikaans', '', 'Afrikaans'),
('am-ET', 'አማርኛ', '', 'Amharic'),
('ar-EG', 'العربية', '', 'Arabic'),
('az-AZ', 'Azərbaycan', '', 'Azerbaijani'),
('be-BY', 'Беларуская', '', 'Belarusian'),
('bg-BG', 'Български', '', 'Bulgarian'),
('bn-BD', 'বাংলা', '', 'Bangla'),
('bs-BA', 'Bosanski', '', 'Bosnian'),
('ca-ES', 'Català', '', 'Catalan'),
('cs-CZ', 'Čeština', '', 'Czech'),
('da-DK', 'Dansk', '', 'Danish'),
('de', 'Deutsch', '', 'German'),
('de-AT', 'Deutsch', 'Österreich', 'German'),
('de-CH', 'Deutsch', 'Schweiz', 'German'),
('de-DE', 'Deutsch', 'Deutschland', 'German'),
('el-GR', 'Ελληνικά', '', 'Greek'),
('en', 'English', '', 'English'),
('en-AU', 'English', 'Australia', 'English'),
('en-CA', 'English', 'Canada', 'English'),
('en-GB', 'English', 'United Kingdom', 'English'),
('en-IE', 'English', 'Ireland', 'English'),
('en-IN', 'English', 'India', 'English'),
('en-NZ', 'English', 'New Zealand', 'English'),
('en-PH', 'English', 'Philippines', 'English'),
('en-SG', 'English', 'Singapore', 'English'),
('en-US', 'English', 'United States', 'English'),
('es', 'Español', '', 'Spanish'),
('es-AR', 'Español', 'Argentina', 'Spanish'),
('es-CL', 'Español', 'Chile', 'Spanish'),
('es-ES', 'Español', 'España', 'Spanish'),
('es-MX', 'Español', 'México', 'Spanish'),
('et-EE', 'Eesti', '', 'Estonian'),
('fa-IR', 'فارسی', '', 'Persian'),
('fi-FI', 'Suomi', '', 'Finnish'),
('fo-FO', 'Føroyskt', '', 'Faroese'),
('fr', 'Français', '', 'French'),
('fr-BE', 'Français', 'Belgique', 'French'),
('fr-CA', 'Français', 'Canada', 'French'),
('fr-CH', 'Français', 'Suisse', 'French'),
('fr-FR', 'Français', 'France', 'French'),
('he-IL', 'עברית', '', 'Hebrew'),
('hr-HR', 'Hrvatski', '', 'Croatian'),
('hu-HU', 'Magyar', '', 'Hungarian'),
('hy-AM', 'Հայերեն', '', 'Armenian'),
('id-ID', 'Indonesia', '', 'Indonesian'),
('is-IS', 'Íslenska', '', 'Icelandic'),
('it-IT', 'Italiano', '', 'Italian'),
('ja-JP', '日本語', '', 'Japanese'),
('ka-GE', 'ქართული', '', 'Georgian'),
('kk-KZ', 'Қазақ Тілі', '', 'Kazakh'),
('km-KH', 'ខ្មែរ', '', 'Khmer'),
('ko-KR', '한국어', '', 'Korean'),
('ky-KG', 'Кыргызча', '', 'Kyrgyz'),
('lo-LA', 'ລາວ', '', 'Lao'),
('lt-LT', 'Lietuvių', '', 'Lithuanian'),
('lv-LV', 'Latviešu', '', 'Latvian'),
('mk-MK', 'Македонски', '', 'Macedonian'),
('mn-MN', 'Монгол', '', 'Mongolian'),
('ms-MY', 'Melayu', '', 'Malay'),
('mt-MT', 'Malti', '', 'Maltese'),
('nb-NO', 'Norsk Bokmål', '', 'Norwegian Bokmål'),
('ne-NP', 'नेपाली', '', 'Nepali'),
('nl', 'Nederlands', '', 'Dutch'),
('nl-BE', 'Nederlands', 'België', 'Dutch'),
('nl-NL', 'Nederlands', 'Nederland', 'Dutch'),
('pl-PL', 'Polski', '', 'Polish'),
('pt', 'Português', '', 'Portuguese'),
('pt-BR', 'Português', 'Brasil', 'Portuguese'),
('pt-PT', 'Português', 'Portugal', 'Portuguese'),
('ro-RO', 'Română', '', 'Romanian'),
('ru-RU', 'Русский', '', 'Russian'),
('rw-RW', 'Kinyarwanda', '', 'Kinyarwanda'),
('si-LK', 'සිංහල', '', 'Sinhala'),
('sk-SK', 'Slovenčina', '', 'Slovak'),
('sl-SI', 'Slovenščina', '', 'Slovenian'),
('so-SO', 'Soomaali', '', 'Somali'),
('sq-AL', 'Shqip', '', 'Albanian'),
('sr-RS', 'Srpski', '', 'Serbian'),
('sv-SE', 'Svenska', '', 'Swedish'),
('sw-TZ', 'Kiswahili', '', 'Swahili'),
('tg-TJ', 'Тоҷикӣ', '', 'Tajik'),
('th-TH', 'ไทย', '', 'Thai'),
('ti-ET', 'ትግርኛ', '', 'Tigrinya'),
('tk-TM', 'Türkmen Dili', '', 'Turkmen'),
('tr-TR', 'Türkçe', '', 'Turkish'),
('uk-UA', 'Українська', '', 'Ukrainian'),
('ur-PK', 'اردو', '', 'Urdu'),
('uz-UZ', 'OZbek', '', 'Uzbek'),
('vi-VN', 'Tiếng Việt', '', 'Vietnamese'),
('zh', '中文', '', 'Chinese'),
('zh-CN', '中文', '中国', 'Chinese'),
('zh-TW', '中文', '台灣', 'Chinese'))

View File

@ -6,7 +6,7 @@
# are written in current directory to avoid overwriting in case something goes wrong.
import json
import io
from pprint import pformat
from sys import path
from babel import Locale, UnknownLocaleError
from babel.languages import get_global
@ -23,7 +23,7 @@ languages_file = 'languages.py'
# Fetchs supported languages for each engine and writes json file with those.
def fetch_supported_languages():
engines_languages = {}
engines_languages = dict()
names = list(engines)
names.sort()
@ -51,19 +51,9 @@ def get_locale(lang_code):
return None
# Append engine_name to list of engines that support locale.
def add_engine_counter(lang_code, engine_name, languages):
if lang_code in languages:
if 'counter' not in languages[lang_code]:
languages[lang_code]['counter'] = [engine_name]
elif engine_name not in languages[lang_code]['counter']:
languages[lang_code]['counter'].append(engine_name)
# Join all language lists.
# TODO: Add language names from engine's language list if name not known by babel.
def join_language_lists(engines_languages):
language_list = {}
language_list = dict()
for engine_name in engines_languages:
for lang_code in engines_languages[engine_name]:
@ -76,32 +66,51 @@ def join_language_lists(engines_languages):
# ensure that lang_code uses standard language and country codes
if locale and locale.territory:
lang_code = locale.language + '-' + locale.territory
lang_code = "{lang}-{country}".format(lang=locale.language, country=locale.territory)
short_code = lang_code.split('-')[0]
# add locale if it's not in list
if lang_code not in language_list:
# add language without country if not in list
if short_code not in language_list:
if locale:
language_list[lang_code] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name,
'country': locale.get_territory_name() or ''}
# also add language without country
if locale.language not in language_list:
language_list[locale.language] = {'name': locale.get_language_name().title(),
'english_name': locale.english_name}
# get language's data from babel's Locale object
language_name = locale.get_language_name().title()
english_name = locale.english_name.split(' (')[0]
elif short_code in engines_languages['wikipedia']:
# get language's data from wikipedia if not known by babel
language_name = engines_languages['wikipedia'][short_code]['name']
english_name = engines_languages['wikipedia'][short_code]['english_name']
else:
language_list[lang_code] = {}
language_name = None
english_name = None
# add language to list
language_list[short_code] = {'name': language_name,
'english_name': english_name,
'counter': set(),
'countries': dict()}
# add language with country if not in list
if lang_code != short_code and lang_code not in language_list[short_code]['countries']:
country_name = ''
if locale:
# get country name from babel's Locale object
country_name = locale.get_territory_name()
language_list[short_code]['countries'][lang_code] = {'country_name': country_name,
'counter': set()}
# count engine for both language_country combination and language alone
add_engine_counter(lang_code, engine_name, language_list)
add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
language_list[short_code]['counter'].add(engine_name)
if lang_code != short_code:
language_list[short_code]['countries'][lang_code]['counter'].add(engine_name)
return language_list
# Filter language list so it only includes the most supported languages and countries.
# Filter language list so it only includes the most supported languages and countries
def filter_language_list(all_languages):
min_supported_engines = 10
min_engines_per_lang = 15
min_engines_per_country = 10
main_engines = [engine_name for engine_name in engines.keys()
if 'general' in engines[engine_name].categories and
engines[engine_name].supported_languages and
@ -110,82 +119,84 @@ def filter_language_list(all_languages):
# filter list to include only languages supported by most engines or all default general engines
filtered_languages = {code: lang for code, lang
in all_languages.items()
if (len(lang.get('counter', [])) >= min_supported_engines or
all(main_engine in lang.get('counter', [])
if (len(lang['counter']) >= min_engines_per_lang or
all(main_engine in lang['counter']
for main_engine in main_engines))}
return filtered_languages
def _copy_lang_data(lang, country_name=None):
new_dict = dict()
new_dict['name'] = all_languages[lang]['name']
new_dict['english_name'] = all_languages[lang]['english_name']
if country_name:
new_dict['country_name'] = country_name
return new_dict
def _country_count(i):
return len(countries[sorted_countries[i]]['counter'])
# Add country codes to languages without one and filter out language codes.
def assign_country_codes(filtered_languages, all_languages):
sorted_languages = sorted(all_languages,
key=lambda lang: len(all_languages[lang].get('counter', [])),
reverse=True)
previous_lang = None
previous_code = None
countries = 0
for current_code in sorted(filtered_languages):
current_lang = current_code.split('-')[0]
# for each language get country codes supported by most engines or at least one country code
filtered_languages_with_countries = dict()
for lang, lang_data in filtered_languages.items():
countries = lang_data['countries']
filtered_countries = dict()
# count country codes per language
if current_lang == previous_lang:
countries += 1
# get language's country codes with enough supported engines
for lang_country, country_data in countries.items():
if len(country_data['counter']) >= min_engines_per_country:
filtered_countries[lang_country] = _copy_lang_data(lang, country_data['country_name'])
else:
if previous_lang is not None:
# if language has no single country code
if countries == 0:
# try to get country code with most supported engines
for l in sorted_languages:
l_parts = l.split('-')
if len(l_parts) == 2 and l_parts[0] == previous_lang:
filtered_languages[l] = all_languages[l]
filtered_languages[l]['country'] = ''
countries = 1
break
# add language without countries too if there's more than one country to choose from
if len(filtered_countries) > 1:
filtered_countries[lang] = _copy_lang_data(lang)
elif len(filtered_countries) == 1:
# if there's only one country per language, it's not necessary to show country name
lang_country = next(iter(filtered_countries))
filtered_countries[lang_country]['country_name'] = None
if countries == 0:
# get most likely country code from babel
subtags = get_global('likely_subtags').get(previous_lang)
if subtags:
subtag_parts = subtags.split('_')
new_code = subtag_parts[0] + '-' + subtag_parts[-1]
filtered_languages[new_code] = all_languages[previous_lang]
countries = 1
# if no country has enough engines try to get most likely country code from babel
if not filtered_countries:
lang_country = None
subtags = get_global('likely_subtags').get(lang)
if subtags:
country_code = subtags.split('_')[-1]
if len(country_code) == 2:
lang_country = "{lang}-{country}".format(lang=lang, country=country_code)
if countries == 1:
# remove countryless version of language if there's only one country
del filtered_languages[previous_lang]
if previous_code in filtered_languages:
filtered_languages[previous_code]['country'] = ''
if lang_country:
filtered_countries[lang_country] = _copy_lang_data(lang)
else:
filtered_countries[lang] = _copy_lang_data(lang)
countries = 0
previous_lang = current_lang
filtered_languages_with_countries.update(filtered_countries)
previous_code = current_code
return filtered_languages_with_countries
# Write languages.py.
def write_languages_file(languages):
new_file = open(languages_file, 'wb')
file_content = '# -*- coding: utf-8 -*-\n'\
+ '# list of language codes\n'\
+ '# this file is generated automatically by utils/update_search_languages.py\n'\
+ '\nlanguage_codes = ('
for code in sorted(languages):
if 'name' in languages[code]:
file_content += '\n ("' + code + '"'\
+ ', "' + languages[code]['name'].split(' (')[0] + '"'\
+ ', "' + languages[code].get('country', '') + '"'\
+ ', "' + languages[code].get('english_name', '').split(' (')[0] + '"),'
else:
print('ignore ',languages[code])
# remove last comma
file_content = file_content[:-1]
file_content += '\n)\n'
new_file.write(file_content.encode())
new_file.close()
file_headers = (
"# -*- coding: utf-8 -*-",
"# list of language codes",
"# this file is generated automatically by utils/fetch_languages.py",
"language_codes ="
)
language_codes = tuple([
(
code,
languages[code]['name'].split(' (')[0],
languages[code].get('country_name') or '',
languages[code].get('english_name') or ''
) for code in sorted(languages)
])
with open(languages_file, 'w') as new_file:
file_content = "{file_headers} \\\n{language_codes}".format(
file_headers='\n'.join(file_headers),
language_codes=pformat(language_codes, indent=4)
)
new_file.write(file_content)
new_file.close()
if __name__ == "__main__":
@ -193,5 +204,4 @@ if __name__ == "__main__":
engines_languages = fetch_supported_languages()
all_languages = join_language_lists(engines_languages)
filtered_languages = filter_language_list(all_languages)
assign_country_codes(filtered_languages, all_languages)
write_languages_file(filtered_languages)