From 57e618edf7a72cb65620e5664941fb11130d2145 Mon Sep 17 00:00:00 2001 From: Luc Didry Date: Thu, 9 Jul 2015 23:05:45 +0200 Subject: [PATCH 01/53] Add Image on opensearch --- searx/templates/courgette/opensearch.xml | 1 + searx/templates/default/opensearch.xml | 1 + searx/templates/oscar/opensearch.xml | 1 + 3 files changed, 3 insertions(+) diff --git a/searx/templates/courgette/opensearch.xml b/searx/templates/courgette/opensearch.xml index f39283f9..ff9eac55 100644 --- a/searx/templates/courgette/opensearch.xml +++ b/searx/templates/courgette/opensearch.xml @@ -3,6 +3,7 @@ searx Search searx UTF-8 + {{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }} searx metasearch {% if opensearch_method == 'get' %} diff --git a/searx/templates/default/opensearch.xml b/searx/templates/default/opensearch.xml index f39283f9..ff9eac55 100644 --- a/searx/templates/default/opensearch.xml +++ b/searx/templates/default/opensearch.xml @@ -3,6 +3,7 @@ searx Search searx UTF-8 + {{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }} searx metasearch {% if opensearch_method == 'get' %} diff --git a/searx/templates/oscar/opensearch.xml b/searx/templates/oscar/opensearch.xml index f39283f9..ff9eac55 100644 --- a/searx/templates/oscar/opensearch.xml +++ b/searx/templates/oscar/opensearch.xml @@ -3,6 +3,7 @@ searx Search searx UTF-8 + {{ host }}{{ url_for('static', filename='img/favicon.png') | replace("/", "", 1) }} searx metasearch {% if opensearch_method == 'get' %} From d0830d4edf8a9ee794d5897afd813c88f0ea720b Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 19:03:55 +0200 Subject: [PATCH 02/53] [enh] add settings option to set listening address - closes #397 --- searx/settings.yml | 1 + searx/settings_robot.yml | 1 + searx/webapp.py | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/searx/settings.yml b/searx/settings.yml index 03d89536..31bde02a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1,5 +1,6 @@ server: port : 8888 + bind_address : "127.0.0.1" # address to listen on secret_key : "ultrasecretkey" # change this! debug : False # Debug mode, only for development request_timeout : 2.0 # seconds diff --git a/searx/settings_robot.yml b/searx/settings_robot.yml index c6fe2282..36907e6e 100644 --- a/searx/settings_robot.yml +++ b/searx/settings_robot.yml @@ -1,5 +1,6 @@ server: port : 11111 + bind_address : 127.0.0.1 secret_key : "ultrasecretkey" # change this! debug : False request_timeout : 3.0 # seconds diff --git a/searx/webapp.py b/searx/webapp.py index fb7157b4..d45d01ec 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -777,7 +777,8 @@ def run(): app.run( debug=settings['server']['debug'], use_debugger=settings['server']['debug'], - port=settings['server']['port'] + port=settings['server']['port'], + host=settings['server']['bind_address'] ) From 1fcf066a8188b28eb644ea304a131d40b1b341eb Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 19:38:27 +0200 Subject: [PATCH 03/53] [mod] change settings file structure according to #314 --- searx/autocomplete.py | 2 +- searx/engines/__init__.py | 2 +- searx/poolrequests.py | 8 ++++---- searx/settings.yml | 26 +++++++++++++++----------- searx/settings_robot.yml | 17 ++++++++++++----- searx/utils.py | 3 ++- searx/webapp.py | 22 +++++++++++----------- 7 files changed, 46 insertions(+), 34 deletions(-) diff --git a/searx/autocomplete.py b/searx/autocomplete.py index 1a324b8a..264d0cc1 100644 --- a/searx/autocomplete.py +++ b/searx/autocomplete.py @@ -29,7 +29,7 @@ from searx.poolrequests import get as http_get def get(*args, **kwargs): if 'timeout' not in kwargs: - kwargs['timeout'] = settings['server']['request_timeout'] + kwargs['timeout'] = settings['outgoing']['request_timeout'] return http_get(*args, **kwargs) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 42e1f08b..447138d3 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -75,7 +75,7 @@ def load_engine(engine_data): engine.safesearch = False if not hasattr(engine, 'timeout'): - engine.timeout = settings['server']['request_timeout'] + engine.timeout = settings['outgoing']['request_timeout'] if not hasattr(engine, 'shortcut'): engine.shortcut = '' diff --git a/searx/poolrequests.py b/searx/poolrequests.py index e2a75766..c44bdc7e 100644 --- a/searx/poolrequests.py +++ b/searx/poolrequests.py @@ -39,11 +39,11 @@ class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter): block=self._pool_block, **self._conn_params) -if settings.get('source_ips'): +if settings['outgoing'].get('source_ips'): http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=100, source_address=(source_ip, 0)) - for source_ip in settings['source_ips']) + for source_ip in settings['outgoing']['source_ips']) https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=100, source_address=(source_ip, 0)) - for source_ip in settings['source_ips']) + for source_ip in settings['outgoing']['source_ips']) else: http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=100), )) https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=100), )) @@ -69,7 +69,7 @@ def request(method, url, **kwargs): """same as requests/requests/api.py request(...) except it use SessionSinglePool and force proxies""" global settings session = SessionSinglePool() - kwargs['proxies'] = settings.get('outgoing_proxies', None) + kwargs['proxies'] = settings['outgoing'].get('proxies', None) response = session.request(method=method, url=url, **kwargs) session.close() return response diff --git a/searx/settings.yml b/searx/settings.yml index 31bde02a..de018dcb 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1,28 +1,32 @@ +general: + debug : False # Debug mode, only for development + server: port : 8888 bind_address : "127.0.0.1" # address to listen on secret_key : "ultrasecretkey" # change this! - debug : False # Debug mode, only for development - request_timeout : 2.0 # seconds base_url : False # Set custom base_url. Possible values: False or "https://your.custom.host/location/" + image_proxy : False # Proxying image results through searx + +ui: themes_path : "" # Custom ui themes path - leave it blank if you didn't change default_theme : oscar # ui theme - useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator - image_proxy : False # Proxying image results through searx default_locale : "" # Default interface locale - leave blank to detect from browser information or use codes from the 'locales' config section +outgoing: # communication with search engines + request_timeout : 2.0 # seconds + useragent_suffix : "" # suffix of searx_useragent, could contain informations like an email address to the administrator # uncomment below section if you want to use a proxy # see http://docs.python-requests.org/en/latest/user/advanced/#proxies # SOCKS proxies are not supported : see https://github.com/kennethreitz/requests/pull/478 -#outgoing_proxies : -# http : http://127.0.0.1:8080 -# https: http://127.0.0.1:8080 - +# proxies : +# http : http://127.0.0.1:8080 +# https: http://127.0.0.1:8080 # uncomment below section only if you have more than one network interface # which can be the source of outgoing search requests -#source_ips: -# - 1.1.1.1 -# - 1.1.1.2 +# source_ips: +# - 1.1.1.1 +# - 1.1.1.2 engines: - name : wikipedia diff --git a/searx/settings_robot.yml b/searx/settings_robot.yml index 36907e6e..3ca474d6 100644 --- a/searx/settings_robot.yml +++ b/searx/settings_robot.yml @@ -1,14 +1,21 @@ +general: + debug : False + server: port : 11111 bind_address : 127.0.0.1 secret_key : "ultrasecretkey" # change this! - debug : False - request_timeout : 3.0 # seconds - base_url: False + base_url : False + image_proxy : False + +ui: themes_path : "" default_theme : default - https_rewrite : True - image_proxy : False + default_locale : "" + +outgoing: + request_timeout : 1.0 # seconds + useragent_suffix : "" engines: - name : general_dummy diff --git a/searx/utils.py b/searx/utils.py index c9784159..cc31726b 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -26,6 +26,7 @@ ua_versions = ('33.0', ua_os = ('Windows NT 6.3; WOW64', 'X11; Linux x86_64', 'X11; Linux x86') + ua = "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}" blocked_tags = ('script', @@ -40,7 +41,7 @@ def gen_useragent(): def searx_useragent(): return 'searx/{searx_version} {suffix}'.format( searx_version=VERSION_STRING, - suffix=settings['server'].get('useragent_suffix', '')) + suffix=settings['outgoing'].get('useragent_suffix', '')) def highlight_content(content, query): diff --git a/searx/webapp.py b/searx/webapp.py index d45d01ec..778956cc 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -77,11 +77,11 @@ except ImportError: static_path, templates_path, themes =\ - get_themes(settings['themes_path'] - if settings.get('themes_path') + get_themes(settings['ui']['themes_path'] + if settings['ui']['themes_path'] else searx_dir) -default_theme = settings['server'].get('default_theme', 'default') +default_theme = settings['ui']['default_theme'] static_files = get_static_files(searx_dir) @@ -121,15 +121,15 @@ _category_names = (gettext('files'), gettext('news'), gettext('map')) -outgoing_proxies = settings.get('outgoing_proxies', None) +outgoing_proxies = settings['outgoing'].get('proxies', None) @babel.localeselector def get_locale(): locale = request.accept_languages.best_match(settings['locales'].keys()) - if settings['server'].get('default_locale'): - locale = settings['server']['default_locale'] + if settings['ui'].get('default_locale'): + locale = settings['ui']['default_locale'] if request.cookies.get('locale', '') in settings['locales']: locale = request.cookies.get('locale', '') @@ -640,12 +640,12 @@ def preferences(): stats[e.name] = {'time': None, 'warn_timeout': False, 'warn_time': False} - if e.timeout > settings['server']['request_timeout']: + if e.timeout > settings['outgoing']['request_timeout']: stats[e.name]['warn_timeout'] = True for engine_stat in get_engines_stats()[0][1]: stats[engine_stat.get('name')]['time'] = round(engine_stat.get('avg'), 3) - if engine_stat.get('avg') > settings['server']['request_timeout']: + if engine_stat.get('avg') > settings['outgoing']['request_timeout']: stats[engine_stat.get('name')]['warn_time'] = True # end of stats @@ -683,7 +683,7 @@ def image_proxy(): resp = requests.get(url, stream=True, - timeout=settings['server'].get('request_timeout', 2), + timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) @@ -775,8 +775,8 @@ def clear_cookies(): def run(): app.run( - debug=settings['server']['debug'], - use_debugger=settings['server']['debug'], + debug=settings['general']['debug'], + use_debugger=settings['general']['debug'], port=settings['server']['port'], host=settings['server']['bind_address'] ) From c1d9cfd9ae0cc78b91ee5cc339266c26c09e0e4c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 19:59:54 +0200 Subject: [PATCH 04/53] [enh] default settings option to autocomplete backend - #396 --- searx/settings.yml | 4 ++++ searx/settings_robot.yml | 4 ++++ searx/webapp.py | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/searx/settings.yml b/searx/settings.yml index de018dcb..ffc3044a 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -1,6 +1,10 @@ general: debug : False # Debug mode, only for development +search: + safe_search : 0 # Filter results. 0: None, 1: Moderate, 2: Strict + autocomplete : "" # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "startpage", "wikipedia" - leave blank to turn it off by default + server: port : 8888 bind_address : "127.0.0.1" # address to listen on diff --git a/searx/settings_robot.yml b/searx/settings_robot.yml index 3ca474d6..f14443cf 100644 --- a/searx/settings_robot.yml +++ b/searx/settings_robot.yml @@ -1,6 +1,10 @@ general: debug : False +search: + safe_search : 0 + autocomplete : 0 + server: port : 11111 bind_address : 127.0.0.1 diff --git a/searx/webapp.py b/searx/webapp.py index 778956cc..4292b32d 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -263,7 +263,7 @@ def image_proxify(url): def render(template_name, override_theme=None, **kwargs): blocked_engines = get_blocked_engines(engines, request.cookies) - autocomplete = request.cookies.get('autocomplete') + autocomplete = request.cookies.get('autocomplete', settings['search']['autocomplete']) if autocomplete not in autocomplete_backends: autocomplete = None @@ -491,7 +491,7 @@ def autocompleter(): return '', 400 # run autocompleter - completer = autocomplete_backends.get(request.cookies.get('autocomplete')) + completer = autocomplete_backends.get(request.cookies.get('autocomplete', settings['search']['autocomplete'])) # parse searx specific autocompleter results like !bang raw_results = searx_bang(query) From 43cd8e0c4129571a263429173c7a9fe7092e1dec Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 20:29:19 +0200 Subject: [PATCH 05/53] [enh] default settings option to safe_search - #396 --- searx/search.py | 5 +++-- searx/webapp.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/searx/search.py b/searx/search.py index bb440352..5054ce79 100644 --- a/searx/search.py +++ b/searx/search.py @@ -23,6 +23,7 @@ from operator import itemgetter from Queue import Queue from time import time from urlparse import urlparse, unquote +from searx import settings from searx.engines import ( categories, engines ) @@ -480,9 +481,9 @@ class Search(object): try: # 0 = None, 1 = Moderate, 2 = Strict - request_params['safesearch'] = int(request.cookies.get('safesearch', 1)) + request_params['safesearch'] = int(request.cookies.get('safesearch')) except ValueError: - request_params['safesearch'] = 1 + request_params['safesearch'] = settings['search']['safe_search'] # update request parameters dependent on # search-engine (contained in engines folder) diff --git a/searx/webapp.py b/searx/webapp.py index 4292b32d..06f24662 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -312,7 +312,7 @@ def render(template_name, override_theme=None, **kwargs): kwargs['method'] = request.cookies.get('method', 'POST') - kwargs['safesearch'] = request.cookies.get('safesearch', '1') + kwargs['safesearch'] = request.cookies.get('safesearch', str(settings['search']['safe_search'])) # override url_for function in templates kwargs['url_for'] = url_for_theme @@ -542,7 +542,7 @@ def preferences(): locale = None autocomplete = '' method = 'POST' - safesearch = '1' + safesearch = settings['search']['safe_search'] for pd_name, pd in request.form.items(): if pd_name.startswith('category_'): category = pd_name[9:] From 7c9f931baff35e8a8986810971fa1949f18c19ad Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sun, 2 Aug 2015 20:41:44 +0200 Subject: [PATCH 06/53] [fix] type error --- searx/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/search.py b/searx/search.py index 5054ce79..6288a46e 100644 --- a/searx/search.py +++ b/searx/search.py @@ -482,7 +482,7 @@ class Search(object): try: # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = int(request.cookies.get('safesearch')) - except ValueError: + except Exception: request_params['safesearch'] = settings['search']['safe_search'] # update request parameters dependent on From 3a8eafcc6b19b4b47b10534fbc683e4e3fbc064d Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 12 Aug 2015 15:49:48 +0200 Subject: [PATCH 07/53] [fix] cookie parameter type --- searx/webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/webapp.py b/searx/webapp.py index 06f24662..7f1621a6 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -624,7 +624,7 @@ def preferences(): resp.set_cookie('method', method, max_age=cookie_max_age) - resp.set_cookie('safesearch', safesearch, max_age=cookie_max_age) + resp.set_cookie('safesearch', str(safesearch), max_age=cookie_max_age) resp.set_cookie('image_proxy', image_proxy, max_age=cookie_max_age) From 9cec9770be27cf4fc47d1caa2bccc59d911c20c2 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:03:06 +0200 Subject: [PATCH 08/53] [fix] show debug output when enabled --- searx/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/__init__.py b/searx/__init__.py index 2d545a80..ea21e8f1 100644 --- a/searx/__init__.py +++ b/searx/__init__.py @@ -40,7 +40,7 @@ else: with open(settings_path) as settings_yaml: settings = load(settings_yaml) -if settings.get('server', {}).get('debug'): +if settings.get('general', {}).get('debug'): logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARNING) From 5bffa9ca33f8ec98baebca13ee9b16262bfe4e8d Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:18:58 +0200 Subject: [PATCH 09/53] [fix] rewrite scheme to http if there is no one, FIX #390 --- searx/search.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/searx/search.py b/searx/search.py index 6288a46e..1bf05f7f 100644 --- a/searx/search.py +++ b/searx/search.py @@ -206,6 +206,10 @@ def score_results(results): # if there is no duplicate found, append result else: res['score'] = score + # if the result has no scheme, use http as default + if res['parsed_url'].scheme == '': + res['parsed_url'] = res['parsed_url']._replace(scheme="http") + results.append(res) results = sorted(results, key=itemgetter('score'), reverse=True) From 23b9095cbf2d31a1495ee3d63a55bd81548cd367 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:28:55 +0200 Subject: [PATCH 10/53] [fix] improve result handling of startpage engine --- searx/engines/startpage.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 9d5b4bef..08e4f7a5 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -66,7 +66,11 @@ def response(resp): url = link.attrib.get('href') # block google-ad url's - if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): + if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): + continue + + # block startpage search url's + if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue title = escape(extract_text(link)) From 996c96fffff328497c2ba305c61e064256c84188 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Mon, 24 Aug 2015 11:31:30 +0200 Subject: [PATCH 11/53] [fix] block ixquick search url's --- searx/engines/startpage.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 08e4f7a5..7d58f7f0 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -73,6 +73,10 @@ def response(resp): if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue + # block ixquick search url's + if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): + continue + title = escape(extract_text(link)) if result.xpath('./p[@class="desc"]'): From 28493d41a327128762c6286a625d219e4b0b4e2e Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 13:12:51 +0200 Subject: [PATCH 12/53] [fix] handle missing url in twitter results --- searx/engines/twitter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index a0ee18a4..36efac18 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -55,10 +55,14 @@ def response(resp): # parse results for tweet in dom.xpath(results_xpath): - link = tweet.xpath(link_xpath)[0] + try: + link = tweet.xpath(link_xpath)[0] + content = extract_text(tweet.xpath(content_xpath)[0]) + except Exception: + continue + url = urljoin(base_url, link.attrib.get('href')) title = extract_text(tweet.xpath(title_xpath)) - content = extract_text(tweet.xpath(content_xpath)[0]) pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: From 42ac2c5b56607a8d061063e77d39b2d467ad9f9c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 22:02:18 +0200 Subject: [PATCH 13/53] [fix] check empty engine language attribute - fixes subtitleseeker --- searx/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/search.py b/searx/search.py index 1bf05f7f..ca24581a 100644 --- a/searx/search.py +++ b/searx/search.py @@ -478,7 +478,7 @@ class Search(object): request_params['started'] = time() request_params['pageno'] = self.pageno - if hasattr(engine, 'language'): + if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = self.lang From 3f31e1ce6bcaea595a6e773c4ff729cf7a9e31e1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 22:56:40 +0200 Subject: [PATCH 14/53] [fix] piratebay tld according to wikipedia --- searx/engines/piratebay.py | 2 +- searx/tests/engines/test_piratebay.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index ab0dfd44..55446b41 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -20,7 +20,7 @@ categories = ['videos', 'music', 'files'] paging = True # search-url -url = 'https://thepiratebay.am/' +url = 'https://thepiratebay.se/' search_url = url + 'search/{search_term}/{pageno}/99/{search_type}' # piratebay specific type-definitions diff --git a/searx/tests/engines/test_piratebay.py b/searx/tests/engines/test_piratebay.py index 6ebbcf71..5699380b 100644 --- a/searx/tests/engines/test_piratebay.py +++ b/searx/tests/engines/test_piratebay.py @@ -15,7 +15,7 @@ class TestPiratebayEngine(SearxTestCase): params = piratebay.request(query, dicto) self.assertIn('url', params) self.assertIn(query, params['url']) - self.assertIn('piratebay.am', params['url']) + self.assertIn('piratebay.se', params['url']) self.assertIn('0', params['url']) dicto['category'] = 'music' @@ -99,7 +99,7 @@ class TestPiratebayEngine(SearxTestCase): self.assertEqual(type(results), list) self.assertEqual(len(results), 2) self.assertEqual(results[0]['title'], 'This is the title') - self.assertEqual(results[0]['url'], 'https://thepiratebay.am/this.is.the.link') + self.assertEqual(results[0]['url'], 'https://thepiratebay.se/this.is.the.link') self.assertEqual(results[0]['content'], 'This is the content and should be OK') self.assertEqual(results[0]['seed'], 13) self.assertEqual(results[0]['leech'], 334) @@ -149,7 +149,7 @@ class TestPiratebayEngine(SearxTestCase): self.assertEqual(type(results), list) self.assertEqual(len(results), 1) self.assertEqual(results[0]['title'], 'This is the title') - self.assertEqual(results[0]['url'], 'https://thepiratebay.am/this.is.the.link') + self.assertEqual(results[0]['url'], 'https://thepiratebay.se/this.is.the.link') self.assertEqual(results[0]['content'], 'This is the content and should be OK') self.assertEqual(results[0]['seed'], 0) self.assertEqual(results[0]['leech'], 0) From b9c8039d743376ab134adb3da146519c5353c36c Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 25 Aug 2015 22:56:45 +0200 Subject: [PATCH 15/53] [mod] disable searchcode SSL verification (unable to get local issuer) --- searx/engines/searchcode_code.py | 5 +++++ searx/engines/searchcode_doc.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 21d9c4ac..bd5eb71d 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -34,6 +34,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 582b98d7..9453f31a 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -27,6 +27,11 @@ def request(query, params): params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']-1) + # Disable SSL verification + # error: (60) SSL certificate problem: unable to get local issuer + # certificate + params['verify'] = False + return params From 604f32f67276a34a3ead265ff89d3bb807902b26 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Fri, 28 Aug 2015 14:51:32 +0200 Subject: [PATCH 16/53] [fix] bing unicode encode error - fixes #408 --- searx/engines/bing.py | 2 +- searx/engines/bing_images.py | 2 +- searx/engines/bing_news.py | 2 +- searx/tests/engines/test_bing.py | 8 ++++---- searx/tests/engines/test_bing_images.py | 10 +++++----- searx/tests/engines/test_bing_news.py | 12 ++++++------ 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/searx/engines/bing.py b/searx/engines/bing.py index c72e6aef..171606cf 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -52,7 +52,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # parse results for result in dom.xpath('//div[@class="sa_cc"]'): diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 839b8e5b..06850dfe 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -63,7 +63,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # init regex for yaml-parsing p = re.compile('({|,)([a-z]+):(")') diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index a2397c48..943bf882 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -68,7 +68,7 @@ def request(query, params): def response(resp): results = [] - rss = etree.fromstring(resp.content) + rss = etree.fromstring(resp.text) ns = rss.nsmap diff --git a/searx/tests/engines/test_bing.py b/searx/tests/engines/test_bing.py index 52a049f0..bce22144 100644 --- a/searx/tests/engines/test_bing.py +++ b/searx/tests/engines/test_bing.py @@ -29,10 +29,10 @@ class TestBingEngine(SearxTestCase): self.assertRaises(AttributeError, bing.response, '') self.assertRaises(AttributeError, bing.response, '[]') - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing.response(response), []) - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing.response(response), []) html = """ @@ -54,7 +54,7 @@ class TestBingEngine(SearxTestCase): """ - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) @@ -81,7 +81,7 @@ class TestBingEngine(SearxTestCase): """ - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) diff --git a/searx/tests/engines/test_bing_images.py b/searx/tests/engines/test_bing_images.py index f869da79..f42dff7e 100644 --- a/searx/tests/engines/test_bing_images.py +++ b/searx/tests/engines/test_bing_images.py @@ -31,10 +31,10 @@ class TestBingImagesEngine(SearxTestCase): self.assertRaises(AttributeError, bing_images.response, '') self.assertRaises(AttributeError, bing_images.response, '[]') - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing_images.response(response), []) - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing_images.response(response), []) html = """ @@ -52,7 +52,7 @@ oh:"238",tft:"0",oi:"http://www.image.url/Images/Test%2 """ html = html.replace('\r\n', '').replace('\n', '').replace('\r', '') - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_images.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) @@ -75,7 +75,7 @@ oh:"238",tft:"0",oi:"http://www.image.url/Images/Test%2 style="height:144px;" width="178" height="144"/> """ - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_images.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 0) @@ -263,7 +263,7 @@ oh:"238",tft:"0",oi:"http://www.image.url/Images/Test%2 """ html = html.replace('\r\n', '').replace('\n', '').replace('\r', '') - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_images.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 10) diff --git a/searx/tests/engines/test_bing_news.py b/searx/tests/engines/test_bing_news.py index a64d59b7..c6c40265 100644 --- a/searx/tests/engines/test_bing_news.py +++ b/searx/tests/engines/test_bing_news.py @@ -28,10 +28,10 @@ class TestBingNewsEngine(SearxTestCase): self.assertRaises(AttributeError, bing_news.response, '') self.assertRaises(AttributeError, bing_news.response, '[]') - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing_news.response(response), []) - response = mock.Mock(content='') + response = mock.Mock(text='') self.assertEqual(bing_news.response(response), []) html = """ @@ -66,7 +66,7 @@ class TestBingNewsEngine(SearxTestCase): """ # noqa - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_news.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 2) @@ -105,7 +105,7 @@ class TestBingNewsEngine(SearxTestCase): """ # noqa - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_news.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 1) @@ -128,11 +128,11 @@ class TestBingNewsEngine(SearxTestCase): """ # noqa - response = mock.Mock(content=html) + response = mock.Mock(text=html) results = bing_news.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 0) html = """gabarge""" - response = mock.Mock(content=html) + response = mock.Mock(text=html) self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response) From 8c4d9b79d5727aa803b88e8181f7edabe46b40c4 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 1 Sep 2015 15:14:02 +0200 Subject: [PATCH 17/53] [fix] engine selection from url --- searx/search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/searx/search.py b/searx/search.py index ca24581a..f36552a4 100644 --- a/searx/search.py +++ b/searx/search.py @@ -391,11 +391,11 @@ class Search(object): load_default_categories = True for pd_name, pd in self.request_data.items(): if pd_name == 'categories': - self.categories.extend(categ.strip() for categ in pd.split(',') if categ in categories) + self.categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories) elif pd_name == 'engines': pd_engines = [{'category': engines[engine].categories[0], 'name': engine} - for engine in map(str.strip, pd.split(',')) if engine in engines] + for engine in map(unicode.strip, pd.split(',')) if engine in engines] if pd_engines: self.engines.extend(pd_engines) load_default_categories = False From f094188780c41d665d08c7aa159968a81ca135ed Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Tue, 1 Sep 2015 15:21:17 +0200 Subject: [PATCH 18/53] [fix] display categories of the selected engines --- searx/search.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/searx/search.py b/searx/search.py index f36552a4..89917124 100644 --- a/searx/search.py +++ b/searx/search.py @@ -414,6 +414,9 @@ class Search(object): self.categories.remove(category) if not load_default_categories: + if not self.categories: + self.categories = list(set(engine['category'] + for engine in self.engines)) return # if no category is specified for this search, From 78a69e4c982d08a0fb49f1347d7f9db3b15d464f Mon Sep 17 00:00:00 2001 From: Emmanuel Benazera Date: Tue, 1 Sep 2015 16:47:56 +0200 Subject: [PATCH 19/53] ddg encoding of URLs appears to be broken, revealed when trying to pickled the results to disk --- searx/engines/duckduckgo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 4ac2099a..f18f3b44 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -72,7 +72,7 @@ def response(resp): # append result results.append({'title': title, 'content': content, - 'url': res_url}) + 'url': res_url.encode('utf8')}) # return results return results From ae5bf6e49f621b835dfc3bf89b876e7281d25217 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 1 Sep 2015 19:43:32 +0200 Subject: [PATCH 20/53] [enh] Migrating Travis CI from legacy to container-based infrastructure This patch should speedup testing with Travis CI using caching and container-based infrastructure. source: - http://docs.travis-ci.com/user/migrating-from-legacy/ - http://docs.travis-ci.com/user/caching/ --- .travis.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.travis.yml b/.travis.yml index 44340600..7e0d80f8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,7 @@ +sudo: false +cache: + - pip + - npm language: python python: - "2.7" From e9c33572e90c9e7017ea47ae4ccc26aec869f19b Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 1 Sep 2015 20:02:08 +0200 Subject: [PATCH 21/53] [fix] also cache "/.cache/pip" --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 7e0d80f8..0edd71c1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,8 @@ sudo: false cache: - pip - npm + directories: + - $HOME/.cache/pip language: python python: - "2.7" From cf932ee6a0c86575ac78ee3c4206c4f39bfa8305 Mon Sep 17 00:00:00 2001 From: Thomas Pointhuber Date: Tue, 1 Sep 2015 20:03:32 +0200 Subject: [PATCH 22/53] [fix] yml --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0edd71c1..be668351 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ sudo: false cache: - pip - npm - directories: + - directories: - $HOME/.cache/pip language: python python: From d5931874ac8a7762c3a961f866e65a19671f9d54 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 2 Sep 2015 22:57:10 +0200 Subject: [PATCH 23/53] [fix] escape format string.. --- searx/translations/es/LC_MESSAGES/messages.mo | Bin 7137 -> 7138 bytes searx/translations/es/LC_MESSAGES/messages.po | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/searx/translations/es/LC_MESSAGES/messages.mo b/searx/translations/es/LC_MESSAGES/messages.mo index e07d5195dd5fe95d5228d35693e1f36e1c7e31be..3a4a788d21f15a5e8e5a9cdcfc6f18b8a24fdc75 100644 GIT binary patch delta 580 zcmXZY&npCB9LMpeX2)1RcUSFL9B2)3m{@5TX;I^6k z1!_xsaa%{LTpTwC2M6zuozv@?Z_j+5&-2uSkDz>Dhs(y8AZv_+$JmBv*n#IbjyIUV z3XWhEeQaO~hur4=0`~E}fXZ3JByM0Q?%^<=U<=;4Cj4JKFi>zA2e5_;{zetCdyUCp z23yg?cJ$H4S@dubOSp-<_=FbbbH?=GD5}^T>cYib*c_A?sNpK=##=ar7pNOlP>o;X z?;5H|9ToJ2D%?O7ws-~=PN8!1sQ)KXc{8YDOJTey;vj<+)Kl-Eg7;Ae9bpXqwZ6M5C$XLWGcXFfg+jkdR=JG(>V@H!vYFNDMaQ z`~{LG(#2M5iN)9$EC%n7yXp1ZH_v^Z&+~kE@7}|{?JpQ(Jja*>9%2WcU>Ba^7+#@; zH7sBqi`c?m%w@v;B^=~?7L~J#NnFQn+(j3UFpf7Fg7;WBRE4nAQF^I2p1v4AQzjk<6p>xY9S25Pv1y74BK@eFl?8mjSY z^xZ%eX`+I@P=#Bl!fiZ*3a3!HdDQ Date: Mon, 7 Sep 2015 19:22:01 +0200 Subject: [PATCH 24/53] [enh] test utils.prettify_url --- searx/tests/test_utils.py | 10 ++++++++++ searx/utils.py | 7 ++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/searx/tests/test_utils.py b/searx/tests/test_utils.py index abe411c2..04480791 100644 --- a/searx/tests/test_utils.py +++ b/searx/tests/test_utils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import mock from searx.testing import SearxTestCase from searx import utils @@ -51,6 +52,15 @@ class TestUtils(SearxTestCase): self.assertIsNotNone(utils.html_to_text(html)) self.assertEqual(utils.html_to_text(html), "Test text") + def test_prettify_url(self): + data = (('https://searx.me/', 'https://searx.me/'), + (u'https://searx.me/ű', u'https://searx.me/ű'), + ('https://searx.me/' + (100 * 'a'), 'https://searx.me/[...]aaaaaaaaaaaaaaaaa'), + (u'https://searx.me/' + (100 * u'ű'), u'https://searx.me/[...]űűűűűűűűűűűűűűűűű')) + + for test_url, expected in data: + self.assertEqual(utils.prettify_url(test_url, max_length=32), expected) + class TestHTMLTextExtractor(SearxTestCase): diff --git a/searx/utils.py b/searx/utils.py index cc31726b..3651cc38 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -222,9 +222,10 @@ def dict_subset(d, properties): return result -def prettify_url(url): - if len(url) > 74: - return u'{0}[...]{1}'.format(url[:35], url[-35:]) +def prettify_url(url, max_length=74): + if len(url) > max_length: + chunk_len = max_length / 2 + 1 + return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:]) else: return url From 362c849797e2e6f0e232642c23744c47a75cdfd4 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 7 Sep 2015 22:39:33 +0200 Subject: [PATCH 25/53] [fix][mod] wikidata date handling refactor - fixes #387 --- searx/engines/wikidata.py | 34 ++++++++++++++++++++++++++-------- searx/utils.py | 26 +++++++++----------------- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 43f72761..fc840d47 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -1,8 +1,15 @@ import json -from urllib import urlencode + +from searx import logger from searx.poolrequests import get from searx.utils import format_date_by_locale +from datetime import datetime +from dateutil.parser import parse as dateutil_parse +from urllib import urlencode + + +logger = logger.getChild('wikidata') result_count = 1 wikidata_host = 'https://www.wikidata.org' wikidata_api = wikidata_host + '/w/api.php' @@ -164,14 +171,12 @@ def getDetail(jsonresponse, wikidata_id, language, locale): if postal_code is not None: attributes.append({'label': 'Postal code(s)', 'value': postal_code}) - date_of_birth = get_time(claims, 'P569', None) + date_of_birth = get_time(claims, 'P569', locale, None) if date_of_birth is not None: - date_of_birth = format_date_by_locale(date_of_birth[8:], locale) attributes.append({'label': 'Date of birth', 'value': date_of_birth}) - date_of_death = get_time(claims, 'P570', None) + date_of_death = get_time(claims, 'P570', locale, None) if date_of_death is not None: - date_of_death = format_date_by_locale(date_of_death[8:], locale) attributes.append({'label': 'Date of death', 'value': date_of_death}) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: @@ -229,7 +234,7 @@ def get_string(claims, propertyName, defaultValue=None): return result[0] -def get_time(claims, propertyName, defaultValue=None): +def get_time(claims, propertyName, locale, defaultValue=None): propValue = claims.get(propertyName, {}) if len(propValue) == 0: return defaultValue @@ -244,9 +249,22 @@ def get_time(claims, propertyName, defaultValue=None): result.append(value.get('time', '')) if len(result) == 0: - return defaultValue + date_string = defaultValue else: - return ', '.join(result) + date_string = ', '.join(result) + + try: + parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ") + except: + if date_string.startswith('-'): + return date_string.split('T')[0] + try: + parsed_date = dateutil_parse(date_string, fuzzy=False, default=False) + except: + logger.debug('could not parse date %s', date_string) + return date_string.split('T')[0] + + return format_date_by_locale(parsed_date, locale) def get_geolink(claims, propertyName, defaultValue=''): diff --git a/searx/utils.py b/searx/utils.py index 3651cc38..b8561b84 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -1,11 +1,10 @@ # import htmlentitydefs -import locale -import dateutil.parser import cStringIO import csv import os import re +from babel.dates import format_date from codecs import getincrementalencoder from HTMLParser import HTMLParser from random import choice @@ -195,23 +194,16 @@ def get_result_templates(base_path): return result_templates -def format_date_by_locale(date_string, locale_string): +def format_date_by_locale(date, locale_string): # strftime works only on dates after 1900 - parsed_date = dateutil.parser.parse(date_string) - if parsed_date.year <= 1900: - return parsed_date.isoformat().split('T')[0] - orig_locale = locale.getlocale()[0] - try: - locale.setlocale(locale.LC_ALL, locale_string) - except: - logger.warning('cannot set locale: {0}'.format(locale_string)) - formatted_date = parsed_date.strftime(locale.nl_langinfo(locale.D_FMT)) - try: - locale.setlocale(locale.LC_ALL, orig_locale) - except: - logger.warning('cannot set original locale: {0}'.format(orig_locale)) - return formatted_date + if date.year <= 1900: + return date.isoformat().split('T')[0] + + if locale_string == 'all': + locale_string = settings['ui']['default_locale'] or 'en_US' + + return format_date(date, locale=locale_string) def dict_subset(d, properties): From 4184cece4a0b2d04b45105e755492bfee5fa1a12 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Mon, 7 Sep 2015 23:13:04 +0200 Subject: [PATCH 26/53] [fix] duckduckgo unicode url - #419 --- searx/engines/duckduckgo.py | 2 +- searx/tests/engines/test_duckduckgo.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index f18f3b44..4ac2099a 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -72,7 +72,7 @@ def response(resp): # append result results.append({'title': title, 'content': content, - 'url': res_url.encode('utf8')}) + 'url': res_url}) # return results return results diff --git a/searx/tests/engines/test_duckduckgo.py b/searx/tests/engines/test_duckduckgo.py index 6f085cbc..14cd9cd8 100644 --- a/searx/tests/engines/test_duckduckgo.py +++ b/searx/tests/engines/test_duckduckgo.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from collections import defaultdict import mock from searx.engines import duckduckgo @@ -30,7 +31,7 @@ class TestDuckduckgoEngine(SearxTestCase): response = mock.Mock(text='') self.assertEqual(duckduckgo.response(response), []) - html = """ + html = u"""