[mod] ddg engine mods

2024-11-10 21:09:02 +01:00 · 2014-03-21 16:33:17 +01:00 · 2014-03-21 16:33:17 +01:00 · 3854703d95
commit 3854703d95
parent ce08abe223
1 changed files with 45 additions and 13 deletions
--- a/searx/engines/duckduckgo.py
+++ b/searx/engines/duckduckgo.py
@ -1,29 +1,61 @@
-from json import loads
 from urllib import urlencode
+from lxml.html import fromstring
 from searx.utils import html_to_text

-url = 'https://duckduckgo.com/'
-search_url = url + 'd.js?{query}&p=1&s={offset}'
+url = 'https://duckduckgo.com/html?{query}&s={offset}'
 locale = 'us-en'

-paging = True
-
-
 def request(query, params):
    offset = (params['pageno'] - 1) * 30
    q = urlencode({'q': query,
                   'l': locale})
-    params['url'] = search_url.format(query=q, offset=offset)
+    params['url'] = url.format(query=q, offset=offset)
    return params


 def response(resp):
+    result_xpath = '//div[@class="results_links results_links_deep web-result"]'
+    url_xpath = './/a[@class="large"]/@href'
+    title_xpath = './/a[@class="large"]//text()'
+    content_xpath = './/div[@class="snippet"]//text()'
    results = []
-    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
-    for r in search_res:
-        if not r.get('t'):
+
+    doc = fromstring(resp.text)
+
+    for r in doc.xpath(result_xpath):
+        res_url = r.xpath(url_xpath)[-1]
+        if not res_url:
            continue
-        results.append({'title': r['t'],
-                       'content': html_to_text(r['a']),
-                       'url': r['u']})
+        title = html_to_text(''.join(r.xpath(title_xpath)))
+        content = html_to_text(''.join(r.xpath(content_xpath)))
+        results.append({'title': title,
+                        'content': content,
+                        'url': res_url})
+
    return results
+
+
+#from json import loads
+#search_url = url + 'd.js?{query}&p=1&s={offset}'
+#
+#paging = True
+#
+#
+#def request(query, params):
+#    offset = (params['pageno'] - 1) * 30
+#    q = urlencode({'q': query,
+#                   'l': locale})
+#    params['url'] = search_url.format(query=q, offset=offset)
+#    return params
+#
+#
+#def response(resp):
+#    results = []
+#    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
+#    for r in search_res:
+#        if not r.get('t'):
+#            continue
+#        results.append({'title': r['t'],
+#                       'content': html_to_text(r['a']),
+#                       'url': r['u']})
+#    return results