[enh] removing result html tags

This commit is contained in:
asciimoo 2013-11-09 18:39:20 +01:00
parent 14a53e3430
commit 17bf00ee42
4 changed files with 7 additions and 6 deletions

View File

@ -1,5 +1,6 @@
from json import loads from json import loads
from urllib import urlencode from urllib import urlencode
from searx.utils import html_to_text
url = 'https://duckduckgo.com/' url = 'https://duckduckgo.com/'
search_url = url + 'd.js?{query}&l=us-en&p=1&s=0' search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
@ -16,7 +17,7 @@ def response(resp):
if not r.get('t'): if not r.get('t'):
continue continue
results.append({'title': r['t'] results.append({'title': r['t']
,'content': r['a'] ,'content': html_to_text(r['a'])
,'url': r['u'] ,'url': r['u']
}) })
return results return results

View File

@ -1,4 +1,4 @@
from urllib import quote from urllib import urlencode
from lxml import html from lxml import html
from urlparse import urlparse from urlparse import urlparse
from cgi import escape from cgi import escape
@ -8,7 +8,7 @@ search_url = base_url+'do/search'
def request(query, params): def request(query, params):
global search_url global search_url
query = quote(query.replace(' ', '+'), safe='+') query = urlencode({'q': query})[2:]
params['url'] = search_url params['url'] = search_url
params['method'] = 'POST' params['method'] = 'POST'
params['data'] = {'query': query} params['data'] = {'query': query}

View File

@ -1,6 +1,7 @@
from urlparse import urljoin from urlparse import urljoin
from urllib import urlencode from urllib import urlencode
from lxml import html from lxml import html
from cgi import escape
categories = ['social media'] categories = ['social media']
@ -21,6 +22,6 @@ def response(resp):
link = tweet.xpath('.//small[@class="time"]//a')[0] link = tweet.xpath('.//small[@class="time"]//a')[0]
url = urljoin(base_url, link.attrib.get('href')) url = urljoin(base_url, link.attrib.get('href'))
title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()')) title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*'))) content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()')))
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
return results return results

View File

@ -46,12 +46,11 @@ def request(query, params):
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
query = resp.search_params['query']
if results_xpath: if results_xpath:
for result in dom.xpath(results_xpath): for result in dom.xpath(results_xpath):
url = extract_url(result.xpath(url_xpath)) url = extract_url(result.xpath(url_xpath))
title = ' '.join(result.xpath(title_xpath)) title = ' '.join(result.xpath(title_xpath))
content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query)) content = escape(' '.join(result.xpath(content_xpath)))
results.append({'url': url, 'title': title, 'content': content}) results.append({'url': url, 'title': title, 'content': content})
else: else:
for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)): for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):