mirror of https://github.com/searx/searx
commit
678b87f9d5
@ -0,0 +1,83 @@ |
||||
# INA (Videos) |
||||
# |
||||
# @website https://www.ina.fr/ |
||||
# @provide-api no |
||||
# |
||||
# @using-api no |
||||
# @results HTML (using search portal) |
||||
# @stable no (HTML can change) |
||||
# @parse url, title, content, publishedDate, thumbnail |
||||
# |
||||
# @todo set content-parameter with correct data |
||||
# @todo embedded (needs some md5 from video page) |
||||
|
||||
from json import loads |
||||
from urllib import urlencode |
||||
from lxml import html |
||||
from HTMLParser import HTMLParser |
||||
from searx.engines.xpath import extract_text |
||||
from dateutil import parser |
||||
|
||||
# engine dependent config |
||||
categories = ['videos'] |
||||
paging = True |
||||
page_size = 48 |
||||
|
||||
# search-url |
||||
base_url = 'https://www.ina.fr' |
||||
search_url = base_url + '/layout/set/ajax/recherche/result?autopromote=&hf={ps}&b={start}&type=Video&r=&{query}' |
||||
|
||||
# specific xpath variables |
||||
results_xpath = '//div[contains(@class,"search-results--list")]/div[@class="media"]' |
||||
url_xpath = './/a/@href' |
||||
title_xpath = './/h3[@class="h3--title media-heading"]' |
||||
thumbnail_xpath = './/img/@src' |
||||
publishedDate_xpath = './/span[@class="broadcast"]' |
||||
content_xpath = './/p[@class="media-body__summary"]' |
||||
|
||||
|
||||
# do search-request |
||||
def request(query, params): |
||||
params['url'] = search_url.format(ps=page_size, |
||||
start=params['pageno'] * page_size, |
||||
query=urlencode({'q': query})) |
||||
|
||||
return params |
||||
|
||||
|
||||
# get response from search-request |
||||
def response(resp): |
||||
results = [] |
||||
|
||||
# we get html in a JSON container... |
||||
response = loads(resp.text) |
||||
if "content" not in response: |
||||
return [] |
||||
dom = html.fromstring(response["content"]) |
||||
p = HTMLParser() |
||||
|
||||
# parse results |
||||
for result in dom.xpath(results_xpath): |
||||
videoid = result.xpath(url_xpath)[0] |
||||
url = base_url + videoid |
||||
title = p.unescape(extract_text(result.xpath(title_xpath))) |
||||
thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) |
||||
if thumbnail[0] == '/': |
||||
thumbnail = base_url + thumbnail |
||||
d = extract_text(result.xpath(publishedDate_xpath)[0]) |
||||
d = d.split('/') |
||||
# force ISO date to avoid wrong parsing |
||||
d = "%s-%s-%s" % (d[2], d[1], d[0]) |
||||
publishedDate = parser.parse(d) |
||||
content = extract_text(result.xpath(content_xpath)) |
||||
|
||||
# append result |
||||
results.append({'url': url, |
||||
'title': title, |
||||
'content': content, |
||||
'template': 'videos.html', |
||||
'publishedDate': publishedDate, |
||||
'thumbnail': thumbnail}) |
||||
|
||||
# return results |
||||
return results |
@ -0,0 +1,64 @@ |
||||
from collections import defaultdict |
||||
import mock |
||||
from searx.engines import ina |
||||
from searx.testing import SearxTestCase |
||||
|
||||
|
||||
class TestInaEngine(SearxTestCase): |
||||
|
||||
def test_request(self): |
||||
query = 'test_query' |
||||
dicto = defaultdict(dict) |
||||
dicto['pageno'] = 0 |
||||
params = ina.request(query, dicto) |
||||
self.assertTrue('url' in params) |
||||
self.assertTrue(query in params['url']) |
||||
self.assertTrue('ina.fr' in params['url']) |
||||
|
||||
def test_response(self): |
||||
self.assertRaises(AttributeError, ina.response, None) |
||||
self.assertRaises(AttributeError, ina.response, []) |
||||
self.assertRaises(AttributeError, ina.response, '') |
||||
self.assertRaises(AttributeError, ina.response, '[]') |
||||
|
||||
response = mock.Mock(text='{}') |
||||
self.assertEqual(ina.response(response), []) |
||||
|
||||
response = mock.Mock(text='{"data": []}') |
||||
self.assertEqual(ina.response(response), []) |
||||
|
||||
json = """ |
||||
{"content":"\\t<div class=\\"container\\">\\n\\t\\n\ |
||||
<!-- DEBUT CONTENU PRINCIPAL -->\\n<div class=\\"row\\">\\n\ |
||||
<div class=\\"search-results--list\\"><div class=\\"media\\">\\n\ |
||||
\\t\\t\\t\\t<a class=\\"media-left media-video premium xiti_click_action\\" \ |
||||
data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ |
||||
href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\\n\ |
||||
<img src=\\"https:\/\/www.ina.fr\/images_v2\/140x105\/CAF89035682.jpeg\\" \ |
||||
alt=\\"Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle \\">\\n\ |
||||
\\t\\t\\t\\t\\t<\/a>\\n\ |
||||
\\t\\t\\t\\t\\t<div class=\\"media-body\\">\\n\\t\\t\\t\\t\\t\\t<h3 class=\\"h3--title media-heading\\">\\n\ |
||||
\\t\\t\\t\\t\\t\\t\\t<a class=\\"xiti_click_action\\" \ |
||||
data-xiti-params=\\"recherche_v4::resultats_conference_de_presse_du_general_de_gaulle::N\\" \ |
||||
href=\\"\/video\/CAF89035682\/conference-de-presse-du-general-de-gaulle-video.html\\">\ |
||||
Conf\\u00e9rence de presse du G\\u00e9n\\u00e9ral de Gaulle <\/a>\\n\ |
||||
<\/h3>\\n\ |
||||
<div class=\\"media-body__info\\">\\n<span class=\\"broadcast\\">27\/11\/1967<\/span>\\n\ |
||||
<span class=\\"views\\">29321 vues<\/span>\\n\ |
||||
<span class=\\"duration\\">01h 33m 07s<\/span>\\n\ |
||||
<\/div>\\n\ |
||||
<p class=\\"media-body__summary\\">VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE . \ |
||||
- PA le Pr\\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...<\/p>\\n\ |
||||
<\/div>\\n<\/div><!-- \/.media -->\\n" |
||||
} |
||||
""" |
||||
response = mock.Mock(text=json) |
||||
results = ina.response(response) |
||||
self.assertEqual(type(results), list) |
||||
self.assertEqual(len(results), 1) |
||||
self.assertEqual(results[0]['title'], u'Conf\xe9rence de presse du G\xe9n\xe9ral de Gaulle') |
||||
self.assertEqual(results[0]['url'], |
||||
'https://www.ina.fr/video/CAF89035682/conference-de-presse-du-general-de-gaulle-video.html') |
||||
self.assertEqual(results[0]['content'], |
||||
u"VERSION INTEGRALE DE LA CONFERENCE DE PRESSE DU GENERAL DE GAULLE ." |
||||
u" - PA le Pr\u00e9sident DE GAULLE : il ouvre les bras et s'assied. DP journalis...") |
Loading…
Reference in new issue