Merge pull request #2640 from return42/fix-yahoo-news

[fix] rewrite Yahoo-News engine
This commit is contained in:
Alexandre Flament 2021-03-08 19:03:41 +01:00 committed by GitHub
commit ccf5ac9801
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 78 additions and 70 deletions

View File

@ -196,6 +196,7 @@ PYLINT_FILES=\
searx/engines/google_images.py \ searx/engines/google_images.py \
searx/engines/mediathekviewweb.py \ searx/engines/mediathekviewweb.py \
searx/engines/google_scholar.py \ searx/engines/google_scholar.py \
searx/engines/yahoo_news.py \
searx_extra/update/update_external_bangs.py searx_extra/update/update_external_bangs.py
test.pylint: pyenvinstall test.pylint: pyenvinstall

View File

@ -1,16 +1,35 @@
# SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later
""" """Yahoo (News)
Yahoo (News)
Yahoo News is "English only" and do not offer localized nor language queries.
""" """
# pylint: disable=invalid-name, missing-function-docstring
import re import re
from datetime import datetime, timedelta
from urllib.parse import urlencode from urllib.parse import urlencode
from lxml import html from datetime import datetime, timedelta
from searx.engines.yahoo import parse_url, language_aliases
from searx.engines.yahoo import _fetch_supported_languages, supported_languages_url # NOQA # pylint: disable=unused-import
from dateutil import parser from dateutil import parser
from searx.utils import extract_text, extract_url, match_language from lxml import html
from searx import logger
from searx.utils import (
eval_xpath_list,
eval_xpath_getindex,
extract_text,
)
from searx.engines.yahoo import parse_url
# pylint: disable=unused-import
from searx.engines.yahoo import (
_fetch_supported_languages,
supported_languages_url,
)
# pylint: enable=unused-import
logger = logger.getChild('yahoo_news engine')
# about # about
about = { about = {
@ -22,90 +41,78 @@ about = {
"results": 'HTML', "results": 'HTML',
} }
# engine dependent config language_support = False
categories = ['news'] time_range_support = False
safesearch = False
paging = True paging = True
categories = ['news']
# search-url # search-url
search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa search_url = (
'https://news.search.yahoo.com/search'
'?{query}&b={offset}'
)
# specific xpath variables AGO_RE = re.compile(r'([0-9]+)\s*(year|month|week|day|minute|hour)')
results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' AGO_TIMEDELTA = {
url_xpath = './/h3/a/@href' 'minute': timedelta(minutes=1),
title_xpath = './/h3/a' 'hour': timedelta(hours=1),
content_xpath = './/div[@class="compText"]' 'day': timedelta(days=1),
publishedDate_xpath = './/span[contains(@class,"tri")]' 'week': timedelta(days=7),
suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' 'month': timedelta(days=30),
'year': timedelta(days=365),
}
# do search-request
def request(query, params): def request(query, params):
offset = (params['pageno'] - 1) * 10 + 1 offset = (params['pageno'] - 1) * 10 + 1
if params['language'] == 'all': params['url'] = search_url.format(
language = 'en' offset = offset,
else: query = urlencode({'p': query})
language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] )
logger.debug("query_url --> %s", params['url'])
params['url'] = search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
# TODO required?
params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
.format(lang=language)
return params return params
def sanitize_url(url):
if ".yahoo.com/" in url:
return re.sub("\\;\\_ylt\\=.+$", "", url)
else:
return url
# get response from search-request
def response(resp): def response(resp):
results = [] results = []
dom = html.fromstring(resp.text) dom = html.fromstring(resp.text)
# parse results # parse results
for result in dom.xpath(results_xpath): for result in eval_xpath_list(dom, '//ol[contains(@class,"searchCenterMiddle")]//li'):
urls = result.xpath(url_xpath)
if len(urls) != 1: url = eval_xpath_getindex(result, './/h4/a/@href', 0, None)
if url is None:
continue continue
url = sanitize_url(parse_url(extract_url(urls, search_url))) url = parse_url(url)
title = extract_text(result.xpath(title_xpath)[0]) title = extract_text(result.xpath('.//h4/a'))
content = extract_text(result.xpath(content_xpath)[0]) content = extract_text(result.xpath('.//p'))
img_src = eval_xpath_getindex(result, './/img/@data-src', 0, None)
# parse publishedDate item = {
publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) 'url': url,
'title': title,
'content': content,
'img_src' : img_src
}
# still useful ? pub_date = extract_text(result.xpath('.//span[contains(@class,"s-time")]'))
if re.match("^[0-9]+ minute(s|) ago$", publishedDate): ago = AGO_RE.search(pub_date)
publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) if ago:
elif re.match("^[0-9]+ days? ago$", publishedDate): number = int(ago.group(1))
publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) delta = AGO_TIMEDELTA[ago.group(2)]
elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): pub_date = datetime.now() - delta * number
timeNumbers = re.findall(r'\d+', publishedDate)
publishedDate = datetime.now()\
- timedelta(hours=int(timeNumbers[0]))\
- timedelta(minutes=int(timeNumbers[1]))
else: else:
try: try:
publishedDate = parser.parse(publishedDate) pub_date = parser.parse(pub_date)
except: except parser.ParserError:
publishedDate = datetime.now() pub_date = None
if publishedDate.year == 1900: if pub_date is not None:
publishedDate = publishedDate.replace(year=datetime.now().year) item['publishedDate'] = pub_date
results.append(item)
# append result for suggestion in eval_xpath_list(dom, '//div[contains(@class,"AlsoTry")]//td'):
results.append({'url': url, results.append({'suggestion': extract_text(suggestion)})
'title': title,
'content': content,
'publishedDate': publishedDate})
# return results
return results return results