This commit is contained in:
✨Shevon・Kuan✨ 2022-07-14 14:55:05 +03:00 committed by GitHub
commit 5f45b875db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 163 additions and 1 deletions

View File

@ -1,5 +1,6 @@
certifi==2022.5.18.1
babel==2.9.1
beautifulsoup4==4.10.0
flask-babel==2.0.0
flask==2.1.1
jinja2==3.1.2

157
searx/engines/baidu.py Normal file
View File

@ -0,0 +1,157 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Baidu
"""
from json import loads
from urllib.parse import urlencode, urlparse
from bs4 import BeautifulSoup
from searx.exceptions import SearxEngineException
#import requests
# about
about = {
"website": 'https://github.com/',
"wikidata_id": 'Q14772',
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
}
# engine dependent config
categories = ['general']
# search-url
baidu_host_url = "https://www.baidu.com"
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}"
ABSTRACT_MAX_LENGTH = 500
# do search-request
def request(query, params):
offset = (params['pageno'] - 1) * 10
params['url'] = baidu_search_url.format(query=urlencode({
'wd': query,
'pn': offset
}))
# headers
params['headers']['Accept'] = (
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
)
params['headers']['Accept-Language'] = ("zh-CN,zh;q=0.9")
params['headers']['Content-Type'] = ("application/x-www-form-urlencoded")
params['headers']['User-Agent'] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
)
params['headers']['Accept-Encoding'] = ("gzip, deflate")
params['headers']['Referer'] = ("https://www.baidu.com/")
return params
# get response from search-request
def response(resp):
results = []
try:
resp.encoding = "utf-8"
root = BeautifulSoup(resp.text, "lxml")
div_contents = root.find("div", id="content_left")
for div in div_contents.contents:
if type(div) != type(div_contents):
continue
class_list = div.get("class", [])
if not class_list:
continue
if "c-container" not in class_list:
continue
title = ''
url = ''
abstract = ''
if "xpath-log" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.text.strip().split("\n", 1)[0]
if div.a:
url = div.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div",
class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip().split("\n", 1)[1].strip()
elif "result-op" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.text.strip().split("\n", 1)[0]
url = div.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div",
class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
# abstract = div.text.strip()
abstract = div.text.strip().split("\n", 1)[1].strip()
else:
if div.get("tpl", "") != "se_com_default":
if div.get("tpl", "") == "se_st_com_abstract":
if len(div.contents) >= 1:
title = div.h3.text.strip()
if div.find("div", class_="c-abstract"):
abstract = div.find(
"div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if len(div.contents) >= 2:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a['href'].strip()
# abstract = div.contents[-1].text
if div.find("div", class_="c-abstract"):
abstract = div.find(
"div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a['href'].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a['href'].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div",
class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
abstract = abstract[:ABSTRACT_MAX_LENGTH]
#re = requests.Session.get(url, allow_redirects=False)
#url = re.headers['location']
# append result
results.append({'url': url, 'title': title, 'content': abstract})
except Exception as e:
raise SearxEngineException()
# return results
return results

View File

@ -398,7 +398,11 @@ engines:
timeout : 3.0
weight : 2
tests: *tests_infobox
- name: baidu
engine : baidu
shortcut : bd
timeout : 2.0
disabled : True
- name : duckduckgo
engine : duckduckgo
shortcut : ddg