From 095f43ed25bc80339a66f1f27aa85c6e3a0a4b19 Mon Sep 17 00:00:00 2001 From: shevonkuan Date: Thu, 10 Mar 2022 17:14:14 +0800 Subject: [PATCH] Add a new engin: Baidu --- requirements.txt | 1 + searx/engines/baidu.py | 157 +++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 6 +- 3 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 searx/engines/baidu.py diff --git a/requirements.txt b/requirements.txt index b843d892..547aa953 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ certifi==2021.10.8 babel==2.9.1 +beautifulsoup4==4.10.0 flask-babel==2.0.0 flask==2.0.2 jinja2==3.0.3 diff --git a/searx/engines/baidu.py b/searx/engines/baidu.py new file mode 100644 index 00000000..4dd89ec0 --- /dev/null +++ b/searx/engines/baidu.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Baidu +""" + +from json import loads +from urllib.parse import urlencode, urlparse +from bs4 import BeautifulSoup +from searx.exceptions import SearxEngineException +#import requests +# about +about = { + "website": 'https://github.com/', + "wikidata_id": 'Q14772', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['general'] + +# search-url +baidu_host_url = "https://www.baidu.com" +baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}" + +ABSTRACT_MAX_LENGTH = 500 + + +# do search-request +def request(query, params): + + offset = (params['pageno'] - 1) * 10 + params['url'] = baidu_search_url.format(query=urlencode({ + 'wd': query, + 'pn': offset + })) + # headers + params['headers']['Accept'] = ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" + ) + params['headers']['Accept-Language'] = ("zh-CN,zh;q=0.9") + params['headers']['Content-Type'] = ("application/x-www-form-urlencoded") + params['headers']['User-Agent'] = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" + ) + params['headers']['Accept-Encoding'] = ("gzip, deflate") + params['headers']['Referer'] = ("https://www.baidu.com/") + + return params + + +# get response from search-request +def response(resp): + results = [] + try: + resp.encoding = "utf-8" + root = BeautifulSoup(resp.text, "lxml") + div_contents = root.find("div", id="content_left") + for div in div_contents.contents: + if type(div) != type(div_contents): + continue + + class_list = div.get("class", []) + if not class_list: + continue + + if "c-container" not in class_list: + continue + + title = '' + url = '' + abstract = '' + if "xpath-log" in class_list: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a['href'].strip() + else: + title = div.text.strip().split("\n", 1)[0] + if div.a: + url = div.a['href'].strip() + + if div.find("div", class_="c-abstract"): + abstract = div.find("div", + class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip().split("\n", 1)[1].strip() + elif "result-op" in class_list: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a['href'].strip() + else: + title = div.text.strip().split("\n", 1)[0] + url = div.a['href'].strip() + if div.find("div", class_="c-abstract"): + abstract = div.find("div", + class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + # abstract = div.text.strip() + abstract = div.text.strip().split("\n", 1)[1].strip() + else: + if div.get("tpl", "") != "se_com_default": + if div.get("tpl", "") == "se_st_com_abstract": + if len(div.contents) >= 1: + title = div.h3.text.strip() + if div.find("div", class_="c-abstract"): + abstract = div.find( + "div", class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + else: + if len(div.contents) >= 2: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a['href'].strip() + else: + title = div.contents[0].text.strip() + url = div.h3.a['href'].strip() + # abstract = div.contents[-1].text + if div.find("div", class_="c-abstract"): + abstract = div.find( + "div", class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + else: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a['href'].strip() + else: + title = div.contents[0].text.strip() + url = div.h3.a['href'].strip() + if div.find("div", class_="c-abstract"): + abstract = div.find("div", + class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + + if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH: + abstract = abstract[:ABSTRACT_MAX_LENGTH] + #re = requests.Session.get(url, allow_redirects=False) + #url = re.headers['location'] + # append result + results.append({'url': url, 'title': title, 'content': abstract}) + except Exception as e: + raise SearxEngineException() + # return results + return results diff --git a/searx/settings.yml b/searx/settings.yml index f4fd30fa..ff3fa548 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -398,7 +398,11 @@ engines: timeout : 3.0 weight : 2 tests: *tests_infobox - + - name: baidu + engine : baidu + shortcut : bd + timeout : 2.0 + disabled : True - name : duckduckgo engine : duckduckgo shortcut : ddg