mirror of
https://github.com/searx/searx
synced 2025-01-19 15:30:02 +01:00
[fix] highlighting only html
This commit is contained in:
parent
04c408389d
commit
7b4ec5c5e9
@ -25,7 +25,6 @@ from urlparse import urlparse
|
||||
from searx import settings
|
||||
import ConfigParser
|
||||
import sys
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
engine_dir = dirname(realpath(__file__))
|
||||
@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params):
|
||||
results[engine_name] = cb_res
|
||||
return process_callback
|
||||
|
||||
def highlight_content(content, query):
|
||||
|
||||
if not content:
|
||||
return None
|
||||
# ignoring html contents
|
||||
# TODO better html content detection
|
||||
if content.find('<') != -1:
|
||||
return content
|
||||
|
||||
query = query.decode('utf-8')
|
||||
if content.lower().find(query.lower()) > -1:
|
||||
query_regex = u'({0})'.format(re.escape(query))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
else:
|
||||
regex_parts = []
|
||||
for chunk in query.split():
|
||||
if len(chunk) == 1:
|
||||
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
|
||||
else:
|
||||
regex_parts.append(u'{0}'.format(re.escape(chunk)))
|
||||
query_regex = u'({0})'.format('|'.join(regex_parts))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
|
||||
return content
|
||||
|
||||
def score_results(results):
|
||||
flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
|
||||
flat_len = len(flat_res)
|
||||
@ -218,8 +192,6 @@ def search(query, request, selected_engines):
|
||||
results = score_results(results)
|
||||
|
||||
for result in results:
|
||||
if 'content' in result:
|
||||
result['content'] = highlight_content(result['content'], query)
|
||||
for res_engine in result['engines']:
|
||||
engines[result['engine']].stats['score_count'] += result['score']
|
||||
|
||||
|
@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
|
||||
import csv
|
||||
import codecs
|
||||
import cStringIO
|
||||
import re
|
||||
|
||||
def highlight_content(content, query):
|
||||
|
||||
if not content:
|
||||
return None
|
||||
# ignoring html contents
|
||||
# TODO better html content detection
|
||||
if content.find('<') != -1:
|
||||
return content
|
||||
|
||||
query = query.decode('utf-8')
|
||||
if content.lower().find(query.lower()) > -1:
|
||||
query_regex = u'({0})'.format(re.escape(query))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
else:
|
||||
regex_parts = []
|
||||
for chunk in query.split():
|
||||
if len(chunk) == 1:
|
||||
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
|
||||
else:
|
||||
regex_parts.append(u'{0}'.format(re.escape(chunk)))
|
||||
query_regex = u'({0})'.format('|'.join(regex_parts))
|
||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||
|
||||
return content
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
def __init__(self):
|
||||
|
@ -29,6 +29,7 @@ import json
|
||||
import cStringIO
|
||||
from searx.utils import UnicodeWriter
|
||||
from flask import send_from_directory
|
||||
from searx.utils import highlight_content, html_to_text
|
||||
|
||||
|
||||
|
||||
@ -104,6 +105,14 @@ def index():
|
||||
results, suggestions = search(query, request, selected_engines)
|
||||
|
||||
for result in results:
|
||||
if request_data.get('format', 'html') == 'html':
|
||||
if 'content' in result:
|
||||
result['content'] = highlight_content(result['content'], query)
|
||||
result['title'] = highlight_content(result['title'], query)
|
||||
else:
|
||||
if 'content' in result:
|
||||
result['content'] = html_to_text(result['content']).strip()
|
||||
result['title'] = html_to_text(result['title']).strip()
|
||||
if len(result['url']) > 74:
|
||||
result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
|
||||
else:
|
||||
|
Loading…
x
Reference in New Issue
Block a user