mirror of
https://github.com/searxng/searxng.git
synced 2024-11-23 03:11:00 +00:00
[fix] highlighting only html
This commit is contained in:
parent
04c408389d
commit
7b4ec5c5e9
3 changed files with 35 additions and 28 deletions
|
@ -25,7 +25,6 @@ from urlparse import urlparse
|
||||||
from searx import settings
|
from searx import settings
|
||||||
import ConfigParser
|
import ConfigParser
|
||||||
import sys
|
import sys
|
||||||
import re
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
engine_dir = dirname(realpath(__file__))
|
engine_dir = dirname(realpath(__file__))
|
||||||
|
@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params):
|
||||||
results[engine_name] = cb_res
|
results[engine_name] = cb_res
|
||||||
return process_callback
|
return process_callback
|
||||||
|
|
||||||
def highlight_content(content, query):
|
|
||||||
|
|
||||||
if not content:
|
|
||||||
return None
|
|
||||||
# ignoring html contents
|
|
||||||
# TODO better html content detection
|
|
||||||
if content.find('<') != -1:
|
|
||||||
return content
|
|
||||||
|
|
||||||
query = query.decode('utf-8')
|
|
||||||
if content.lower().find(query.lower()) > -1:
|
|
||||||
query_regex = u'({0})'.format(re.escape(query))
|
|
||||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
|
||||||
else:
|
|
||||||
regex_parts = []
|
|
||||||
for chunk in query.split():
|
|
||||||
if len(chunk) == 1:
|
|
||||||
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
|
|
||||||
else:
|
|
||||||
regex_parts.append(u'{0}'.format(re.escape(chunk)))
|
|
||||||
query_regex = u'({0})'.format('|'.join(regex_parts))
|
|
||||||
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
|
||||||
|
|
||||||
return content
|
|
||||||
|
|
||||||
def score_results(results):
|
def score_results(results):
|
||||||
flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
|
flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
|
||||||
flat_len = len(flat_res)
|
flat_len = len(flat_res)
|
||||||
|
@ -218,8 +192,6 @@ def search(query, request, selected_engines):
|
||||||
results = score_results(results)
|
results = score_results(results)
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
if 'content' in result:
|
|
||||||
result['content'] = highlight_content(result['content'], query)
|
|
||||||
for res_engine in result['engines']:
|
for res_engine in result['engines']:
|
||||||
engines[result['engine']].stats['score_count'] += result['score']
|
engines[result['engine']].stats['score_count'] += result['score']
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
|
||||||
import csv
|
import csv
|
||||||
import codecs
|
import codecs
|
||||||
import cStringIO
|
import cStringIO
|
||||||
|
import re
|
||||||
|
|
||||||
|
def highlight_content(content, query):
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
# ignoring html contents
|
||||||
|
# TODO better html content detection
|
||||||
|
if content.find('<') != -1:
|
||||||
|
return content
|
||||||
|
|
||||||
|
query = query.decode('utf-8')
|
||||||
|
if content.lower().find(query.lower()) > -1:
|
||||||
|
query_regex = u'({0})'.format(re.escape(query))
|
||||||
|
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||||
|
else:
|
||||||
|
regex_parts = []
|
||||||
|
for chunk in query.split():
|
||||||
|
if len(chunk) == 1:
|
||||||
|
regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
|
||||||
|
else:
|
||||||
|
regex_parts.append(u'{0}'.format(re.escape(chunk)))
|
||||||
|
query_regex = u'({0})'.format('|'.join(regex_parts))
|
||||||
|
content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
class HTMLTextExtractor(HTMLParser):
|
class HTMLTextExtractor(HTMLParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
|
@ -29,6 +29,7 @@ import json
|
||||||
import cStringIO
|
import cStringIO
|
||||||
from searx.utils import UnicodeWriter
|
from searx.utils import UnicodeWriter
|
||||||
from flask import send_from_directory
|
from flask import send_from_directory
|
||||||
|
from searx.utils import highlight_content, html_to_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -104,6 +105,14 @@ def index():
|
||||||
results, suggestions = search(query, request, selected_engines)
|
results, suggestions = search(query, request, selected_engines)
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
|
if request_data.get('format', 'html') == 'html':
|
||||||
|
if 'content' in result:
|
||||||
|
result['content'] = highlight_content(result['content'], query)
|
||||||
|
result['title'] = highlight_content(result['title'], query)
|
||||||
|
else:
|
||||||
|
if 'content' in result:
|
||||||
|
result['content'] = html_to_text(result['content']).strip()
|
||||||
|
result['title'] = html_to_text(result['title']).strip()
|
||||||
if len(result['url']) > 74:
|
if len(result['url']) > 74:
|
||||||
result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
|
result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in a new issue