From 7b4ec5c5e9a89fc1bc3b3fc8dfad26450530a2da Mon Sep 17 00:00:00 2001 From: asciimoo Date: Fri, 10 Jan 2014 23:38:08 +0100 Subject: [PATCH] [fix] highlighting only html --- searx/engines/__init__.py | 28 ---------------------------- searx/utils.py | 26 ++++++++++++++++++++++++++ searx/webapp.py | 9 +++++++++ 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index e011737cf..60fb5cf55 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -25,7 +25,6 @@ from urlparse import urlparse from searx import settings import ConfigParser import sys -import re from datetime import datetime engine_dir = dirname(realpath(__file__)) @@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params): results[engine_name] = cb_res return process_callback -def highlight_content(content, query): - - if not content: - return None - # ignoring html contents - # TODO better html content detection - if content.find('<') != -1: - return content - - query = query.decode('utf-8') - if content.lower().find(query.lower()) > -1: - query_regex = u'({0})'.format(re.escape(query)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) - else: - regex_parts = [] - for chunk in query.split(): - if len(chunk) == 1: - regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) - else: - regex_parts.append(u'{0}'.format(re.escape(chunk))) - query_regex = u'({0})'.format('|'.join(regex_parts)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) - - return content - def score_results(results): flat_res = filter(None, chain.from_iterable(izip_longest(*results.values()))) flat_len = len(flat_res) @@ -218,8 +192,6 @@ def search(query, request, selected_engines): results = score_results(results) for result in results: - if 'content' in result: - result['content'] = highlight_content(result['content'], query) for res_engine in result['engines']: engines[result['engine']].stats['score_count'] += result['score'] diff --git a/searx/utils.py b/searx/utils.py index 670499805..53300181f 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -3,6 +3,32 @@ from HTMLParser import HTMLParser import csv import codecs import cStringIO +import re + +def highlight_content(content, query): + + if not content: + return None + # ignoring html contents + # TODO better html content detection + if content.find('<') != -1: + return content + + query = query.decode('utf-8') + if content.lower().find(query.lower()) > -1: + query_regex = u'({0})'.format(re.escape(query)) + content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) + else: + regex_parts = [] + for chunk in query.split(): + if len(chunk) == 1: + regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk))) + else: + regex_parts.append(u'{0}'.format(re.escape(chunk))) + query_regex = u'({0})'.format('|'.join(regex_parts)) + content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) + + return content class HTMLTextExtractor(HTMLParser): def __init__(self): diff --git a/searx/webapp.py b/searx/webapp.py index 9905bce37..606e109b9 100644 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -29,6 +29,7 @@ import json import cStringIO from searx.utils import UnicodeWriter from flask import send_from_directory +from searx.utils import highlight_content, html_to_text @@ -104,6 +105,14 @@ def index(): results, suggestions = search(query, request, selected_engines) for result in results: + if request_data.get('format', 'html') == 'html': + if 'content' in result: + result['content'] = highlight_content(result['content'], query) + result['title'] = highlight_content(result['title'], query) + else: + if 'content' in result: + result['content'] = html_to_text(result['content']).strip() + result['title'] = html_to_text(result['title']).strip() if len(result['url']) > 74: result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:] else: