mirror of
https://github.com/searxng/searxng.git
synced 2024-11-27 05:11:03 +00:00
[fix] google engine - div classes has been renamed in HTML reult
Since 1. October 2020 google has changed the 'class' attribute of the HTML result page. Fix the xpath expressions and ignore <div class="g" ../> sections which do not match to title's xpath expression. Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
This commit is contained in:
parent
fd5fe36984
commit
8162d7aff4
1 changed files with 11 additions and 6 deletions
|
@ -116,12 +116,12 @@ g_section_with_header = './g-section-with-header'
|
||||||
# the title is a h3 tag relative to the result group
|
# the title is a h3 tag relative to the result group
|
||||||
title_xpath = './/h3[1]'
|
title_xpath = './/h3[1]'
|
||||||
|
|
||||||
# in the result group there is <div class="r" ../> it's first child is a <a
|
# in the result group there is <div class="yuRUbf" ../> it's first child is a <a
|
||||||
# href=...> (on some results, the <a> is the first "descendant", not ""child")
|
# href=...>
|
||||||
href_xpath = './/div[@class="r"]//a/@href'
|
href_xpath = './/div[@class="yuRUbf"]//a/@href'
|
||||||
|
|
||||||
# in the result group there is <div class="s" ../> containing he *content*
|
# in the result group there is <div class="IsZvec" ../> containing he *content*
|
||||||
content_xpath = './/div[@class="s"]'
|
content_xpath = './/div[@class="IsZvec"]'
|
||||||
|
|
||||||
# Suggestions are links placed in a *card-section*, we extract only the text
|
# Suggestions are links placed in a *card-section*, we extract only the text
|
||||||
# from the links not the links itself.
|
# from the links not the links itself.
|
||||||
|
@ -249,7 +249,12 @@ def response(resp):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title = extract_text(eval_xpath(result, title_xpath)[0])
|
title_tag = eval_xpath(result, title_xpath)
|
||||||
|
if not title_tag:
|
||||||
|
# this not one of the common google results *section*
|
||||||
|
logger.debug('ingoring <div class="g" ../> section: missing title')
|
||||||
|
continue
|
||||||
|
title = extract_text(title_tag[0])
|
||||||
url = eval_xpath(result, href_xpath)[0]
|
url = eval_xpath(result, href_xpath)[0]
|
||||||
content = extract_text_from_dom(result, content_xpath)
|
content = extract_text_from_dom(result, content_xpath)
|
||||||
results.append({
|
results.append({
|
||||||
|
|
Loading…
Reference in a new issue