[fix] engine - Crossref

Crossref was broken on result types journal-issue and component .. The old code
had lots of assumptions, and broke during parsing.  Now the assumptions are more
explicit and checked them with the API.
This commit is contained in:
jazzzooo 2023-09-13 16:21:10 +00:00 committed by Markus Heiser
parent ed6a5a01bb
commit 74600c028d

View file

@ -1,60 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""Semantic Scholar (Science)
"""
# pylint: disable=use-dict-literal
"""CrossRef"""
from urllib.parse import urlencode
from searx.utils import html_to_text
from datetime import datetime
about = {
"website": 'https://www.crossref.org/',
"wikidata_id": 'Q5188229',
"official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
"website": "https://www.crossref.org/",
"wikidata_id": "Q5188229",
"official_api_documentation": "https://api.crossref.org",
"use_official_api": False,
"require_api_key": False,
"results": 'JSON',
"results": "JSON",
}
categories = ['science', 'scientific publications']
categories = ["science", "scientific publications"]
paging = True
search_url = 'https://api.crossref.org/works'
search_url = "https://api.crossref.org/works"
def request(query, params):
params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
return params
def response(resp):
res = resp.json()
results = []
for record in res['message']['items']:
record_type = record['type']
if record_type == 'book-chapter':
title = record['container-title'][0]
if record['title'][0].lower().strip() != title.lower().strip():
title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
journal = None
for record in resp.json()["message"]["items"]:
if record["type"] == "component":
# These seem to be files published along with papers. Not something you'd search for
continue
result = {
"template": "paper.html",
"content": record.get("abstract", ""),
"doi": record.get("DOI"),
"pages": record.get("page"),
"publisher": record.get("publisher"),
"tags": record.get("subject"),
"type": record.get("type"),
"url": record.get("URL"),
"volume": record.get("volume"),
}
if record["type"] == "book-chapter":
result["title"] = record["container-title"][0]
if record["title"][0].lower().strip() != result["title"].lower().strip():
result["title"] += f" ({record['title'][0]})"
else:
title = html_to_text(record['title'][0])
journal = record.get('container-title', [None])[0]
url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
results.append(
{
'template': 'paper.html',
'url': url,
'title': title,
'journal': journal,
'volume': record.get('volume'),
'type': record['type'],
'content': html_to_text(record.get('abstract', '')),
'publisher': record.get('publisher'),
'authors': authors,
'doi': record['DOI'],
'isbn': isbn,
}
)
result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
result["url"] = record["resource"]["primary"]["URL"]
if "published" in record and "date-parts" in record["published"]:
result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
# All the links are not PDFs, even if the URL ends with ".pdf"
# result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
results.append(result)
return results