Sanitise description from Open Library

This commit is contained in:
Joeri de Ruiter 2023-08-02 19:30:40 +02:00
parent f4a4b59a14
commit ae5c27f3bb
3 changed files with 7 additions and 6 deletions

View file

@ -10,6 +10,7 @@ from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
from .connector_manager import ConnectorException, create_edition_task
from .openlibrary_languages import languages
from ..utils.sanitizer import clean
class Connector(AbstractConnector):
@ -237,10 +238,10 @@ def ignore_edition(edition_data: JsonDict) -> bool:
return True
def get_description(description_blob: Union[JsonDict, str]) -> Optional[str]:
def get_description(description_blob: Union[JsonDict, str]) -> str:
"""descriptions can be a string or a dict"""
if isinstance(description_blob, dict):
description = markdown(description_blob.get("value"))
description = markdown(description_blob.get("value", ""))
else:
description = markdown(description_blob)
@ -249,10 +250,10 @@ def get_description(description_blob: Union[JsonDict, str]) -> Optional[str]:
and description.endswith("</p>")
and description.count("<p>") == 1
):
# If there is just one <p> tag around the text remove it
# If there is just one <p> tag and it is around the text remove it
return description[len("<p>") : -len("</p>")].strip()
return description
return clean(description)
def get_openlibrary_key(key: str) -> str:

View file

@ -14,7 +14,7 @@ from bookwyrm.connectors.openlibrary import get_languages, get_description
from bookwyrm.connectors.openlibrary import pick_default_edition, get_openlibrary_key
from bookwyrm.connectors.connector_manager import ConnectorException
# pylint: disable=too-many-public-methods
class Openlibrary(TestCase):
"""test loading data from openlibrary.org"""

View file

@ -2,7 +2,7 @@
import bleach
def clean(input_text):
def clean(input_text: str) -> str:
"""Run through "bleach" """
return bleach.clean(
input_text,