diff --git a/bookwyrm/isbn/isbn.py b/bookwyrm/isbn/isbn.py index e07d2100d..4cc7f47dd 100644 --- a/bookwyrm/isbn/isbn.py +++ b/bookwyrm/isbn/isbn.py @@ -1,11 +1,20 @@ """ Use the range message from isbn-international to hyphenate ISBNs """ import os +from typing import Optional from xml.etree import ElementTree +from xml.etree.ElementTree import Element + import requests from bookwyrm import settings +def _get_rules(element: Element) -> list[Element]: + if (rules_el := element.find("Rules")) is not None: + return rules_el.findall("Rule") + return [] + + class IsbnHyphenator: """Class to manage the range message xml file and use it to hyphenate ISBNs""" @@ -15,58 +24,94 @@ class IsbnHyphenator: ) __element_tree = None - def update_range_message(self): + def update_range_message(self) -> None: """Download the range message xml file and save it locally""" response = requests.get(self.__range_message_url) with open(self.__range_file_path, "w", encoding="utf-8") as file: file.write(response.text) self.__element_tree = None - def hyphenate(self, isbn_13): + def hyphenate(self, isbn_13: Optional[str]) -> Optional[str]: """hyphenate the given ISBN-13 number using the range message""" if isbn_13 is None: return None + if self.__element_tree is None: self.__element_tree = ElementTree.parse(self.__range_file_path) + gs1_prefix = isbn_13[:3] reg_group = self.__find_reg_group(isbn_13, gs1_prefix) if reg_group is None: return isbn_13 # failed to hyphenate + registrant = self.__find_registrant(isbn_13, gs1_prefix, reg_group) if registrant is None: return isbn_13 # failed to hyphenate + publication = isbn_13[len(gs1_prefix) + len(reg_group) + len(registrant) : -1] check_digit = isbn_13[-1:] return "-".join((gs1_prefix, reg_group, registrant, publication, check_digit)) - def __find_reg_group(self, isbn_13, gs1_prefix): - for ean_ucc_el in self.__element_tree.find("EAN.UCCPrefixes").findall( - "EAN.UCC" - ): - if ean_ucc_el.find("Prefix").text == gs1_prefix: - for rule_el in ean_ucc_el.find("Rules").findall("Rule"): - length = int(rule_el.find("Length").text) + def __find_reg_group(self, isbn_13: str, gs1_prefix: str) -> Optional[str]: + if self.__element_tree is None: + self.__element_tree = ElementTree.parse(self.__range_file_path) + + ucc_prefixes_el = self.__element_tree.find("EAN.UCCPrefixes") + if ucc_prefixes_el is None: + return None + + for ean_ucc_el in ucc_prefixes_el.findall("EAN.UCC"): + if ( + prefix_el := ean_ucc_el.find("Prefix") + ) is not None and prefix_el.text == gs1_prefix: + for rule_el in _get_rules(ean_ucc_el): + length_el = rule_el.find("Length") + if length_el is None: + continue + length = int(text) if (text := length_el.text) else 0 if length == 0: continue - reg_grp_range = [ - int(x[:length]) for x in rule_el.find("Range").text.split("-") - ] + + range_el = rule_el.find("Range") + if range_el is None or range_el.text is None: + continue + + reg_grp_range = [int(x[:length]) for x in range_el.text.split("-")] reg_group = isbn_13[len(gs1_prefix) : len(gs1_prefix) + length] if reg_grp_range[0] <= int(reg_group) <= reg_grp_range[1]: return reg_group return None return None - def __find_registrant(self, isbn_13, gs1_prefix, reg_group): + def __find_registrant( + self, isbn_13: str, gs1_prefix: str, reg_group: str + ) -> Optional[str]: from_ind = len(gs1_prefix) + len(reg_group) - for group_el in self.__element_tree.find("RegistrationGroups").findall("Group"): - if group_el.find("Prefix").text == "-".join((gs1_prefix, reg_group)): - for rule_el in group_el.find("Rules").findall("Rule"): - length = int(rule_el.find("Length").text) + + if self.__element_tree is None: + self.__element_tree = ElementTree.parse(self.__range_file_path) + + reg_groups_el = self.__element_tree.find("RegistrationGroups") + if reg_groups_el is None: + return None + + for group_el in reg_groups_el.findall("Group"): + if ( + prefix_el := group_el.find("Prefix") + ) is not None and prefix_el.text == "-".join((gs1_prefix, reg_group)): + for rule_el in _get_rules(group_el): + length_el = rule_el.find("Length") + if length_el is None: + continue + length = int(text) if (text := length_el.text) else 0 if length == 0: continue + + range_el = rule_el.find("Range") + if range_el is None or range_el.text is None: + continue registrant_range = [ - int(x[:length]) for x in rule_el.find("Range").text.split("-") + int(x[:length]) for x in range_el.text.split("-") ] registrant = isbn_13[from_ind : from_ind + length] if registrant_range[0] <= int(registrant) <= registrant_range[1]: diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py index 5c562ba26..9a4c9b5a4 100644 --- a/bookwyrm/settings.py +++ b/bookwyrm/settings.py @@ -1,5 +1,7 @@ """ bookwyrm settings and configuration """ import os +from typing import AnyStr + from environs import Env import requests @@ -37,7 +39,7 @@ EMAIL_SENDER_DOMAIN = env("EMAIL_SENDER_DOMAIN", DOMAIN) EMAIL_SENDER = f"{EMAIL_SENDER_NAME}@{EMAIL_SENDER_DOMAIN}" # Build paths inside the project like this: os.path.join(BASE_DIR, ...) -BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +BASE_DIR: AnyStr = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) LOCALE_PATHS = [ os.path.join(BASE_DIR, "locale"), ] diff --git a/bookwyrm/tests/test_isbn.py b/bookwyrm/tests/test_isbn.py new file mode 100644 index 000000000..b528e9210 --- /dev/null +++ b/bookwyrm/tests/test_isbn.py @@ -0,0 +1,31 @@ +""" test ISBN hyphenator for books """ +from django.test import TestCase + +from bookwyrm.isbn.isbn import hyphenator_singleton as hyphenator + + +class TestISBN(TestCase): + """isbn hyphenator""" + + def test_isbn_hyphenation(self): + """different isbn hyphenations""" + # nothing + self.assertEqual(hyphenator.hyphenate(None), None) + # 978-0 (English language) 3700000-6389999 + self.assertEqual(hyphenator.hyphenate("9780439554930"), "978-0-439-55493-0") + # 978-2 (French language) 0000000-1999999 + self.assertEqual(hyphenator.hyphenate("9782070100927"), "978-2-07-010092-7") + # 978-3 (German language) 2000000-6999999 + self.assertEqual(hyphenator.hyphenate("9783518188125"), "978-3-518-18812-5") + # 978-4 (Japan) 0000000-1999999 + self.assertEqual(hyphenator.hyphenate("9784101050454"), "978-4-10-105045-4") + # 978-626 (Taiwan) 9500000-9999999 + self.assertEqual(hyphenator.hyphenate("9786269533251"), "978-626-95332-5-1") + # 979-8 (United States) 4000000-8499999 + self.assertEqual(hyphenator.hyphenate("9798627974040"), "979-8-6279-7404-0") + # 978-626 (Taiwan) 8000000-9499999 (unassigned) + self.assertEqual(hyphenator.hyphenate("9786268533251"), "9786268533251") + # 978 range 6600000-6999999 (unassigned) + self.assertEqual(hyphenator.hyphenate("9786769533251"), "9786769533251") + # 979-8 (United States) 2300000-3499999 (unassigned) + self.assertEqual(hyphenator.hyphenate("9798311111111"), "9798311111111") diff --git a/mypy.ini b/mypy.ini index fe181e365..a039ccb33 100644 --- a/mypy.ini +++ b/mypy.ini @@ -16,6 +16,9 @@ ignore_errors = False [mypy-bookwyrm.importers.*] ignore_errors = False +[mypy-bookwyrm.isbn.*] +ignore_errors = False + [mypy-celerywyrm.*] ignore_errors = False