From d87e1b1567da7d6990bc586289246974abdfaa06 Mon Sep 17 00:00:00 2001 From: Hugh Rundle Date: Fri, 29 Oct 2021 21:00:35 +1100 Subject: [PATCH] fix encoding The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html --- bookwyrm/utils/isni.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bookwyrm/utils/isni.py b/bookwyrm/utils/isni.py index 2fd87bebc..c905eb1c2 100644 --- a/bookwyrm/utils/isni.py +++ b/bookwyrm/utils/isni.py @@ -1,5 +1,6 @@ import requests import xml.etree.ElementTree as ET +from xml.etree.ElementTree import XMLParser # get data base_string = "http://isni.oclc.org/sru/?query=pica.na+%3D+%22" @@ -15,8 +16,11 @@ def find_authors_by_name(names): names = url_stringify(names) query = base_string + names + suffix_string r = requests.get(query) - # parse xml + # the OCLC ISNI server asserts the payload is encoded + # in latin1, but we know better + r.encoding = "utf-8" payload = r.text + # parse xml root = ET.fromstring(payload) # build list of possible authors