mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-10 17:25:35 +00:00
fix encoding
The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html
This commit is contained in:
parent
f2e4865adf
commit
d87e1b1567
1 changed files with 5 additions and 1 deletions
|
@ -1,5 +1,6 @@
|
||||||
import requests
|
import requests
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
from xml.etree.ElementTree import XMLParser
|
||||||
|
|
||||||
# get data
|
# get data
|
||||||
base_string = "http://isni.oclc.org/sru/?query=pica.na+%3D+%22"
|
base_string = "http://isni.oclc.org/sru/?query=pica.na+%3D+%22"
|
||||||
|
@ -15,8 +16,11 @@ def find_authors_by_name(names):
|
||||||
names = url_stringify(names)
|
names = url_stringify(names)
|
||||||
query = base_string + names + suffix_string
|
query = base_string + names + suffix_string
|
||||||
r = requests.get(query)
|
r = requests.get(query)
|
||||||
# parse xml
|
# the OCLC ISNI server asserts the payload is encoded
|
||||||
|
# in latin1, but we know better
|
||||||
|
r.encoding = "utf-8"
|
||||||
payload = r.text
|
payload = r.text
|
||||||
|
# parse xml
|
||||||
root = ET.fromstring(payload)
|
root = ET.fromstring(payload)
|
||||||
|
|
||||||
# build list of possible authors
|
# build list of possible authors
|
||||||
|
|
Loading…
Reference in a new issue