2021-10-29 10:14:32 +00:00
|
|
|
"""ISNI author checking utilities"""
|
2021-10-29 05:12:31 +00:00
|
|
|
import xml.etree.ElementTree as ET
|
2021-10-29 10:14:32 +00:00
|
|
|
import requests
|
2021-10-29 05:12:31 +00:00
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-29 05:12:31 +00:00
|
|
|
def url_stringify(string):
|
2021-10-29 10:14:32 +00:00
|
|
|
"""replace spaces for url encoding"""
|
2021-10-31 06:58:15 +00:00
|
|
|
|
|
|
|
# TODO: this is very lazy and incomplete
|
2021-10-29 05:12:31 +00:00
|
|
|
return string.replace(" ", "+")
|
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 23:20:19 +00:00
|
|
|
def request_isni_data(search_index, search_term, max_records=5):
|
2021-10-31 06:58:15 +00:00
|
|
|
"""Request data from the ISNI API"""
|
2021-10-29 05:12:31 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
search_string = url_stringify(search_term)
|
|
|
|
query_parts = [
|
2021-10-31 09:48:47 +00:00
|
|
|
"http://isni.oclc.org/sru/?query=",
|
2021-10-31 06:58:15 +00:00
|
|
|
search_index,
|
|
|
|
"+%3D+%22",
|
|
|
|
search_string,
|
|
|
|
"%22&version=1.1&operation=searchRetrieve&recordSchema=isni-b",
|
|
|
|
"&maximumRecords=",
|
2021-10-31 23:20:19 +00:00
|
|
|
str(max_records),
|
2021-10-31 09:48:47 +00:00
|
|
|
"&startRecord=1&recordPacking=xml&sortKeys=RLV%2Cpica%2C0%2C%2C",
|
|
|
|
]
|
2021-10-31 06:58:15 +00:00
|
|
|
query_url = "".join(query_parts)
|
|
|
|
result = requests.get(query_url)
|
2021-10-29 10:00:35 +00:00
|
|
|
# the OCLC ISNI server asserts the payload is encoded
|
|
|
|
# in latin1, but we know better
|
2021-10-29 10:14:32 +00:00
|
|
|
result.encoding = "utf-8"
|
2021-10-31 06:58:15 +00:00
|
|
|
return result.text
|
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
def make_name_string(element):
|
|
|
|
"""create a string of form 'personal_name surname'"""
|
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
# NOTE: this will often be incorrect, many naming systems
|
2021-10-31 06:58:15 +00:00
|
|
|
# list "surname" before personal name
|
|
|
|
forename = element.find(".//forename")
|
|
|
|
surname = element.find(".//surname")
|
|
|
|
if forename is not None:
|
2021-10-31 09:48:47 +00:00
|
|
|
return "".join([forename.text, " ", surname.text])
|
2021-10-31 06:58:15 +00:00
|
|
|
return surname.text
|
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
def get_other_identifier(element, code):
|
|
|
|
"""Get other identifiers associated with an author from their ISNI record"""
|
|
|
|
|
|
|
|
identifiers = element.findall(".//otherIdentifierOfIdentity")
|
|
|
|
for section_head in identifiers:
|
|
|
|
if (
|
2021-10-31 09:48:47 +00:00
|
|
|
section_head.find(".//type") is not None
|
2021-10-31 06:58:15 +00:00
|
|
|
and section_head.find(".//type").text == code
|
2021-10-31 09:48:47 +00:00
|
|
|
and section_head.find(".//identifier") is not None
|
2021-10-31 06:58:15 +00:00
|
|
|
):
|
|
|
|
return section_head.find(".//identifier").text
|
2021-10-31 23:20:19 +00:00
|
|
|
return ""
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
|
|
|
|
def get_external_information_uri(element, match_string):
|
|
|
|
"""Get URLs associated with an author from their ISNI record"""
|
|
|
|
|
|
|
|
sources = element.findall(".//externalInformation")
|
|
|
|
for source in sources:
|
|
|
|
uri = source.find(".//URI")
|
2021-10-31 09:48:47 +00:00
|
|
|
if uri is not None and uri.text.find(match_string) is not None:
|
|
|
|
return uri.text
|
2021-10-31 23:20:19 +00:00
|
|
|
return ""
|
2021-10-31 06:58:15 +00:00
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
def find_authors_by_name(name_string):
|
|
|
|
"""Query the ISNI database for possible author matches by name"""
|
|
|
|
|
|
|
|
payload = request_isni_data("pica.na", name_string)
|
2021-10-29 10:00:35 +00:00
|
|
|
# parse xml
|
2021-10-29 05:12:31 +00:00
|
|
|
root = ET.fromstring(payload)
|
|
|
|
# build list of possible authors
|
|
|
|
possible_authors = []
|
2021-10-29 10:14:32 +00:00
|
|
|
for element in root.iter("responseRecord"):
|
2021-10-29 05:12:31 +00:00
|
|
|
|
2021-10-29 10:14:32 +00:00
|
|
|
personal_name = element.find(".//forename/..")
|
2021-10-31 06:58:15 +00:00
|
|
|
bio = element.find(".//nameTitle")
|
|
|
|
|
|
|
|
if not personal_name:
|
|
|
|
continue
|
2021-10-29 05:12:31 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
author = {}
|
|
|
|
author["isni"] = element.find(".//isniUnformatted").text
|
|
|
|
author["uri"] = element.find(".//isniURI").text
|
|
|
|
author["name"] = make_name_string(personal_name)
|
|
|
|
if bio is not None:
|
|
|
|
author["bio"] = bio.text
|
|
|
|
possible_authors.append(author)
|
2021-10-29 05:12:31 +00:00
|
|
|
|
|
|
|
return possible_authors
|
2021-10-31 06:58:15 +00:00
|
|
|
|
2021-10-31 09:48:47 +00:00
|
|
|
|
2021-10-31 06:58:15 +00:00
|
|
|
def get_author_isni_data(isni):
|
2021-10-31 23:20:19 +00:00
|
|
|
"""Find data to populate a new author record from their ISNI"""
|
2021-10-31 06:58:15 +00:00
|
|
|
|
|
|
|
payload = request_isni_data("pica.isn", isni)
|
|
|
|
# parse xml
|
|
|
|
root = ET.fromstring(payload)
|
|
|
|
# there should only be a single responseRecord
|
|
|
|
# but let's use the first one just in case
|
|
|
|
element = root.find(".//responseRecord")
|
|
|
|
personal_name = element.find(".//forename/..")
|
|
|
|
bio = element.find(".//nameTitle")
|
|
|
|
author = {}
|
|
|
|
author["isni"] = isni
|
|
|
|
author["name"] = make_name_string(personal_name)
|
|
|
|
author["viaf_id"] = get_other_identifier(element, "viaf")
|
|
|
|
author["wikipedia_link"] = get_external_information_uri(element, "Wikipedia")
|
|
|
|
author["bio"] = bio.text if bio is not None else ""
|
2021-10-31 09:48:47 +00:00
|
|
|
author["aliases"] = []
|
2021-10-31 06:58:15 +00:00
|
|
|
aliases = element.findall(".//personalNameVariant")
|
|
|
|
for entry in aliases:
|
2021-10-31 09:48:47 +00:00
|
|
|
author["aliases"].append(make_name_string(entry))
|
2021-11-01 08:50:49 +00:00
|
|
|
# dedupe aliases
|
|
|
|
author["aliases"] = list(set(author["aliases"]))
|
2021-10-31 06:58:15 +00:00
|
|
|
return author
|
2021-11-01 00:34:32 +00:00
|
|
|
|
2021-11-01 00:39:37 +00:00
|
|
|
|
2021-11-01 00:34:32 +00:00
|
|
|
def build_author_dict(match_value):
|
2021-11-01 00:58:08 +00:00
|
|
|
"""Build dict with basic author details from ISNI or author name"""
|
2021-11-01 00:34:32 +00:00
|
|
|
|
|
|
|
# if it is an isni value get the data
|
|
|
|
if match_value.startswith("isni_match_"):
|
|
|
|
isni = match_value.replace("isni_match_", "")
|
|
|
|
return get_author_isni_data(isni)
|
|
|
|
# otherwise it's a name string
|
2021-11-01 00:39:37 +00:00
|
|
|
return {"name": match_value}
|
2021-11-01 08:50:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
def augment_author_metadata(author, isni):
|
|
|
|
"""Update any missing author fields from ISNI data"""
|
|
|
|
isni_data = get_author_isni_data(isni)
|
|
|
|
author.viaf_id = (
|
|
|
|
isni_data["viaf_id"] if len(author.viaf_id) == 0 else author.viaf_id
|
|
|
|
)
|
|
|
|
author.wikipedia_link = (
|
|
|
|
isni_data["wikipedia_link"]
|
|
|
|
if len(author.wikipedia_link) == 0
|
|
|
|
else author.wikipedia_link
|
|
|
|
)
|
|
|
|
author.bio = isni_data["bio"] if len(author.bio) == 0 else author.bio
|
|
|
|
aliases = set(isni_data["aliases"])
|
|
|
|
for x in author.aliases:
|
|
|
|
aliases.add(x)
|
|
|
|
author.aliases = list(aliases)
|
|
|
|
author.save()
|