bookwyrm/bookwyrm/utils/isni.py
2023-08-21 14:00:09 +02:00

228 lines
7.7 KiB
Python

"""ISNI author checking utilities"""
import xml.etree.ElementTree as ET
from typing import Union, Optional
import requests
from bookwyrm import activitypub, models
def get_element_text(element: Optional[ET.Element]) -> str:
"""If the element is not None and there is a text attribute return this"""
if element is not None and element.text is not None:
return element.text
return ""
def request_isni_data(search_index: str, search_term: str, max_records: int = 5) -> str:
"""Request data from the ISNI API"""
search_string = f'{search_index}="{search_term}"'
query_params: dict[str, Union[str, int]] = {
"query": search_string,
"version": "1.1",
"operation": "searchRetrieve",
"recordSchema": "isni-b",
"maximumRecords": max_records,
"startRecord": "1",
"recordPacking": "xml",
"sortKeys": "RLV,pica,0,,",
}
result = requests.get("http://isni.oclc.org/sru/", params=query_params, timeout=15)
# the OCLC ISNI server asserts the payload is encoded
# in latin1, but we know better
result.encoding = "utf-8"
return result.text
def make_name_string(element: ET.Element) -> str:
"""create a string of form 'personal_name surname'"""
# NOTE: this will often be incorrect, many naming systems
# list "surname" before personal name
forename = element.find(".//forename")
surname = element.find(".//surname")
forename_text = get_element_text(forename)
surname_text = get_element_text(surname)
return "".join(
[forename_text, " " if forename_text and surname_text else "", surname_text]
)
def get_other_identifier(element: ET.Element, code: str) -> str:
"""Get other identifiers associated with an author from their ISNI record"""
identifiers = element.findall(".//otherIdentifierOfIdentity")
for section_head in identifiers:
if (
(section_type := section_head.find(".//type")) is not None
and section_type.text is not None
and section_type.text == code
and (identifier := section_head.find(".//identifier")) is not None
and identifier.text is not None
):
return identifier.text
# if we can't find it in otherIdentifierOfIdentity,
# try sources
for source in element.findall(".//sources"):
if (
(code_of_source := source.find(".//codeOfSource")) is not None
and code_of_source.text is not None
and code_of_source.text.lower() == code.lower()
and (source_identifier := source.find(".//sourceIdentifier")) is not None
and source_identifier.text is not None
):
return source_identifier.text
return ""
def get_external_information_uri(element: ET.Element, match_string: str) -> str:
"""Get URLs associated with an author from their ISNI record"""
sources = element.findall(".//externalInformation")
for source in sources:
information = source.find(".//information")
uri = source.find(".//URI")
if (
uri is not None
and uri.text is not None
and information is not None
and information.text is not None
and information.text.lower() == match_string.lower()
):
return uri.text
return ""
def find_authors_by_name(
name_string: str, description: bool = False
) -> list[activitypub.Author]:
"""Query the ISNI database for possible author matches by name"""
payload = request_isni_data("pica.na", name_string)
# parse xml
root = ET.fromstring(payload)
# build list of possible authors
possible_authors = []
for element in root.iter("responseRecord"):
# TODO: we don't seem to do anything with the
# personal_name variable - is this code block needed?
personal_name = element.find(".//forename/..")
if not personal_name:
continue
author = get_author_from_isni(
get_element_text(element.find(".//isniUnformatted"))
)
if author is None:
continue
if bool(description):
titles = []
# prefer title records from LoC+ coop, Australia, Ireland, or Singapore
# in that order
for source in ["LCNACO", "NLA", "N6I", "NLB"]:
for parent in element.findall(f'.//titleOfWork/[@source="{source}"]'):
titles.append(parent.find(".//title"))
for parent in element.findall(f'.//titleOfWork[@subsource="{source}"]'):
titles.append(parent.find(".//title"))
# otherwise just grab the first title listing
titles.append(element.find(".//title"))
if titles:
# some of the "titles" in ISNI are a little ...iffy
# @ is used by ISNI/OCLC to index the starting point ignoring stop words
# (e.g. "The @Government of no one")
author.bio = ""
for title in titles:
if (
title is not None
and hasattr(title, "text")
and title.text is not None
and not title.text.replace("@", "").isnumeric()
):
author.bio = title.text.replace("@", "")
break
possible_authors.append(author)
return possible_authors
def get_author_from_isni(isni: str) -> Optional[activitypub.Author]:
"""Find data to populate a new author record from their ISNI"""
payload = request_isni_data("pica.isn", isni)
# parse xml
root = ET.fromstring(payload)
# there should only be a single responseRecord
# but let's use the first one just in case
element = root.find(".//responseRecord")
if element is None:
return None
name = (
make_name_string(forename)
if (forename := element.find(".//forename/..")) is not None
else ""
)
viaf = get_other_identifier(element, "viaf")
# use a set to dedupe aliases in ISNI
aliases = set()
aliases_element = element.findall(".//personalNameVariant")
for entry in aliases_element:
aliases.add(make_name_string(entry))
bio = get_element_text(element.find(".//nameTitle"))
wikipedia = get_external_information_uri(element, "Wikipedia")
author = activitypub.Author(
id=get_element_text(element.find(".//isniURI")),
name=name,
isni=isni,
viafId=viaf,
# aliases needs to be list not set
aliases=list(aliases),
bio=bio,
wikipediaLink=wikipedia,
)
return author
def build_author_from_isni(match_value: str) -> dict[str, activitypub.Author]:
"""Build basic author class object from ISNI URL"""
# if it is an isni value get the data
if match_value.startswith("https://isni.org/isni/"):
isni = match_value.replace("https://isni.org/isni/", "")
author = get_author_from_isni(isni)
if author is not None:
return {"author": author}
# otherwise it's a name string
return {}
def augment_author_metadata(author: models.Author, isni: str) -> None:
"""Update any missing author fields from ISNI data"""
isni_author = get_author_from_isni(isni)
if isni_author is None:
return
isni_author.to_model(model=models.Author, instance=author, overwrite=False)
# we DO want to overwrite aliases because we're adding them to the
# existing aliases and ISNI will usually have more.
# We need to dedupe because ISNI records often have lots of dupe aliases
aliases = set(isni_author.aliases)
for alias in author.aliases:
aliases.add(alias)
author.aliases = list(aliases)
author.save()