bookwyrm/bookwyrm/utils/isni.py

"""ISNI author checking utilities"""
import xml.etree.ElementTree as ET
from typing import Union, Optional

import requests

from bookwyrm import activitypub, models


def get_element_text(element: Optional[ET.Element]) -> str:
    """If the element is not None and there is a text attribute return this"""
    if element is not None and element.text is not None:
        return element.text
    return ""


def request_isni_data(search_index: str, search_term: str, max_records: int = 5) -> str:
    """Request data from the ISNI API"""

    search_string = f'{search_index}="{search_term}"'
    query_params: dict[str, Union[str, int]] = {
        "query": search_string,
        "version": "1.1",
        "operation": "searchRetrieve",
        "recordSchema": "isni-b",
        "maximumRecords": max_records,
        "startRecord": "1",
        "recordPacking": "xml",
        "sortKeys": "RLV,pica,0,,",
    }
    result = requests.get("http://isni.oclc.org/sru/", params=query_params, timeout=15)
    # the OCLC ISNI server asserts the payload is encoded
    # in latin1, but we know better
    result.encoding = "utf-8"
    return result.text


def make_name_string(element: ET.Element) -> str:
    """create a string of form 'personal_name surname'"""

    # NOTE: this will often be incorrect, many naming systems
    # list "surname" before personal name
    forename = element.find(".//forename")
    surname = element.find(".//surname")

    forename_text = get_element_text(forename)
    surname_text = get_element_text(surname)

    return "".join(
        [forename_text, " " if forename_text and surname_text else "", surname_text]
    )


def get_other_identifier(element: ET.Element, code: str) -> str:
    """Get other identifiers associated with an author from their ISNI record"""

    identifiers = element.findall(".//otherIdentifierOfIdentity")
    for section_head in identifiers:
        if (
            (section_type := section_head.find(".//type")) is not None
            and section_type.text is not None
            and section_type.text == code
            and (identifier := section_head.find(".//identifier")) is not None
            and identifier.text is not None
        ):
            return identifier.text

    # if we can't find it in otherIdentifierOfIdentity,
    # try sources
    for source in element.findall(".//sources"):
        if (
            (code_of_source := source.find(".//codeOfSource")) is not None
            and code_of_source.text is not None
            and code_of_source.text.lower() == code.lower()
            and (source_identifier := source.find(".//sourceIdentifier")) is not None
            and source_identifier.text is not None
        ):
            return source_identifier.text

    return ""


def get_external_information_uri(element: ET.Element, match_string: str) -> str:
    """Get URLs associated with an author from their ISNI record"""

    sources = element.findall(".//externalInformation")
    for source in sources:
        information = source.find(".//information")
        uri = source.find(".//URI")
        if (
            uri is not None
            and uri.text is not None
            and information is not None
            and information.text is not None
            and information.text.lower() == match_string.lower()
        ):
            return uri.text
    return ""


def find_authors_by_name(
    name_string: str, description: bool = False
) -> list[activitypub.Author]:
    """Query the ISNI database for possible author matches by name"""

    payload = request_isni_data("pica.na", name_string)
    # parse xml
    root = ET.fromstring(payload)
    # build list of possible authors
    possible_authors = []
    for element in root.iter("responseRecord"):

        # TODO: we don't seem to do anything with the
        # personal_name variable - is this code block needed?
        personal_name = element.find(".//forename/..")
        if not personal_name:
            continue

        author = get_author_from_isni(
            get_element_text(element.find(".//isniUnformatted"))
        )
        if author is None:
            continue

        if bool(description):

            titles = []
            # prefer title records from LoC+ coop, Australia, Ireland, or Singapore
            # in that order
            for source in ["LCNACO", "NLA", "N6I", "NLB"]:
                for parent in element.findall(f'.//titleOfWork/[@source="{source}"]'):
                    titles.append(parent.find(".//title"))
                for parent in element.findall(f'.//titleOfWork[@subsource="{source}"]'):
                    titles.append(parent.find(".//title"))
            # otherwise just grab the first title listing
            titles.append(element.find(".//title"))

            if titles:
                # some of the "titles" in ISNI are a little ...iffy
                # @ is used by ISNI/OCLC to index the starting point ignoring stop words
                # (e.g. "The @Government of no one")
                author.bio = ""
                for title in titles:
                    if (
                        title is not None
                        and hasattr(title, "text")
                        and title.text is not None
                        and not title.text.replace("@", "").isnumeric()
                    ):
                        author.bio = title.text.replace("@", "")
                        break

        possible_authors.append(author)

    return possible_authors


def get_author_from_isni(isni: str) -> Optional[activitypub.Author]:
    """Find data to populate a new author record from their ISNI"""

    payload = request_isni_data("pica.isn", isni)
    # parse xml
    root = ET.fromstring(payload)
    # there should only be a single responseRecord
    # but let's use the first one just in case
    element = root.find(".//responseRecord")
    if element is None:
        return None

    name = (
        make_name_string(forename)
        if (forename := element.find(".//forename/..")) is not None
        else ""
    )
    viaf = get_other_identifier(element, "viaf")
    # use a set to dedupe aliases in ISNI
    aliases = set()
    aliases_element = element.findall(".//personalNameVariant")
    for entry in aliases_element:
        aliases.add(make_name_string(entry))
    bio = get_element_text(element.find(".//nameTitle"))
    wikipedia = get_external_information_uri(element, "Wikipedia")

    author = activitypub.Author(
        id=get_element_text(element.find(".//isniURI")),
        name=name,
        isni=isni,
        viafId=viaf,
        # aliases needs to be list not set
        aliases=list(aliases),
        bio=bio,
        wikipediaLink=wikipedia,
    )

    return author


def build_author_from_isni(match_value: str) -> dict[str, activitypub.Author]:
    """Build basic author class object from ISNI URL"""

    # if it is an isni value get the data
    if match_value.startswith("https://isni.org/isni/"):
        isni = match_value.replace("https://isni.org/isni/", "")
        author = get_author_from_isni(isni)
        if author is not None:
            return {"author": author}
    # otherwise it's a name string
    return {}


def augment_author_metadata(author: models.Author, isni: str) -> None:
    """Update any missing author fields from ISNI data"""

    isni_author = get_author_from_isni(isni)
    if isni_author is None:
        return

    isni_author.to_model(model=models.Author, instance=author, overwrite=False)

    # we DO want to overwrite aliases because we're adding them to the
    # existing aliases and ISNI will usually have more.
    # We need to dedupe because ISNI records often have lots of dupe aliases
    aliases = set(isni_author.aliases)
    for alias in author.aliases:
        aliases.add(alias)
    author.aliases = list(aliases)
    author.save()