moviewyrm/bookwyrm/utils/isni.py

"""ISNI author checking utilities"""
import xml.etree.ElementTree as ET
import requests


def url_stringify(string):
    """replace spaces for url encoding"""

    # TODO: this is very lazy and incomplete
    return string.replace(" ", "+")


def request_isni_data(search_index, search_term, max_records=5):
    """Request data from the ISNI API"""

    search_string = url_stringify(search_term)
    query_parts = [
        "http://isni.oclc.org/sru/?query=",
        search_index,
        "+%3D+%22",
        search_string,
        "%22&version=1.1&operation=searchRetrieve&recordSchema=isni-b",
        "&maximumRecords=",
        str(max_records),
        "&startRecord=1&recordPacking=xml&sortKeys=RLV%2Cpica%2C0%2C%2C",
    ]
    query_url = "".join(query_parts)
    result = requests.get(query_url)
    # the OCLC ISNI server asserts the payload is encoded
    # in latin1, but we know better
    result.encoding = "utf-8"
    return result.text


def make_name_string(element):
    """create a string of form 'personal_name surname'"""

    # NOTE: this will often be incorrect, many naming systems
    # list "surname" before personal name
    forename = element.find(".//forename")
    surname = element.find(".//surname")
    if forename is not None:
        return "".join([forename.text, " ", surname.text])
    return surname.text


def get_other_identifier(element, code):
    """Get other identifiers associated with an author from their ISNI record"""

    identifiers = element.findall(".//otherIdentifierOfIdentity")
    for section_head in identifiers:
        if (
            section_head.find(".//type") is not None
            and section_head.find(".//type").text == code
            and section_head.find(".//identifier") is not None
        ):
            return section_head.find(".//identifier").text
    return ""


def get_external_information_uri(element, match_string):
    """Get URLs associated with an author from their ISNI record"""

    sources = element.findall(".//externalInformation")
    for source in sources:
        uri = source.find(".//URI")
        if uri is not None and uri.text.find(match_string) is not None:
            return uri.text
    return ""


def find_authors_by_name(name_string):
    """Query the ISNI database for possible author matches by name"""

    payload = request_isni_data("pica.na", name_string)
    # parse xml
    root = ET.fromstring(payload)
    # build list of possible authors
    possible_authors = []
    for element in root.iter("responseRecord"):

        personal_name = element.find(".//forename/..")
        bio = element.find(".//nameTitle")

        if not personal_name:
            continue

        author = {}
        author["isni"] = element.find(".//isniUnformatted").text
        author["uri"] = element.find(".//isniURI").text
        author["name"] = make_name_string(personal_name)
        if bio is not None:
            author["bio"] = bio.text
        possible_authors.append(author)

    return possible_authors


def get_author_isni_data(isni):
    """Find data to populate a new author record from their ISNI"""

    payload = request_isni_data("pica.isn", isni)
    # parse xml
    root = ET.fromstring(payload)
    # there should only be a single responseRecord
    # but let's use the first one just in case
    element = root.find(".//responseRecord")
    personal_name = element.find(".//forename/..")
    bio = element.find(".//nameTitle")
    author = {}
    author["isni"] = isni
    author["name"] = make_name_string(personal_name)
    author["viaf_id"] = get_other_identifier(element, "viaf")
    author["wikipedia_link"] = get_external_information_uri(element, "Wikipedia")
    author["bio"] = bio.text if bio is not None else ""
    author["aliases"] = []
    aliases = element.findall(".//personalNameVariant")
    for entry in aliases:
        author["aliases"].append(make_name_string(entry))
    # dedupe aliases
    author["aliases"] = list(set(author["aliases"]))
    return author


def build_author_dict(match_value):
    """Build dict with basic author details from ISNI or author name"""

    # if it is an isni value get the data
    if match_value.startswith("isni_match_"):
        isni = match_value.replace("isni_match_", "")
        return get_author_isni_data(isni)
    # otherwise it's a name string
    return {"name": match_value}


def augment_author_metadata(author, isni):
    """Update any missing author fields from ISNI data"""
    isni_data = get_author_isni_data(isni)
    author.viaf_id = (
        isni_data["viaf_id"] if len(author.viaf_id) == 0 else author.viaf_id
    )
    author.wikipedia_link = (
        isni_data["wikipedia_link"]
        if len(author.wikipedia_link) == 0
        else author.wikipedia_link
    )
    author.bio = isni_data["bio"] if len(author.bio) == 0 else author.bio
    aliases = set(isni_data["aliases"])
    for x in author.aliases:
        aliases.add(x)
    author.aliases = list(aliases)
    author.save()
code formatting 2021-10-29 10:14:32 +00:00			`"""ISNI author checking utilities"""`
isni author lookup utility 2021-10-29 05:12:31 +00:00			`import xml.etree.ElementTree as ET`
code formatting 2021-10-29 10:14:32 +00:00			`import requests`
isni author lookup utility 2021-10-29 05:12:31 +00:00
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
isni author lookup utility 2021-10-29 05:12:31 +00:00			`def url_stringify(string):`
code formatting 2021-10-29 10:14:32 +00:00			`"""replace spaces for url encoding"""`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00
			`# TODO: this is very lazy and incomplete`
isni author lookup utility 2021-10-29 05:12:31 +00:00			`return string.replace(" ", "+")`

hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
make pylint happy 2021-10-31 23:20:19 +00:00			`def request_isni_data(search_index, search_term, max_records=5):`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`"""Request data from the ISNI API"""`
isni author lookup utility 2021-10-29 05:12:31 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`search_string = url_stringify(search_term)`
			`query_parts = [`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`"http://isni.oclc.org/sru/?query=",`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`search_index,`
			`"+%3D+%22",`
			`search_string,`
			`"%22&version=1.1&operation=searchRetrieve&recordSchema=isni-b",`
			`"&maximumRecords=",`
make pylint happy 2021-10-31 23:20:19 +00:00			`str(max_records),`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`"&startRecord=1&recordPacking=xml&sortKeys=RLV%2Cpica%2C0%2C%2C",`
			`]`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`query_url = "".join(query_parts)`
			`result = requests.get(query_url)`
fix encoding The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html 2021-10-29 10:00:35 +00:00			`# the OCLC ISNI server asserts the payload is encoded`
			`# in latin1, but we know better`
code formatting 2021-10-29 10:14:32 +00:00			`result.encoding = "utf-8"`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`return result.text`

hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`def make_name_string(element):`
			`"""create a string of form 'personal_name surname'"""`

hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`# NOTE: this will often be incorrect, many naming systems`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`# list "surname" before personal name`
			`forename = element.find(".//forename")`
			`surname = element.find(".//surname")`
			`if forename is not None:`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`return "".join([forename.text, " ", surname.text])`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`return surname.text`

hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`def get_other_identifier(element, code):`
			`"""Get other identifiers associated with an author from their ISNI record"""`

			`identifiers = element.findall(".//otherIdentifierOfIdentity")`
			`for section_head in identifiers:`
			`if (`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`section_head.find(".//type") is not None`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`and section_head.find(".//type").text == code`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`and section_head.find(".//identifier") is not None`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`):`
			`return section_head.find(".//identifier").text`
make pylint happy 2021-10-31 23:20:19 +00:00			`return ""`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00
			`def get_external_information_uri(element, match_string):`
			`"""Get URLs associated with an author from their ISNI record"""`

			`sources = element.findall(".//externalInformation")`
			`for source in sources:`
			`uri = source.find(".//URI")`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`if uri is not None and uri.text.find(match_string) is not None:`
			`return uri.text`
make pylint happy 2021-10-31 23:20:19 +00:00			`return ""`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`def find_authors_by_name(name_string):`
			`"""Query the ISNI database for possible author matches by name"""`

			`payload = request_isni_data("pica.na", name_string)`
fix encoding The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html 2021-10-29 10:00:35 +00:00			`# parse xml`
isni author lookup utility 2021-10-29 05:12:31 +00:00			`root = ET.fromstring(payload)`
			`# build list of possible authors`
			`possible_authors = []`
code formatting 2021-10-29 10:14:32 +00:00			`for element in root.iter("responseRecord"):`
isni author lookup utility 2021-10-29 05:12:31 +00:00
code formatting 2021-10-29 10:14:32 +00:00			`personal_name = element.find(".//forename/..")`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`bio = element.find(".//nameTitle")`

			`if not personal_name:`
			`continue`
isni author lookup utility 2021-10-29 05:12:31 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`author = {}`
			`author["isni"] = element.find(".//isniUnformatted").text`
			`author["uri"] = element.find(".//isniURI").text`
			`author["name"] = make_name_string(personal_name)`
			`if bio is not None:`
			`author["bio"] = bio.text`
			`possible_authors.append(author)`
isni author lookup utility 2021-10-29 05:12:31 +00:00
			`return possible_authors`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`def get_author_isni_data(isni):`
make pylint happy 2021-10-31 23:20:19 +00:00			`"""Find data to populate a new author record from their ISNI"""`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00
			`payload = request_isni_data("pica.isn", isni)`
			`# parse xml`
			`root = ET.fromstring(payload)`
			`# there should only be a single responseRecord`
			`# but let's use the first one just in case`
			`element = root.find(".//responseRecord")`
			`personal_name = element.find(".//forename/..")`
			`bio = element.find(".//nameTitle")`
			`author = {}`
			`author["isni"] = isni`
			`author["name"] = make_name_string(personal_name)`
			`author["viaf_id"] = get_other_identifier(element, "viaf")`
			`author["wikipedia_link"] = get_external_information_uri(element, "Wikipedia")`
			`author["bio"] = bio.text if bio is not None else ""`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`author["aliases"] = []`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`aliases = element.findall(".//personalNameVariant")`
			`for entry in aliases:`
hide isni authors if isni in local db Check the authors suggested from the local DB for a matching ISNI when pulling authors from ISNI. i.e. we do not want to suggest the same author twice when we know it is the same author. 2021-10-31 09:48:47 +00:00			`author["aliases"].append(make_name_string(entry))`
Update existing authors when isni data available When an existing author is selected as a new author when editing a book, if they have an ISNI ID recorded we check the record and augment the local database record from the ISNI data. Also dedupes author aliases for this feature and when adding a completely new author. 2021-11-01 08:50:49 +00:00			`# dedupe aliases`
			`author["aliases"] = list(set(author["aliases"]))`
populate new authors with isni data If a user selects an author pulled from the ISNI service when editing a book, use any relevant data from ISNI to populate the new author record. This includes - bio - aliases - isni - wikipedia url - viaf 2021-10-31 06:58:15 +00:00			`return author`
select correct isni record when adding authors The original implementation of this was so, so broken. Now it's not. 2021-11-01 00:34:32 +00:00
code cleanup 2021-11-01 00:39:37 +00:00
select correct isni record when adding authors The original implementation of this was so, so broken. Now it's not. 2021-11-01 00:34:32 +00:00			`def build_author_dict(match_value):`
lint code 2021-11-01 00:58:08 +00:00			`"""Build dict with basic author details from ISNI or author name"""`
select correct isni record when adding authors The original implementation of this was so, so broken. Now it's not. 2021-11-01 00:34:32 +00:00
			`# if it is an isni value get the data`
			`if match_value.startswith("isni_match_"):`
			`isni = match_value.replace("isni_match_", "")`
			`return get_author_isni_data(isni)`
			`# otherwise it's a name string`
code cleanup 2021-11-01 00:39:37 +00:00			`return {"name": match_value}`
Update existing authors when isni data available When an existing author is selected as a new author when editing a book, if they have an ISNI ID recorded we check the record and augment the local database record from the ISNI data. Also dedupes author aliases for this feature and when adding a completely new author. 2021-11-01 08:50:49 +00:00

			`def augment_author_metadata(author, isni):`
			`"""Update any missing author fields from ISNI data"""`
			`isni_data = get_author_isni_data(isni)`
			`author.viaf_id = (`
			`isni_data["viaf_id"] if len(author.viaf_id) == 0 else author.viaf_id`
			`)`
			`author.wikipedia_link = (`
			`isni_data["wikipedia_link"]`
			`if len(author.wikipedia_link) == 0`
			`else author.wikipedia_link`
			`)`
			`author.bio = isni_data["bio"] if len(author.bio) == 0 else author.bio`
			`aliases = set(isni_data["aliases"])`
			`for x in author.aliases:`
			`aliases.add(x)`
			`author.aliases = list(aliases)`
			`author.save()`