Merge pull request #1581 from hughrun/isni-poc

Query ISNI database when adding authors
2025-04-16 15:24:05 +00:00 · 2021-11-23 14:30:49 -08:00 · 2021-11-23 14:30:49 -08:00 · a24fb5cd11
commit a24fb5cd11
parent 73abd2d342 d8e83ffe82
6 changed files with 305 additions and 16 deletions
--- a/bookwyrm/models/author.py
+++ b/bookwyrm/models/author.py
@ -27,7 +27,7 @@ class Author(BookDataModel):
    # idk probably other keys would be useful here?
    born = fields.DateTimeField(blank=True, null=True)
    died = fields.DateTimeField(blank=True, null=True)
-    name = fields.CharField(max_length=255, deduplication_field=True)
+    name = fields.CharField(max_length=255)
    aliases = fields.ArrayField(
        models.CharField(max_length=255), blank=True, default=list
    )
--- a/bookwyrm/templates/author/author.html
+++ b/bookwyrm/templates/author/author.html
@ -2,6 +2,7 @@
 {% load i18n %}
 {% load markdown %}
 {% load humanize %}
+{% load utilities %}

 {% block title %}{{ author.name }}{% endblock %}

@ -25,7 +26,7 @@
 <div class="block columns content" itemscope itemtype="https://schema.org/Person">
    <meta itemprop="name" content="{{ author.name }}">

-    {% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id %}
+    {% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id or author.isni %}
    <div class="column is-two-fifths">
        <div class="box py-2">
            <dl>
@ -63,6 +64,14 @@
            </p>
            {% endif %}

+            {% if author.isni %}
+            <p class="my-1">
+                <a itemprop="sameAs" href="https://isni.org/isni/{{ author.isni|remove_spaces }}" rel="noopener" target="_blank">
+                    {% trans "View ISNI record" %}
+                </a>
+            </p>
+            {% endif %}
+
            {% if author.openlibrary_key %}
            <p class="my-1">
                <a itemprop="sameAs" href="https://openlibrary.org/authors/{{ author.openlibrary_key }}" target="_blank" rel="noopener">
--- a/bookwyrm/templates/book/edit/edit_book.html
+++ b/bookwyrm/templates/book/edit/edit_book.html
@ -1,6 +1,7 @@
 {% extends 'layout.html' %}
 {% load i18n %}
 {% load humanize %}
+{% load utilities %}

 {% block title %}{% if book %}{% blocktrans with book_title=book.title %}Edit "{{ book_title }}"{% endblocktrans %}{% else %}{% trans "Add Book" %}{% endif %}{% endblock %}

@ -52,19 +53,29 @@
                    {% for author in author_matches %}
                    <fieldset>
                        <legend class="title is-5 mb-1">
-                            {% blocktrans with name=author.name %}Is "{{ name }}" an existing author?{% endblocktrans %}
+                            {% blocktrans with name=author.name %}Is "{{ name }}" one of these authors?{% endblocktrans %}
                        </legend>
                        {% with forloop.counter0 as counter %}
                        {% for match in author.matches %}
-                        <label class="label mb-2">
+                        <label class="label">
                            <input type="radio" name="author_match-{{ counter }}" value="{{ match.id }}" required>
                            {{ match.name }}
                        </label>
-                        <p class="help">
-                            <a href="{{ match.local_path }}" target="_blank">{% blocktrans with book_title=match.book_set.first.title %}Author of <em>{{ book_title }}</em>{% endblocktrans %}</a>
+                        <p class="help ml-5 mb-2">
+                            {% with book_title=match.book_set.first.title alt_title=match.bio %}
+                            {% if book_title %}
+                            <a href="{{ match.local_path }}" target="_blank">{% trans "Author of " %}<em>{{ book_title }}</em></a>
+                            {% else %} 
+                            <a href="{{ match.id }}" target="_blank">{% if alt_title %}{% trans "Author of " %}<em>{{ alt_title }}</em>{% else %} {% trans "Find more information at isni.org" %}{% endif %}</a>
+                            {% endif %}
+                            {% endwith %}
                        </p>
+                        <p class="help ml-5">
+                            {{ author.existing_isnis|get_isni_bio:match }}
+                        </p>
+                        {{ author.existing_isnis|get_isni:match }}
                        {% endfor %}
-                        <label class="label">
+                        <label class="label mt-2">
                            <input type="radio" name="author_match-{{ counter }}" value="{{ author.name }}" required> {% trans "This is a new author" %}
                        </label>
                        {% endwith %}
--- a/bookwyrm/templatetags/utilities.py
+++ b/bookwyrm/templatetags/utilities.py
@ -1,8 +1,11 @@
 """ template filters for really common utilities """
 import os
+import re
 from uuid import uuid4
 from django import template
+from django.utils.safestring import mark_safe
 from django.utils.translation import gettext_lazy as _
+from django.template.defaultfilters import stringfilter
 from django.templatetags.static import static


@ -66,3 +69,39 @@ def get_book_cover_thumbnail(book, size="medium", ext="jpg"):
        return cover_thumbnail.url
    except OSError:
        return static("images/no_cover.jpg")
+
+
+@register.filter(name="get_isni_bio")
+def get_isni_bio(existing, author):
+    """Returns the isni bio string if an existing author has an isni listed"""
+    auth_isni = re.sub(r"\D", "", str(author.isni))
+    if len(existing) == 0:
+        return ""
+    for value in existing:
+        if hasattr(value, "bio") and auth_isni == re.sub(r"\D", "", str(value.isni)):
+            return mark_safe(f"Author of <em>{value.bio}</em>")
+
+    return ""
+
+
+# pylint: disable=unused-argument
+@register.filter(name="get_isni", needs_autoescape=True)
+def get_isni(existing, author, autoescape=True):
+    """Returns the isni ID if an existing author has an ISNI listing"""
+    auth_isni = re.sub(r"\D", "", str(author.isni))
+    if len(existing) == 0:
+        return ""
+    for value in existing:
+        if hasattr(value, "isni") and auth_isni == re.sub(r"\D", "", str(value.isni)):
+            isni = value.isni
+            return mark_safe(
+                f'<input type="text" name="isni-for-{author.id}" value="{isni}" hidden>'
+            )
+    return ""
+
+
+@register.filter(name="remove_spaces")
+@stringfilter
+def remove_spaces(arg):
+    """Removes spaces from argument passed in"""
+    return re.sub(r"\s", "", str(arg))
--- a/bookwyrm/utils/isni.py
+++ b/bookwyrm/utils/isni.py
@ -0,0 +1,183 @@
+"""ISNI author checking utilities"""
+import xml.etree.ElementTree as ET
+import requests
+
+from bookwyrm import activitypub, models
+
+
+def request_isni_data(search_index, search_term, max_records=5):
+    """Request data from the ISNI API"""
+
+    search_string = f'{search_index}="{search_term}"'
+    query_params = {
+        "query": search_string,
+        "version": "1.1",
+        "operation": "searchRetrieve",
+        "recordSchema": "isni-b",
+        "maximumRecords": max_records,
+        "startRecord": "1",
+        "recordPacking": "xml",
+        "sortKeys": "RLV,pica,0,,",
+    }
+    result = requests.get("http://isni.oclc.org/sru/", params=query_params, timeout=10)
+    # the OCLC ISNI server asserts the payload is encoded
+    # in latin1, but we know better
+    result.encoding = "utf-8"
+    return result.text
+
+
+def make_name_string(element):
+    """create a string of form 'personal_name surname'"""
+
+    # NOTE: this will often be incorrect, many naming systems
+    # list "surname" before personal name
+    forename = element.find(".//forename")
+    surname = element.find(".//surname")
+    if forename is not None:
+        return "".join([forename.text, " ", surname.text])
+    return surname.text
+
+
+def get_other_identifier(element, code):
+    """Get other identifiers associated with an author from their ISNI record"""
+
+    identifiers = element.findall(".//otherIdentifierOfIdentity")
+    for section_head in identifiers:
+        if (
+            section_head.find(".//type") is not None
+            and section_head.find(".//type").text == code
+            and section_head.find(".//identifier") is not None
+        ):
+            return section_head.find(".//identifier").text
+
+    # if we can't find it in otherIdentifierOfIdentity,
+    # try sources
+    for source in element.findall(".//sources"):
+        code_of_source = source.find(".//codeOfSource")
+        if code_of_source is not None and code_of_source.text.lower() == code.lower():
+            return source.find(".//sourceIdentifier").text
+
+    return ""
+
+
+def get_external_information_uri(element, match_string):
+    """Get URLs associated with an author from their ISNI record"""
+
+    sources = element.findall(".//externalInformation")
+    for source in sources:
+        information = source.find(".//information")
+        uri = source.find(".//URI")
+        if (
+            uri is not None
+            and information is not None
+            and information.text.lower() == match_string.lower()
+        ):
+            return uri.text
+    return ""
+
+
+def find_authors_by_name(name_string, description=False):
+    """Query the ISNI database for possible author matches by name"""
+
+    payload = request_isni_data("pica.na", name_string)
+    # parse xml
+    root = ET.fromstring(payload)
+    # build list of possible authors
+    possible_authors = []
+    for element in root.iter("responseRecord"):
+        personal_name = element.find(".//forename/..")
+        if not personal_name:
+            continue
+
+        author = get_author_from_isni(element.find(".//isniUnformatted").text)
+
+        if bool(description):
+
+            titles = []
+            # prefer title records from LoC+ coop, Australia, Ireland, or Singapore
+            # in that order
+            for source in ["LCNACO", "NLA", "N6I", "NLB"]:
+                for parent in element.findall(f'.//titleOfWork/[@source="{source}"]'):
+                    titles.append(parent.find(".//title"))
+                for parent in element.findall(f'.//titleOfWork[@subsource="{source}"]'):
+                    titles.append(parent.find(".//title"))
+            # otherwise just grab the first title listing
+            titles.append(element.find(".//title"))
+
+            if titles is not None:
+                # some of the "titles" in ISNI are a little ...iffy
+                # '@' is used by ISNI/OCLC to index the starting point ignoring stop words
+                # (e.g. "The @Government of no one")
+                title_elements = [
+                    e for e in titles if not e.text.replace("@", "").isnumeric()
+                ]
+                if len(title_elements):
+                    author.bio = title_elements[0].text.replace("@", "")
+                else:
+                    author.bio = None
+
+        possible_authors.append(author)
+
+    return possible_authors
+
+
+def get_author_from_isni(isni):
+    """Find data to populate a new author record from their ISNI"""
+
+    payload = request_isni_data("pica.isn", isni)
+    # parse xml
+    root = ET.fromstring(payload)
+    # there should only be a single responseRecord
+    # but let's use the first one just in case
+    element = root.find(".//responseRecord")
+    name = make_name_string(element.find(".//forename/.."))
+    viaf = get_other_identifier(element, "viaf")
+    # use a set to dedupe aliases in ISNI
+    aliases = set()
+    aliases_element = element.findall(".//personalNameVariant")
+    for entry in aliases_element:
+        aliases.add(make_name_string(entry))
+    # aliases needs to be list not set
+    aliases = list(aliases)
+    bio = element.find(".//nameTitle")
+    bio = bio.text if bio is not None else ""
+    wikipedia = get_external_information_uri(element, "Wikipedia")
+
+    author = activitypub.Author(
+        id=element.find(".//isniURI").text,
+        name=name,
+        isni=isni,
+        viafId=viaf,
+        aliases=aliases,
+        bio=bio,
+        wikipediaLink=wikipedia,
+    )
+
+    return author
+
+
+def build_author_from_isni(match_value):
+    """Build basic author class object from ISNI URL"""
+
+    # if it is an isni value get the data
+    if match_value.startswith("https://isni.org/isni/"):
+        isni = match_value.replace("https://isni.org/isni/", "")
+        return {"author": get_author_from_isni(isni)}
+    # otherwise it's a name string
+    return {}
+
+
+def augment_author_metadata(author, isni):
+    """Update any missing author fields from ISNI data"""
+
+    isni_author = get_author_from_isni(isni)
+    isni_author.to_model(model=models.Author, instance=author, overwrite=False)
+
+    # we DO want to overwrite aliases because we're adding them to the
+    # existing aliases and ISNI will usually have more.
+    # We need to dedupe because ISNI records often have lots of dupe aliases
+    aliases = set(isni_author.aliases)
+    for alias in author.aliases:
+        aliases.add(alias)
+    author.aliases = list(aliases)
+    author.save()
--- a/bookwyrm/views/books/edit_book.py
+++ b/bookwyrm/views/books/edit_book.py
@ -1,4 +1,5 @@
 """ the good stuff! the books! """
+from re import sub
 from dateutil.parser import parse as dateparse
 from django.contrib.auth.decorators import login_required, permission_required
 from django.contrib.postgres.search import SearchRank, SearchVector
@ -11,10 +12,16 @@ from django.utils.decorators import method_decorator
 from django.views import View

 from bookwyrm import book_search, forms, models
+
+# from bookwyrm.activitypub.base_activity import ActivityObject
+from bookwyrm.utils.isni import (
+    find_authors_by_name,
+    build_author_from_isni,
+    augment_author_metadata,
+)
 from bookwyrm.views.helpers import get_edition
 from .books import set_cover_from_url

-
 # pylint: disable=no-self-use
@method_decorator(login_required, name="dispatch")
@method_decorator(
@ -33,6 +40,7 @@ class EditBook(View):
        data = {"book": book, "form": forms.EditionForm(instance=book)}
        return TemplateResponse(request, "book/edit/edit_book.html", data)

+    # pylint: disable=too-many-locals
    def post(self, request, book_id=None):
        """edit a book cool"""
        # returns None if no match is found
@ -48,6 +56,7 @@ class EditBook(View):
        if add_author:
            data["add_author"] = add_author
            data["author_matches"] = []
+            data["isni_matches"] = []
            for author in add_author.split(","):
                if not author:
                    continue
@ -56,15 +65,35 @@ class EditBook(View):
                    "aliases", weight="B"
                )

+                author_matches = (
+                    models.Author.objects.annotate(search=vector)
+                    .annotate(rank=SearchRank(vector, author))
+                    .filter(rank__gt=0.4)
+                    .order_by("-rank")[:5]
+                )
+
+                isni_authors = find_authors_by_name(
+                    author, description=True
+                )  # find matches from ISNI API
+
+                # dedupe isni authors we already have in the DB
+                exists = [
+                    i
+                    for i in isni_authors
+                    for a in author_matches
+                    if sub(r"\D", "", str(i.isni)) == sub(r"\D", "", str(a.isni))
+                ]
+
+                # pylint: disable=cell-var-from-loop
+                matches = list(filter(lambda x: x not in exists, isni_authors))
+                # combine existing and isni authors
+                matches.extend(author_matches)
+
                data["author_matches"].append(
                    {
                        "name": author.strip(),
-                        "matches": (
-                            models.Author.objects.annotate(search=vector)
-                            .annotate(rank=SearchRank(vector, author))
-                            .filter(rank__gt=0.4)
-                            .order_by("-rank")[:5]
-                        ),
+                        "matches": matches,
+                        "existing_isnis": exists,
                    }
                )

@ -122,6 +151,8 @@ class EditBook(View):
 class ConfirmEditBook(View):
    """confirm edits to a book"""

+    # pylint: disable=too-many-locals
+    # pylint: disable=too-many-branches
    def post(self, request, book_id=None):
        """edit a book cool"""
        # returns None if no match is found
@ -147,9 +178,25 @@ class ConfirmEditBook(View):
                    author = get_object_or_404(
                        models.Author, id=request.POST[f"author_match-{i}"]
                    )
+                    # update author metadata if the ISNI record is more complete
+                    isni = request.POST.get(f"isni-for-{match}", None)
+                    if isni is not None:
+                        augment_author_metadata(author, isni)
                except ValueError:
-                    # otherwise it's a name
-                    author = models.Author.objects.create(name=match)
+                    # otherwise it's a new author
+                    isni_match = request.POST.get(f"author_match-{i}")
+                    author_object = build_author_from_isni(isni_match)
+                    # with author data class from isni id
+                    if "author" in author_object:
+                        skeleton = models.Author.objects.create(
+                            name=author_object["author"].name
+                        )
+                        author = author_object["author"].to_model(
+                            model=models.Author, overwrite=True, instance=skeleton
+                        )
+                    else:
+                        # or it's just a name
+                        author = models.Author.objects.create(name=match)
                book.authors.add(author)

            # create work, if needed