mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-11-27 20:11:14 +00:00
Merge pull request #1581 from hughrun/isni-poc
Query ISNI database when adding authors
This commit is contained in:
commit
a24fb5cd11
6 changed files with 305 additions and 16 deletions
|
@ -27,7 +27,7 @@ class Author(BookDataModel):
|
|||
# idk probably other keys would be useful here?
|
||||
born = fields.DateTimeField(blank=True, null=True)
|
||||
died = fields.DateTimeField(blank=True, null=True)
|
||||
name = fields.CharField(max_length=255, deduplication_field=True)
|
||||
name = fields.CharField(max_length=255)
|
||||
aliases = fields.ArrayField(
|
||||
models.CharField(max_length=255), blank=True, default=list
|
||||
)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
{% load i18n %}
|
||||
{% load markdown %}
|
||||
{% load humanize %}
|
||||
{% load utilities %}
|
||||
|
||||
{% block title %}{{ author.name }}{% endblock %}
|
||||
|
||||
|
@ -25,7 +26,7 @@
|
|||
<div class="block columns content" itemscope itemtype="https://schema.org/Person">
|
||||
<meta itemprop="name" content="{{ author.name }}">
|
||||
|
||||
{% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id %}
|
||||
{% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id or author.isni %}
|
||||
<div class="column is-two-fifths">
|
||||
<div class="box py-2">
|
||||
<dl>
|
||||
|
@ -63,6 +64,14 @@
|
|||
</p>
|
||||
{% endif %}
|
||||
|
||||
{% if author.isni %}
|
||||
<p class="my-1">
|
||||
<a itemprop="sameAs" href="https://isni.org/isni/{{ author.isni|remove_spaces }}" rel="noopener" target="_blank">
|
||||
{% trans "View ISNI record" %}
|
||||
</a>
|
||||
</p>
|
||||
{% endif %}
|
||||
|
||||
{% if author.openlibrary_key %}
|
||||
<p class="my-1">
|
||||
<a itemprop="sameAs" href="https://openlibrary.org/authors/{{ author.openlibrary_key }}" target="_blank" rel="noopener">
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
{% extends 'layout.html' %}
|
||||
{% load i18n %}
|
||||
{% load humanize %}
|
||||
{% load utilities %}
|
||||
|
||||
{% block title %}{% if book %}{% blocktrans with book_title=book.title %}Edit "{{ book_title }}"{% endblocktrans %}{% else %}{% trans "Add Book" %}{% endif %}{% endblock %}
|
||||
|
||||
|
@ -52,19 +53,29 @@
|
|||
{% for author in author_matches %}
|
||||
<fieldset>
|
||||
<legend class="title is-5 mb-1">
|
||||
{% blocktrans with name=author.name %}Is "{{ name }}" an existing author?{% endblocktrans %}
|
||||
{% blocktrans with name=author.name %}Is "{{ name }}" one of these authors?{% endblocktrans %}
|
||||
</legend>
|
||||
{% with forloop.counter0 as counter %}
|
||||
{% for match in author.matches %}
|
||||
<label class="label mb-2">
|
||||
<label class="label">
|
||||
<input type="radio" name="author_match-{{ counter }}" value="{{ match.id }}" required>
|
||||
{{ match.name }}
|
||||
</label>
|
||||
<p class="help">
|
||||
<a href="{{ match.local_path }}" target="_blank">{% blocktrans with book_title=match.book_set.first.title %}Author of <em>{{ book_title }}</em>{% endblocktrans %}</a>
|
||||
<p class="help ml-5 mb-2">
|
||||
{% with book_title=match.book_set.first.title alt_title=match.bio %}
|
||||
{% if book_title %}
|
||||
<a href="{{ match.local_path }}" target="_blank">{% trans "Author of " %}<em>{{ book_title }}</em></a>
|
||||
{% else %}
|
||||
<a href="{{ match.id }}" target="_blank">{% if alt_title %}{% trans "Author of " %}<em>{{ alt_title }}</em>{% else %} {% trans "Find more information at isni.org" %}{% endif %}</a>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
</p>
|
||||
<p class="help ml-5">
|
||||
{{ author.existing_isnis|get_isni_bio:match }}
|
||||
</p>
|
||||
{{ author.existing_isnis|get_isni:match }}
|
||||
{% endfor %}
|
||||
<label class="label">
|
||||
<label class="label mt-2">
|
||||
<input type="radio" name="author_match-{{ counter }}" value="{{ author.name }}" required> {% trans "This is a new author" %}
|
||||
</label>
|
||||
{% endwith %}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
""" template filters for really common utilities """
|
||||
import os
|
||||
import re
|
||||
from uuid import uuid4
|
||||
from django import template
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from django.template.defaultfilters import stringfilter
|
||||
from django.templatetags.static import static
|
||||
|
||||
|
||||
|
@ -66,3 +69,39 @@ def get_book_cover_thumbnail(book, size="medium", ext="jpg"):
|
|||
return cover_thumbnail.url
|
||||
except OSError:
|
||||
return static("images/no_cover.jpg")
|
||||
|
||||
|
||||
@register.filter(name="get_isni_bio")
|
||||
def get_isni_bio(existing, author):
|
||||
"""Returns the isni bio string if an existing author has an isni listed"""
|
||||
auth_isni = re.sub(r"\D", "", str(author.isni))
|
||||
if len(existing) == 0:
|
||||
return ""
|
||||
for value in existing:
|
||||
if hasattr(value, "bio") and auth_isni == re.sub(r"\D", "", str(value.isni)):
|
||||
return mark_safe(f"Author of <em>{value.bio}</em>")
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
# pylint: disable=unused-argument
|
||||
@register.filter(name="get_isni", needs_autoescape=True)
|
||||
def get_isni(existing, author, autoescape=True):
|
||||
"""Returns the isni ID if an existing author has an ISNI listing"""
|
||||
auth_isni = re.sub(r"\D", "", str(author.isni))
|
||||
if len(existing) == 0:
|
||||
return ""
|
||||
for value in existing:
|
||||
if hasattr(value, "isni") and auth_isni == re.sub(r"\D", "", str(value.isni)):
|
||||
isni = value.isni
|
||||
return mark_safe(
|
||||
f'<input type="text" name="isni-for-{author.id}" value="{isni}" hidden>'
|
||||
)
|
||||
return ""
|
||||
|
||||
|
||||
@register.filter(name="remove_spaces")
|
||||
@stringfilter
|
||||
def remove_spaces(arg):
|
||||
"""Removes spaces from argument passed in"""
|
||||
return re.sub(r"\s", "", str(arg))
|
||||
|
|
183
bookwyrm/utils/isni.py
Normal file
183
bookwyrm/utils/isni.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
"""ISNI author checking utilities"""
|
||||
import xml.etree.ElementTree as ET
|
||||
import requests
|
||||
|
||||
from bookwyrm import activitypub, models
|
||||
|
||||
|
||||
def request_isni_data(search_index, search_term, max_records=5):
|
||||
"""Request data from the ISNI API"""
|
||||
|
||||
search_string = f'{search_index}="{search_term}"'
|
||||
query_params = {
|
||||
"query": search_string,
|
||||
"version": "1.1",
|
||||
"operation": "searchRetrieve",
|
||||
"recordSchema": "isni-b",
|
||||
"maximumRecords": max_records,
|
||||
"startRecord": "1",
|
||||
"recordPacking": "xml",
|
||||
"sortKeys": "RLV,pica,0,,",
|
||||
}
|
||||
result = requests.get("http://isni.oclc.org/sru/", params=query_params, timeout=10)
|
||||
# the OCLC ISNI server asserts the payload is encoded
|
||||
# in latin1, but we know better
|
||||
result.encoding = "utf-8"
|
||||
return result.text
|
||||
|
||||
|
||||
def make_name_string(element):
|
||||
"""create a string of form 'personal_name surname'"""
|
||||
|
||||
# NOTE: this will often be incorrect, many naming systems
|
||||
# list "surname" before personal name
|
||||
forename = element.find(".//forename")
|
||||
surname = element.find(".//surname")
|
||||
if forename is not None:
|
||||
return "".join([forename.text, " ", surname.text])
|
||||
return surname.text
|
||||
|
||||
|
||||
def get_other_identifier(element, code):
|
||||
"""Get other identifiers associated with an author from their ISNI record"""
|
||||
|
||||
identifiers = element.findall(".//otherIdentifierOfIdentity")
|
||||
for section_head in identifiers:
|
||||
if (
|
||||
section_head.find(".//type") is not None
|
||||
and section_head.find(".//type").text == code
|
||||
and section_head.find(".//identifier") is not None
|
||||
):
|
||||
return section_head.find(".//identifier").text
|
||||
|
||||
# if we can't find it in otherIdentifierOfIdentity,
|
||||
# try sources
|
||||
for source in element.findall(".//sources"):
|
||||
code_of_source = source.find(".//codeOfSource")
|
||||
if code_of_source is not None and code_of_source.text.lower() == code.lower():
|
||||
return source.find(".//sourceIdentifier").text
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def get_external_information_uri(element, match_string):
|
||||
"""Get URLs associated with an author from their ISNI record"""
|
||||
|
||||
sources = element.findall(".//externalInformation")
|
||||
for source in sources:
|
||||
information = source.find(".//information")
|
||||
uri = source.find(".//URI")
|
||||
if (
|
||||
uri is not None
|
||||
and information is not None
|
||||
and information.text.lower() == match_string.lower()
|
||||
):
|
||||
return uri.text
|
||||
return ""
|
||||
|
||||
|
||||
def find_authors_by_name(name_string, description=False):
|
||||
"""Query the ISNI database for possible author matches by name"""
|
||||
|
||||
payload = request_isni_data("pica.na", name_string)
|
||||
# parse xml
|
||||
root = ET.fromstring(payload)
|
||||
# build list of possible authors
|
||||
possible_authors = []
|
||||
for element in root.iter("responseRecord"):
|
||||
personal_name = element.find(".//forename/..")
|
||||
if not personal_name:
|
||||
continue
|
||||
|
||||
author = get_author_from_isni(element.find(".//isniUnformatted").text)
|
||||
|
||||
if bool(description):
|
||||
|
||||
titles = []
|
||||
# prefer title records from LoC+ coop, Australia, Ireland, or Singapore
|
||||
# in that order
|
||||
for source in ["LCNACO", "NLA", "N6I", "NLB"]:
|
||||
for parent in element.findall(f'.//titleOfWork/[@source="{source}"]'):
|
||||
titles.append(parent.find(".//title"))
|
||||
for parent in element.findall(f'.//titleOfWork[@subsource="{source}"]'):
|
||||
titles.append(parent.find(".//title"))
|
||||
# otherwise just grab the first title listing
|
||||
titles.append(element.find(".//title"))
|
||||
|
||||
if titles is not None:
|
||||
# some of the "titles" in ISNI are a little ...iffy
|
||||
# '@' is used by ISNI/OCLC to index the starting point ignoring stop words
|
||||
# (e.g. "The @Government of no one")
|
||||
title_elements = [
|
||||
e for e in titles if not e.text.replace("@", "").isnumeric()
|
||||
]
|
||||
if len(title_elements):
|
||||
author.bio = title_elements[0].text.replace("@", "")
|
||||
else:
|
||||
author.bio = None
|
||||
|
||||
possible_authors.append(author)
|
||||
|
||||
return possible_authors
|
||||
|
||||
|
||||
def get_author_from_isni(isni):
|
||||
"""Find data to populate a new author record from their ISNI"""
|
||||
|
||||
payload = request_isni_data("pica.isn", isni)
|
||||
# parse xml
|
||||
root = ET.fromstring(payload)
|
||||
# there should only be a single responseRecord
|
||||
# but let's use the first one just in case
|
||||
element = root.find(".//responseRecord")
|
||||
name = make_name_string(element.find(".//forename/.."))
|
||||
viaf = get_other_identifier(element, "viaf")
|
||||
# use a set to dedupe aliases in ISNI
|
||||
aliases = set()
|
||||
aliases_element = element.findall(".//personalNameVariant")
|
||||
for entry in aliases_element:
|
||||
aliases.add(make_name_string(entry))
|
||||
# aliases needs to be list not set
|
||||
aliases = list(aliases)
|
||||
bio = element.find(".//nameTitle")
|
||||
bio = bio.text if bio is not None else ""
|
||||
wikipedia = get_external_information_uri(element, "Wikipedia")
|
||||
|
||||
author = activitypub.Author(
|
||||
id=element.find(".//isniURI").text,
|
||||
name=name,
|
||||
isni=isni,
|
||||
viafId=viaf,
|
||||
aliases=aliases,
|
||||
bio=bio,
|
||||
wikipediaLink=wikipedia,
|
||||
)
|
||||
|
||||
return author
|
||||
|
||||
|
||||
def build_author_from_isni(match_value):
|
||||
"""Build basic author class object from ISNI URL"""
|
||||
|
||||
# if it is an isni value get the data
|
||||
if match_value.startswith("https://isni.org/isni/"):
|
||||
isni = match_value.replace("https://isni.org/isni/", "")
|
||||
return {"author": get_author_from_isni(isni)}
|
||||
# otherwise it's a name string
|
||||
return {}
|
||||
|
||||
|
||||
def augment_author_metadata(author, isni):
|
||||
"""Update any missing author fields from ISNI data"""
|
||||
|
||||
isni_author = get_author_from_isni(isni)
|
||||
isni_author.to_model(model=models.Author, instance=author, overwrite=False)
|
||||
|
||||
# we DO want to overwrite aliases because we're adding them to the
|
||||
# existing aliases and ISNI will usually have more.
|
||||
# We need to dedupe because ISNI records often have lots of dupe aliases
|
||||
aliases = set(isni_author.aliases)
|
||||
for alias in author.aliases:
|
||||
aliases.add(alias)
|
||||
author.aliases = list(aliases)
|
||||
author.save()
|
|
@ -1,4 +1,5 @@
|
|||
""" the good stuff! the books! """
|
||||
from re import sub
|
||||
from dateutil.parser import parse as dateparse
|
||||
from django.contrib.auth.decorators import login_required, permission_required
|
||||
from django.contrib.postgres.search import SearchRank, SearchVector
|
||||
|
@ -11,10 +12,16 @@ from django.utils.decorators import method_decorator
|
|||
from django.views import View
|
||||
|
||||
from bookwyrm import book_search, forms, models
|
||||
|
||||
# from bookwyrm.activitypub.base_activity import ActivityObject
|
||||
from bookwyrm.utils.isni import (
|
||||
find_authors_by_name,
|
||||
build_author_from_isni,
|
||||
augment_author_metadata,
|
||||
)
|
||||
from bookwyrm.views.helpers import get_edition
|
||||
from .books import set_cover_from_url
|
||||
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
@method_decorator(login_required, name="dispatch")
|
||||
@method_decorator(
|
||||
|
@ -33,6 +40,7 @@ class EditBook(View):
|
|||
data = {"book": book, "form": forms.EditionForm(instance=book)}
|
||||
return TemplateResponse(request, "book/edit/edit_book.html", data)
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
def post(self, request, book_id=None):
|
||||
"""edit a book cool"""
|
||||
# returns None if no match is found
|
||||
|
@ -48,6 +56,7 @@ class EditBook(View):
|
|||
if add_author:
|
||||
data["add_author"] = add_author
|
||||
data["author_matches"] = []
|
||||
data["isni_matches"] = []
|
||||
for author in add_author.split(","):
|
||||
if not author:
|
||||
continue
|
||||
|
@ -56,15 +65,35 @@ class EditBook(View):
|
|||
"aliases", weight="B"
|
||||
)
|
||||
|
||||
author_matches = (
|
||||
models.Author.objects.annotate(search=vector)
|
||||
.annotate(rank=SearchRank(vector, author))
|
||||
.filter(rank__gt=0.4)
|
||||
.order_by("-rank")[:5]
|
||||
)
|
||||
|
||||
isni_authors = find_authors_by_name(
|
||||
author, description=True
|
||||
) # find matches from ISNI API
|
||||
|
||||
# dedupe isni authors we already have in the DB
|
||||
exists = [
|
||||
i
|
||||
for i in isni_authors
|
||||
for a in author_matches
|
||||
if sub(r"\D", "", str(i.isni)) == sub(r"\D", "", str(a.isni))
|
||||
]
|
||||
|
||||
# pylint: disable=cell-var-from-loop
|
||||
matches = list(filter(lambda x: x not in exists, isni_authors))
|
||||
# combine existing and isni authors
|
||||
matches.extend(author_matches)
|
||||
|
||||
data["author_matches"].append(
|
||||
{
|
||||
"name": author.strip(),
|
||||
"matches": (
|
||||
models.Author.objects.annotate(search=vector)
|
||||
.annotate(rank=SearchRank(vector, author))
|
||||
.filter(rank__gt=0.4)
|
||||
.order_by("-rank")[:5]
|
||||
),
|
||||
"matches": matches,
|
||||
"existing_isnis": exists,
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -122,6 +151,8 @@ class EditBook(View):
|
|||
class ConfirmEditBook(View):
|
||||
"""confirm edits to a book"""
|
||||
|
||||
# pylint: disable=too-many-locals
|
||||
# pylint: disable=too-many-branches
|
||||
def post(self, request, book_id=None):
|
||||
"""edit a book cool"""
|
||||
# returns None if no match is found
|
||||
|
@ -147,9 +178,25 @@ class ConfirmEditBook(View):
|
|||
author = get_object_or_404(
|
||||
models.Author, id=request.POST[f"author_match-{i}"]
|
||||
)
|
||||
# update author metadata if the ISNI record is more complete
|
||||
isni = request.POST.get(f"isni-for-{match}", None)
|
||||
if isni is not None:
|
||||
augment_author_metadata(author, isni)
|
||||
except ValueError:
|
||||
# otherwise it's a name
|
||||
author = models.Author.objects.create(name=match)
|
||||
# otherwise it's a new author
|
||||
isni_match = request.POST.get(f"author_match-{i}")
|
||||
author_object = build_author_from_isni(isni_match)
|
||||
# with author data class from isni id
|
||||
if "author" in author_object:
|
||||
skeleton = models.Author.objects.create(
|
||||
name=author_object["author"].name
|
||||
)
|
||||
author = author_object["author"].to_model(
|
||||
model=models.Author, overwrite=True, instance=skeleton
|
||||
)
|
||||
else:
|
||||
# or it's just a name
|
||||
author = models.Author.objects.create(name=match)
|
||||
book.authors.add(author)
|
||||
|
||||
# create work, if needed
|
||||
|
|
Loading…
Reference in a new issue