Merge pull request #3325 from Minnozz/author-search-vector

Rework author search
This commit is contained in:
Mouse Reeve 2024-03-25 14:41:25 -07:00 committed by GitHub
commit 7192449b21
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 304 additions and 42 deletions

View file

@ -0,0 +1,41 @@
# Generated by Django 3.2.25 on 2024-03-20 15:15
import django.contrib.postgres.indexes
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0196_merge_pr3134_into_main"),
]
operations = [
migrations.AddIndex(
model_name="author",
index=django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
),
),
pgtrigger.migrations.AddTrigger(
model_name="author",
trigger=pgtrigger.compiler.Trigger(
name="update_search_vector_on_author_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="new.search_vector := setweight(to_tsvector('simple', new.name), 'A') || setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');RETURN NEW;",
hash="b97919016236d74d0ade51a0769a173ea269da64",
operation='INSERT OR UPDATE OF "name", "aliases", "search_vector"',
pgid="pgtrigger_update_search_vector_on_author_edit_c61cb",
table="bookwyrm_author",
when="BEFORE",
),
),
),
migrations.RunSQL(
# Calculate search vector for all Authors.
sql="UPDATE bookwyrm_author SET search_vector = NULL;",
reverse_sql="UPDATE bookwyrm_author SET search_vector = NULL;",
),
]

View file

@ -0,0 +1,57 @@
# Generated by Django 3.2.25 on 2024-03-20 15:52
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0197_author_search_vector"),
]
operations = [
pgtrigger.migrations.RemoveTrigger(
model_name="author",
name="reset_search_vector_on_author_edit",
),
pgtrigger.migrations.RemoveTrigger(
model_name="book",
name="update_search_vector_on_book_edit",
),
pgtrigger.migrations.AddTrigger(
model_name="author",
trigger=pgtrigger.compiler.Trigger(
name="reset_book_search_vector_on_author_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;",
hash="68422c0f29879c5802b82159dde45297eff53e73",
operation='UPDATE OF "name", "aliases"',
pgid="pgtrigger_reset_book_search_vector_on_author_edit_a50c7",
table="bookwyrm_author",
when="AFTER",
),
),
),
pgtrigger.migrations.AddTrigger(
model_name="book",
trigger=pgtrigger.compiler.Trigger(
name="update_search_vector_on_book_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="WITH author_names AS (SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) SELECT setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(name_and_aliases), ' '), '')), 'C') FROM author_names) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D') INTO new.search_vector;RETURN NEW;",
hash="9324f5ca76a6f5e63931881d62d11da11f595b2c",
operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"',
pgid="pgtrigger_update_search_vector_on_book_edit_bec58",
table="bookwyrm_book",
when="BEFORE",
),
),
),
migrations.RunSQL(
# Recalculate search vector for all Books because it now includes
# Author aliases.
sql="UPDATE bookwyrm_book SET search_vector = NULL;",
reverse_sql="UPDATE bookwyrm_book SET search_vector = NULL;",
),
]

View file

@ -3,6 +3,7 @@ import re
from typing import Tuple, Any
from django.db import models
from django.contrib.postgres.indexes import GinIndex
import pgtrigger
from bookwyrm import activitypub
@ -71,11 +72,29 @@ class Author(BookDataModel):
class Meta:
"""sets up indexes and triggers"""
# pylint: disable=line-too-long
indexes = (GinIndex(fields=["search_vector"]),)
triggers = [
pgtrigger.Trigger(
name="reset_search_vector_on_author_edit",
name="update_search_vector_on_author_edit",
when=pgtrigger.Before,
operation=pgtrigger.Insert
| pgtrigger.UpdateOf("name", "aliases", "search_vector"),
func=format_trigger(
"""new.search_vector :=
-- author name, with priority A
setweight(to_tsvector('simple', new.name), 'A') ||
-- author aliases, with priority B
setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');
RETURN new;
"""
),
),
pgtrigger.Trigger(
name="reset_book_search_vector_on_author_edit",
when=pgtrigger.After,
operation=pgtrigger.UpdateOf("name"),
operation=pgtrigger.UpdateOf("name", "aliases"),
func=format_trigger(
"""WITH updated_books AS (
SELECT book_id
@ -89,7 +108,7 @@ class Author(BookDataModel):
RETURN new;
"""
),
)
),
]
activity_serializer = activitypub.Author

View file

@ -246,24 +246,34 @@ class Book(BookDataModel):
operation=pgtrigger.Insert
| pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"),
func=format_trigger(
"""new.search_vector :=
-- title, with priority A (parse in English, default to simple if empty)
setweight(COALESCE(nullif(
to_tsvector('english', new.title), ''),
to_tsvector('simple', new.title)), 'A') ||
-- subtitle, with priority B (always in English?)
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063)
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C')
FROM bookwyrm_author
LEFT JOIN bookwyrm_book_authors
ON bookwyrm_author.id = bookwyrm_book_authors.author_id
WHERE bookwyrm_book_authors.book_id = new.id
) ||
--- last: series name, with lowest priority
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D');
RETURN new;
"""
"""
WITH author_names AS (
SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases
FROM bookwyrm_author
LEFT JOIN bookwyrm_book_authors
ON bookwyrm_author.id = bookwyrm_book_authors.author_id
WHERE bookwyrm_book_authors.book_id = new.id
)
SELECT
-- title, with priority A (parse in English, default to simple if empty)
setweight(COALESCE(nullif(
to_tsvector('english', new.title), ''),
to_tsvector('simple', new.title)), 'A') ||
-- subtitle, with priority B (always in English?)
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-- list of authors names and aliases (with priority C)
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(name_and_aliases), ' '), '')), 'C')
FROM author_names
) ||
--- last: series name, with lowest priority
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D')
INTO new.search_vector;
RETURN new;
"""
),
)
]

View file

@ -0,0 +1,87 @@
""" test searching for authors """
from django.test import TestCase
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F
from bookwyrm import models
class AuthorSearch(TestCase):
"""look for some authors"""
@classmethod
def setUpTestData(cls):
"""we need basic test data and mocks"""
cls.bob = models.Author.objects.create(
name="Bob", aliases=["Robertus", "Alice"]
)
cls.alice = models.Author.objects.create(name="Alice")
def test_search(self):
"""search for an author in the db"""
results = self._search("Bob")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.bob)
def test_alias_priority(self):
"""aliases should be included, with lower priority than name"""
results = self._search("Alice")
self.assertEqual(len(results), 2)
self.assertEqual(results[0], self.alice)
def _search_first(self, query):
"""wrapper around search_title_author"""
return self._search(query, return_first=True)
@staticmethod
def _search(query, *, return_first=False):
"""author search"""
search_query = SearchQuery(query, config="simple")
min_confidence = 0
results = (
models.Author.objects.filter(search_vector=search_query)
.annotate(rank=SearchRank(F("search_vector"), search_query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
if return_first:
return results.first()
return results
class SearchVectorTest(TestCase):
"""check search_vector is computed correctly"""
def test_search_vector_simple(self):
"""simplest search vector"""
author = self._create_author("Mary")
self.assertEqual(author.search_vector, "'mary':1A")
def test_search_vector_aliases(self):
"""author aliases should be included with lower priority"""
author = self._create_author("Mary", aliases=["Maria", "Example"])
self.assertEqual(author.search_vector, "'example':3B 'maria':2B 'mary':1A")
def test_search_vector_parse_author(self):
"""author name and alias is not stem'd or affected by stop words"""
author = self._create_author("Writes", aliases=["Reads"])
self.assertEqual(author.search_vector, "'reads':2B 'writes':1A")
def test_search_vector_on_update(self):
"""make sure that search_vector is being set correctly on edit"""
author = self._create_author("Mary")
self.assertEqual(author.search_vector, "'mary':1A")
author.name = "Example"
author.save(broadcast=False)
author.refresh_from_db()
self.assertEqual(author.search_vector, "'example':1A")
@staticmethod
def _create_author(name, /, *, aliases=None):
"""quickly create an author"""
author = models.Author.objects.create(name=name, aliases=aliases or [])
author.refresh_from_db()
return author

View file

@ -14,6 +14,13 @@ class BookSearch(TestCase):
@classmethod
def setUpTestData(self): # pylint: disable=bad-classmethod-argument
"""we need basic test data and mocks"""
self.first_author = models.Author.objects.create(
name="Author One", aliases=["The First"]
)
self.second_author = models.Author.objects.create(
name="Author Two", aliases=["The Second"]
)
self.work = models.Work.objects.create(title="Example Work")
self.first_edition = models.Edition.objects.create(
@ -23,6 +30,8 @@ class BookSearch(TestCase):
physical_format="Paperback",
published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc),
)
self.first_edition.authors.add(self.first_author)
self.second_edition = models.Edition.objects.create(
title="Another Edition",
parent_work=self.work,
@ -30,19 +39,34 @@ class BookSearch(TestCase):
openlibrary_key="hello",
pages=150,
)
self.second_edition.authors.add(self.first_author)
self.second_edition.authors.add(self.second_author)
self.third_edition = models.Edition.objects.create(
title="Another Edition with annoying ISBN",
parent_work=self.work,
isbn_10="022222222X",
)
self.third_edition.authors.add(self.first_author)
self.third_edition.authors.add(self.second_author)
def test_search(self):
"""search for a book in the db"""
# title/author
# title
results = book_search.search("Example")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition)
# author
results = book_search.search("One")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition)
# author alias
results = book_search.search("First")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition)
# isbn
results = book_search.search("0000000000")
self.assertEqual(len(results), 1)
@ -155,8 +179,17 @@ class SearchVectorTest(TestCase):
"""search vector with subtitle and series"""
# for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the
# indexes in the search vector. (priority "D" is the default, and never shown.)
book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch")
self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C")
book = self._create_book(
"Book",
"Mary",
subtitle="Long",
series="Bunch",
author_alias=["Maria", "Mary Ann"],
)
self.assertEqual(
book.search_vector,
"'ann':6C 'book':1A 'bunch':7 'long':2B 'maria':4C 'mary':3C,5C",
)
def test_search_vector_parse_book(self):
"""book parts are parsed in english"""
@ -170,8 +203,8 @@ class SearchVectorTest(TestCase):
def test_search_vector_parse_author(self):
"""author name is not stem'd or affected by stop words"""
book = self._create_book("Writing", "Writes")
self.assertEqual(book.search_vector, "'write':1A 'writes':2C")
book = self._create_book("Writing", "Writes", author_alias=["Reads"])
self.assertEqual(book.search_vector, "'reads':3C 'write':1A 'writes':2C")
book = self._create_book("She Is Writing", "She Writes")
self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C")
@ -218,6 +251,13 @@ class SearchVectorTest(TestCase):
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
author.aliases = ["Example"]
author.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(
book.search_vector, "'example':5C 'goodby':3A 'jeremy':4C 'long':2A"
)
def test_search_vector_on_author_delete(self):
"""update search when an author is deleted"""
book = self._create_book("The Long Goodbye", "The Rays")
@ -274,7 +314,7 @@ class SearchVectorUpdates(TestCase):
def setUp(self):
"""we need basic test data and mocks"""
self.work = models.Work.objects.create(title="This Work")
self.author = models.Author.objects.create(name="Name")
self.author = models.Author.objects.create(name="Name", aliases=["Alias"])
self.edition = models.Edition.objects.create(
title="First Edition of Work",
subtitle="Some Extra Words Are Good",
@ -363,13 +403,18 @@ class SearchVectorUpdates(TestCase):
def test_search_after_updated_author_name(self):
"""book found under new author name"""
self.assertEqual(self.edition, self._search_first("Name"))
self.assertEqual(self.edition, self._search_first("Alias"))
self.assertFalse(self._search("Identifier"))
self.assertFalse(self._search("Another"))
self.author.name = "Identifier"
self.author.aliases = ["Another"]
self.author.save(broadcast=False)
self.assertFalse(self._search("Name"))
self.assertFalse(self._search("Aliases"))
self.assertEqual(self.edition, self._search_first("Identifier"))
self.assertEqual(self.edition, self._search_first("Another"))
self.assertEqual(self.edition, self._search_first("Work"))
def _search_first(self, query):

View file

@ -2,8 +2,9 @@
import re
from django.contrib.postgres.search import TrigramSimilarity
from django.contrib.postgres.search import TrigramSimilarity, SearchRank, SearchQuery
from django.core.paginator import Paginator
from django.db.models import F
from django.db.models.functions import Greatest
from django.http import JsonResponse
from django.template.response import TemplateResponse
@ -94,26 +95,28 @@ def book_search(request):
def author_search(request):
"""search for an author"""
query = request.GET.get("q")
query = query.strip()
data = {"type": "author", "query": query}
query = request.GET.get("q").strip()
search_query = SearchQuery(query, config="simple")
min_confidence = 0
results = (
models.Author.objects.annotate(
similarity=TrigramSimilarity("name", query),
)
.filter(
similarity__gt=0.1,
)
.order_by("-similarity")
models.Author.objects.filter(search_vector=search_query)
.annotate(rank=SearchRank(F("search_vector"), search_query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
paginated = Paginator(results, PAGE_LENGTH)
page = paginated.get_page(request.GET.get("page"))
data["results"] = page
data["page_range"] = paginated.get_elided_page_range(
page.number, on_each_side=2, on_ends=1
)
data = {
"type": "author",
"query": query,
"results": page,
"page_range": paginated.get_elided_page_range(
page.number, on_each_side=2, on_ends=1
),
}
return TemplateResponse(request, "search/author.html", data)