Merge pull request #3325 from Minnozz/author-search-vector

Rework author search
This commit is contained in:
Mouse Reeve 2024-03-25 14:41:25 -07:00 committed by GitHub
commit 7192449b21
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 304 additions and 42 deletions

View file

@ -0,0 +1,41 @@
# Generated by Django 3.2.25 on 2024-03-20 15:15
import django.contrib.postgres.indexes
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0196_merge_pr3134_into_main"),
]
operations = [
migrations.AddIndex(
model_name="author",
index=django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
),
),
pgtrigger.migrations.AddTrigger(
model_name="author",
trigger=pgtrigger.compiler.Trigger(
name="update_search_vector_on_author_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="new.search_vector := setweight(to_tsvector('simple', new.name), 'A') || setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');RETURN NEW;",
hash="b97919016236d74d0ade51a0769a173ea269da64",
operation='INSERT OR UPDATE OF "name", "aliases", "search_vector"',
pgid="pgtrigger_update_search_vector_on_author_edit_c61cb",
table="bookwyrm_author",
when="BEFORE",
),
),
),
migrations.RunSQL(
# Calculate search vector for all Authors.
sql="UPDATE bookwyrm_author SET search_vector = NULL;",
reverse_sql="UPDATE bookwyrm_author SET search_vector = NULL;",
),
]

View file

@ -0,0 +1,57 @@
# Generated by Django 3.2.25 on 2024-03-20 15:52
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0197_author_search_vector"),
]
operations = [
pgtrigger.migrations.RemoveTrigger(
model_name="author",
name="reset_search_vector_on_author_edit",
),
pgtrigger.migrations.RemoveTrigger(
model_name="book",
name="update_search_vector_on_book_edit",
),
pgtrigger.migrations.AddTrigger(
model_name="author",
trigger=pgtrigger.compiler.Trigger(
name="reset_book_search_vector_on_author_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;",
hash="68422c0f29879c5802b82159dde45297eff53e73",
operation='UPDATE OF "name", "aliases"',
pgid="pgtrigger_reset_book_search_vector_on_author_edit_a50c7",
table="bookwyrm_author",
when="AFTER",
),
),
),
pgtrigger.migrations.AddTrigger(
model_name="book",
trigger=pgtrigger.compiler.Trigger(
name="update_search_vector_on_book_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="WITH author_names AS (SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) SELECT setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(name_and_aliases), ' '), '')), 'C') FROM author_names) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D') INTO new.search_vector;RETURN NEW;",
hash="9324f5ca76a6f5e63931881d62d11da11f595b2c",
operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"',
pgid="pgtrigger_update_search_vector_on_book_edit_bec58",
table="bookwyrm_book",
when="BEFORE",
),
),
),
migrations.RunSQL(
# Recalculate search vector for all Books because it now includes
# Author aliases.
sql="UPDATE bookwyrm_book SET search_vector = NULL;",
reverse_sql="UPDATE bookwyrm_book SET search_vector = NULL;",
),
]

View file

@ -3,6 +3,7 @@ import re
from typing import Tuple, Any from typing import Tuple, Any
from django.db import models from django.db import models
from django.contrib.postgres.indexes import GinIndex
import pgtrigger import pgtrigger
from bookwyrm import activitypub from bookwyrm import activitypub
@ -71,11 +72,29 @@ class Author(BookDataModel):
class Meta: class Meta:
"""sets up indexes and triggers""" """sets up indexes and triggers"""
# pylint: disable=line-too-long
indexes = (GinIndex(fields=["search_vector"]),)
triggers = [ triggers = [
pgtrigger.Trigger( pgtrigger.Trigger(
name="reset_search_vector_on_author_edit", name="update_search_vector_on_author_edit",
when=pgtrigger.Before,
operation=pgtrigger.Insert
| pgtrigger.UpdateOf("name", "aliases", "search_vector"),
func=format_trigger(
"""new.search_vector :=
-- author name, with priority A
setweight(to_tsvector('simple', new.name), 'A') ||
-- author aliases, with priority B
setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');
RETURN new;
"""
),
),
pgtrigger.Trigger(
name="reset_book_search_vector_on_author_edit",
when=pgtrigger.After, when=pgtrigger.After,
operation=pgtrigger.UpdateOf("name"), operation=pgtrigger.UpdateOf("name", "aliases"),
func=format_trigger( func=format_trigger(
"""WITH updated_books AS ( """WITH updated_books AS (
SELECT book_id SELECT book_id
@ -89,7 +108,7 @@ class Author(BookDataModel):
RETURN new; RETURN new;
""" """
), ),
) ),
] ]
activity_serializer = activitypub.Author activity_serializer = activitypub.Author

View file

@ -246,22 +246,32 @@ class Book(BookDataModel):
operation=pgtrigger.Insert operation=pgtrigger.Insert
| pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"), | pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"),
func=format_trigger( func=format_trigger(
"""new.search_vector := """
-- title, with priority A (parse in English, default to simple if empty) WITH author_names AS (
setweight(COALESCE(nullif( SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases
to_tsvector('english', new.title), ''),
to_tsvector('simple', new.title)), 'A') ||
-- subtitle, with priority B (always in English?)
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063)
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C')
FROM bookwyrm_author FROM bookwyrm_author
LEFT JOIN bookwyrm_book_authors LEFT JOIN bookwyrm_book_authors
ON bookwyrm_author.id = bookwyrm_book_authors.author_id ON bookwyrm_author.id = bookwyrm_book_authors.author_id
WHERE bookwyrm_book_authors.book_id = new.id WHERE bookwyrm_book_authors.book_id = new.id
)
SELECT
-- title, with priority A (parse in English, default to simple if empty)
setweight(COALESCE(nullif(
to_tsvector('english', new.title), ''),
to_tsvector('simple', new.title)), 'A') ||
-- subtitle, with priority B (always in English?)
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-- list of authors names and aliases (with priority C)
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(name_and_aliases), ' '), '')), 'C')
FROM author_names
) || ) ||
--- last: series name, with lowest priority --- last: series name, with lowest priority
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D'); setweight(to_tsvector('english', COALESCE(new.series, '')), 'D')
INTO new.search_vector;
RETURN new; RETURN new;
""" """
), ),

View file

@ -0,0 +1,87 @@
""" test searching for authors """
from django.test import TestCase
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F
from bookwyrm import models
class AuthorSearch(TestCase):
"""look for some authors"""
@classmethod
def setUpTestData(cls):
"""we need basic test data and mocks"""
cls.bob = models.Author.objects.create(
name="Bob", aliases=["Robertus", "Alice"]
)
cls.alice = models.Author.objects.create(name="Alice")
def test_search(self):
"""search for an author in the db"""
results = self._search("Bob")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.bob)
def test_alias_priority(self):
"""aliases should be included, with lower priority than name"""
results = self._search("Alice")
self.assertEqual(len(results), 2)
self.assertEqual(results[0], self.alice)
def _search_first(self, query):
"""wrapper around search_title_author"""
return self._search(query, return_first=True)
@staticmethod
def _search(query, *, return_first=False):
"""author search"""
search_query = SearchQuery(query, config="simple")
min_confidence = 0
results = (
models.Author.objects.filter(search_vector=search_query)
.annotate(rank=SearchRank(F("search_vector"), search_query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
if return_first:
return results.first()
return results
class SearchVectorTest(TestCase):
"""check search_vector is computed correctly"""
def test_search_vector_simple(self):
"""simplest search vector"""
author = self._create_author("Mary")
self.assertEqual(author.search_vector, "'mary':1A")
def test_search_vector_aliases(self):
"""author aliases should be included with lower priority"""
author = self._create_author("Mary", aliases=["Maria", "Example"])
self.assertEqual(author.search_vector, "'example':3B 'maria':2B 'mary':1A")
def test_search_vector_parse_author(self):
"""author name and alias is not stem'd or affected by stop words"""
author = self._create_author("Writes", aliases=["Reads"])
self.assertEqual(author.search_vector, "'reads':2B 'writes':1A")
def test_search_vector_on_update(self):
"""make sure that search_vector is being set correctly on edit"""
author = self._create_author("Mary")
self.assertEqual(author.search_vector, "'mary':1A")
author.name = "Example"
author.save(broadcast=False)
author.refresh_from_db()
self.assertEqual(author.search_vector, "'example':1A")
@staticmethod
def _create_author(name, /, *, aliases=None):
"""quickly create an author"""
author = models.Author.objects.create(name=name, aliases=aliases or [])
author.refresh_from_db()
return author

View file

@ -14,6 +14,13 @@ class BookSearch(TestCase):
@classmethod @classmethod
def setUpTestData(self): # pylint: disable=bad-classmethod-argument def setUpTestData(self): # pylint: disable=bad-classmethod-argument
"""we need basic test data and mocks""" """we need basic test data and mocks"""
self.first_author = models.Author.objects.create(
name="Author One", aliases=["The First"]
)
self.second_author = models.Author.objects.create(
name="Author Two", aliases=["The Second"]
)
self.work = models.Work.objects.create(title="Example Work") self.work = models.Work.objects.create(title="Example Work")
self.first_edition = models.Edition.objects.create( self.first_edition = models.Edition.objects.create(
@ -23,6 +30,8 @@ class BookSearch(TestCase):
physical_format="Paperback", physical_format="Paperback",
published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc), published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc),
) )
self.first_edition.authors.add(self.first_author)
self.second_edition = models.Edition.objects.create( self.second_edition = models.Edition.objects.create(
title="Another Edition", title="Another Edition",
parent_work=self.work, parent_work=self.work,
@ -30,19 +39,34 @@ class BookSearch(TestCase):
openlibrary_key="hello", openlibrary_key="hello",
pages=150, pages=150,
) )
self.second_edition.authors.add(self.first_author)
self.second_edition.authors.add(self.second_author)
self.third_edition = models.Edition.objects.create( self.third_edition = models.Edition.objects.create(
title="Another Edition with annoying ISBN", title="Another Edition with annoying ISBN",
parent_work=self.work, parent_work=self.work,
isbn_10="022222222X", isbn_10="022222222X",
) )
self.third_edition.authors.add(self.first_author)
self.third_edition.authors.add(self.second_author)
def test_search(self): def test_search(self):
"""search for a book in the db""" """search for a book in the db"""
# title/author # title
results = book_search.search("Example") results = book_search.search("Example")
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition) self.assertEqual(results[0], self.first_edition)
# author
results = book_search.search("One")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition)
# author alias
results = book_search.search("First")
self.assertEqual(len(results), 1)
self.assertEqual(results[0], self.first_edition)
# isbn # isbn
results = book_search.search("0000000000") results = book_search.search("0000000000")
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
@ -155,8 +179,17 @@ class SearchVectorTest(TestCase):
"""search vector with subtitle and series""" """search vector with subtitle and series"""
# for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the # for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the
# indexes in the search vector. (priority "D" is the default, and never shown.) # indexes in the search vector. (priority "D" is the default, and never shown.)
book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch") book = self._create_book(
self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C") "Book",
"Mary",
subtitle="Long",
series="Bunch",
author_alias=["Maria", "Mary Ann"],
)
self.assertEqual(
book.search_vector,
"'ann':6C 'book':1A 'bunch':7 'long':2B 'maria':4C 'mary':3C,5C",
)
def test_search_vector_parse_book(self): def test_search_vector_parse_book(self):
"""book parts are parsed in english""" """book parts are parsed in english"""
@ -170,8 +203,8 @@ class SearchVectorTest(TestCase):
def test_search_vector_parse_author(self): def test_search_vector_parse_author(self):
"""author name is not stem'd or affected by stop words""" """author name is not stem'd or affected by stop words"""
book = self._create_book("Writing", "Writes") book = self._create_book("Writing", "Writes", author_alias=["Reads"])
self.assertEqual(book.search_vector, "'write':1A 'writes':2C") self.assertEqual(book.search_vector, "'reads':3C 'write':1A 'writes':2C")
book = self._create_book("She Is Writing", "She Writes") book = self._create_book("She Is Writing", "She Writes")
self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C") self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C")
@ -218,6 +251,13 @@ class SearchVectorTest(TestCase):
book.refresh_from_db() book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A") self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
author.aliases = ["Example"]
author.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(
book.search_vector, "'example':5C 'goodby':3A 'jeremy':4C 'long':2A"
)
def test_search_vector_on_author_delete(self): def test_search_vector_on_author_delete(self):
"""update search when an author is deleted""" """update search when an author is deleted"""
book = self._create_book("The Long Goodbye", "The Rays") book = self._create_book("The Long Goodbye", "The Rays")
@ -274,7 +314,7 @@ class SearchVectorUpdates(TestCase):
def setUp(self): def setUp(self):
"""we need basic test data and mocks""" """we need basic test data and mocks"""
self.work = models.Work.objects.create(title="This Work") self.work = models.Work.objects.create(title="This Work")
self.author = models.Author.objects.create(name="Name") self.author = models.Author.objects.create(name="Name", aliases=["Alias"])
self.edition = models.Edition.objects.create( self.edition = models.Edition.objects.create(
title="First Edition of Work", title="First Edition of Work",
subtitle="Some Extra Words Are Good", subtitle="Some Extra Words Are Good",
@ -363,13 +403,18 @@ class SearchVectorUpdates(TestCase):
def test_search_after_updated_author_name(self): def test_search_after_updated_author_name(self):
"""book found under new author name""" """book found under new author name"""
self.assertEqual(self.edition, self._search_first("Name")) self.assertEqual(self.edition, self._search_first("Name"))
self.assertEqual(self.edition, self._search_first("Alias"))
self.assertFalse(self._search("Identifier")) self.assertFalse(self._search("Identifier"))
self.assertFalse(self._search("Another"))
self.author.name = "Identifier" self.author.name = "Identifier"
self.author.aliases = ["Another"]
self.author.save(broadcast=False) self.author.save(broadcast=False)
self.assertFalse(self._search("Name")) self.assertFalse(self._search("Name"))
self.assertFalse(self._search("Aliases"))
self.assertEqual(self.edition, self._search_first("Identifier")) self.assertEqual(self.edition, self._search_first("Identifier"))
self.assertEqual(self.edition, self._search_first("Another"))
self.assertEqual(self.edition, self._search_first("Work")) self.assertEqual(self.edition, self._search_first("Work"))
def _search_first(self, query): def _search_first(self, query):

View file

@ -2,8 +2,9 @@
import re import re
from django.contrib.postgres.search import TrigramSimilarity from django.contrib.postgres.search import TrigramSimilarity, SearchRank, SearchQuery
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.db.models import F
from django.db.models.functions import Greatest from django.db.models.functions import Greatest
from django.http import JsonResponse from django.http import JsonResponse
from django.template.response import TemplateResponse from django.template.response import TemplateResponse
@ -94,26 +95,28 @@ def book_search(request):
def author_search(request): def author_search(request):
"""search for an author""" """search for an author"""
query = request.GET.get("q") query = request.GET.get("q").strip()
query = query.strip() search_query = SearchQuery(query, config="simple")
data = {"type": "author", "query": query} min_confidence = 0
results = ( results = (
models.Author.objects.annotate( models.Author.objects.filter(search_vector=search_query)
similarity=TrigramSimilarity("name", query), .annotate(rank=SearchRank(F("search_vector"), search_query))
) .filter(rank__gt=min_confidence)
.filter( .order_by("-rank")
similarity__gt=0.1,
)
.order_by("-similarity")
) )
paginated = Paginator(results, PAGE_LENGTH) paginated = Paginator(results, PAGE_LENGTH)
page = paginated.get_page(request.GET.get("page")) page = paginated.get_page(request.GET.get("page"))
data["results"] = page
data["page_range"] = paginated.get_elided_page_range( data = {
"type": "author",
"query": query,
"results": page,
"page_range": paginated.get_elided_page_range(
page.number, on_each_side=2, on_ends=1 page.number, on_each_side=2, on_ends=1
) ),
}
return TemplateResponse(request, "search/author.html", data) return TemplateResponse(request, "search/author.html", data)