Merge pull request #3325 from Minnozz/author-search-vector

Rework author search
2025-02-01 20:02:21 +00:00 · 2024-03-25 14:41:25 -07:00 · 2024-03-25 14:41:25 -07:00 · 7192449b21
commit 7192449b21
parent f11c80162a b8995bd4b1
7 changed files with 304 additions and 42 deletions
--- a/bookwyrm/migrations/0197_author_search_vector.py
+++ b/bookwyrm/migrations/0197_author_search_vector.py
@ -0,0 +1,41 @@
+# Generated by Django 3.2.25 on 2024-03-20 15:15
+
+import django.contrib.postgres.indexes
+from django.db import migrations
+import pgtrigger.compiler
+import pgtrigger.migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("bookwyrm", "0196_merge_pr3134_into_main"),
+    ]
+
+    operations = [
+        migrations.AddIndex(
+            model_name="author",
+            index=django.contrib.postgres.indexes.GinIndex(
+                fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="author",
+            trigger=pgtrigger.compiler.Trigger(
+                name="update_search_vector_on_author_edit",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    func="new.search_vector := setweight(to_tsvector('simple', new.name), 'A') || setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');RETURN NEW;",
+                    hash="b97919016236d74d0ade51a0769a173ea269da64",
+                    operation='INSERT OR UPDATE OF "name", "aliases", "search_vector"',
+                    pgid="pgtrigger_update_search_vector_on_author_edit_c61cb",
+                    table="bookwyrm_author",
+                    when="BEFORE",
+                ),
+            ),
+        ),
+        migrations.RunSQL(
+            # Calculate search vector for all Authors.
+            sql="UPDATE bookwyrm_author SET search_vector = NULL;",
+            reverse_sql="UPDATE bookwyrm_author SET search_vector = NULL;",
+        ),
+    ]
--- a/bookwyrm/migrations/0198_book_search_vector_author_aliases.py
+++ b/bookwyrm/migrations/0198_book_search_vector_author_aliases.py
@ -0,0 +1,57 @@
+# Generated by Django 3.2.25 on 2024-03-20 15:52
+
+from django.db import migrations
+import pgtrigger.compiler
+import pgtrigger.migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("bookwyrm", "0197_author_search_vector"),
+    ]
+
+    operations = [
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="author",
+            name="reset_search_vector_on_author_edit",
+        ),
+        pgtrigger.migrations.RemoveTrigger(
+            model_name="book",
+            name="update_search_vector_on_book_edit",
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="author",
+            trigger=pgtrigger.compiler.Trigger(
+                name="reset_book_search_vector_on_author_edit",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;",
+                    hash="68422c0f29879c5802b82159dde45297eff53e73",
+                    operation='UPDATE OF "name", "aliases"',
+                    pgid="pgtrigger_reset_book_search_vector_on_author_edit_a50c7",
+                    table="bookwyrm_author",
+                    when="AFTER",
+                ),
+            ),
+        ),
+        pgtrigger.migrations.AddTrigger(
+            model_name="book",
+            trigger=pgtrigger.compiler.Trigger(
+                name="update_search_vector_on_book_edit",
+                sql=pgtrigger.compiler.UpsertTriggerSql(
+                    func="WITH author_names AS (SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) SELECT setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(name_and_aliases), ' '), '')), 'C') FROM author_names) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D') INTO new.search_vector;RETURN NEW;",
+                    hash="9324f5ca76a6f5e63931881d62d11da11f595b2c",
+                    operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"',
+                    pgid="pgtrigger_update_search_vector_on_book_edit_bec58",
+                    table="bookwyrm_book",
+                    when="BEFORE",
+                ),
+            ),
+        ),
+        migrations.RunSQL(
+            # Recalculate search vector for all Books because it now includes
+            # Author aliases.
+            sql="UPDATE bookwyrm_book SET search_vector = NULL;",
+            reverse_sql="UPDATE bookwyrm_book SET search_vector = NULL;",
+        ),
+    ]
--- a/bookwyrm/models/author.py
+++ b/bookwyrm/models/author.py
@ -3,6 +3,7 @@ import re
 from typing import Tuple, Any

 from django.db import models
+from django.contrib.postgres.indexes import GinIndex
 import pgtrigger

 from bookwyrm import activitypub
@ -71,11 +72,29 @@ class Author(BookDataModel):
    class Meta:
        """sets up indexes and triggers"""

+        # pylint: disable=line-too-long
+
+        indexes = (GinIndex(fields=["search_vector"]),)
        triggers = [
            pgtrigger.Trigger(
-                name="reset_search_vector_on_author_edit",
+                name="update_search_vector_on_author_edit",
+                when=pgtrigger.Before,
+                operation=pgtrigger.Insert
+                | pgtrigger.UpdateOf("name", "aliases", "search_vector"),
+                func=format_trigger(
+                    """new.search_vector :=
+                    -- author name, with priority A
+                    setweight(to_tsvector('simple', new.name), 'A') ||
+                    -- author aliases, with priority B
+                    setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');
+                    RETURN new;
+                """
+                ),
+            ),
+            pgtrigger.Trigger(
+                name="reset_book_search_vector_on_author_edit",
                when=pgtrigger.After,
-                operation=pgtrigger.UpdateOf("name"),
+                operation=pgtrigger.UpdateOf("name", "aliases"),
                func=format_trigger(
                    """WITH updated_books AS (
                         SELECT book_id
@ -89,7 +108,7 @@ class Author(BookDataModel):
                    RETURN new;
                """
                ),
-            )
+            ),
        ]

    activity_serializer = activitypub.Author
--- a/bookwyrm/models/book.py
+++ b/bookwyrm/models/book.py
@ -246,24 +246,34 @@ class Book(BookDataModel):
                operation=pgtrigger.Insert
                | pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"),
                func=format_trigger(
-                    """new.search_vector :=
-                         -- title, with priority A (parse in English, default to simple if empty)
-                         setweight(COALESCE(nullif(
-                                       to_tsvector('english', new.title), ''),
-                                       to_tsvector('simple', new.title)), 'A') ||
-                         -- subtitle, with priority B (always in English?)
-                         setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-                         -- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063)
-                         (SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C')
-                           FROM bookwyrm_author
-                           LEFT JOIN bookwyrm_book_authors
-                               ON bookwyrm_author.id = bookwyrm_book_authors.author_id
-                           WHERE bookwyrm_book_authors.book_id = new.id
-                         ) ||
-                         --- last: series name, with lowest priority
-                         setweight(to_tsvector('english', COALESCE(new.series, '')), 'D');
-                       RETURN new;
-                """
+                    """
+                    WITH author_names AS (
+                        SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases
+                            FROM bookwyrm_author
+                        LEFT JOIN bookwyrm_book_authors
+                            ON bookwyrm_author.id = bookwyrm_book_authors.author_id
+                        WHERE bookwyrm_book_authors.book_id = new.id
+                    )
+                    SELECT
+                        -- title, with priority A (parse in English, default to simple if empty)
+                        setweight(COALESCE(nullif(
+                            to_tsvector('english', new.title), ''),
+                            to_tsvector('simple', new.title)), 'A') ||
+
+                        -- subtitle, with priority B (always in English?)
+                        setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
+
+                        -- list of authors names and aliases (with priority C)
+                        (SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(name_and_aliases), ' '), '')), 'C')
+                            FROM author_names
+                        ) ||
+
+                        --- last: series name, with lowest priority
+                        setweight(to_tsvector('english', COALESCE(new.series, '')), 'D')
+
+                        INTO new.search_vector;
+                    RETURN new;
+                    """
                ),
            )
        ]
--- a/bookwyrm/tests/test_author_search.py
+++ b/bookwyrm/tests/test_author_search.py
@ -0,0 +1,87 @@
+""" test searching for authors """
+from django.test import TestCase
+
+from django.contrib.postgres.search import SearchRank, SearchQuery
+from django.db.models import F
+
+from bookwyrm import models
+
+
+class AuthorSearch(TestCase):
+    """look for some authors"""
+
+    @classmethod
+    def setUpTestData(cls):
+        """we need basic test data and mocks"""
+        cls.bob = models.Author.objects.create(
+            name="Bob", aliases=["Robertus", "Alice"]
+        )
+        cls.alice = models.Author.objects.create(name="Alice")
+
+    def test_search(self):
+        """search for an author in the db"""
+        results = self._search("Bob")
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0], self.bob)
+
+    def test_alias_priority(self):
+        """aliases should be included, with lower priority than name"""
+        results = self._search("Alice")
+        self.assertEqual(len(results), 2)
+        self.assertEqual(results[0], self.alice)
+
+    def _search_first(self, query):
+        """wrapper around search_title_author"""
+        return self._search(query, return_first=True)
+
+    @staticmethod
+    def _search(query, *, return_first=False):
+        """author search"""
+        search_query = SearchQuery(query, config="simple")
+        min_confidence = 0
+
+        results = (
+            models.Author.objects.filter(search_vector=search_query)
+            .annotate(rank=SearchRank(F("search_vector"), search_query))
+            .filter(rank__gt=min_confidence)
+            .order_by("-rank")
+        )
+        if return_first:
+            return results.first()
+        return results
+
+
+class SearchVectorTest(TestCase):
+    """check search_vector is computed correctly"""
+
+    def test_search_vector_simple(self):
+        """simplest search vector"""
+        author = self._create_author("Mary")
+        self.assertEqual(author.search_vector, "'mary':1A")
+
+    def test_search_vector_aliases(self):
+        """author aliases should be included with lower priority"""
+        author = self._create_author("Mary", aliases=["Maria", "Example"])
+        self.assertEqual(author.search_vector, "'example':3B 'maria':2B 'mary':1A")
+
+    def test_search_vector_parse_author(self):
+        """author name and alias is not stem'd or affected by stop words"""
+        author = self._create_author("Writes", aliases=["Reads"])
+        self.assertEqual(author.search_vector, "'reads':2B 'writes':1A")
+
+    def test_search_vector_on_update(self):
+        """make sure that search_vector is being set correctly on edit"""
+        author = self._create_author("Mary")
+        self.assertEqual(author.search_vector, "'mary':1A")
+
+        author.name = "Example"
+        author.save(broadcast=False)
+        author.refresh_from_db()
+        self.assertEqual(author.search_vector, "'example':1A")
+
+    @staticmethod
+    def _create_author(name, /, *, aliases=None):
+        """quickly create an author"""
+        author = models.Author.objects.create(name=name, aliases=aliases or [])
+        author.refresh_from_db()
+        return author
--- a/bookwyrm/tests/test_book_search.py
+++ b/bookwyrm/tests/test_book_search.py
@ -14,6 +14,13 @@ class BookSearch(TestCase):
    @classmethod
    def setUpTestData(self):  # pylint: disable=bad-classmethod-argument
        """we need basic test data and mocks"""
+        self.first_author = models.Author.objects.create(
+            name="Author One", aliases=["The First"]
+        )
+        self.second_author = models.Author.objects.create(
+            name="Author Two", aliases=["The Second"]
+        )
+
        self.work = models.Work.objects.create(title="Example Work")

        self.first_edition = models.Edition.objects.create(
@ -23,6 +30,8 @@ class BookSearch(TestCase):
            physical_format="Paperback",
            published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc),
        )
+        self.first_edition.authors.add(self.first_author)
+
        self.second_edition = models.Edition.objects.create(
            title="Another Edition",
            parent_work=self.work,
@ -30,19 +39,34 @@ class BookSearch(TestCase):
            openlibrary_key="hello",
            pages=150,
        )
+        self.second_edition.authors.add(self.first_author)
+        self.second_edition.authors.add(self.second_author)
+
        self.third_edition = models.Edition.objects.create(
            title="Another Edition with annoying ISBN",
            parent_work=self.work,
            isbn_10="022222222X",
        )
+        self.third_edition.authors.add(self.first_author)
+        self.third_edition.authors.add(self.second_author)

    def test_search(self):
        """search for a book in the db"""
-        # title/author
+        # title
        results = book_search.search("Example")
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0], self.first_edition)

+        # author
+        results = book_search.search("One")
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0], self.first_edition)
+
+        # author alias
+        results = book_search.search("First")
+        self.assertEqual(len(results), 1)
+        self.assertEqual(results[0], self.first_edition)
+
        # isbn
        results = book_search.search("0000000000")
        self.assertEqual(len(results), 1)
@ -155,8 +179,17 @@ class SearchVectorTest(TestCase):
        """search vector with subtitle and series"""
        # for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the
        # indexes in the search vector. (priority "D" is the default, and never shown.)
-        book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch")
-        self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C")
+        book = self._create_book(
+            "Book",
+            "Mary",
+            subtitle="Long",
+            series="Bunch",
+            author_alias=["Maria", "Mary Ann"],
+        )
+        self.assertEqual(
+            book.search_vector,
+            "'ann':6C 'book':1A 'bunch':7 'long':2B 'maria':4C 'mary':3C,5C",
+        )

    def test_search_vector_parse_book(self):
        """book parts are parsed in english"""
@ -170,8 +203,8 @@ class SearchVectorTest(TestCase):

    def test_search_vector_parse_author(self):
        """author name is not stem'd or affected by stop words"""
-        book = self._create_book("Writing", "Writes")
-        self.assertEqual(book.search_vector, "'write':1A 'writes':2C")
+        book = self._create_book("Writing", "Writes", author_alias=["Reads"])
+        self.assertEqual(book.search_vector, "'reads':3C 'write':1A 'writes':2C")

        book = self._create_book("She Is Writing", "She Writes")
        self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C")
@ -218,6 +251,13 @@ class SearchVectorTest(TestCase):
        book.refresh_from_db()
        self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")

+        author.aliases = ["Example"]
+        author.save(broadcast=False)
+        book.refresh_from_db()
+        self.assertEqual(
+            book.search_vector, "'example':5C 'goodby':3A 'jeremy':4C 'long':2A"
+        )
+
    def test_search_vector_on_author_delete(self):
        """update search when an author is deleted"""
        book = self._create_book("The Long Goodbye", "The Rays")
@ -274,7 +314,7 @@ class SearchVectorUpdates(TestCase):
    def setUp(self):
        """we need basic test data and mocks"""
        self.work = models.Work.objects.create(title="This Work")
-        self.author = models.Author.objects.create(name="Name")
+        self.author = models.Author.objects.create(name="Name", aliases=["Alias"])
        self.edition = models.Edition.objects.create(
            title="First Edition of Work",
            subtitle="Some Extra Words Are Good",
@ -363,13 +403,18 @@ class SearchVectorUpdates(TestCase):
    def test_search_after_updated_author_name(self):
        """book found under new author name"""
        self.assertEqual(self.edition, self._search_first("Name"))
+        self.assertEqual(self.edition, self._search_first("Alias"))
        self.assertFalse(self._search("Identifier"))
+        self.assertFalse(self._search("Another"))

        self.author.name = "Identifier"
+        self.author.aliases = ["Another"]
        self.author.save(broadcast=False)

        self.assertFalse(self._search("Name"))
+        self.assertFalse(self._search("Aliases"))
        self.assertEqual(self.edition, self._search_first("Identifier"))
+        self.assertEqual(self.edition, self._search_first("Another"))
        self.assertEqual(self.edition, self._search_first("Work"))

    def _search_first(self, query):
--- a/bookwyrm/views/search.py
+++ b/bookwyrm/views/search.py
@ -2,8 +2,9 @@

 import re

-from django.contrib.postgres.search import TrigramSimilarity
+from django.contrib.postgres.search import TrigramSimilarity, SearchRank, SearchQuery
 from django.core.paginator import Paginator
+from django.db.models import F
 from django.db.models.functions import Greatest
 from django.http import JsonResponse
 from django.template.response import TemplateResponse
@ -94,26 +95,28 @@ def book_search(request):

 def author_search(request):
    """search for an author"""
-    query = request.GET.get("q")
-    query = query.strip()
-    data = {"type": "author", "query": query}
+    query = request.GET.get("q").strip()
+    search_query = SearchQuery(query, config="simple")
+    min_confidence = 0

    results = (
-        models.Author.objects.annotate(
-            similarity=TrigramSimilarity("name", query),
-        )
-        .filter(
-            similarity__gt=0.1,
-        )
-        .order_by("-similarity")
+        models.Author.objects.filter(search_vector=search_query)
+        .annotate(rank=SearchRank(F("search_vector"), search_query))
+        .filter(rank__gt=min_confidence)
+        .order_by("-rank")
    )

    paginated = Paginator(results, PAGE_LENGTH)
    page = paginated.get_page(request.GET.get("page"))
-    data["results"] = page
-    data["page_range"] = paginated.get_elided_page_range(
-        page.number, on_each_side=2, on_ends=1
-    )
+
+    data = {
+        "type": "author",
+        "query": query,
+        "results": page,
+        "page_range": paginated.get_elided_page_range(
+            page.number, on_each_side=2, on_ends=1
+        ),
+    }
    return TemplateResponse(request, "search/author.html", data)