mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-11-25 11:01:12 +00:00
Merge pull request #3325 from Minnozz/author-search-vector
Rework author search
This commit is contained in:
commit
7192449b21
7 changed files with 304 additions and 42 deletions
41
bookwyrm/migrations/0197_author_search_vector.py
Normal file
41
bookwyrm/migrations/0197_author_search_vector.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
# Generated by Django 3.2.25 on 2024-03-20 15:15
|
||||
|
||||
import django.contrib.postgres.indexes
|
||||
from django.db import migrations
|
||||
import pgtrigger.compiler
|
||||
import pgtrigger.migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("bookwyrm", "0196_merge_pr3134_into_main"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddIndex(
|
||||
model_name="author",
|
||||
index=django.contrib.postgres.indexes.GinIndex(
|
||||
fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
|
||||
),
|
||||
),
|
||||
pgtrigger.migrations.AddTrigger(
|
||||
model_name="author",
|
||||
trigger=pgtrigger.compiler.Trigger(
|
||||
name="update_search_vector_on_author_edit",
|
||||
sql=pgtrigger.compiler.UpsertTriggerSql(
|
||||
func="new.search_vector := setweight(to_tsvector('simple', new.name), 'A') || setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');RETURN NEW;",
|
||||
hash="b97919016236d74d0ade51a0769a173ea269da64",
|
||||
operation='INSERT OR UPDATE OF "name", "aliases", "search_vector"',
|
||||
pgid="pgtrigger_update_search_vector_on_author_edit_c61cb",
|
||||
table="bookwyrm_author",
|
||||
when="BEFORE",
|
||||
),
|
||||
),
|
||||
),
|
||||
migrations.RunSQL(
|
||||
# Calculate search vector for all Authors.
|
||||
sql="UPDATE bookwyrm_author SET search_vector = NULL;",
|
||||
reverse_sql="UPDATE bookwyrm_author SET search_vector = NULL;",
|
||||
),
|
||||
]
|
|
@ -0,0 +1,57 @@
|
|||
# Generated by Django 3.2.25 on 2024-03-20 15:52
|
||||
|
||||
from django.db import migrations
|
||||
import pgtrigger.compiler
|
||||
import pgtrigger.migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("bookwyrm", "0197_author_search_vector"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
pgtrigger.migrations.RemoveTrigger(
|
||||
model_name="author",
|
||||
name="reset_search_vector_on_author_edit",
|
||||
),
|
||||
pgtrigger.migrations.RemoveTrigger(
|
||||
model_name="book",
|
||||
name="update_search_vector_on_book_edit",
|
||||
),
|
||||
pgtrigger.migrations.AddTrigger(
|
||||
model_name="author",
|
||||
trigger=pgtrigger.compiler.Trigger(
|
||||
name="reset_book_search_vector_on_author_edit",
|
||||
sql=pgtrigger.compiler.UpsertTriggerSql(
|
||||
func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;",
|
||||
hash="68422c0f29879c5802b82159dde45297eff53e73",
|
||||
operation='UPDATE OF "name", "aliases"',
|
||||
pgid="pgtrigger_reset_book_search_vector_on_author_edit_a50c7",
|
||||
table="bookwyrm_author",
|
||||
when="AFTER",
|
||||
),
|
||||
),
|
||||
),
|
||||
pgtrigger.migrations.AddTrigger(
|
||||
model_name="book",
|
||||
trigger=pgtrigger.compiler.Trigger(
|
||||
name="update_search_vector_on_book_edit",
|
||||
sql=pgtrigger.compiler.UpsertTriggerSql(
|
||||
func="WITH author_names AS (SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) SELECT setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(name_and_aliases), ' '), '')), 'C') FROM author_names) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D') INTO new.search_vector;RETURN NEW;",
|
||||
hash="9324f5ca76a6f5e63931881d62d11da11f595b2c",
|
||||
operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"',
|
||||
pgid="pgtrigger_update_search_vector_on_book_edit_bec58",
|
||||
table="bookwyrm_book",
|
||||
when="BEFORE",
|
||||
),
|
||||
),
|
||||
),
|
||||
migrations.RunSQL(
|
||||
# Recalculate search vector for all Books because it now includes
|
||||
# Author aliases.
|
||||
sql="UPDATE bookwyrm_book SET search_vector = NULL;",
|
||||
reverse_sql="UPDATE bookwyrm_book SET search_vector = NULL;",
|
||||
),
|
||||
]
|
|
@ -3,6 +3,7 @@ import re
|
|||
from typing import Tuple, Any
|
||||
|
||||
from django.db import models
|
||||
from django.contrib.postgres.indexes import GinIndex
|
||||
import pgtrigger
|
||||
|
||||
from bookwyrm import activitypub
|
||||
|
@ -71,11 +72,29 @@ class Author(BookDataModel):
|
|||
class Meta:
|
||||
"""sets up indexes and triggers"""
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
|
||||
indexes = (GinIndex(fields=["search_vector"]),)
|
||||
triggers = [
|
||||
pgtrigger.Trigger(
|
||||
name="reset_search_vector_on_author_edit",
|
||||
name="update_search_vector_on_author_edit",
|
||||
when=pgtrigger.Before,
|
||||
operation=pgtrigger.Insert
|
||||
| pgtrigger.UpdateOf("name", "aliases", "search_vector"),
|
||||
func=format_trigger(
|
||||
"""new.search_vector :=
|
||||
-- author name, with priority A
|
||||
setweight(to_tsvector('simple', new.name), 'A') ||
|
||||
-- author aliases, with priority B
|
||||
setweight(to_tsvector('simple', coalesce(array_to_string(new.aliases, ' '), '')), 'B');
|
||||
RETURN new;
|
||||
"""
|
||||
),
|
||||
),
|
||||
pgtrigger.Trigger(
|
||||
name="reset_book_search_vector_on_author_edit",
|
||||
when=pgtrigger.After,
|
||||
operation=pgtrigger.UpdateOf("name"),
|
||||
operation=pgtrigger.UpdateOf("name", "aliases"),
|
||||
func=format_trigger(
|
||||
"""WITH updated_books AS (
|
||||
SELECT book_id
|
||||
|
@ -89,7 +108,7 @@ class Author(BookDataModel):
|
|||
RETURN new;
|
||||
"""
|
||||
),
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
activity_serializer = activitypub.Author
|
||||
|
|
|
@ -246,22 +246,32 @@ class Book(BookDataModel):
|
|||
operation=pgtrigger.Insert
|
||||
| pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"),
|
||||
func=format_trigger(
|
||||
"""new.search_vector :=
|
||||
-- title, with priority A (parse in English, default to simple if empty)
|
||||
setweight(COALESCE(nullif(
|
||||
to_tsvector('english', new.title), ''),
|
||||
to_tsvector('simple', new.title)), 'A') ||
|
||||
-- subtitle, with priority B (always in English?)
|
||||
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
|
||||
-- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063)
|
||||
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C')
|
||||
"""
|
||||
WITH author_names AS (
|
||||
SELECT array_to_string(bookwyrm_author.name || bookwyrm_author.aliases, ' ') AS name_and_aliases
|
||||
FROM bookwyrm_author
|
||||
LEFT JOIN bookwyrm_book_authors
|
||||
ON bookwyrm_author.id = bookwyrm_book_authors.author_id
|
||||
WHERE bookwyrm_book_authors.book_id = new.id
|
||||
)
|
||||
SELECT
|
||||
-- title, with priority A (parse in English, default to simple if empty)
|
||||
setweight(COALESCE(nullif(
|
||||
to_tsvector('english', new.title), ''),
|
||||
to_tsvector('simple', new.title)), 'A') ||
|
||||
|
||||
-- subtitle, with priority B (always in English?)
|
||||
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
|
||||
|
||||
-- list of authors names and aliases (with priority C)
|
||||
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(name_and_aliases), ' '), '')), 'C')
|
||||
FROM author_names
|
||||
) ||
|
||||
|
||||
--- last: series name, with lowest priority
|
||||
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D');
|
||||
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D')
|
||||
|
||||
INTO new.search_vector;
|
||||
RETURN new;
|
||||
"""
|
||||
),
|
||||
|
|
87
bookwyrm/tests/test_author_search.py
Normal file
87
bookwyrm/tests/test_author_search.py
Normal file
|
@ -0,0 +1,87 @@
|
|||
""" test searching for authors """
|
||||
from django.test import TestCase
|
||||
|
||||
from django.contrib.postgres.search import SearchRank, SearchQuery
|
||||
from django.db.models import F
|
||||
|
||||
from bookwyrm import models
|
||||
|
||||
|
||||
class AuthorSearch(TestCase):
|
||||
"""look for some authors"""
|
||||
|
||||
@classmethod
|
||||
def setUpTestData(cls):
|
||||
"""we need basic test data and mocks"""
|
||||
cls.bob = models.Author.objects.create(
|
||||
name="Bob", aliases=["Robertus", "Alice"]
|
||||
)
|
||||
cls.alice = models.Author.objects.create(name="Alice")
|
||||
|
||||
def test_search(self):
|
||||
"""search for an author in the db"""
|
||||
results = self._search("Bob")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0], self.bob)
|
||||
|
||||
def test_alias_priority(self):
|
||||
"""aliases should be included, with lower priority than name"""
|
||||
results = self._search("Alice")
|
||||
self.assertEqual(len(results), 2)
|
||||
self.assertEqual(results[0], self.alice)
|
||||
|
||||
def _search_first(self, query):
|
||||
"""wrapper around search_title_author"""
|
||||
return self._search(query, return_first=True)
|
||||
|
||||
@staticmethod
|
||||
def _search(query, *, return_first=False):
|
||||
"""author search"""
|
||||
search_query = SearchQuery(query, config="simple")
|
||||
min_confidence = 0
|
||||
|
||||
results = (
|
||||
models.Author.objects.filter(search_vector=search_query)
|
||||
.annotate(rank=SearchRank(F("search_vector"), search_query))
|
||||
.filter(rank__gt=min_confidence)
|
||||
.order_by("-rank")
|
||||
)
|
||||
if return_first:
|
||||
return results.first()
|
||||
return results
|
||||
|
||||
|
||||
class SearchVectorTest(TestCase):
|
||||
"""check search_vector is computed correctly"""
|
||||
|
||||
def test_search_vector_simple(self):
|
||||
"""simplest search vector"""
|
||||
author = self._create_author("Mary")
|
||||
self.assertEqual(author.search_vector, "'mary':1A")
|
||||
|
||||
def test_search_vector_aliases(self):
|
||||
"""author aliases should be included with lower priority"""
|
||||
author = self._create_author("Mary", aliases=["Maria", "Example"])
|
||||
self.assertEqual(author.search_vector, "'example':3B 'maria':2B 'mary':1A")
|
||||
|
||||
def test_search_vector_parse_author(self):
|
||||
"""author name and alias is not stem'd or affected by stop words"""
|
||||
author = self._create_author("Writes", aliases=["Reads"])
|
||||
self.assertEqual(author.search_vector, "'reads':2B 'writes':1A")
|
||||
|
||||
def test_search_vector_on_update(self):
|
||||
"""make sure that search_vector is being set correctly on edit"""
|
||||
author = self._create_author("Mary")
|
||||
self.assertEqual(author.search_vector, "'mary':1A")
|
||||
|
||||
author.name = "Example"
|
||||
author.save(broadcast=False)
|
||||
author.refresh_from_db()
|
||||
self.assertEqual(author.search_vector, "'example':1A")
|
||||
|
||||
@staticmethod
|
||||
def _create_author(name, /, *, aliases=None):
|
||||
"""quickly create an author"""
|
||||
author = models.Author.objects.create(name=name, aliases=aliases or [])
|
||||
author.refresh_from_db()
|
||||
return author
|
|
@ -14,6 +14,13 @@ class BookSearch(TestCase):
|
|||
@classmethod
|
||||
def setUpTestData(self): # pylint: disable=bad-classmethod-argument
|
||||
"""we need basic test data and mocks"""
|
||||
self.first_author = models.Author.objects.create(
|
||||
name="Author One", aliases=["The First"]
|
||||
)
|
||||
self.second_author = models.Author.objects.create(
|
||||
name="Author Two", aliases=["The Second"]
|
||||
)
|
||||
|
||||
self.work = models.Work.objects.create(title="Example Work")
|
||||
|
||||
self.first_edition = models.Edition.objects.create(
|
||||
|
@ -23,6 +30,8 @@ class BookSearch(TestCase):
|
|||
physical_format="Paperback",
|
||||
published_date=datetime.datetime(2019, 4, 9, 0, 0, tzinfo=timezone.utc),
|
||||
)
|
||||
self.first_edition.authors.add(self.first_author)
|
||||
|
||||
self.second_edition = models.Edition.objects.create(
|
||||
title="Another Edition",
|
||||
parent_work=self.work,
|
||||
|
@ -30,19 +39,34 @@ class BookSearch(TestCase):
|
|||
openlibrary_key="hello",
|
||||
pages=150,
|
||||
)
|
||||
self.second_edition.authors.add(self.first_author)
|
||||
self.second_edition.authors.add(self.second_author)
|
||||
|
||||
self.third_edition = models.Edition.objects.create(
|
||||
title="Another Edition with annoying ISBN",
|
||||
parent_work=self.work,
|
||||
isbn_10="022222222X",
|
||||
)
|
||||
self.third_edition.authors.add(self.first_author)
|
||||
self.third_edition.authors.add(self.second_author)
|
||||
|
||||
def test_search(self):
|
||||
"""search for a book in the db"""
|
||||
# title/author
|
||||
# title
|
||||
results = book_search.search("Example")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0], self.first_edition)
|
||||
|
||||
# author
|
||||
results = book_search.search("One")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0], self.first_edition)
|
||||
|
||||
# author alias
|
||||
results = book_search.search("First")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0], self.first_edition)
|
||||
|
||||
# isbn
|
||||
results = book_search.search("0000000000")
|
||||
self.assertEqual(len(results), 1)
|
||||
|
@ -155,8 +179,17 @@ class SearchVectorTest(TestCase):
|
|||
"""search vector with subtitle and series"""
|
||||
# for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the
|
||||
# indexes in the search vector. (priority "D" is the default, and never shown.)
|
||||
book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch")
|
||||
self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C")
|
||||
book = self._create_book(
|
||||
"Book",
|
||||
"Mary",
|
||||
subtitle="Long",
|
||||
series="Bunch",
|
||||
author_alias=["Maria", "Mary Ann"],
|
||||
)
|
||||
self.assertEqual(
|
||||
book.search_vector,
|
||||
"'ann':6C 'book':1A 'bunch':7 'long':2B 'maria':4C 'mary':3C,5C",
|
||||
)
|
||||
|
||||
def test_search_vector_parse_book(self):
|
||||
"""book parts are parsed in english"""
|
||||
|
@ -170,8 +203,8 @@ class SearchVectorTest(TestCase):
|
|||
|
||||
def test_search_vector_parse_author(self):
|
||||
"""author name is not stem'd or affected by stop words"""
|
||||
book = self._create_book("Writing", "Writes")
|
||||
self.assertEqual(book.search_vector, "'write':1A 'writes':2C")
|
||||
book = self._create_book("Writing", "Writes", author_alias=["Reads"])
|
||||
self.assertEqual(book.search_vector, "'reads':3C 'write':1A 'writes':2C")
|
||||
|
||||
book = self._create_book("She Is Writing", "She Writes")
|
||||
self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C")
|
||||
|
@ -218,6 +251,13 @@ class SearchVectorTest(TestCase):
|
|||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
|
||||
|
||||
author.aliases = ["Example"]
|
||||
author.save(broadcast=False)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(
|
||||
book.search_vector, "'example':5C 'goodby':3A 'jeremy':4C 'long':2A"
|
||||
)
|
||||
|
||||
def test_search_vector_on_author_delete(self):
|
||||
"""update search when an author is deleted"""
|
||||
book = self._create_book("The Long Goodbye", "The Rays")
|
||||
|
@ -274,7 +314,7 @@ class SearchVectorUpdates(TestCase):
|
|||
def setUp(self):
|
||||
"""we need basic test data and mocks"""
|
||||
self.work = models.Work.objects.create(title="This Work")
|
||||
self.author = models.Author.objects.create(name="Name")
|
||||
self.author = models.Author.objects.create(name="Name", aliases=["Alias"])
|
||||
self.edition = models.Edition.objects.create(
|
||||
title="First Edition of Work",
|
||||
subtitle="Some Extra Words Are Good",
|
||||
|
@ -363,13 +403,18 @@ class SearchVectorUpdates(TestCase):
|
|||
def test_search_after_updated_author_name(self):
|
||||
"""book found under new author name"""
|
||||
self.assertEqual(self.edition, self._search_first("Name"))
|
||||
self.assertEqual(self.edition, self._search_first("Alias"))
|
||||
self.assertFalse(self._search("Identifier"))
|
||||
self.assertFalse(self._search("Another"))
|
||||
|
||||
self.author.name = "Identifier"
|
||||
self.author.aliases = ["Another"]
|
||||
self.author.save(broadcast=False)
|
||||
|
||||
self.assertFalse(self._search("Name"))
|
||||
self.assertFalse(self._search("Aliases"))
|
||||
self.assertEqual(self.edition, self._search_first("Identifier"))
|
||||
self.assertEqual(self.edition, self._search_first("Another"))
|
||||
self.assertEqual(self.edition, self._search_first("Work"))
|
||||
|
||||
def _search_first(self, query):
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
import re
|
||||
|
||||
from django.contrib.postgres.search import TrigramSimilarity
|
||||
from django.contrib.postgres.search import TrigramSimilarity, SearchRank, SearchQuery
|
||||
from django.core.paginator import Paginator
|
||||
from django.db.models import F
|
||||
from django.db.models.functions import Greatest
|
||||
from django.http import JsonResponse
|
||||
from django.template.response import TemplateResponse
|
||||
|
@ -94,26 +95,28 @@ def book_search(request):
|
|||
|
||||
def author_search(request):
|
||||
"""search for an author"""
|
||||
query = request.GET.get("q")
|
||||
query = query.strip()
|
||||
data = {"type": "author", "query": query}
|
||||
query = request.GET.get("q").strip()
|
||||
search_query = SearchQuery(query, config="simple")
|
||||
min_confidence = 0
|
||||
|
||||
results = (
|
||||
models.Author.objects.annotate(
|
||||
similarity=TrigramSimilarity("name", query),
|
||||
)
|
||||
.filter(
|
||||
similarity__gt=0.1,
|
||||
)
|
||||
.order_by("-similarity")
|
||||
models.Author.objects.filter(search_vector=search_query)
|
||||
.annotate(rank=SearchRank(F("search_vector"), search_query))
|
||||
.filter(rank__gt=min_confidence)
|
||||
.order_by("-rank")
|
||||
)
|
||||
|
||||
paginated = Paginator(results, PAGE_LENGTH)
|
||||
page = paginated.get_page(request.GET.get("page"))
|
||||
data["results"] = page
|
||||
data["page_range"] = paginated.get_elided_page_range(
|
||||
|
||||
data = {
|
||||
"type": "author",
|
||||
"query": query,
|
||||
"results": page,
|
||||
"page_range": paginated.get_elided_page_range(
|
||||
page.number, on_each_side=2, on_ends=1
|
||||
)
|
||||
),
|
||||
}
|
||||
return TemplateResponse(request, "search/author.html", data)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue