Merge pull request #3134 from dato/trigger_migrations

Support trigger migrations
This commit is contained in:
Bart Schuurmans 2024-03-20 12:11:34 +01:00 committed by GitHub
commit 762786839c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 430 additions and 83 deletions

View file

@ -0,0 +1,16 @@
# Generated by Django 3.2.20 on 2023-11-24 17:11
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0188_theme_loads"),
]
operations = [
migrations.RemoveIndex(
model_name="author",
name="bookwyrm_au_search__b050a8_gin",
),
]

View file

@ -0,0 +1,76 @@
# Generated by Django 3.2.20 on 2023-11-25 00:47
from importlib import import_module
import re
from django.db import migrations
import pgtrigger.compiler
import pgtrigger.migrations
trigger_migration = import_module("bookwyrm.migrations.0077_auto_20210623_2155")
# it's _very_ convenient for development that this migration be reversible
search_vector_trigger = trigger_migration.Migration.operations[4]
author_search_vector_trigger = trigger_migration.Migration.operations[5]
assert re.search(r"\bCREATE TRIGGER search_vector_trigger\b", search_vector_trigger.sql)
assert re.search(
r"\bCREATE TRIGGER author_search_vector_trigger\b",
author_search_vector_trigger.sql,
)
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0190_book_search_updates"),
]
operations = [
pgtrigger.migrations.AddTrigger(
model_name="book",
trigger=pgtrigger.compiler.Trigger(
name="update_search_vector_on_book_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="new.search_vector := setweight(coalesce(nullif(to_tsvector('english', new.title), ''), to_tsvector('simple', new.title)), 'A') || setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(bookwyrm_author.name), ' '), '')), 'C') FROM bookwyrm_author LEFT JOIN bookwyrm_book_authors ON bookwyrm_author.id = bookwyrm_book_authors.author_id WHERE bookwyrm_book_authors.book_id = new.id ) || setweight(to_tsvector('english', coalesce(new.series, '')), 'D');RETURN NEW;",
hash="77d6399497c0a89b0bf09d296e33c396da63705c",
operation='INSERT OR UPDATE OF "title", "subtitle", "series", "search_vector"',
pgid="pgtrigger_update_search_vector_on_book_edit_bec58",
table="bookwyrm_book",
when="BEFORE",
),
),
),
pgtrigger.migrations.AddTrigger(
model_name="author",
trigger=pgtrigger.compiler.Trigger(
name="reset_search_vector_on_author_edit",
sql=pgtrigger.compiler.UpsertTriggerSql(
func="WITH updated_books AS (SELECT book_id FROM bookwyrm_book_authors WHERE author_id = new.id ) UPDATE bookwyrm_book SET search_vector = '' FROM updated_books WHERE id = updated_books.book_id;RETURN NEW;",
hash="e7bbf08711ff3724c58f4d92fb7a082ffb3d7826",
operation='UPDATE OF "name"',
pgid="pgtrigger_reset_search_vector_on_author_edit_a447c",
table="bookwyrm_author",
when="AFTER",
),
),
),
migrations.RunSQL(
sql="""DROP TRIGGER IF EXISTS search_vector_trigger ON bookwyrm_book;
DROP FUNCTION IF EXISTS book_trigger;
""",
reverse_sql=search_vector_trigger.sql,
),
migrations.RunSQL(
sql="""DROP TRIGGER IF EXISTS author_search_vector_trigger ON bookwyrm_author;
DROP FUNCTION IF EXISTS author_trigger;
""",
reverse_sql=author_search_vector_trigger.sql,
),
migrations.RunSQL(
# Recalculate book search vector for any missed author name changes
# due to bug in JOIN in the old trigger.
sql="UPDATE bookwyrm_book SET search_vector = NULL;",
reverse_sql=migrations.RunSQL.noop,
),
]

View file

@ -0,0 +1,13 @@
# Generated by Django 3.2.23 on 2024-03-18 00:48
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0191_migrate_search_vec_triggers_to_pgtriggers"),
("bookwyrm", "0195_alter_user_preferred_language"),
]
operations = []

View file

@ -2,11 +2,12 @@
import re import re
from typing import Tuple, Any from typing import Tuple, Any
from django.contrib.postgres.indexes import GinIndex
from django.db import models from django.db import models
import pgtrigger
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.utils.db import format_trigger
from .book import BookDataModel from .book import BookDataModel
from . import fields from . import fields
@ -67,9 +68,28 @@ class Author(BookDataModel):
"""editions and works both use "book" instead of model_name""" """editions and works both use "book" instead of model_name"""
return f"https://{DOMAIN}/author/{self.id}" return f"https://{DOMAIN}/author/{self.id}"
activity_serializer = activitypub.Author
class Meta: class Meta:
"""sets up postgres GIN index field""" """sets up indexes and triggers"""
indexes = (GinIndex(fields=["search_vector"]),) triggers = [
pgtrigger.Trigger(
name="reset_search_vector_on_author_edit",
when=pgtrigger.After,
operation=pgtrigger.UpdateOf("name"),
func=format_trigger(
"""WITH updated_books AS (
SELECT book_id
FROM bookwyrm_book_authors
WHERE author_id = new.id
)
UPDATE bookwyrm_book
SET search_vector = ''
FROM updated_books
WHERE id = updated_books.book_id;
RETURN new;
"""
),
)
]
activity_serializer = activitypub.Author

View file

@ -13,6 +13,7 @@ from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker from model_utils import FieldTracker
from model_utils.managers import InheritanceManager from model_utils.managers import InheritanceManager
from imagekit.models import ImageSpecField from imagekit.models import ImageSpecField
import pgtrigger
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.isbn.isbn import hyphenator_singleton as hyphenator from bookwyrm.isbn.isbn import hyphenator_singleton as hyphenator
@ -24,6 +25,7 @@ from bookwyrm.settings import (
ENABLE_PREVIEW_IMAGES, ENABLE_PREVIEW_IMAGES,
ENABLE_THUMBNAIL_GENERATION, ENABLE_THUMBNAIL_GENERATION,
) )
from bookwyrm.utils.db import format_trigger
from .activitypub_mixin import OrderedCollectionPageMixin, ObjectMixin from .activitypub_mixin import OrderedCollectionPageMixin, ObjectMixin
from .base_model import BookWyrmModel from .base_model import BookWyrmModel
@ -232,9 +234,39 @@ class Book(BookDataModel):
) )
class Meta: class Meta:
"""sets up postgres GIN index field""" """set up indexes and triggers"""
# pylint: disable=line-too-long
indexes = (GinIndex(fields=["search_vector"]),) indexes = (GinIndex(fields=["search_vector"]),)
triggers = [
pgtrigger.Trigger(
name="update_search_vector_on_book_edit",
when=pgtrigger.Before,
operation=pgtrigger.Insert
| pgtrigger.UpdateOf("title", "subtitle", "series", "search_vector"),
func=format_trigger(
"""new.search_vector :=
-- title, with priority A (parse in English, default to simple if empty)
setweight(COALESCE(nullif(
to_tsvector('english', new.title), ''),
to_tsvector('simple', new.title)), 'A') ||
-- subtitle, with priority B (always in English?)
setweight(to_tsvector('english', COALESCE(new.subtitle, '')), 'B') ||
-- list of authors, with priority C (TODO: add aliases?, bookwyrm-social#3063)
(SELECT setweight(to_tsvector('simple', COALESCE(array_to_string(ARRAY_AGG(bookwyrm_author.name), ' '), '')), 'C')
FROM bookwyrm_author
LEFT JOIN bookwyrm_book_authors
ON bookwyrm_author.id = bookwyrm_book_authors.author_id
WHERE bookwyrm_book_authors.book_id = new.id
) ||
--- last: series name, with lowest priority
setweight(to_tsvector('english', COALESCE(new.series, '')), 'D');
RETURN new;
"""
),
)
]
class Work(OrderedCollectionPageMixin, Book): class Work(OrderedCollectionPageMixin, Book):

View file

@ -108,6 +108,7 @@ INSTALLED_APPS = [
"celery", "celery",
"django_celery_beat", "django_celery_beat",
"imagekit", "imagekit",
"pgtrigger",
"storages", "storages",
] ]

View file

@ -1,5 +1,6 @@
""" test searching for books """ """ test searching for books """
import datetime import datetime
from django.db import connection
from django.test import TestCase from django.test import TestCase
from django.utils import timezone from django.utils import timezone
@ -140,3 +141,244 @@ class BookSearch(TestCase):
# there's really not much to test here, it's just a dataclass # there's really not much to test here, it's just a dataclass
self.assertEqual(result.confidence, 1) self.assertEqual(result.confidence, 1)
self.assertEqual(result.title, "Title") self.assertEqual(result.title, "Title")
class SearchVectorTest(TestCase):
"""check search_vector is computed correctly"""
def test_search_vector_simple(self):
"""simplest search vector"""
book = self._create_book("Book", "Mary")
self.assertEqual(book.search_vector, "'book':1A 'mary':2C") # A > C (priority)
def test_search_vector_all_parts(self):
"""search vector with subtitle and series"""
# for a book like this we call `to_tsvector("Book Long Mary Bunch")`, hence the
# indexes in the search vector. (priority "D" is the default, and never shown.)
book = self._create_book("Book", "Mary", subtitle="Long", series="Bunch")
self.assertEqual(book.search_vector, "'book':1A 'bunch':4 'long':2B 'mary':3C")
def test_search_vector_parse_book(self):
"""book parts are parsed in english"""
# FIXME: at some point this should stop being the default.
book = self._create_book(
"Edition", "Editor", series="Castle", subtitle="Writing"
)
self.assertEqual(
book.search_vector, "'castl':4 'edit':1A 'editor':3C 'write':2B"
)
def test_search_vector_parse_author(self):
"""author name is not stem'd or affected by stop words"""
book = self._create_book("Writing", "Writes")
self.assertEqual(book.search_vector, "'write':1A 'writes':2C")
book = self._create_book("She Is Writing", "She Writes")
self.assertEqual(book.search_vector, "'she':4C 'write':3A 'writes':5C")
def test_search_vector_parse_title_empty(self):
"""empty parse in English retried as simple title"""
book = self._create_book("Here We", "John")
self.assertEqual(book.search_vector, "'here':1A 'john':3C 'we':2A")
book = self._create_book("Hear We Come", "John")
self.assertEqual(book.search_vector, "'come':3A 'hear':1A 'john':4C")
book = self._create_book("there there", "the")
self.assertEqual(book.search_vector, "'the':3C 'there':1A,2A")
def test_search_vector_no_author(self):
"""book with no authors gets processed normally"""
book = self._create_book("Book", None, series="Bunch")
self.assertEqual(book.search_vector, "'book':1A 'bunch':2")
book = self._create_book("there there", None)
self.assertEqual(book.search_vector, "'there':1A,2A")
# n.b.: the following originally from test_posgres.py
def test_search_vector_on_update(self):
"""make sure that search_vector is being set correctly on edit"""
book = self._create_book("The Long Goodbye", None)
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
book.title = "The Even Longer Goodbye"
book.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'even':2A 'goodby':4A 'longer':3A")
def test_search_vector_on_author_update(self):
"""update search when an author name changes"""
book = self._create_book("The Long Goodbye", "The Rays")
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A 'rays':5C 'the':4C")
author = models.Author.objects.get(name="The Rays")
author.name = "Jeremy"
author.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
def test_search_vector_on_author_delete(self):
"""update search when an author is deleted"""
book = self._create_book("The Long Goodbye", "The Rays")
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A 'rays':5C 'the':4C")
author = models.Author.objects.get(name="The Rays")
book.authors.remove(author)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
def test_search_vector_fields(self):
"""language field irrelevant for search_vector"""
author = models.Author.objects.create(name="The Rays")
book = models.Edition.objects.create(
title="The Long Goodbye",
subtitle="wow cool",
series="series name",
languages=["irrelevant"],
)
book.authors.add(author)
book.refresh_from_db()
self.assertEqual(
book.search_vector,
# pylint: disable-next=line-too-long
"'cool':5B 'goodby':3A 'long':2A 'name':9 'rays':7C 'seri':8 'the':6C 'wow':4B",
)
@staticmethod
def _create_book(
title, author_name, /, *, subtitle="", series="", author_alias=None
):
"""quickly create a book"""
work = models.Work.objects.create(title="work")
edition = models.Edition.objects.create(
title=title,
series=series or None,
subtitle=subtitle or None,
isbn_10="0000000000",
parent_work=work,
)
if author_name is not None:
author = models.Author.objects.create(
name=author_name, aliases=author_alias or []
)
edition.authors.add(author)
edition.save(broadcast=False)
edition.refresh_from_db()
return edition
class SearchVectorUpdates(TestCase):
"""look for books as they change""" # functional tests of the above
def setUp(self):
"""we need basic test data and mocks"""
self.work = models.Work.objects.create(title="This Work")
self.author = models.Author.objects.create(name="Name")
self.edition = models.Edition.objects.create(
title="First Edition of Work",
subtitle="Some Extra Words Are Good",
series="A Fabulous Sequence of Items",
parent_work=self.work,
isbn_10="0000000000",
)
self.edition.authors.add(self.author)
self.edition.save(broadcast=False)
@classmethod
def setUpTestData(cls):
"""create conditions that trigger known old bugs"""
with connection.cursor() as cursor:
cursor.execute(
"""
ALTER SEQUENCE bookwyrm_author_id_seq RESTART WITH 20;
ALTER SEQUENCE bookwyrm_book_authors_id_seq RESTART WITH 300;
"""
)
def test_search_after_changed_metadata(self):
"""book found after updating metadata"""
self.assertEqual(self.edition, self._search_first("First")) # title
self.assertEqual(self.edition, self._search_first("Good")) # subtitle
self.assertEqual(self.edition, self._search_first("Sequence")) # series
self.edition.title = "Second Title of Work"
self.edition.subtitle = "Fewer Words Is Better"
self.edition.series = "A Wondrous Bunch"
self.edition.save(broadcast=False)
self.assertEqual(self.edition, self._search_first("Second")) # title new
self.assertEqual(self.edition, self._search_first("Fewer")) # subtitle new
self.assertEqual(self.edition, self._search_first("Wondrous")) # series new
self.assertFalse(self._search_first("First")) # title old
self.assertFalse(self._search_first("Good")) # subtitle old
self.assertFalse(self._search_first("Sequence")) # series old
def test_search_after_author_remove(self):
"""book not found via removed author"""
self.assertEqual(self.edition, self._search_first("Name"))
self.edition.authors.set([])
self.edition.save(broadcast=False)
self.assertFalse(self._search("Name"))
self.assertEqual(self.edition, self._search_first("Edition"))
def test_search_after_author_add(self):
"""book found by newly-added author"""
new_author = models.Author.objects.create(name="Mozilla")
self.assertFalse(self._search("Mozilla"))
self.edition.authors.add(new_author)
self.edition.save(broadcast=False)
self.assertEqual(self.edition, self._search_first("Mozilla"))
self.assertEqual(self.edition, self._search_first("Name"))
def test_search_after_author_add_remove_sql(self):
"""add/remove author through SQL to ensure execution of book_authors trigger"""
# Tests calling edition.save(), above, pass even if the trigger in
# bookwyrm_book_authors is removed (probably because they trigger the one
# in bookwyrm_book directly). Here we make sure to exercise the former.
new_author = models.Author.objects.create(name="Mozilla")
with connection.cursor() as cursor:
cursor.execute(
"DELETE FROM bookwyrm_book_authors WHERE book_id = %s",
[self.edition.id],
)
self.assertFalse(self._search("Name"))
self.assertFalse(self._search("Mozilla"))
with connection.cursor() as cursor:
cursor.execute(
"INSERT INTO bookwyrm_book_authors (book_id,author_id) VALUES (%s,%s)",
[self.edition.id, new_author.id],
)
self.assertFalse(self._search("Name"))
self.assertEqual(self.edition, self._search_first("Mozilla"))
def test_search_after_updated_author_name(self):
"""book found under new author name"""
self.assertEqual(self.edition, self._search_first("Name"))
self.assertFalse(self._search("Identifier"))
self.author.name = "Identifier"
self.author.save(broadcast=False)
self.assertFalse(self._search("Name"))
self.assertEqual(self.edition, self._search_first("Identifier"))
self.assertEqual(self.edition, self._search_first("Work"))
def _search_first(self, query):
"""wrapper around search_title_author"""
return self._search(query, return_first=True)
@staticmethod
def _search(query, *, return_first=False):
"""wrapper around search_title_author"""
return book_search.search_title_author(
query, min_confidence=0, return_first=return_first
)

View file

@ -1,77 +0,0 @@
""" django configuration of postgres """
from unittest.mock import patch
from django.test import TestCase
from bookwyrm import models
@patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async")
class PostgresTriggers(TestCase):
"""special migrations, fancy stuff ya know"""
def test_search_vector_on_create(self, _):
"""make sure that search_vector is being set correctly on create"""
book = models.Edition.objects.create(title="The Long Goodbye")
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
def test_search_vector_on_update(self, _):
"""make sure that search_vector is being set correctly on edit"""
book = models.Edition.objects.create(title="The Long Goodbye")
book.title = "The Even Longer Goodbye"
book.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'even':2A 'goodby':4A 'longer':3A")
def test_search_vector_fields(self, _):
"""use multiple fields to create search vector"""
author = models.Author.objects.create(name="The Rays")
book = models.Edition.objects.create(
title="The Long Goodbye",
subtitle="wow cool",
series="series name",
languages=["irrelevant"],
)
book.authors.add(author)
book.refresh_from_db()
# pylint: disable=line-too-long
self.assertEqual(
book.search_vector,
"'cool':5B 'goodby':3A 'long':2A 'name':9 'rays':7C 'seri':8 'the':6C 'wow':4B",
)
def test_search_vector_on_author_update(self, _):
"""update search when an author name changes"""
author = models.Author.objects.create(name="The Rays")
book = models.Edition.objects.create(
title="The Long Goodbye",
)
book.authors.add(author)
author.name = "Jeremy"
author.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
def test_search_vector_on_author_delete(self, _):
"""update search when an author name changes"""
author = models.Author.objects.create(name="Jeremy")
book = models.Edition.objects.create(
title="The Long Goodbye",
)
book.authors.add(author)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
book.authors.remove(author)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
def test_search_vector_stop_word_fallback(self, _):
"""use a fallback when removing stop words leads to an empty vector"""
book = models.Edition.objects.create(
title="there there",
)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'there':1A,2A")

23
bookwyrm/utils/db.py Normal file
View file

@ -0,0 +1,23 @@
""" Database utilities """
from typing import cast
import sqlparse # type: ignore
def format_trigger(sql: str) -> str:
"""format SQL trigger before storing
we remove whitespace and use consistent casing so as to avoid migrations
due to formatting changes.
"""
return cast(
str,
sqlparse.format(
sql,
strip_comments=True,
strip_whitespace=True,
use_space_around_operators=True,
keyword_case="upper",
identifier_case="lower",
),
)

View file

@ -10,6 +10,7 @@ django-compressor==4.4
django-csp==3.7 django-csp==3.7
django-imagekit==4.1.0 django-imagekit==4.1.0
django-model-utils==4.3.1 django-model-utils==4.3.1
django-pgtrigger==4.11.0
django-redis==5.2.0 django-redis==5.2.0
django-sass-processor==1.2.2 django-sass-processor==1.2.2
django-storages==1.13.2 django-storages==1.13.2