diff --git a/.github/workflows/django-tests.yml b/.github/workflows/django-tests.yml index b5b319f53..c11b7c408 100644 --- a/.github/workflows/django-tests.yml +++ b/.github/workflows/django-tests.yml @@ -20,7 +20,7 @@ jobs: services: postgres: - image: postgres:10 + image: postgres:12 env: POSTGRES_USER: postgres POSTGRES_PASSWORD: hunter2 @@ -66,4 +66,4 @@ jobs: EMAIL_USE_TLS: true ENABLE_PREVIEW_IMAGES: true run: | - python manage.py test + pytest diff --git a/bookwyrm/connectors/self_connector.py b/bookwyrm/connectors/self_connector.py index 930b7cb3d..8d5a7614e 100644 --- a/bookwyrm/connectors/self_connector.py +++ b/bookwyrm/connectors/self_connector.py @@ -2,7 +2,7 @@ from functools import reduce import operator -from django.contrib.postgres.search import SearchRank, SearchVector +from django.contrib.postgres.search import SearchRank, SearchQuery from django.db.models import OuterRef, Subquery, F, Q from bookwyrm import models @@ -13,7 +13,7 @@ class Connector(AbstractConnector): """instantiate a connector""" # pylint: disable=arguments-differ - def search(self, query, min_confidence=0.1, raw=False, filters=None): + def search(self, query, min_confidence=0, raw=False, filters=None): """search your local database""" filters = filters or [] if not query: @@ -141,16 +141,11 @@ def search_identifiers(query, *filters): def search_title_author(query, min_confidence, *filters): """searches for title and author""" - vector = ( - SearchVector("title", weight="A") - + SearchVector("subtitle", weight="B") - + SearchVector("authors__name", weight="C") - + SearchVector("series", weight="D") - ) - + query = SearchQuery(query, config="simple") | SearchQuery(query, config="english") results = ( - models.Edition.objects.annotate(rank=SearchRank(vector, query)) - .filter(*filters, rank__gt=min_confidence) + models.Edition.objects.filter(*filters, search_vector=query) + .annotate(rank=SearchRank(F("search_vector"), query)) + .filter(rank__gt=min_confidence) .order_by("-rank") ) diff --git a/bookwyrm/forms.py b/bookwyrm/forms.py index cb55d229e..57a94e3cd 100644 --- a/bookwyrm/forms.py +++ b/bookwyrm/forms.py @@ -183,6 +183,7 @@ class EditionForm(CustomForm): "parent_work", "shelves", "connector", + "search_vector", ] @@ -194,6 +195,7 @@ class AuthorForm(CustomForm): "origin_id", "created_date", "updated_date", + "search_vector", ] diff --git a/bookwyrm/migrations/0077_auto_20210623_2155.py b/bookwyrm/migrations/0077_auto_20210623_2155.py new file mode 100644 index 000000000..a73c43825 --- /dev/null +++ b/bookwyrm/migrations/0077_auto_20210623_2155.py @@ -0,0 +1,126 @@ +# Generated by Django 3.2.4 on 2021-06-23 21:55 + +import django.contrib.postgres.indexes +import django.contrib.postgres.search +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookwyrm", "0076_preview_images"), + ] + + operations = [ + migrations.AddField( + model_name="author", + name="search_vector", + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddField( + model_name="book", + name="search_vector", + field=django.contrib.postgres.search.SearchVectorField(null=True), + ), + migrations.AddIndex( + model_name="author", + index=django.contrib.postgres.indexes.GinIndex( + fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin" + ), + ), + migrations.AddIndex( + model_name="book", + index=django.contrib.postgres.indexes.GinIndex( + fields=["search_vector"], name="bookwyrm_bo_search__51beb3_gin" + ), + ), + migrations.RunSQL( + sql=""" + CREATE FUNCTION book_trigger() RETURNS trigger AS $$ + begin + new.search_vector := + coalesce( + NULLIF(setweight(to_tsvector('english', coalesce(new.title, '')), 'A'), ''), + setweight(to_tsvector('simple', coalesce(new.title, '')), 'A') + ) || + setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') || + (SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(bookwyrm_author.name), ' '), '')), 'C') + FROM bookwyrm_book + LEFT OUTER JOIN bookwyrm_book_authors + ON bookwyrm_book.id = bookwyrm_book_authors.book_id + LEFT OUTER JOIN bookwyrm_author + ON bookwyrm_book_authors.author_id = bookwyrm_author.id + WHERE bookwyrm_book.id = new.id + ) || + setweight(to_tsvector('english', coalesce(new.series, '')), 'D'); + return new; + end + $$ LANGUAGE plpgsql; + + CREATE TRIGGER search_vector_trigger + BEFORE INSERT OR UPDATE OF title, subtitle, series, search_vector + ON bookwyrm_book + FOR EACH ROW EXECUTE FUNCTION book_trigger(); + + UPDATE bookwyrm_book SET search_vector = NULL; + """, + reverse_sql=""" + DROP TRIGGER IF EXISTS search_vector_trigger + ON bookwyrm_book; + DROP FUNCTION IF EXISTS book_trigger; + """, + ), + # when an author is edited + migrations.RunSQL( + sql=""" + CREATE FUNCTION author_trigger() RETURNS trigger AS $$ + begin + WITH book AS ( + SELECT bookwyrm_book.id as row_id + FROM bookwyrm_author + LEFT OUTER JOIN bookwyrm_book_authors + ON bookwyrm_book_authors.id = new.id + LEFT OUTER JOIN bookwyrm_book + ON bookwyrm_book.id = bookwyrm_book_authors.book_id + ) + UPDATE bookwyrm_book SET search_vector = '' + FROM book + WHERE id = book.row_id; + return new; + end + $$ LANGUAGE plpgsql; + + CREATE TRIGGER author_search_vector_trigger + AFTER UPDATE OF name + ON bookwyrm_author + FOR EACH ROW EXECUTE FUNCTION author_trigger(); + """, + reverse_sql=""" + DROP TRIGGER IF EXISTS author_search_vector_trigger + ON bookwyrm_author; + DROP FUNCTION IF EXISTS author_trigger; + """, + ), + # when an author is added to or removed from a book + migrations.RunSQL( + sql=""" + CREATE FUNCTION book_authors_trigger() RETURNS trigger AS $$ + begin + UPDATE bookwyrm_book SET search_vector = '' + WHERE id = coalesce(new.book_id, old.book_id); + return new; + end + $$ LANGUAGE plpgsql; + + CREATE TRIGGER book_authors_search_vector_trigger + AFTER INSERT OR DELETE + ON bookwyrm_book_authors + FOR EACH ROW EXECUTE FUNCTION book_authors_trigger(); + """, + reverse_sql=""" + DROP TRIGGER IF EXISTS book_authors_search_vector_trigger + ON bookwyrm_book_authors; + DROP FUNCTION IF EXISTS book_authors_trigger; + """, + ), + ] diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index c4e26c5ab..6da80b176 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -1,4 +1,5 @@ """ database schema for info about authors """ +from django.contrib.postgres.indexes import GinIndex from django.db import models from bookwyrm import activitypub @@ -37,3 +38,8 @@ class Author(BookDataModel): return "https://%s/author/%s" % (DOMAIN, self.id) activity_serializer = activitypub.Author + + class Meta: + """sets up postgres GIN index field""" + + indexes = (GinIndex(fields=["search_vector"]),) diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index d79ce206d..a6aa5de2d 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -1,6 +1,8 @@ """ database schema for books and shelves """ import re +from django.contrib.postgres.search import SearchVectorField +from django.contrib.postgres.indexes import GinIndex from django.db import models from django.dispatch import receiver from model_utils import FieldTracker @@ -34,6 +36,7 @@ class BookDataModel(ObjectMixin, BookWyrmModel): bnf_id = fields.CharField( # Bibliothèque nationale de France max_length=255, blank=True, null=True, deduplication_field=True ) + search_vector = SearchVectorField(null=True) last_edited_by = fields.ForeignKey( "User", @@ -142,6 +145,11 @@ class Book(BookDataModel): self.title, ) + class Meta: + """sets up postgres GIN index field""" + + indexes = (GinIndex(fields=["search_vector"]),) + class Work(OrderedCollectionPageMixin, Book): """a work (an abstract concept of a book that manifests in an edition)""" diff --git a/bookwyrm/tests/connectors/test_self_connector.py b/bookwyrm/tests/connectors/test_self_connector.py index 4f2173085..02c0c9a4f 100644 --- a/bookwyrm/tests/connectors/test_self_connector.py +++ b/bookwyrm/tests/connectors/test_self_connector.py @@ -43,68 +43,69 @@ class SelfConnector(TestCase): self.assertEqual(result.year, 1980) self.assertEqual(result.connector, self.connector) - def test_search_rank(self): + @patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay") + def test_search_rank(self, _): """prioritize certain results""" author = models.Author.objects.create(name="Anonymous") - with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"): - edition = models.Edition.objects.create( - title="Edition of Example Work", - published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc), - parent_work=models.Work.objects.create(title=""), - ) - # author text is rank C - edition.authors.add(author) + edition = models.Edition.objects.create( + title="Edition of Example Work", + published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc), + parent_work=models.Work.objects.create(title=""), + ) + # author text is rank B + edition.authors.add(author) - # series is rank D - models.Edition.objects.create( - title="Another Edition", - series="Anonymous", - parent_work=models.Work.objects.create(title=""), - ) - # subtitle is rank B - models.Edition.objects.create( - title="More Editions", - subtitle="The Anonymous Edition", - parent_work=models.Work.objects.create(title=""), - ) - # title is rank A - models.Edition.objects.create(title="Anonymous") - # doesn't rank in this search - edition = models.Edition.objects.create( - title="An Edition", parent_work=models.Work.objects.create(title="") - ) + # series is rank D + models.Edition.objects.create( + title="Another Edition", + series="Anonymous", + parent_work=models.Work.objects.create(title=""), + ) + # subtitle is rank B + models.Edition.objects.create( + title="More Editions", + subtitle="The Anonymous Edition", + parent_work=models.Work.objects.create(title=""), + ) + # title is rank A + models.Edition.objects.create(title="Anonymous") + # doesn't rank in this search + models.Edition.objects.create( + title="An Edition", parent_work=models.Work.objects.create(title="") + ) - results = self.connector.search("Anonymous") - self.assertEqual(len(results), 3) + results = self.connector.search("Anonymous") + self.assertEqual(len(results), 4) self.assertEqual(results[0].title, "Anonymous") self.assertEqual(results[1].title, "More Editions") self.assertEqual(results[2].title, "Edition of Example Work") + self.assertEqual(results[3].title, "Another Edition") - def test_search_multiple_editions(self): + @patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay") + def test_search_multiple_editions(self, _): """it should get rid of duplicate editions for the same work""" - with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"): - work = models.Work.objects.create(title="Work Title") - edition_1 = models.Edition.objects.create( - title="Edition 1 Title", parent_work=work - ) - edition_2 = models.Edition.objects.create( - title="Edition 2 Title", - parent_work=work, - edition_rank=20, # that's default babey - ) - edition_3 = models.Edition.objects.create(title="Fish", parent_work=work) + work = models.Work.objects.create(title="Work Title") + edition_1 = models.Edition.objects.create( + title="Edition 1 Title", parent_work=work + ) + edition_2 = models.Edition.objects.create( + title="Edition 2 Title", + parent_work=work, + isbn_13="123456789", # this is now the defualt edition + ) + edition_3 = models.Edition.objects.create(title="Fish", parent_work=work) - # pick the best edition - results = self.connector.search("Edition 1 Title") - self.assertEqual(len(results), 1) - self.assertEqual(results[0].key, edition_1.remote_id) + # pick the best edition + results = self.connector.search("Edition 1 Title") + self.assertEqual(len(results), 1) + self.assertEqual(results[0].key, edition_1.remote_id) - # pick the default edition when no match is best - results = self.connector.search("Edition Title") - self.assertEqual(len(results), 1) - self.assertEqual(results[0].key, edition_2.remote_id) + # pick the default edition when no match is best + results = self.connector.search("Edition Title") + self.assertEqual(len(results), 1) + self.assertEqual(results[0].key, edition_2.remote_id) - # only matches one edition, so no deduplication takes place - results = self.connector.search("Fish") - self.assertEqual(len(results), 1) - self.assertEqual(results[0].key, edition_3.remote_id) + # only matches one edition, so no deduplication takes place + results = self.connector.search("Fish") + self.assertEqual(len(results), 1) + self.assertEqual(results[0].key, edition_3.remote_id) diff --git a/bookwyrm/tests/models/test_book_model.py b/bookwyrm/tests/models/test_book_model.py index df61514c1..9408220e0 100644 --- a/bookwyrm/tests/models/test_book_model.py +++ b/bookwyrm/tests/models/test_book_model.py @@ -1,8 +1,9 @@ """ testing models """ +from unittest.mock import patch + from dateutil.parser import parse from django.test import TestCase from django.utils import timezone -from unittest.mock import patch from bookwyrm import models, settings from bookwyrm.models.book import isbn_10_to_13, isbn_13_to_10 diff --git a/bookwyrm/tests/test_postgres.py b/bookwyrm/tests/test_postgres.py new file mode 100644 index 000000000..98385d34b --- /dev/null +++ b/bookwyrm/tests/test_postgres.py @@ -0,0 +1,77 @@ +""" django configuration of postgres """ +from unittest.mock import patch +from django.test import TestCase + +from bookwyrm import models + + +@patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay") +@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay") +class PostgresTriggers(TestCase): + """special migrations, fancy stuff ya know""" + + def test_search_vector_on_create(self, *_): + """make sure that search_vector is being set correctly on create""" + book = models.Edition.objects.create(title="The Long Goodbye") + book.refresh_from_db() + self.assertEqual(book.search_vector, "'goodby':3A 'long':2A") + + def test_search_vector_on_update(self, *_): + """make sure that search_vector is being set correctly on edit""" + book = models.Edition.objects.create(title="The Long Goodbye") + book.title = "The Even Longer Goodbye" + book.save(broadcast=False) + book.refresh_from_db() + self.assertEqual(book.search_vector, "'even':2A 'goodby':4A 'longer':3A") + + def test_search_vector_fields(self, *_): + """use multiple fields to create search vector""" + author = models.Author.objects.create(name="The Rays") + book = models.Edition.objects.create( + title="The Long Goodbye", + subtitle="wow cool", + series="series name", + languages=["irrelevent"], + ) + book.authors.add(author) + book.refresh_from_db() + self.assertEqual( + book.search_vector, + "'cool':5B 'goodby':3A 'long':2A 'name':9 'rays':7C 'seri':8 'the':6C 'wow':4B", + ) + + def test_seach_vector_on_author_update(self, *_): + """update search when an author name changes""" + author = models.Author.objects.create(name="The Rays") + book = models.Edition.objects.create( + title="The Long Goodbye", + ) + book.authors.add(author) + author.name = "Jeremy" + author.save(broadcast=False) + book.refresh_from_db() + + self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A") + + def test_seach_vector_on_author_delete(self, *_): + """update search when an author name changes""" + author = models.Author.objects.create(name="Jeremy") + book = models.Edition.objects.create( + title="The Long Goodbye", + ) + + book.authors.add(author) + book.refresh_from_db() + self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A") + + book.authors.remove(author) + book.refresh_from_db() + self.assertEqual(book.search_vector, "'goodby':3A 'long':2A") + + def test_search_vector_stop_word_fallback(self, *_): + """use a fallback when removing stop words leads to an empty vector""" + book = models.Edition.objects.create( + title="there there", + ) + book.refresh_from_db() + self.assertEqual(book.search_vector, "'there':1A,2A") diff --git a/bookwyrm/views/search.py b/bookwyrm/views/search.py index 274a3bc2e..d15fc6a87 100644 --- a/bookwyrm/views/search.py +++ b/bookwyrm/views/search.py @@ -23,7 +23,7 @@ class Search(View): def get(self, request): """that search bar up top""" query = request.GET.get("q") - min_confidence = request.GET.get("min_confidence", 0.1) + min_confidence = request.GET.get("min_confidence", 0) search_type = request.GET.get("type") search_remote = ( request.GET.get("remote", False) and request.user.is_authenticated