Merge pull request #1199 from bookwyrm-social/search-index

More efficient search index
This commit is contained in:
Mouse Reeve 2021-06-27 08:03:14 -07:00 committed by GitHub
commit 556ffa4194
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 284 additions and 68 deletions

View file

@ -20,7 +20,7 @@ jobs:
services:
postgres:
image: postgres:10
image: postgres:12
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: hunter2
@ -66,4 +66,4 @@ jobs:
EMAIL_USE_TLS: true
ENABLE_PREVIEW_IMAGES: true
run: |
python manage.py test
pytest

View file

@ -2,7 +2,7 @@
from functools import reduce
import operator
from django.contrib.postgres.search import SearchRank, SearchVector
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import OuterRef, Subquery, F, Q
from bookwyrm import models
@ -13,7 +13,7 @@ class Connector(AbstractConnector):
"""instantiate a connector"""
# pylint: disable=arguments-differ
def search(self, query, min_confidence=0.1, raw=False, filters=None):
def search(self, query, min_confidence=0, raw=False, filters=None):
"""search your local database"""
filters = filters or []
if not query:
@ -141,16 +141,11 @@ def search_identifiers(query, *filters):
def search_title_author(query, min_confidence, *filters):
"""searches for title and author"""
vector = (
SearchVector("title", weight="A")
+ SearchVector("subtitle", weight="B")
+ SearchVector("authors__name", weight="C")
+ SearchVector("series", weight="D")
)
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
results = (
models.Edition.objects.annotate(rank=SearchRank(vector, query))
.filter(*filters, rank__gt=min_confidence)
models.Edition.objects.filter(*filters, search_vector=query)
.annotate(rank=SearchRank(F("search_vector"), query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)

View file

@ -183,6 +183,7 @@ class EditionForm(CustomForm):
"parent_work",
"shelves",
"connector",
"search_vector",
]
@ -194,6 +195,7 @@ class AuthorForm(CustomForm):
"origin_id",
"created_date",
"updated_date",
"search_vector",
]

View file

@ -0,0 +1,126 @@
# Generated by Django 3.2.4 on 2021-06-23 21:55
import django.contrib.postgres.indexes
import django.contrib.postgres.search
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0076_preview_images"),
]
operations = [
migrations.AddField(
model_name="author",
name="search_vector",
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddField(
model_name="book",
name="search_vector",
field=django.contrib.postgres.search.SearchVectorField(null=True),
),
migrations.AddIndex(
model_name="author",
index=django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
),
),
migrations.AddIndex(
model_name="book",
index=django.contrib.postgres.indexes.GinIndex(
fields=["search_vector"], name="bookwyrm_bo_search__51beb3_gin"
),
),
migrations.RunSQL(
sql="""
CREATE FUNCTION book_trigger() RETURNS trigger AS $$
begin
new.search_vector :=
coalesce(
NULLIF(setweight(to_tsvector('english', coalesce(new.title, '')), 'A'), ''),
setweight(to_tsvector('simple', coalesce(new.title, '')), 'A')
) ||
setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') ||
(SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(bookwyrm_author.name), ' '), '')), 'C')
FROM bookwyrm_book
LEFT OUTER JOIN bookwyrm_book_authors
ON bookwyrm_book.id = bookwyrm_book_authors.book_id
LEFT OUTER JOIN bookwyrm_author
ON bookwyrm_book_authors.author_id = bookwyrm_author.id
WHERE bookwyrm_book.id = new.id
) ||
setweight(to_tsvector('english', coalesce(new.series, '')), 'D');
return new;
end
$$ LANGUAGE plpgsql;
CREATE TRIGGER search_vector_trigger
BEFORE INSERT OR UPDATE OF title, subtitle, series, search_vector
ON bookwyrm_book
FOR EACH ROW EXECUTE FUNCTION book_trigger();
UPDATE bookwyrm_book SET search_vector = NULL;
""",
reverse_sql="""
DROP TRIGGER IF EXISTS search_vector_trigger
ON bookwyrm_book;
DROP FUNCTION IF EXISTS book_trigger;
""",
),
# when an author is edited
migrations.RunSQL(
sql="""
CREATE FUNCTION author_trigger() RETURNS trigger AS $$
begin
WITH book AS (
SELECT bookwyrm_book.id as row_id
FROM bookwyrm_author
LEFT OUTER JOIN bookwyrm_book_authors
ON bookwyrm_book_authors.id = new.id
LEFT OUTER JOIN bookwyrm_book
ON bookwyrm_book.id = bookwyrm_book_authors.book_id
)
UPDATE bookwyrm_book SET search_vector = ''
FROM book
WHERE id = book.row_id;
return new;
end
$$ LANGUAGE plpgsql;
CREATE TRIGGER author_search_vector_trigger
AFTER UPDATE OF name
ON bookwyrm_author
FOR EACH ROW EXECUTE FUNCTION author_trigger();
""",
reverse_sql="""
DROP TRIGGER IF EXISTS author_search_vector_trigger
ON bookwyrm_author;
DROP FUNCTION IF EXISTS author_trigger;
""",
),
# when an author is added to or removed from a book
migrations.RunSQL(
sql="""
CREATE FUNCTION book_authors_trigger() RETURNS trigger AS $$
begin
UPDATE bookwyrm_book SET search_vector = ''
WHERE id = coalesce(new.book_id, old.book_id);
return new;
end
$$ LANGUAGE plpgsql;
CREATE TRIGGER book_authors_search_vector_trigger
AFTER INSERT OR DELETE
ON bookwyrm_book_authors
FOR EACH ROW EXECUTE FUNCTION book_authors_trigger();
""",
reverse_sql="""
DROP TRIGGER IF EXISTS book_authors_search_vector_trigger
ON bookwyrm_book_authors;
DROP FUNCTION IF EXISTS book_authors_trigger;
""",
),
]

View file

@ -1,4 +1,5 @@
""" database schema for info about authors """
from django.contrib.postgres.indexes import GinIndex
from django.db import models
from bookwyrm import activitypub
@ -37,3 +38,8 @@ class Author(BookDataModel):
return "https://%s/author/%s" % (DOMAIN, self.id)
activity_serializer = activitypub.Author
class Meta:
"""sets up postgres GIN index field"""
indexes = (GinIndex(fields=["search_vector"]),)

View file

@ -1,6 +1,8 @@
""" database schema for books and shelves """
import re
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
from django.db import models
from django.dispatch import receiver
from model_utils import FieldTracker
@ -34,6 +36,7 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
bnf_id = fields.CharField( # Bibliothèque nationale de France
max_length=255, blank=True, null=True, deduplication_field=True
)
search_vector = SearchVectorField(null=True)
last_edited_by = fields.ForeignKey(
"User",
@ -142,6 +145,11 @@ class Book(BookDataModel):
self.title,
)
class Meta:
"""sets up postgres GIN index field"""
indexes = (GinIndex(fields=["search_vector"]),)
class Work(OrderedCollectionPageMixin, Book):
"""a work (an abstract concept of a book that manifests in an edition)"""

View file

@ -43,68 +43,69 @@ class SelfConnector(TestCase):
self.assertEqual(result.year, 1980)
self.assertEqual(result.connector, self.connector)
def test_search_rank(self):
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
def test_search_rank(self, _):
"""prioritize certain results"""
author = models.Author.objects.create(name="Anonymous")
with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"):
edition = models.Edition.objects.create(
title="Edition of Example Work",
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
parent_work=models.Work.objects.create(title=""),
)
# author text is rank C
edition.authors.add(author)
edition = models.Edition.objects.create(
title="Edition of Example Work",
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
parent_work=models.Work.objects.create(title=""),
)
# author text is rank B
edition.authors.add(author)
# series is rank D
models.Edition.objects.create(
title="Another Edition",
series="Anonymous",
parent_work=models.Work.objects.create(title=""),
)
# subtitle is rank B
models.Edition.objects.create(
title="More Editions",
subtitle="The Anonymous Edition",
parent_work=models.Work.objects.create(title=""),
)
# title is rank A
models.Edition.objects.create(title="Anonymous")
# doesn't rank in this search
edition = models.Edition.objects.create(
title="An Edition", parent_work=models.Work.objects.create(title="")
)
# series is rank D
models.Edition.objects.create(
title="Another Edition",
series="Anonymous",
parent_work=models.Work.objects.create(title=""),
)
# subtitle is rank B
models.Edition.objects.create(
title="More Editions",
subtitle="The Anonymous Edition",
parent_work=models.Work.objects.create(title=""),
)
# title is rank A
models.Edition.objects.create(title="Anonymous")
# doesn't rank in this search
models.Edition.objects.create(
title="An Edition", parent_work=models.Work.objects.create(title="")
)
results = self.connector.search("Anonymous")
self.assertEqual(len(results), 3)
results = self.connector.search("Anonymous")
self.assertEqual(len(results), 4)
self.assertEqual(results[0].title, "Anonymous")
self.assertEqual(results[1].title, "More Editions")
self.assertEqual(results[2].title, "Edition of Example Work")
self.assertEqual(results[3].title, "Another Edition")
def test_search_multiple_editions(self):
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
def test_search_multiple_editions(self, _):
"""it should get rid of duplicate editions for the same work"""
with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"):
work = models.Work.objects.create(title="Work Title")
edition_1 = models.Edition.objects.create(
title="Edition 1 Title", parent_work=work
)
edition_2 = models.Edition.objects.create(
title="Edition 2 Title",
parent_work=work,
edition_rank=20, # that's default babey
)
edition_3 = models.Edition.objects.create(title="Fish", parent_work=work)
work = models.Work.objects.create(title="Work Title")
edition_1 = models.Edition.objects.create(
title="Edition 1 Title", parent_work=work
)
edition_2 = models.Edition.objects.create(
title="Edition 2 Title",
parent_work=work,
isbn_13="123456789", # this is now the defualt edition
)
edition_3 = models.Edition.objects.create(title="Fish", parent_work=work)
# pick the best edition
results = self.connector.search("Edition 1 Title")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_1.remote_id)
# pick the best edition
results = self.connector.search("Edition 1 Title")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_1.remote_id)
# pick the default edition when no match is best
results = self.connector.search("Edition Title")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_2.remote_id)
# pick the default edition when no match is best
results = self.connector.search("Edition Title")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_2.remote_id)
# only matches one edition, so no deduplication takes place
results = self.connector.search("Fish")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_3.remote_id)
# only matches one edition, so no deduplication takes place
results = self.connector.search("Fish")
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_3.remote_id)

View file

@ -1,8 +1,9 @@
""" testing models """
from unittest.mock import patch
from dateutil.parser import parse
from django.test import TestCase
from django.utils import timezone
from unittest.mock import patch
from bookwyrm import models, settings
from bookwyrm.models.book import isbn_10_to_13, isbn_13_to_10

View file

@ -0,0 +1,77 @@
""" django configuration of postgres """
from unittest.mock import patch
from django.test import TestCase
from bookwyrm import models
@patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay")
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
class PostgresTriggers(TestCase):
"""special migrations, fancy stuff ya know"""
def test_search_vector_on_create(self, *_):
"""make sure that search_vector is being set correctly on create"""
book = models.Edition.objects.create(title="The Long Goodbye")
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
def test_search_vector_on_update(self, *_):
"""make sure that search_vector is being set correctly on edit"""
book = models.Edition.objects.create(title="The Long Goodbye")
book.title = "The Even Longer Goodbye"
book.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'even':2A 'goodby':4A 'longer':3A")
def test_search_vector_fields(self, *_):
"""use multiple fields to create search vector"""
author = models.Author.objects.create(name="The Rays")
book = models.Edition.objects.create(
title="The Long Goodbye",
subtitle="wow cool",
series="series name",
languages=["irrelevent"],
)
book.authors.add(author)
book.refresh_from_db()
self.assertEqual(
book.search_vector,
"'cool':5B 'goodby':3A 'long':2A 'name':9 'rays':7C 'seri':8 'the':6C 'wow':4B",
)
def test_seach_vector_on_author_update(self, *_):
"""update search when an author name changes"""
author = models.Author.objects.create(name="The Rays")
book = models.Edition.objects.create(
title="The Long Goodbye",
)
book.authors.add(author)
author.name = "Jeremy"
author.save(broadcast=False)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
def test_seach_vector_on_author_delete(self, *_):
"""update search when an author name changes"""
author = models.Author.objects.create(name="Jeremy")
book = models.Edition.objects.create(
title="The Long Goodbye",
)
book.authors.add(author)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
book.authors.remove(author)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
def test_search_vector_stop_word_fallback(self, *_):
"""use a fallback when removing stop words leads to an empty vector"""
book = models.Edition.objects.create(
title="there there",
)
book.refresh_from_db()
self.assertEqual(book.search_vector, "'there':1A,2A")

View file

@ -23,7 +23,7 @@ class Search(View):
def get(self, request):
"""that search bar up top"""
query = request.GET.get("q")
min_confidence = request.GET.get("min_confidence", 0.1)
min_confidence = request.GET.get("min_confidence", 0)
search_type = request.GET.get("type")
search_remote = (
request.GET.get("remote", False) and request.user.is_authenticated