forked from mirrors/bookwyrm
Merge pull request #1199 from bookwyrm-social/search-index
More efficient search index
This commit is contained in:
commit
556ffa4194
10 changed files with 284 additions and 68 deletions
4
.github/workflows/django-tests.yml
vendored
4
.github/workflows/django-tests.yml
vendored
|
@ -20,7 +20,7 @@ jobs:
|
|||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:10
|
||||
image: postgres:12
|
||||
env:
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: hunter2
|
||||
|
@ -66,4 +66,4 @@ jobs:
|
|||
EMAIL_USE_TLS: true
|
||||
ENABLE_PREVIEW_IMAGES: true
|
||||
run: |
|
||||
python manage.py test
|
||||
pytest
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from functools import reduce
|
||||
import operator
|
||||
|
||||
from django.contrib.postgres.search import SearchRank, SearchVector
|
||||
from django.contrib.postgres.search import SearchRank, SearchQuery
|
||||
from django.db.models import OuterRef, Subquery, F, Q
|
||||
|
||||
from bookwyrm import models
|
||||
|
@ -13,7 +13,7 @@ class Connector(AbstractConnector):
|
|||
"""instantiate a connector"""
|
||||
|
||||
# pylint: disable=arguments-differ
|
||||
def search(self, query, min_confidence=0.1, raw=False, filters=None):
|
||||
def search(self, query, min_confidence=0, raw=False, filters=None):
|
||||
"""search your local database"""
|
||||
filters = filters or []
|
||||
if not query:
|
||||
|
@ -141,16 +141,11 @@ def search_identifiers(query, *filters):
|
|||
|
||||
def search_title_author(query, min_confidence, *filters):
|
||||
"""searches for title and author"""
|
||||
vector = (
|
||||
SearchVector("title", weight="A")
|
||||
+ SearchVector("subtitle", weight="B")
|
||||
+ SearchVector("authors__name", weight="C")
|
||||
+ SearchVector("series", weight="D")
|
||||
)
|
||||
|
||||
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
|
||||
results = (
|
||||
models.Edition.objects.annotate(rank=SearchRank(vector, query))
|
||||
.filter(*filters, rank__gt=min_confidence)
|
||||
models.Edition.objects.filter(*filters, search_vector=query)
|
||||
.annotate(rank=SearchRank(F("search_vector"), query))
|
||||
.filter(rank__gt=min_confidence)
|
||||
.order_by("-rank")
|
||||
)
|
||||
|
||||
|
|
|
@ -183,6 +183,7 @@ class EditionForm(CustomForm):
|
|||
"parent_work",
|
||||
"shelves",
|
||||
"connector",
|
||||
"search_vector",
|
||||
]
|
||||
|
||||
|
||||
|
@ -194,6 +195,7 @@ class AuthorForm(CustomForm):
|
|||
"origin_id",
|
||||
"created_date",
|
||||
"updated_date",
|
||||
"search_vector",
|
||||
]
|
||||
|
||||
|
||||
|
|
126
bookwyrm/migrations/0077_auto_20210623_2155.py
Normal file
126
bookwyrm/migrations/0077_auto_20210623_2155.py
Normal file
|
@ -0,0 +1,126 @@
|
|||
# Generated by Django 3.2.4 on 2021-06-23 21:55
|
||||
|
||||
import django.contrib.postgres.indexes
|
||||
import django.contrib.postgres.search
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("bookwyrm", "0076_preview_images"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="author",
|
||||
name="search_vector",
|
||||
field=django.contrib.postgres.search.SearchVectorField(null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name="book",
|
||||
name="search_vector",
|
||||
field=django.contrib.postgres.search.SearchVectorField(null=True),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name="author",
|
||||
index=django.contrib.postgres.indexes.GinIndex(
|
||||
fields=["search_vector"], name="bookwyrm_au_search__b050a8_gin"
|
||||
),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name="book",
|
||||
index=django.contrib.postgres.indexes.GinIndex(
|
||||
fields=["search_vector"], name="bookwyrm_bo_search__51beb3_gin"
|
||||
),
|
||||
),
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
CREATE FUNCTION book_trigger() RETURNS trigger AS $$
|
||||
begin
|
||||
new.search_vector :=
|
||||
coalesce(
|
||||
NULLIF(setweight(to_tsvector('english', coalesce(new.title, '')), 'A'), ''),
|
||||
setweight(to_tsvector('simple', coalesce(new.title, '')), 'A')
|
||||
) ||
|
||||
setweight(to_tsvector('english', coalesce(new.subtitle, '')), 'B') ||
|
||||
(SELECT setweight(to_tsvector('simple', coalesce(array_to_string(array_agg(bookwyrm_author.name), ' '), '')), 'C')
|
||||
FROM bookwyrm_book
|
||||
LEFT OUTER JOIN bookwyrm_book_authors
|
||||
ON bookwyrm_book.id = bookwyrm_book_authors.book_id
|
||||
LEFT OUTER JOIN bookwyrm_author
|
||||
ON bookwyrm_book_authors.author_id = bookwyrm_author.id
|
||||
WHERE bookwyrm_book.id = new.id
|
||||
) ||
|
||||
setweight(to_tsvector('english', coalesce(new.series, '')), 'D');
|
||||
return new;
|
||||
end
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER search_vector_trigger
|
||||
BEFORE INSERT OR UPDATE OF title, subtitle, series, search_vector
|
||||
ON bookwyrm_book
|
||||
FOR EACH ROW EXECUTE FUNCTION book_trigger();
|
||||
|
||||
UPDATE bookwyrm_book SET search_vector = NULL;
|
||||
""",
|
||||
reverse_sql="""
|
||||
DROP TRIGGER IF EXISTS search_vector_trigger
|
||||
ON bookwyrm_book;
|
||||
DROP FUNCTION IF EXISTS book_trigger;
|
||||
""",
|
||||
),
|
||||
# when an author is edited
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
CREATE FUNCTION author_trigger() RETURNS trigger AS $$
|
||||
begin
|
||||
WITH book AS (
|
||||
SELECT bookwyrm_book.id as row_id
|
||||
FROM bookwyrm_author
|
||||
LEFT OUTER JOIN bookwyrm_book_authors
|
||||
ON bookwyrm_book_authors.id = new.id
|
||||
LEFT OUTER JOIN bookwyrm_book
|
||||
ON bookwyrm_book.id = bookwyrm_book_authors.book_id
|
||||
)
|
||||
UPDATE bookwyrm_book SET search_vector = ''
|
||||
FROM book
|
||||
WHERE id = book.row_id;
|
||||
return new;
|
||||
end
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER author_search_vector_trigger
|
||||
AFTER UPDATE OF name
|
||||
ON bookwyrm_author
|
||||
FOR EACH ROW EXECUTE FUNCTION author_trigger();
|
||||
""",
|
||||
reverse_sql="""
|
||||
DROP TRIGGER IF EXISTS author_search_vector_trigger
|
||||
ON bookwyrm_author;
|
||||
DROP FUNCTION IF EXISTS author_trigger;
|
||||
""",
|
||||
),
|
||||
# when an author is added to or removed from a book
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
CREATE FUNCTION book_authors_trigger() RETURNS trigger AS $$
|
||||
begin
|
||||
UPDATE bookwyrm_book SET search_vector = ''
|
||||
WHERE id = coalesce(new.book_id, old.book_id);
|
||||
return new;
|
||||
end
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER book_authors_search_vector_trigger
|
||||
AFTER INSERT OR DELETE
|
||||
ON bookwyrm_book_authors
|
||||
FOR EACH ROW EXECUTE FUNCTION book_authors_trigger();
|
||||
""",
|
||||
reverse_sql="""
|
||||
DROP TRIGGER IF EXISTS book_authors_search_vector_trigger
|
||||
ON bookwyrm_book_authors;
|
||||
DROP FUNCTION IF EXISTS book_authors_trigger;
|
||||
""",
|
||||
),
|
||||
]
|
|
@ -1,4 +1,5 @@
|
|||
""" database schema for info about authors """
|
||||
from django.contrib.postgres.indexes import GinIndex
|
||||
from django.db import models
|
||||
|
||||
from bookwyrm import activitypub
|
||||
|
@ -37,3 +38,8 @@ class Author(BookDataModel):
|
|||
return "https://%s/author/%s" % (DOMAIN, self.id)
|
||||
|
||||
activity_serializer = activitypub.Author
|
||||
|
||||
class Meta:
|
||||
"""sets up postgres GIN index field"""
|
||||
|
||||
indexes = (GinIndex(fields=["search_vector"]),)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
""" database schema for books and shelves """
|
||||
import re
|
||||
|
||||
from django.contrib.postgres.search import SearchVectorField
|
||||
from django.contrib.postgres.indexes import GinIndex
|
||||
from django.db import models
|
||||
from django.dispatch import receiver
|
||||
from model_utils import FieldTracker
|
||||
|
@ -34,6 +36,7 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
|
|||
bnf_id = fields.CharField( # Bibliothèque nationale de France
|
||||
max_length=255, blank=True, null=True, deduplication_field=True
|
||||
)
|
||||
search_vector = SearchVectorField(null=True)
|
||||
|
||||
last_edited_by = fields.ForeignKey(
|
||||
"User",
|
||||
|
@ -142,6 +145,11 @@ class Book(BookDataModel):
|
|||
self.title,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
"""sets up postgres GIN index field"""
|
||||
|
||||
indexes = (GinIndex(fields=["search_vector"]),)
|
||||
|
||||
|
||||
class Work(OrderedCollectionPageMixin, Book):
|
||||
"""a work (an abstract concept of a book that manifests in an edition)"""
|
||||
|
|
|
@ -43,68 +43,69 @@ class SelfConnector(TestCase):
|
|||
self.assertEqual(result.year, 1980)
|
||||
self.assertEqual(result.connector, self.connector)
|
||||
|
||||
def test_search_rank(self):
|
||||
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
|
||||
def test_search_rank(self, _):
|
||||
"""prioritize certain results"""
|
||||
author = models.Author.objects.create(name="Anonymous")
|
||||
with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"):
|
||||
edition = models.Edition.objects.create(
|
||||
title="Edition of Example Work",
|
||||
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# author text is rank C
|
||||
edition.authors.add(author)
|
||||
edition = models.Edition.objects.create(
|
||||
title="Edition of Example Work",
|
||||
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# author text is rank B
|
||||
edition.authors.add(author)
|
||||
|
||||
# series is rank D
|
||||
models.Edition.objects.create(
|
||||
title="Another Edition",
|
||||
series="Anonymous",
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# subtitle is rank B
|
||||
models.Edition.objects.create(
|
||||
title="More Editions",
|
||||
subtitle="The Anonymous Edition",
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# title is rank A
|
||||
models.Edition.objects.create(title="Anonymous")
|
||||
# doesn't rank in this search
|
||||
edition = models.Edition.objects.create(
|
||||
title="An Edition", parent_work=models.Work.objects.create(title="")
|
||||
)
|
||||
# series is rank D
|
||||
models.Edition.objects.create(
|
||||
title="Another Edition",
|
||||
series="Anonymous",
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# subtitle is rank B
|
||||
models.Edition.objects.create(
|
||||
title="More Editions",
|
||||
subtitle="The Anonymous Edition",
|
||||
parent_work=models.Work.objects.create(title=""),
|
||||
)
|
||||
# title is rank A
|
||||
models.Edition.objects.create(title="Anonymous")
|
||||
# doesn't rank in this search
|
||||
models.Edition.objects.create(
|
||||
title="An Edition", parent_work=models.Work.objects.create(title="")
|
||||
)
|
||||
|
||||
results = self.connector.search("Anonymous")
|
||||
self.assertEqual(len(results), 3)
|
||||
results = self.connector.search("Anonymous")
|
||||
self.assertEqual(len(results), 4)
|
||||
self.assertEqual(results[0].title, "Anonymous")
|
||||
self.assertEqual(results[1].title, "More Editions")
|
||||
self.assertEqual(results[2].title, "Edition of Example Work")
|
||||
self.assertEqual(results[3].title, "Another Edition")
|
||||
|
||||
def test_search_multiple_editions(self):
|
||||
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
|
||||
def test_search_multiple_editions(self, _):
|
||||
"""it should get rid of duplicate editions for the same work"""
|
||||
with patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay"):
|
||||
work = models.Work.objects.create(title="Work Title")
|
||||
edition_1 = models.Edition.objects.create(
|
||||
title="Edition 1 Title", parent_work=work
|
||||
)
|
||||
edition_2 = models.Edition.objects.create(
|
||||
title="Edition 2 Title",
|
||||
parent_work=work,
|
||||
edition_rank=20, # that's default babey
|
||||
)
|
||||
edition_3 = models.Edition.objects.create(title="Fish", parent_work=work)
|
||||
work = models.Work.objects.create(title="Work Title")
|
||||
edition_1 = models.Edition.objects.create(
|
||||
title="Edition 1 Title", parent_work=work
|
||||
)
|
||||
edition_2 = models.Edition.objects.create(
|
||||
title="Edition 2 Title",
|
||||
parent_work=work,
|
||||
isbn_13="123456789", # this is now the defualt edition
|
||||
)
|
||||
edition_3 = models.Edition.objects.create(title="Fish", parent_work=work)
|
||||
|
||||
# pick the best edition
|
||||
results = self.connector.search("Edition 1 Title")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_1.remote_id)
|
||||
# pick the best edition
|
||||
results = self.connector.search("Edition 1 Title")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_1.remote_id)
|
||||
|
||||
# pick the default edition when no match is best
|
||||
results = self.connector.search("Edition Title")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_2.remote_id)
|
||||
# pick the default edition when no match is best
|
||||
results = self.connector.search("Edition Title")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_2.remote_id)
|
||||
|
||||
# only matches one edition, so no deduplication takes place
|
||||
results = self.connector.search("Fish")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_3.remote_id)
|
||||
# only matches one edition, so no deduplication takes place
|
||||
results = self.connector.search("Fish")
|
||||
self.assertEqual(len(results), 1)
|
||||
self.assertEqual(results[0].key, edition_3.remote_id)
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
""" testing models """
|
||||
from unittest.mock import patch
|
||||
|
||||
from dateutil.parser import parse
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
from unittest.mock import patch
|
||||
|
||||
from bookwyrm import models, settings
|
||||
from bookwyrm.models.book import isbn_10_to_13, isbn_13_to_10
|
||||
|
|
77
bookwyrm/tests/test_postgres.py
Normal file
77
bookwyrm/tests/test_postgres.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
""" django configuration of postgres """
|
||||
from unittest.mock import patch
|
||||
from django.test import TestCase
|
||||
|
||||
from bookwyrm import models
|
||||
|
||||
|
||||
@patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay")
|
||||
@patch("bookwyrm.preview_images.generate_edition_preview_image_task.delay")
|
||||
class PostgresTriggers(TestCase):
|
||||
"""special migrations, fancy stuff ya know"""
|
||||
|
||||
def test_search_vector_on_create(self, *_):
|
||||
"""make sure that search_vector is being set correctly on create"""
|
||||
book = models.Edition.objects.create(title="The Long Goodbye")
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
|
||||
|
||||
def test_search_vector_on_update(self, *_):
|
||||
"""make sure that search_vector is being set correctly on edit"""
|
||||
book = models.Edition.objects.create(title="The Long Goodbye")
|
||||
book.title = "The Even Longer Goodbye"
|
||||
book.save(broadcast=False)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'even':2A 'goodby':4A 'longer':3A")
|
||||
|
||||
def test_search_vector_fields(self, *_):
|
||||
"""use multiple fields to create search vector"""
|
||||
author = models.Author.objects.create(name="The Rays")
|
||||
book = models.Edition.objects.create(
|
||||
title="The Long Goodbye",
|
||||
subtitle="wow cool",
|
||||
series="series name",
|
||||
languages=["irrelevent"],
|
||||
)
|
||||
book.authors.add(author)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(
|
||||
book.search_vector,
|
||||
"'cool':5B 'goodby':3A 'long':2A 'name':9 'rays':7C 'seri':8 'the':6C 'wow':4B",
|
||||
)
|
||||
|
||||
def test_seach_vector_on_author_update(self, *_):
|
||||
"""update search when an author name changes"""
|
||||
author = models.Author.objects.create(name="The Rays")
|
||||
book = models.Edition.objects.create(
|
||||
title="The Long Goodbye",
|
||||
)
|
||||
book.authors.add(author)
|
||||
author.name = "Jeremy"
|
||||
author.save(broadcast=False)
|
||||
book.refresh_from_db()
|
||||
|
||||
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
|
||||
|
||||
def test_seach_vector_on_author_delete(self, *_):
|
||||
"""update search when an author name changes"""
|
||||
author = models.Author.objects.create(name="Jeremy")
|
||||
book = models.Edition.objects.create(
|
||||
title="The Long Goodbye",
|
||||
)
|
||||
|
||||
book.authors.add(author)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'goodby':3A 'jeremy':4C 'long':2A")
|
||||
|
||||
book.authors.remove(author)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'goodby':3A 'long':2A")
|
||||
|
||||
def test_search_vector_stop_word_fallback(self, *_):
|
||||
"""use a fallback when removing stop words leads to an empty vector"""
|
||||
book = models.Edition.objects.create(
|
||||
title="there there",
|
||||
)
|
||||
book.refresh_from_db()
|
||||
self.assertEqual(book.search_vector, "'there':1A,2A")
|
|
@ -23,7 +23,7 @@ class Search(View):
|
|||
def get(self, request):
|
||||
"""that search bar up top"""
|
||||
query = request.GET.get("q")
|
||||
min_confidence = request.GET.get("min_confidence", 0.1)
|
||||
min_confidence = request.GET.get("min_confidence", 0)
|
||||
search_type = request.GET.get("type")
|
||||
search_remote = (
|
||||
request.GET.get("remote", False) and request.user.is_authenticated
|
||||
|
|
Loading…
Reference in a new issue