moviewyrm/bookwyrm/connectors/self_connector.py

""" using a bookwyrm instance as a source of book data """
from functools import reduce
import operator

from django.contrib.postgres.search import SearchRank, SearchVector
from django.db.models import Count, F, Q

from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult


class Connector(AbstractConnector):
    """ instantiate a connector  """

    # pylint: disable=arguments-differ
    def search(self, query, min_confidence=0.1, raw=False):
        """ search your local database """
        if not query:
            return []
        # first, try searching unqiue identifiers
        results = search_identifiers(query)
        if not results:
            # then try searching title/author
            results = search_title_author(query, min_confidence)
        search_results = []
        for result in results:
            if raw:
                search_results.append(result)
            else:
                search_results.append(self.format_search_result(result))
            if len(search_results) >= 10:
                break
        if not raw:
            search_results.sort(key=lambda r: r.confidence, reverse=True)
        return search_results

    def isbn_search(self, query, raw=False):
        """ search your local database """
        if not query:
            return []

        filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
        results = models.Edition.objects.filter(
            reduce(operator.or_, (Q(**f) for f in filters))
        ).distinct()

        # when there are multiple editions of the same work, pick the default.
        # it would be odd for this to happen.
        results = results.filter(parent_work__default_edition__id=F("id")) or results

        search_results = []
        for result in results:
            if raw:
                search_results.append(result)
            else:
                search_results.append(self.format_search_result(result))
            if len(search_results) >= 10:
                break
        return search_results

    def format_search_result(self, search_result):
        return SearchResult(
            title=search_result.title,
            key=search_result.remote_id,
            author=search_result.author_text,
            year=search_result.published_date.year
            if search_result.published_date
            else None,
            connector=self,
            confidence=search_result.rank if hasattr(search_result, "rank") else 1,
        )

    def format_isbn_search_result(self, search_result):
        return SearchResult(
            title=search_result.title,
            key=search_result.remote_id,
            author=search_result.author_text,
            year=search_result.published_date.year
            if search_result.published_date
            else None,
            connector=self,
            confidence=search_result.rank if hasattr(search_result, "rank") else 1,
        )

    def is_work_data(self, data):
        pass

    def get_edition_from_work_data(self, data):
        pass

    def get_work_from_edition_data(self, data):
        pass

    def get_authors_from_data(self, data):
        return None

    def parse_isbn_search_data(self, data):
        """ it's already in the right format, don't even worry about it """
        return data

    def parse_search_data(self, data):
        """ it's already in the right format, don't even worry about it """
        return data

    def expand_book_data(self, book):
        pass


def search_identifiers(query):
    """ tries remote_id, isbn; defined as dedupe fields on the model """
    filters = [
        {f.name: query}
        for f in models.Edition._meta.get_fields()
        if hasattr(f, "deduplication_field") and f.deduplication_field
    ]
    results = models.Edition.objects.filter(
        reduce(operator.or_, (Q(**f) for f in filters))
    ).distinct()

    # when there are multiple editions of the same work, pick the default.
    # it would be odd for this to happen.
    return results.filter(parent_work__default_edition__id=F("id")) or results


def search_title_author(query, min_confidence):
    """ searches for title and author """
    vector = (
        SearchVector("title", weight="A")
        + SearchVector("subtitle", weight="B")
        + SearchVector("authors__name", weight="C")
        + SearchVector("series", weight="D")
    )

    results = (
        models.Edition.objects.annotate(search=vector)
        .annotate(rank=SearchRank(vector, query))
        .filter(rank__gt=min_confidence)
        .order_by("-rank")
    )

    # when there are multiple editions of the same work, pick the closest
    editions_of_work = (
        results.values("parent_work")
        .annotate(Count("parent_work"))
        .values_list("parent_work")
    )

    for work_id in set(editions_of_work):
        editions = results.filter(parent_work=work_id)
        default = editions.filter(parent_work__default_edition=F("id"))
        default_rank = default.first().rank if default.exists() else 0
        # if mutliple books have the top rank, pick the default edition
        if default_rank == editions.first().rank:
            yield default.first()
        else:
            yield editions.first()
Runs black 2021-03-08 16:49:10 +00:00			`""" using a bookwyrm instance as a source of book data """`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`from functools import reduce`
			`import operator`

Sort by rank in local db full text search plus tests 2020-05-12 20:03:46 +00:00			`from django.contrib.postgres.search import SearchRank, SearchVector`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`from django.db.models import Count, F, Q`
More connectors more problems 2020-03-28 19:55:53 +00:00
Updates migrations To get the app working again I ran resetdb, let it crash in initdb, then ran the migration, then re-ran initdb 2020-09-21 15:10:37 +00:00			`from bookwyrm import models`
Adds fulltext search of postgres 2020-04-29 17:57:20 +00:00			`from .abstract_connector import AbstractConnector, SearchResult`
More connectors more problems 2020-03-28 19:55:53 +00:00

			`class Connector(AbstractConnector):`
Runs black 2021-03-08 16:49:10 +00:00			`""" instantiate a connector """`

Search for books to add to lists 2021-01-31 19:11:26 +00:00			`# pylint: disable=arguments-differ`
			`def search(self, query, min_confidence=0.1, raw=False):`
Runs black 2021-03-08 16:49:10 +00:00			`""" search your local database """`
Don't match arbitrary results on empty isbn queries 2021-01-07 17:26:05 +00:00			`if not query:`
			`return []`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`# first, try searching unqiue identifiers`
			`results = search_identifiers(query)`
			`if not results:`
			`# then try searching title/author`
			`results = search_title_author(query, min_confidence)`
Adds fulltext search of postgres 2020-04-29 17:57:20 +00:00			`search_results = []`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`for result in results:`
Search for books to add to lists 2021-01-31 19:11:26 +00:00			`if raw:`
			`search_results.append(result)`
			`else:`
			`search_results.append(self.format_search_result(result))`
Return best matching edition instead of default in search 2021-01-02 23:15:25 +00:00			`if len(search_results) >= 10:`
			`break`
Search for books to add to lists 2021-01-31 19:11:26 +00:00			`if not raw:`
			`search_results.sort(key=lambda r: r.confidence, reverse=True)`
Adds fulltext search of postgres 2020-04-29 17:57:20 +00:00			`return search_results`
More connectors more problems 2020-03-28 19:55:53 +00:00
isbn search 2021-03-01 20:09:21 +00:00			`def isbn_search(self, query, raw=False):`
Runs black 2021-03-08 16:49:10 +00:00			`""" search your local database """`
isbn search 2021-03-01 20:09:21 +00:00			`if not query:`
			`return []`

Runs black 2021-03-08 16:49:10 +00:00			`filters = [{f: query} for f in ["isbn_10", "isbn_13"]]`
isbn search 2021-03-01 20:09:21 +00:00			`results = models.Edition.objects.filter(`
			`reduce(operator.or_, (Q(**f) for f in filters))`
			`).distinct()`

			`# when there are multiple editions of the same work, pick the default.`
			`# it would be odd for this to happen.`
Runs black 2021-03-08 16:49:10 +00:00			`results = results.filter(parent_work__default_edition__id=F("id")) or results`
isbn search 2021-03-01 20:09:21 +00:00
			`search_results = []`
			`for result in results:`
			`if raw:`
			`search_results.append(result)`
			`else:`
			`search_results.append(self.format_search_result(result))`
			`if len(search_results) >= 10:`
			`break`
			`return search_results`

Fixes linter issues 2020-09-21 17:25:26 +00:00			`def format_search_result(self, search_result):`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`return SearchResult(`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`title=search_result.title,`
fixes import matching with local books 2020-11-13 17:47:35 +00:00			`key=search_result.remote_id,`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`author=search_result.author_text,`
Runs black 2021-03-08 16:49:10 +00:00			`year=search_result.published_date.year`
			`if search_result.published_date`
			`else None,`
Send connector with search result also fix typo in get_work_from_edition_data function 2020-12-27 22:27:18 +00:00			`connector=self,`
Runs black 2021-03-08 16:49:10 +00:00			`confidence=search_result.rank if hasattr(search_result, "rank") else 1,`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`)`

isbn search 2021-03-01 20:09:21 +00:00			`def format_isbn_search_result(self, search_result):`
			`return SearchResult(`
			`title=search_result.title,`
			`key=search_result.remote_id,`
			`author=search_result.author_text,`
Runs black 2021-03-08 16:49:10 +00:00			`year=search_result.published_date.year`
			`if search_result.published_date`
			`else None,`
isbn search 2021-03-01 20:09:21 +00:00			`connector=self,`
Runs black 2021-03-08 16:49:10 +00:00			`confidence=search_result.rank if hasattr(search_result, "rank") else 1,`
isbn search 2021-03-01 20:09:21 +00:00			`)`

Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`def is_work_data(self, data):`
			`pass`
More connectors more problems 2020-03-28 19:55:53 +00:00
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`def get_edition_from_work_data(self, data):`
			`pass`
More connectors more problems 2020-03-28 19:55:53 +00:00
Send connector with search result also fix typo in get_work_from_edition_data function 2020-12-27 22:27:18 +00:00			`def get_work_from_edition_data(self, data):`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`pass`
Tidy up self connector 2020-05-09 20:36:10 +00:00
			`def get_authors_from_data(self, data):`
			`return None`

isbn search 2021-03-01 20:09:21 +00:00			`def parse_isbn_search_data(self, data):`
Runs black 2021-03-08 16:49:10 +00:00			`""" it's already in the right format, don't even worry about it """`
isbn search 2021-03-01 20:09:21 +00:00			`return data`

Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`def parse_search_data(self, data):`
Runs black 2021-03-08 16:49:10 +00:00			`""" it's already in the right format, don't even worry about it """`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`return data`
Adds fulltext search of postgres 2020-04-29 17:57:20 +00:00
			`def expand_book_data(self, book):`
			`pass`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00

			`def search_identifiers(query):`
Runs black 2021-03-08 16:49:10 +00:00			`""" tries remote_id, isbn; defined as dedupe fields on the model """`
			`filters = [`
			`{f.name: query}`
			`for f in models.Edition._meta.get_fields()`
			`if hasattr(f, "deduplication_field") and f.deduplication_field`
			`]`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`results = models.Edition.objects.filter(`
			`reduce(operator.or_, (Q(**f) for f in filters))`
			`).distinct()`

			`# when there are multiple editions of the same work, pick the default.`
			`# it would be odd for this to happen.`
Runs black 2021-03-08 16:49:10 +00:00			`return results.filter(parent_work__default_edition__id=F("id")) or results`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00

			`def search_title_author(query, min_confidence):`
Runs black 2021-03-08 16:49:10 +00:00			`""" searches for title and author """`
			`vector = (`
			`SearchVector("title", weight="A")`
			`+ SearchVector("subtitle", weight="B")`
			`+ SearchVector("authors__name", weight="C")`
			`+ SearchVector("series", weight="D")`
			`)`

			`results = (`
			`models.Edition.objects.annotate(search=vector)`
			`.annotate(rank=SearchRank(vector, query))`
			`.filter(rank__gt=min_confidence)`
			`.order_by("-rank")`
			`)`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00
			`# when there are multiple editions of the same work, pick the closest`
Runs black 2021-03-08 16:49:10 +00:00			`editions_of_work = (`
			`results.values("parent_work")`
			`.annotate(Count("parent_work"))`
			`.values_list("parent_work")`
			`)`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00
			`for work_id in set(editions_of_work):`
			`editions = results.filter(parent_work=work_id)`
Runs black 2021-03-08 16:49:10 +00:00			`default = editions.filter(parent_work__default_edition=F("id"))`
Separate search for unique identifiers out from text search 2021-01-02 23:48:59 +00:00			`default_rank = default.first().rank if default.exists() else 0`
			`# if mutliple books have the top rank, pick the default edition`
			`if default_rank == editions.first().rank:`
			`yield default.first()`
			`else:`
			`yield editions.first()`