Merge pull request #450 from mouse-reeve/openlibrary-deduplication

Openlibrary tests and cleanup
This commit is contained in:
Mouse Reeve 2020-12-31 13:50:07 -08:00 committed by GitHub
commit 8ca7c45d3b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 328 additions and 119 deletions

View file

@ -68,7 +68,7 @@ class Connector(AbstractConnector):
key = data['key'] key = data['key']
except KeyError: except KeyError:
raise ConnectorException('Invalid book data') raise ConnectorException('Invalid book data')
return '%s/%s' % (self.books_url, key) return '%s%s' % (self.books_url, key)
def is_work_data(self, data): def is_work_data(self, data):
@ -80,7 +80,7 @@ class Connector(AbstractConnector):
key = data['key'] key = data['key']
except KeyError: except KeyError:
raise ConnectorException('Invalid book data') raise ConnectorException('Invalid book data')
url = '%s/%s/editions' % (self.books_url, key) url = '%s%s/editions' % (self.books_url, key)
data = get_data(url) data = get_data(url)
return pick_default_edition(data['entries']) return pick_default_edition(data['entries'])
@ -90,7 +90,7 @@ class Connector(AbstractConnector):
key = data['works'][0]['key'] key = data['works'][0]['key']
except (IndexError, KeyError): except (IndexError, KeyError):
raise ConnectorException('No work found for edition') raise ConnectorException('No work found for edition')
url = '%s/%s' % (self.books_url, key) url = '%s%s' % (self.books_url, key)
return get_data(url) return get_data(url)
@ -100,7 +100,7 @@ class Connector(AbstractConnector):
author_blob = author_blob.get('author', author_blob) author_blob = author_blob.get('author', author_blob)
# this id is "/authors/OL1234567A" # this id is "/authors/OL1234567A"
author_id = author_blob['key'] author_id = author_blob['key']
url = '%s/%s.json' % (self.base_url, author_id) url = '%s%s' % (self.base_url, author_id)
yield self.get_or_create_author(url) yield self.get_or_create_author(url)
@ -130,7 +130,7 @@ class Connector(AbstractConnector):
def load_edition_data(self, olkey): def load_edition_data(self, olkey):
''' query openlibrary for editions of a work ''' ''' query openlibrary for editions of a work '''
url = '%s/works/%s/editions.json' % (self.books_url, olkey) url = '%s/works/%s/editions' % (self.books_url, olkey)
return get_data(url) return get_data(url)
@ -150,7 +150,7 @@ def get_description(description_blob):
''' descriptions can be a string or a dict ''' ''' descriptions can be a string or a dict '''
if isinstance(description_blob, dict): if isinstance(description_blob, dict):
return description_blob.get('value') return description_blob.get('value')
return description_blob return description_blob
def get_openlibrary_key(key): def get_openlibrary_key(key):

View file

@ -109,7 +109,7 @@ class ActivitypubMixin:
not field.deduplication_field: not field.deduplication_field:
continue continue
value = data.get(field.activitypub_field) value = data.get(field.get_activitypub_field())
if not value: if not value:
continue continue
filters.append({field.name: value}) filters.append({field.name: value})

View file

@ -4,16 +4,14 @@ import responses
from bookwyrm import models from bookwyrm import models
from bookwyrm.connectors import abstract_connector from bookwyrm.connectors import abstract_connector
from bookwyrm.connectors.abstract_connector import Mapping, SearchResult from bookwyrm.connectors.abstract_connector import Mapping
from bookwyrm.connectors.openlibrary import Connector from bookwyrm.settings import DOMAIN
class AbstractConnector(TestCase): class AbstractConnector(TestCase):
''' generic code for connecting to outside data sources ''' ''' generic code for connecting to outside data sources '''
def setUp(self): def setUp(self):
''' we need an example connector ''' ''' we need an example connector '''
self.book = models.Edition.objects.create(title='Example Edition')
self.connector_info = models.Connector.objects.create( self.connector_info = models.Connector.objects.create(
identifier='example.com', identifier='example.com',
connector_file='openlibrary', connector_file='openlibrary',
@ -22,107 +20,92 @@ class AbstractConnector(TestCase):
covers_url='https://example.com/covers', covers_url='https://example.com/covers',
search_url='https://example.com/search?q=', search_url='https://example.com/search?q=',
) )
self.connector = Connector('example.com') work_data = {
'id': 'abc1',
self.data = { 'title': 'Test work',
'title': 'Unused title', 'type': 'work',
'ASIN': 'A00BLAH', 'openlibraryKey': 'OL1234W',
'isbn_10': '1234567890',
'isbn_13': 'blahhh',
'blah': 'bip',
'format': 'hardcover',
'series': ['one', 'two'],
} }
self.connector.key_mappings = [ self.work_data = work_data
Mapping('isbn_10'), edition_data = {
Mapping('isbn_13'), 'id': 'abc2',
Mapping('lccn'), 'title': 'Test edition',
Mapping('asin'), 'type': 'edition',
] 'openlibraryKey': 'OL1234M',
}
self.edition_data = edition_data
class TestConnector(abstract_connector.AbstractConnector):
def test_abstract_minimal_connector_init(self):
''' barebones connector for search with defaults '''
class TestConnector(abstract_connector.AbstractMinimalConnector):
''' nothing added here ''' ''' nothing added here '''
def format_search_result(): def format_search_result(self, search_result):
pass return search_result
def get_or_create_book():
pass
def parse_search_data():
pass
connector = TestConnector('example.com')
self.assertEqual(connector.connector, self.connector_info)
self.assertEqual(connector.base_url, 'https://example.com')
self.assertEqual(connector.books_url, 'https://example.com/books')
self.assertEqual(connector.covers_url, 'https://example.com/covers')
self.assertEqual(connector.search_url, 'https://example.com/search?q=')
self.assertIsNone(connector.name),
self.assertEqual(connector.identifier, 'example.com'),
self.assertIsNone(connector.max_query_count)
self.assertFalse(connector.local)
@responses.activate
def test_abstract_minimal_connector_search(self):
''' makes an http request to the outside service '''
class TestConnector(abstract_connector.AbstractMinimalConnector):
''' nothing added here '''
def format_search_result(self, data):
return data
def get_or_create_book(self, data):
pass
def parse_search_data(self, data): def parse_search_data(self, data):
return data return data
connector = TestConnector('example.com') def is_work_data(self, data):
return data['type'] == 'work'
def get_edition_from_work_data(self, data):
return edition_data
def get_work_from_edition_data(self, data):
return work_data
def get_authors_from_data(self, data):
return []
def expand_book_data(self, book):
pass
self.connector = TestConnector('example.com')
self.connector.book_mappings = [
Mapping('id'),
Mapping('title'),
Mapping('openlibraryKey'),
]
self.book = models.Edition.objects.create(
title='Test Book', remote_id='https://example.com/book/1234',
openlibrary_key='OL1234M')
def test_abstract_connector_init(self):
''' barebones connector for search with defaults '''
self.assertIsInstance(self.connector.book_mappings, list)
def test_is_available(self):
''' this isn't used.... '''
self.assertTrue(self.connector.is_available())
self.connector.max_query_count = 1
self.connector.connector.query_count = 2
self.assertFalse(self.connector.is_available())
def test_get_or_create_book_existing(self):
''' find an existing book by remote/origin id '''
self.assertEqual(models.Book.objects.count(), 1)
self.assertEqual(
self.book.remote_id, 'https://%s/book/%d' % (DOMAIN, self.book.id))
self.assertEqual(
self.book.origin_id, 'https://example.com/book/1234')
# dedupe by origin id
result = self.connector.get_or_create_book(
'https://example.com/book/1234')
self.assertEqual(models.Book.objects.count(), 1)
self.assertEqual(result, self.book)
# dedupe by remote id
result = self.connector.get_or_create_book(
'https://%s/book/%d' % (DOMAIN, self.book.id))
self.assertEqual(models.Book.objects.count(), 1)
self.assertEqual(result, self.book)
@responses.activate
def test_get_or_create_book_deduped(self):
''' load remote data and deduplicate '''
responses.add( responses.add(
responses.GET, responses.GET,
'https://example.com/search?q=a%20book%20title', 'https://example.com/book/abcd',
json=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l'], json=self.edition_data
status=200)
results = connector.search('a book title')
self.assertEqual(len(results), 10)
self.assertEqual(results[0], 'a')
self.assertEqual(results[1], 'b')
self.assertEqual(results[2], 'c')
def test_search_result(self):
''' a class that stores info about a search result '''
result = SearchResult(
title='Title',
key='https://example.com/book/1',
author='Author Name',
year='1850',
connector=self.connector,
) )
# there's really not much to test here, it's just a dataclass result = self.connector.get_or_create_book(
self.assertEqual(result.confidence, 1) 'https://example.com/book/abcd')
self.assertEqual(result.title, 'Title') self.assertEqual(result, self.book)
self.assertEqual(models.Edition.objects.count(), 1)
self.assertEqual(models.Edition.objects.count(), 1)
def test_create_mapping(self):
''' maps remote fields for book data to bookwyrm activitypub fields '''
mapping = Mapping('isbn')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter('bb'), 'bb')
def test_create_mapping_with_remote(self):
''' the remote field is different than the local field '''
mapping = Mapping('isbn', remote_field='isbn13')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn13')
self.assertEqual(mapping.formatter('bb'), 'bb')
def test_create_mapping_with_formatter(self):
''' a function is provided to modify the data '''
formatter = lambda x: 'aa' + x
mapping = Mapping('isbn', formatter=formatter)
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter, formatter)
self.assertEqual(mapping.formatter('bb'), 'aabb')

View file

@ -0,0 +1,100 @@
''' testing book data connectors '''
from django.test import TestCase
import responses
from bookwyrm import models
from bookwyrm.connectors import abstract_connector
from bookwyrm.connectors.abstract_connector import Mapping, SearchResult
class AbstractConnector(TestCase):
''' generic code for connecting to outside data sources '''
def setUp(self):
''' we need an example connector '''
self.connector_info = models.Connector.objects.create(
identifier='example.com',
connector_file='openlibrary',
base_url='https://example.com',
books_url='https://example.com/books',
covers_url='https://example.com/covers',
search_url='https://example.com/search?q=',
)
class TestConnector(abstract_connector.AbstractMinimalConnector):
''' nothing added here '''
def format_search_result(self, search_result):
return search_result
def get_or_create_book(self, remote_id):
pass
def parse_search_data(self, data):
return data
self.test_connector = TestConnector('example.com')
def test_abstract_minimal_connector_init(self):
''' barebones connector for search with defaults '''
connector = self.test_connector
self.assertEqual(connector.connector, self.connector_info)
self.assertEqual(connector.base_url, 'https://example.com')
self.assertEqual(connector.books_url, 'https://example.com/books')
self.assertEqual(connector.covers_url, 'https://example.com/covers')
self.assertEqual(connector.search_url, 'https://example.com/search?q=')
self.assertIsNone(connector.name)
self.assertEqual(connector.identifier, 'example.com')
self.assertIsNone(connector.max_query_count)
self.assertFalse(connector.local)
@responses.activate
def test_search(self):
''' makes an http request to the outside service '''
responses.add(
responses.GET,
'https://example.com/search?q=a%20book%20title',
json=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l'],
status=200)
results = self.test_connector.search('a book title')
self.assertEqual(len(results), 10)
self.assertEqual(results[0], 'a')
self.assertEqual(results[1], 'b')
self.assertEqual(results[2], 'c')
def test_search_result(self):
''' a class that stores info about a search result '''
result = SearchResult(
title='Title',
key='https://example.com/book/1',
author='Author Name',
year='1850',
connector=self.test_connector,
)
# there's really not much to test here, it's just a dataclass
self.assertEqual(result.confidence, 1)
self.assertEqual(result.title, 'Title')
def test_create_mapping(self):
''' maps remote fields for book data to bookwyrm activitypub fields '''
mapping = Mapping('isbn')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter('bb'), 'bb')
def test_create_mapping_with_remote(self):
''' the remote field is different than the local field '''
mapping = Mapping('isbn', remote_field='isbn13')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn13')
self.assertEqual(mapping.formatter('bb'), 'bb')
def test_create_mapping_with_formatter(self):
''' a function is provided to modify the data '''
formatter = lambda x: 'aa' + x
mapping = Mapping('isbn', formatter=formatter)
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter, formatter)
self.assertEqual(mapping.formatter('bb'), 'aabb')

View file

@ -1,9 +1,10 @@
''' testing book data connectors ''' ''' testing book data connectors '''
import json import json
import pathlib import pathlib
from dateutil import parser from unittest.mock import patch
from django.test import TestCase from django.test import TestCase
import pytz import responses
from bookwyrm import models from bookwyrm import models
from bookwyrm.connectors.openlibrary import Connector from bookwyrm.connectors.openlibrary import Connector
@ -11,10 +12,13 @@ from bookwyrm.connectors.openlibrary import get_languages, get_description
from bookwyrm.connectors.openlibrary import pick_default_edition, \ from bookwyrm.connectors.openlibrary import pick_default_edition, \
get_openlibrary_key get_openlibrary_key
from bookwyrm.connectors.abstract_connector import SearchResult from bookwyrm.connectors.abstract_connector import SearchResult
from bookwyrm.connectors.abstract_connector import ConnectorException
class Openlibrary(TestCase): class Openlibrary(TestCase):
''' test loading data from openlibrary.org '''
def setUp(self): def setUp(self):
''' creates the connector we'll use '''
models.Connector.objects.create( models.Connector.objects.create(
identifier='openlibrary.org', identifier='openlibrary.org',
name='OpenLibrary', name='OpenLibrary',
@ -37,19 +41,85 @@ class Openlibrary(TestCase):
self.edition_list_data = json.loads(edition_list_file.read_bytes()) self.edition_list_data = json.loads(edition_list_file.read_bytes())
def test_get_remote_id_from_data(self):
''' format the remote id from the data '''
data = {'key': '/work/OL1234W'}
result = self.connector.get_remote_id_from_data(data)
self.assertEqual(result, 'https://openlibrary.org/work/OL1234W')
# error handlding
with self.assertRaises(ConnectorException):
self.connector.get_remote_id_from_data({})
def test_is_work_data(self): def test_is_work_data(self):
''' detect if the loaded json is a work '''
self.assertEqual(self.connector.is_work_data(self.work_data), True) self.assertEqual(self.connector.is_work_data(self.work_data), True)
self.assertEqual(self.connector.is_work_data(self.edition_data), False) self.assertEqual(self.connector.is_work_data(self.edition_data), False)
def test_pick_default_edition(self): @responses.activate
edition = pick_default_edition(self.edition_list_data['entries']) def test_get_edition_from_work_data(self):
self.assertEqual(edition['key'], '/books/OL9788823M') ''' loads a list of editions '''
data = {'key': '/work/OL1234W'}
responses.add(
responses.GET,
'https://openlibrary.org/work/OL1234W/editions',
json={'entries': []},
status=200)
with patch('bookwyrm.connectors.openlibrary.pick_default_edition') \
as pick_edition:
pick_edition.return_value = 'hi'
result = self.connector.get_edition_from_work_data(data)
self.assertEqual(result, 'hi')
@responses.activate
def test_get_work_from_edition_data(self):
''' loads a list of editions '''
data = {'works': [{'key': '/work/OL1234W'}]}
responses.add(
responses.GET,
'https://openlibrary.org/work/OL1234W',
json={'hi': 'there'},
status=200)
result = self.connector.get_work_from_edition_data(data)
self.assertEqual(result, {'hi': 'there'})
@responses.activate
def test_get_authors_from_data(self):
''' find authors in data '''
responses.add(
responses.GET,
'https://openlibrary.org/authors/OL382982A',
json={'hi': 'there'},
status=200)
results = self.connector.get_authors_from_data(self.work_data)
for result in results:
self.assertIsInstance(result, models.Author)
def test_get_cover_url(self):
''' formats a url that should contain the cover image '''
blob = ['image']
result = self.connector.get_cover_url(blob)
self.assertEqual(
result, 'https://covers.openlibrary.org/b/id/image-M.jpg')
def test_parse_search_result(self):
''' extract the results from the search json response '''
datafile = pathlib.Path(__file__).parent.joinpath(
'../data/ol_search.json')
search_data = json.loads(datafile.read_bytes())
result = self.connector.parse_search_data(search_data)
self.assertIsInstance(result, list)
self.assertEqual(len(result), 2)
def test_format_search_result(self): def test_format_search_result(self):
''' translate json from openlibrary into SearchResult ''' ''' translate json from openlibrary into SearchResult '''
datafile = pathlib.Path(__file__).parent.joinpath('../data/ol_search.json') datafile = pathlib.Path(__file__).parent.joinpath(
'../data/ol_search.json')
search_data = json.loads(datafile.read_bytes()) search_data = json.loads(datafile.read_bytes())
results = self.connector.parse_search_data(search_data) results = self.connector.parse_search_data(search_data)
self.assertIsInstance(results, list) self.assertIsInstance(results, list)
@ -57,22 +127,66 @@ class Openlibrary(TestCase):
result = self.connector.format_search_result(results[0]) result = self.connector.format_search_result(results[0])
self.assertIsInstance(result, SearchResult) self.assertIsInstance(result, SearchResult)
self.assertEqual(result.title, 'This Is How You Lose the Time War') self.assertEqual(result.title, 'This Is How You Lose the Time War')
self.assertEqual(result.key, 'https://openlibrary.org/works/OL20639540W') self.assertEqual(
result.key, 'https://openlibrary.org/works/OL20639540W')
self.assertEqual(result.author, 'Amal El-Mohtar, Max Gladstone') self.assertEqual(result.author, 'Amal El-Mohtar, Max Gladstone')
self.assertEqual(result.year, 2019) self.assertEqual(result.year, 2019)
self.assertEqual(result.connector, self.connector)
@responses.activate
def test_load_edition_data(self):
''' format url from key and make request '''
key = 'OL1234W'
responses.add(
responses.GET,
'https://openlibrary.org/works/OL1234W/editions',
json={'hi': 'there'}
)
result = self.connector.load_edition_data(key)
self.assertEqual(result, {'hi': 'there'})
@responses.activate
def test_expand_book_data(self):
''' given a book, get more editions '''
work = models.Work.objects.create(
title='Test Work', openlibrary_key='OL1234W')
edition = models.Edition.objects.create(
title='Test Edition', parent_work=work)
responses.add(
responses.GET,
'https://openlibrary.org/works/OL1234W/editions',
json={'entries': []},
)
with patch(
'bookwyrm.connectors.abstract_connector.AbstractConnector.' \
'create_edition_from_data'):
self.connector.expand_book_data(edition)
self.connector.expand_book_data(work)
def test_get_description(self): def test_get_description(self):
''' should do some cleanup on the description data '''
description = get_description(self.work_data['description']) description = get_description(self.work_data['description'])
expected = 'First in the Old Kingdom/Abhorsen series.' expected = 'First in the Old Kingdom/Abhorsen series.'
self.assertEqual(description, expected) self.assertEqual(description, expected)
def test_get_openlibrary_key(self):
''' extracts the uuid '''
key = get_openlibrary_key('/books/OL27320736M')
self.assertEqual(key, 'OL27320736M')
def test_get_languages(self): def test_get_languages(self):
''' looks up languages from a list '''
languages = get_languages(self.edition_data['languages']) languages = get_languages(self.edition_data['languages'])
self.assertEqual(languages, ['English']) self.assertEqual(languages, ['English'])
def test_get_ol_key(self): def test_pick_default_edition(self):
key = get_openlibrary_key('/books/OL27320736M') ''' detect if the loaded json is an edition '''
self.assertEqual(key, 'OL27320736M') edition = pick_default_edition(self.edition_list_data['entries'])
self.assertEqual(edition['key'], '/books/OL9788823M')

View file

@ -200,3 +200,15 @@ class BaseModel(TestCase):
# test subclass match # test subclass match
result = models.Status.find_existing_by_remote_id( result = models.Status.find_existing_by_remote_id(
'https://comment.net') 'https://comment.net')
def test_find_existing(self):
''' match a blob of data to a model '''
book = models.Edition.objects.create(
title='Test edition',
openlibrary_key='OL1234',
)
result = models.Edition.find_existing(
{'openlibraryKey': 'OL1234'})
self.assertEqual(result, book)

View file

@ -154,12 +154,12 @@ class ImportJob(TestCase):
status=200) status=200)
responses.add( responses.add(
responses.GET, responses.GET,
'https://openlibrary.org//works/OL15832982W', 'https://openlibrary.org/works/OL15832982W',
json=bookdata, json=bookdata,
status=200) status=200)
responses.add( responses.add(
responses.GET, responses.GET,
'https://openlibrary.org//authors/OL382982A.json', 'https://openlibrary.org/authors/OL382982A',
json={'name': 'test author'}, json={'name': 'test author'},
status=200) status=200)