bookwyrm/fedireads/connectors/openlibrary.py

179 lines
5.5 KiB
Python
Raw Normal View History

2020-03-07 20:22:28 +00:00
''' openlibrary data connector '''
2020-03-28 04:28:52 +00:00
from datetime import datetime
2020-03-07 20:22:28 +00:00
from django.core.exceptions import ObjectDoesNotExist
from django.core.files.base import ContentFile
import re
import requests
from fedireads import models
from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector):
2020-03-07 20:22:28 +00:00
''' instantiate a connector for OL '''
def __init__(self, identifier):
super().__init__(identifier)
2020-03-07 20:22:28 +00:00
def search(self, query):
''' query openlibrary search '''
resp = requests.get('%s/search.json' % self.url, params={'q': query})
if not resp.ok:
resp.raise_for_status()
data = resp.json()
results = []
for doc in data['docs'][:5]:
key = doc['key']
key = key.split('/')[-1]
author = doc.get('author_name') or ['Unknown']
results.append(SearchResult(
doc.get('title'),
key,
author[0],
doc.get('first_publish_year'),
doc
2020-03-07 20:22:28 +00:00
))
return results
def get_or_create_book(self, olkey):
''' pull up a book record by whatever means possible '''
if re.match(r'^OL\d+W$', olkey):
model = models.Work
elif re.match(r'^OL\d+M$', olkey):
model = models.Edition
else:
raise ValueError('Invalid OpenLibrary ID')
try:
2020-03-14 04:10:53 +00:00
book = model.objects.get(openlibrary_key=olkey)
2020-03-07 20:22:28 +00:00
return book
except ObjectDoesNotExist:
# no book was found, so we start creating a new one
book = model(openlibrary_key=olkey)
# load the book json from openlibrary.org
response = requests.get('%s/works/%s.json' % (self.url, olkey))
if not response.ok:
response.raise_for_status()
data = response.json()
# great, we can update our book.
2020-03-28 04:28:52 +00:00
noop = lambda x: x
mappings = {
'publish_date': ('published_date', get_date),
'first_publish_date': ('first_published_date', get_date),
'description': ('description', get_description),
'isbn_13': ('isbn', noop),
'oclc_numbers': ('oclc_number', lambda a: a[0]),
'lccn': ('lccn', lambda a: a[0]),
}
for (key, value) in data.items():
if key in mappings:
key, formatter = mappings[key]
else:
key = key
formatter = noop
if self.has_attr(book, key):
book.__setattr__(key, formatter(value))
if 'identifiers' in data:
if 'goodreads' in data['identifiers']:
book.goodreads_key = data['identifiers']['goodreads']
2020-03-07 20:22:28 +00:00
2020-03-14 04:10:53 +00:00
book.save()
2020-03-07 20:22:28 +00:00
# this book sure as heck better be an edition
if data.get('works'):
key = data.get('works')[0]['key']
key = key.split('/')[-1]
work = self.get_or_create_book(key)
2020-03-07 20:22:28 +00:00
book.parent_work = work
# we also need to know the author get the cover
2020-03-14 04:10:53 +00:00
for author_blob in data.get('authors', []):
2020-03-07 20:22:28 +00:00
# this id is "/authors/OL1234567A" and we want just "OL1234567A"
author_blob = author_blob.get('author', author_blob)
author_id = author_blob['key']
author_id = author_id.split('/')[-1]
book.authors.add(self.get_or_create_author(author_id))
if data.get('covers') and len(data['covers']):
book.cover.save(*self.get_cover(data['covers'][0]), save=True)
return book
def get_or_create_author(self, olkey):
''' load that author '''
if not re.match(r'^OL\d+A$', olkey):
raise ValueError('Invalid OpenLibrary author ID')
try:
2020-03-14 04:10:53 +00:00
return models.Author.objects.get(openlibrary_key=olkey)
2020-03-07 20:22:28 +00:00
except ObjectDoesNotExist:
pass
response = requests.get('%s/authors/%s.json' % (self.url, olkey))
if not response.ok:
response.raise_for_status()
data = response.json()
author = models.Author(openlibrary_key=olkey)
bio = data.get('bio')
if bio:
if isinstance(bio, dict):
bio = bio.get('value')
author.bio = bio
name = data['name']
author.name = name
# TODO this is making some BOLD assumption
author.last_name = name.split(' ')[-1]
author.first_name = ' '.join(name.split(' ')[:-1])
#author.born = data.get('birth_date')
#author.died = data.get('death_date')
author.save()
return author
def get_cover(self, cover_id):
''' ask openlibrary for the cover '''
# TODO: get medium and small versions
image_name = '%s-M.jpg' % cover_id
url = '%s/b/id/%s' % (self.covers_url, image_name)
response = requests.get(url)
if not response.ok:
response.raise_for_status()
2020-03-14 04:10:53 +00:00
image_content = ContentFile(response.content)
2020-03-07 20:22:28 +00:00
return [image_name, image_content]
def update_book(self, book_obj):
pass
2020-03-28 04:28:52 +00:00
def get_date(date_string):
''' helper function to try to interpret dates '''
formats = [
'%B %Y',
'%Y',
]
for date_format in formats:
try:
return datetime.strptime(date_string, date_format)
except ValueError:
pass
return False
def get_description(description_blob):
''' descriptions can be a string or a dict '''
if isinstance(description_blob, dict):
return description_blob.get('value')
return description_blob