moviewyrm/fedireads/connectors/openlibrary.py
2020-03-28 16:32:18 -07:00

178 lines
5.6 KiB
Python

''' openlibrary data connector '''
from datetime import datetime
from django.core.exceptions import ObjectDoesNotExist
from django.core.files.base import ContentFile
import re
import requests
from fedireads import models
from .abstract_connector import AbstractConnector, SearchResult, \
update_from_mappings
class Connector(AbstractConnector):
''' instantiate a connector for OL '''
def __init__(self, identifier):
super().__init__(identifier)
def search(self, query):
''' query openlibrary search '''
resp = requests.get(
'%s%s' % (self.search_url, query),
headers={
'Accept': 'application/json; charset=utf-8',
},
)
if not resp.ok:
resp.raise_for_status()
data = resp.json()
results = []
for doc in data['docs'][:5]:
key = doc['key']
key = key.split('/')[-1]
author = doc.get('author_name') or ['Unknown']
results.append(SearchResult(
doc.get('title'),
key,
author[0],
doc.get('first_publish_year'),
doc
))
return results
def get_or_create_book(self, olkey):
''' pull up a book record by whatever means possible '''
if re.match(r'^OL\d+W$', olkey):
model = models.Work
elif re.match(r'^OL\d+M$', olkey):
model = models.Edition
else:
raise ValueError('Invalid OpenLibrary ID')
try:
book = model.objects.get(openlibrary_key=olkey)
return book
except ObjectDoesNotExist:
# no book was found, so we start creating a new one
book = model(openlibrary_key=olkey)
self.update_book(book)
def update_book(self, book):
''' query openlibrary for data on a book '''
olkey = book.openlibrary_key
# load the book json from openlibrary.org
response = requests.get('%s/works/%s.json' % (self.url, olkey))
if not response.ok:
response.raise_for_status()
data = response.json()
# great, we can update our book.
mappings = {
'publish_date': ('published_date', get_date),
'first_publish_date': ('first_published_date', get_date),
'description': ('description', get_description),
'isbn_13': ('isbn', None),
'oclc_numbers': ('oclc_number', lambda a: a[0]),
'lccn': ('lccn', lambda a: a[0]),
}
book = update_from_mappings(book, data, mappings)
if 'identifiers' in data:
if 'goodreads' in data['identifiers']:
book.goodreads_key = data['identifiers']['goodreads']
if not book.source_url:
book.source_url = response.url
if not book.connector:
book.connector = self.connector
book.save()
# this book sure as heck better be an edition
if data.get('works'):
key = data.get('works')[0]['key']
key = key.split('/')[-1]
work = self.get_or_create_book(key)
book.parent_work = work
# we also need to know the author get the cover
for author_blob in data.get('authors', []):
# this id is "/authors/OL1234567A" and we want just "OL1234567A"
author_blob = author_blob.get('author', author_blob)
author_id = author_blob['key']
author_id = author_id.split('/')[-1]
book.authors.add(self.get_or_create_author(author_id))
if book.sync_cover and data.get('covers') and len(data['covers']):
book.cover.save(*self.get_cover(data['covers'][0]), save=True)
return book
def get_or_create_author(self, olkey):
''' load that author '''
if not re.match(r'^OL\d+A$', olkey):
raise ValueError('Invalid OpenLibrary author ID')
try:
return models.Author.objects.get(openlibrary_key=olkey)
except ObjectDoesNotExist:
pass
response = requests.get('%s/authors/%s.json' % (self.url, olkey))
if not response.ok:
response.raise_for_status()
data = response.json()
author = models.Author(openlibrary_key=olkey)
mappings = {
'birth_date': ('born', get_date),
'death_date': ('died', get_date),
'bio': ('bio', get_description),
}
author = update_from_mappings(author, data, mappings)
# TODO this is making some BOLD assumption
name = data['name']
author.last_name = name.split(' ')[-1]
author.first_name = ' '.join(name.split(' ')[:-1])
author.save()
return author
def get_cover(self, cover_id):
''' ask openlibrary for the cover '''
# TODO: get medium and small versions
image_name = '%s-M.jpg' % cover_id
url = '%s/b/id/%s' % (self.covers_url, image_name)
response = requests.get(url)
if not response.ok:
response.raise_for_status()
image_content = ContentFile(response.content)
return [image_name, image_content]
def get_date(date_string):
''' helper function to try to interpret dates '''
formats = [
'%B %Y',
'%Y',
]
for date_format in formats:
try:
return datetime.strptime(date_string, date_format)
except ValueError:
pass
return None
def get_description(description_blob):
''' descriptions can be a string or a dict '''
if isinstance(description_blob, dict):
return description_blob.get('value')
return description_blob