Infer format in openlibrary import

This commit is contained in:
Mouse Reeve 2021-09-29 12:21:19 -07:00
parent 47706b5353
commit 123b23728f
4 changed files with 63 additions and 53 deletions

View file

@ -9,6 +9,7 @@ from requests.exceptions import RequestException
from bookwyrm import activitypub, models, settings
from .connector_manager import load_more_data, ConnectorException
from .format_mappings import format_mappings
logger = logging.getLogger(__name__)
@ -312,3 +313,16 @@ class Mapping:
return self.formatter(value)
except: # pylint: disable=bare-except
return None
def infer_physical_format(format_text):
""" try to figure out what the standardized format is from the free value """
format_text = format_text.lower()
if format_text in format_mappings:
# try a direct match
return format_mappings[format_text]
else:
# failing that, try substring
matches = [v for k, v in format_mappings.items() if k in format_text]
if not matches:
return None
return matches[0]

View file

@ -0,0 +1,43 @@
""" comparing a free text format to the standardized one """
format_mappings = {
"paperback": "Paperback",
"soft": "Paperback",
"pamphlet": "Paperback",
"peperback": "Paperback",
"tapa blanda": "Paperback",
"turtleback": "Paperback",
"pocket": "Paperback",
"spiral": "Paperback",
"ring": "Paperback",
"平装": "Paperback",
"简装": "Paperback",
"hardcover": "Hardcover",
"hardcocer": "Hardcover",
"hardover": "Hardcover",
"hardback": "Hardcover",
"library": "Hardcover",
"tapa dura": "Hardcover",
"leather": "Hardcover",
"clothbound": "Hardcover",
"精装": "Hardcover",
"ebook": "EBook",
"e-book": "EBook",
"digital": "EBook",
"computer file": "EBook",
"epub": "EBook",
"online": "EBook",
"pdf": "EBook",
"elektronische": "EBook",
"electronic": "EBook",
"audiobook": "AudiobookFormat",
"audio": "AudiobookFormat",
"cd": "AudiobookFormat",
"dvd": "AudiobookFormat",
"mp3": "AudiobookFormat",
"cassette": "AudiobookFormat",
"kindle": "AudiobookFormat",
"talking": "AudiobookFormat",
"sound": "AudiobookFormat",
"comic": "GraphicNovel",
"graphic": "GraphicNovel",
}

View file

@ -3,7 +3,7 @@ import re
from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult, Mapping
from .abstract_connector import get_data
from .abstract_connector import get_data, infer_physical_format
from .connector_manager import ConnectorException
from .openlibrary_languages import languages
@ -43,7 +43,8 @@ class Connector(AbstractConnector):
),
Mapping("publishedDate", remote_field="publish_date"),
Mapping("pages", remote_field="number_of_pages"),
Mapping("physicalFormat", remote_field="physical_format"),
Mapping("physicalFormat", remote_field="physical_format", formatter=infer_physical_format),
Mapping("physicalFormatDetail", remote_field="physical_format"),
Mapping("publishers"),
]

View file

@ -2,6 +2,7 @@
from django.db import migrations
import bookwyrm
from bookwyrm.connectors.abstract_connector import infer_physical_format
def infer_format(app_registry, schema_editor):
@ -13,58 +14,9 @@ def infer_format(app_registry, schema_editor):
.objects.using(db_alias)
.filter(physical_format_detail__isnull=False)
)
mappings = {
"paperback": "Paperback",
"soft": "Paperback",
"pamphlet": "Paperback",
"peperback": "Paperback",
"tapa blanda": "Paperback",
"turtleback": "Paperback",
"pocket": "Paperback",
"spiral": "Paperback",
"ring": "Paperback",
"平装": "Paperback",
"简装": "Paperback",
"hardcover": "Hardcover",
"hardcocer": "Hardcover",
"hardover": "Hardcover",
"hardback": "Hardcover",
"library": "Hardcover",
"tapa dura": "Hardcover",
"leather": "Hardcover",
"clothbound": "Hardcover",
"精装": "Hardcover",
"ebook": "EBook",
"e-book": "EBook",
"digital": "EBook",
"computer file": "EBook",
"epub": "EBook",
"online": "EBook",
"pdf": "EBook",
"elektronische": "EBook",
"electronic": "EBook",
"audiobook": "AudiobookFormat",
"audio": "AudiobookFormat",
"cd": "AudiobookFormat",
"dvd": "AudiobookFormat",
"mp3": "AudiobookFormat",
"cassette": "AudiobookFormat",
"kindle": "AudiobookFormat",
"talking": "AudiobookFormat",
"sound": "AudiobookFormat",
"comic": "GraphicNovel",
"graphic": "GraphicNovel",
}
for edition in editions:
free_format = edition.physical_format_detail.lower()
if free_format in mappings:
edition.physical_format = mappings[free_format]
edition.save()
else:
matches = [v for k, v in mappings.items() if k in free_format]
if not matches:
continue
edition.physical_format = matches[0]
edition.physical_format = infer_physical_format(free_format)
edition.save()