forked from mirrors/bookwyrm
Infer format in openlibrary import
This commit is contained in:
parent
47706b5353
commit
123b23728f
4 changed files with 63 additions and 53 deletions
|
@ -9,6 +9,7 @@ from requests.exceptions import RequestException
|
|||
|
||||
from bookwyrm import activitypub, models, settings
|
||||
from .connector_manager import load_more_data, ConnectorException
|
||||
from .format_mappings import format_mappings
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -312,3 +313,16 @@ class Mapping:
|
|||
return self.formatter(value)
|
||||
except: # pylint: disable=bare-except
|
||||
return None
|
||||
|
||||
def infer_physical_format(format_text):
|
||||
""" try to figure out what the standardized format is from the free value """
|
||||
format_text = format_text.lower()
|
||||
if format_text in format_mappings:
|
||||
# try a direct match
|
||||
return format_mappings[format_text]
|
||||
else:
|
||||
# failing that, try substring
|
||||
matches = [v for k, v in format_mappings.items() if k in format_text]
|
||||
if not matches:
|
||||
return None
|
||||
return matches[0]
|
||||
|
|
43
bookwyrm/connectors/format_mappings.py
Normal file
43
bookwyrm/connectors/format_mappings.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
""" comparing a free text format to the standardized one """
|
||||
format_mappings = {
|
||||
"paperback": "Paperback",
|
||||
"soft": "Paperback",
|
||||
"pamphlet": "Paperback",
|
||||
"peperback": "Paperback",
|
||||
"tapa blanda": "Paperback",
|
||||
"turtleback": "Paperback",
|
||||
"pocket": "Paperback",
|
||||
"spiral": "Paperback",
|
||||
"ring": "Paperback",
|
||||
"平装": "Paperback",
|
||||
"简装": "Paperback",
|
||||
"hardcover": "Hardcover",
|
||||
"hardcocer": "Hardcover",
|
||||
"hardover": "Hardcover",
|
||||
"hardback": "Hardcover",
|
||||
"library": "Hardcover",
|
||||
"tapa dura": "Hardcover",
|
||||
"leather": "Hardcover",
|
||||
"clothbound": "Hardcover",
|
||||
"精装": "Hardcover",
|
||||
"ebook": "EBook",
|
||||
"e-book": "EBook",
|
||||
"digital": "EBook",
|
||||
"computer file": "EBook",
|
||||
"epub": "EBook",
|
||||
"online": "EBook",
|
||||
"pdf": "EBook",
|
||||
"elektronische": "EBook",
|
||||
"electronic": "EBook",
|
||||
"audiobook": "AudiobookFormat",
|
||||
"audio": "AudiobookFormat",
|
||||
"cd": "AudiobookFormat",
|
||||
"dvd": "AudiobookFormat",
|
||||
"mp3": "AudiobookFormat",
|
||||
"cassette": "AudiobookFormat",
|
||||
"kindle": "AudiobookFormat",
|
||||
"talking": "AudiobookFormat",
|
||||
"sound": "AudiobookFormat",
|
||||
"comic": "GraphicNovel",
|
||||
"graphic": "GraphicNovel",
|
||||
}
|
|
@ -3,7 +3,7 @@ import re
|
|||
|
||||
from bookwyrm import models
|
||||
from .abstract_connector import AbstractConnector, SearchResult, Mapping
|
||||
from .abstract_connector import get_data
|
||||
from .abstract_connector import get_data, infer_physical_format
|
||||
from .connector_manager import ConnectorException
|
||||
from .openlibrary_languages import languages
|
||||
|
||||
|
@ -43,7 +43,8 @@ class Connector(AbstractConnector):
|
|||
),
|
||||
Mapping("publishedDate", remote_field="publish_date"),
|
||||
Mapping("pages", remote_field="number_of_pages"),
|
||||
Mapping("physicalFormat", remote_field="physical_format"),
|
||||
Mapping("physicalFormat", remote_field="physical_format", formatter=infer_physical_format),
|
||||
Mapping("physicalFormatDetail", remote_field="physical_format"),
|
||||
Mapping("publishers"),
|
||||
]
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
from django.db import migrations
|
||||
import bookwyrm
|
||||
from bookwyrm.connectors.abstract_connector import infer_physical_format
|
||||
|
||||
|
||||
def infer_format(app_registry, schema_editor):
|
||||
|
@ -13,59 +14,10 @@ def infer_format(app_registry, schema_editor):
|
|||
.objects.using(db_alias)
|
||||
.filter(physical_format_detail__isnull=False)
|
||||
)
|
||||
mappings = {
|
||||
"paperback": "Paperback",
|
||||
"soft": "Paperback",
|
||||
"pamphlet": "Paperback",
|
||||
"peperback": "Paperback",
|
||||
"tapa blanda": "Paperback",
|
||||
"turtleback": "Paperback",
|
||||
"pocket": "Paperback",
|
||||
"spiral": "Paperback",
|
||||
"ring": "Paperback",
|
||||
"平装": "Paperback",
|
||||
"简装": "Paperback",
|
||||
"hardcover": "Hardcover",
|
||||
"hardcocer": "Hardcover",
|
||||
"hardover": "Hardcover",
|
||||
"hardback": "Hardcover",
|
||||
"library": "Hardcover",
|
||||
"tapa dura": "Hardcover",
|
||||
"leather": "Hardcover",
|
||||
"clothbound": "Hardcover",
|
||||
"精装": "Hardcover",
|
||||
"ebook": "EBook",
|
||||
"e-book": "EBook",
|
||||
"digital": "EBook",
|
||||
"computer file": "EBook",
|
||||
"epub": "EBook",
|
||||
"online": "EBook",
|
||||
"pdf": "EBook",
|
||||
"elektronische": "EBook",
|
||||
"electronic": "EBook",
|
||||
"audiobook": "AudiobookFormat",
|
||||
"audio": "AudiobookFormat",
|
||||
"cd": "AudiobookFormat",
|
||||
"dvd": "AudiobookFormat",
|
||||
"mp3": "AudiobookFormat",
|
||||
"cassette": "AudiobookFormat",
|
||||
"kindle": "AudiobookFormat",
|
||||
"talking": "AudiobookFormat",
|
||||
"sound": "AudiobookFormat",
|
||||
"comic": "GraphicNovel",
|
||||
"graphic": "GraphicNovel",
|
||||
}
|
||||
for edition in editions:
|
||||
free_format = edition.physical_format_detail.lower()
|
||||
if free_format in mappings:
|
||||
edition.physical_format = mappings[free_format]
|
||||
edition.save()
|
||||
else:
|
||||
matches = [v for k, v in mappings.items() if k in free_format]
|
||||
if not matches:
|
||||
continue
|
||||
edition.physical_format = matches[0]
|
||||
edition.save()
|
||||
edition.physical_format = infer_physical_format(free_format)
|
||||
edition.save()
|
||||
|
||||
|
||||
def reverse(app_registry, schema_editor):
|
||||
|
|
Loading…
Reference in a new issue