Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
CHANGES
0.10.38 (June 12, 2026)
- New field `book_wikipedia_url(s)` has been added to the DublinCore dispatcher.
- Two functions `add_book_wikipedia_url` and `remove_book_wikipedia_url` have been added to the DublinCoreObject.
- Wikipedia url dispatcher testing added to `test_load_from_json`.
- Add/remove tesing can be found in `test_book_wikipedia_urls_add_and_remove`
- Plus `test_book_wikipedia_url_format` which covers striping white space from client input.
0.10.37 (June 9, 2026)
- Upgraded all the packages inside of `Pipfile.lock` to most recent versions.
- Fixed syntax issues in `Models.py` preventing sqlalchemy upgrade.
Expand Down
57 changes: 57 additions & 0 deletions libgutenberg/DublinCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,25 @@ def handle_dc_languages(dc, text):
pass


WIKI_URL_RE = re.compile(r'https?://[^\s]*wikipedia\.org[^\s]*')
Comment thread
zachjesus marked this conversation as resolved.
Outdated
WIKIPEDIA_URL_PREFIX = 'Wikipedia page about this book: '
Comment thread
zachjesus marked this conversation as resolved.


def extract_wikipedia_url(text):
match = WIKI_URL_RE.search(text or '')
return match.group(0) if match else None


def format_book_wikipedia_url(url_or_text):
"""Bare URL gets default prefix; text with a URL already in it is kept as-is."""
if not url_or_text:
return ''
url = extract_wikipedia_url(url_or_text)
if url and url_or_text == url:
return f"{WIKIPEDIA_URL_PREFIX}{url}"
return url_or_text
Comment thread
zachjesus marked this conversation as resolved.


class GutenbergDublinCore(DublinCore):
""" Parse from PG files. """

Expand All @@ -533,8 +552,30 @@ def __init__(self):
self._project_gutenberg_id = None
self.request_key = ''
self.scan_urls = set()
self.book_wikipedia_urls = []


def add_book_wikipedia_url(self, url_or_text):
url_or_text = (url_or_text or '').strip()
if not url_or_text:
return
text = format_book_wikipedia_url(url_or_text)
url = extract_wikipedia_url(text)
if url and any(extract_wikipedia_url(t) == url for t in self.book_wikipedia_urls):
return
self.book_wikipedia_urls.append(text)
Comment thread
zachjesus marked this conversation as resolved.
Outdated


def remove_book_wikipedia_url(self, url_or_text):
url_or_text = (url_or_text or '').strip()
if not url_or_text:
return
key = extract_wikipedia_url(url_or_text)
for i, text in enumerate(self.book_wikipedia_urls):
Comment thread
zachjesus marked this conversation as resolved.
Outdated
if text == url_or_text or (key and extract_wikipedia_url(text) == key):
del self.book_wikipedia_urls[i]
return


@property
def project_gutenberg_id(self):
Expand Down Expand Up @@ -794,6 +835,18 @@ def handle_scan_urls(self, key, value):
self.scan_urls.add(scan_url)


def handle_book_wikipedia_urls(self, key, value):
if isinstance(value, str):
value = [value]
elif isinstance(value, list):
pass
else:
error('%s is not a valid wikipedia url', value)
return
for url in value:
self.add_book_wikipedia_url(url)
Comment thread
zachjesus marked this conversation as resolved.
Outdated


def handle_pubinfo(self, key, value):
if key == 'publisher':
self.pubinfo.publisher = value
Expand Down Expand Up @@ -928,6 +981,7 @@ def dispatch(self, key, val):
'alt_title': store,
'creator_role': handle_creators,
'scans_archive_url': handle_scan_urls,
'book_wikipedia_url': handle_book_wikipedia_urls,
'credit': store,
'publisher': handle_pubinfo,
'publisher_country': handle_pubinfo,
Expand All @@ -951,6 +1005,9 @@ def dispatch(self, key, val):
'created': 'source_publication_years',
'produced by': 'credit',
'publisher_place': 'place',
'book_wikipedia_urls': 'book_wikipedia_url',
'wikipedia_urls': 'book_wikipedia_url',
'wikipedia_url': 'book_wikipedia_url',
}

for role in list(self.inverse_role_map.keys()):
Expand Down
34 changes: 31 additions & 3 deletions libgutenberg/DublinCoreMapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from sqlalchemy.exc import DBAPIError

from . import DublinCore
from .DublinCore import extract_wikipedia_url
from . import GutenbergGlobals as gg
from . import GutenbergDatabase
from . import GutenbergFiles
Expand Down Expand Up @@ -179,7 +180,10 @@ def parse260(s):
elif marc.code == '260':
(self.pubinfo.place, self.pubinfo.publisher, self.pubinfo.years) = parse260(marc.text)
elif marc.code == '500':
self.notes = marc.text
if extract_wikipedia_url(marc.text):
self.add_book_wikipedia_url(marc.text)
else:
self.notes = marc.text
elif marc.code == '505':
self.contents = marc.text
elif marc.code == '508':
Expand Down Expand Up @@ -215,7 +219,7 @@ def parse260(s):
# categories(text, audiobook, etc)
if book.categories:
self.dcmitypes = [struct(id=cat.dcmitype[0], description=cat.dcmitype[1])
for cat in book.categories]
for cat in book.categories]
else:
self.dcmitypes = [struct(id='Text', description='Text')]

Expand Down Expand Up @@ -293,7 +297,7 @@ def register_coverpage(self, id_, url, code=901):
try:
session.begin_nested()
session.add(Attribute(fk_books=id_, fk_attriblist=code,
text=gg.archive2files(id_, url)))
text=gg.archive2files(id_, url)))
session.commit()

except IntegrityError: # Duplicate key
Expand Down Expand Up @@ -383,6 +387,8 @@ def save(self, updatemode=0):
if self.request_key:
self.add_attribute(self.book, self.request_key, marc=905)

self._update_book_wikipedia_urls()

self.book.updatemode = 1 # prevent non-cataloguer changes

session.commit()
Expand Down Expand Up @@ -480,6 +486,7 @@ def add_title(self, book, title, marc=245, subtitle=None):
title = title.replace(' *_ *', '\n')
self.add_attribute(book, title, nonfiling=nonfiling, marc=marc)


def add_attribute(self, book, attr, nonfiling=0, marc=0):
if not attr:
return
Expand All @@ -505,6 +512,27 @@ def add_attribute(self, book, attr, nonfiling=0, marc=0):
book.attributes.append(Attribute(
fk_attriblist=marc, nonfiling=nonfiling, text=attr))


def _update_book_wikipedia_urls(self):
Comment thread
zachjesus marked this conversation as resolved.
Outdated
"""Sync MARC 500 wiki rows to book_wikipedia_urls (matched by URL)."""
if not self.book:
return
wanted = {extract_wikipedia_url(text): text
Comment thread
zachjesus marked this conversation as resolved.
Outdated
for text in self.book_wikipedia_urls
if extract_wikipedia_url(text)}
for att in list(self.book.attributes):
if att.fk_attriblist != 500:
continue
url = extract_wikipedia_url(att.text)
if not url:
continue
if url in wanted:
del wanted[url]
else:
self.book.attributes.remove(att)
for text in wanted.values():
self.book.attributes.append(Attribute(fk_attriblist=500, text=text))

def delete(self):
""" only delete the book! """
session = self.get_my_session()
Expand Down
4 changes: 4 additions & 0 deletions libgutenberg/tests/99999.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
"https://archive.org/details/in.ernet.dli.2013.179137",
"https://babel.hathitrust.org/cgi/pt?id=uieg.30152019845839"
],
"BOOK_WIKIPEDIA_URLS": [
"https://en.wikipedia.org/wiki/Test_Book",
"https://en.wikipedia.org/wiki/Another_Book"
],
"CREDIT": "Roger Frank and Sue Clark.",
"LANGUAGE": "English",
"PUBLISHER": "Frank A. Munsey Company",
Expand Down
104 changes: 102 additions & 2 deletions libgutenberg/tests/test_dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from libgutenberg.CommonOptions import Options
from libgutenberg import GutenbergDatabase, GutenbergDatabaseDublinCore, DummyConnectionPool
from libgutenberg import DBUtils, DublinCoreMapping
from libgutenberg.DublinCore import GutenbergDublinCore, WIKIPEDIA_URL_PREFIX, extract_wikipedia_url
from libgutenberg.Logger import debug, warning
from libgutenberg.Models import Attribute, Book

Expand Down Expand Up @@ -51,6 +52,12 @@ def test_orm_metadata(self):
dc = DublinCoreMapping.DublinCoreObject()
self.metadata_test2(dc)

def test_book_wikipedia_urls_load(self):
dc = DublinCoreMapping.DublinCoreObject()
dc.load_from_database(2701) # Moby Dick
urls = [extract_wikipedia_url(text) for text in dc.book_wikipedia_urls]
self.assertIn("https://en.wikipedia.org/wiki/Moby-Dick", urls)

def metadata_test1(self, dc):
dc.load_from_database(self.ebook)
self.assertEqual(dc.project_gutenberg_id, 20050)
Expand Down Expand Up @@ -224,7 +231,7 @@ def get_cover(ebook, dc):
dc = GutenbergDatabaseDublinCore.GutenbergDatabaseDublinCore(self.dummypool)
dc.register_coverpage(ebook, 'new_cover')
# does nothing to avoid violates foreign key constraint
self.assertEqual(get_cover(ebook, dc), None)
self.assertEqual(get_cover(ebook, dc), None)

def tearDown(self):
pass
Expand Down Expand Up @@ -297,6 +304,10 @@ def test_load_from_json(self):
self.assertEqual(set_subtitle, "a true story : second line")
self.assertEqual(len(dc.authors), 2)
self.assertEqual(len(dc.scan_urls), 2)
self.assertEqual(
dc.book_wikipedia_urls,
[f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book",
f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"])
self.assertEqual(dc.pubinfo.first_year, '1920')
self.assertEqual(dc.credit, 'Roger Frank and Sue Clark.')
dc.add_credit('Sue Frank and Roger Clark.\n')
Expand All @@ -312,6 +323,10 @@ def test_load_from_json(self):
dc.load_from_database(99999)
self.assertEqual(set_title, dc.title)
self.assertEqual(set_subtitle, dc.subtitle)
self.assertEqual(
dc.book_wikipedia_urls,
[f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book",
f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"])
marc260 = dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=260).first().text
self.assertTrue('1920' in marc260)
self.assertEqual(
Expand All @@ -327,6 +342,11 @@ def test_load_from_json(self):
len(dc.session.query(Attribute).filter_by(book=dc.book,
fk_attriblist=904).all()),
2)
self.assertEqual(
len(dc.session.query(Attribute).filter_by(book=dc.book,
fk_attriblist=500).filter(
Attribute.text.like('%wikipedia.org%')).all()),
2)
self.assertEqual(
dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=905).first().text,
'20210623194947brand')
Expand All @@ -337,8 +357,88 @@ def test_load_from_json(self):
dc.session.flush()
self.assertFalse(DBUtils.ebook_exists(99999))

def test_book_wikipedia_url_format(self):
bare = "https://en.wikipedia.org/wiki/Moby-Dick"
prefixed = f"{WIKIPEDIA_URL_PREFIX}{bare}"
custom = f"See also: {bare}"

dc = GutenbergDublinCore()
dc.add_book_wikipedia_url(bare)
self.assertEqual(dc.book_wikipedia_urls, [prefixed])

dc.add_book_wikipedia_url(prefixed)
self.assertEqual(dc.book_wikipedia_urls, [prefixed])

dc2 = GutenbergDublinCore()
dc2.add_book_wikipedia_url(custom)
self.assertEqual(dc2.book_wikipedia_urls, [custom])

dc.add_book_wikipedia_url(bare)
dc.remove_book_wikipedia_url(bare)
self.assertEqual(dc.book_wikipedia_urls, [])

dc.add_book_wikipedia_url(bare)
dc.remove_book_wikipedia_url(prefixed)
self.assertEqual(dc.book_wikipedia_urls, [])

dc.add_book_wikipedia_url(f" {bare} ")
self.assertEqual(dc.book_wikipedia_urls, [prefixed])
dc.remove_book_wikipedia_url(f" {bare} ")
self.assertEqual(dc.book_wikipedia_urls, [])

dc.add_book_wikipedia_url(f"See also: {bare} ")
self.assertEqual(dc.book_wikipedia_urls, [f"See also: {bare}"])

def test_book_wikipedia_urls_add_and_remove(self):
dc = DublinCoreMapping.DublinCoreObject()

ebook = 99998 # fake test id

dc.load_or_create_book(ebook)
dc.rights = 'Public Domain in the USA.'

url = "https://en.wikipedia.org/wiki/Moby-Dick"
url2 = "https://en.wikipedia.org/wiki/Test_Book"
url3 = "https://en.wikipedia.org/wiki/Herman_Melville"
text1 = f"{WIKIPEDIA_URL_PREFIX}{url}"
text2 = f"{WIKIPEDIA_URL_PREFIX}{url2}"
custom_text = f"Wikipedia page about this author: {url3}"

dc.add_book_wikipedia_url(url)
dc.add_book_wikipedia_url(text2)
dc.add_book_wikipedia_url(custom_text)
dc.save(updatemode=0)

dc2 = DublinCoreMapping.DublinCoreObject()
dc2.load_from_database(ebook)

self.assertEqual(dc2.book_wikipedia_urls, [text1, text2, custom_text])

dc2.remove_book_wikipedia_url(url)
dc2.save(updatemode=1)

dc3 = DublinCoreMapping.DublinCoreObject()
dc3.load_from_database(ebook)

self.assertEqual(dc3.book_wikipedia_urls, [text2, custom_text])

dc3.remove_book_wikipedia_url(custom_text)
dc3.save(updatemode=1)

dc4 = DublinCoreMapping.DublinCoreObject()
dc4.load_from_database(ebook)
self.assertEqual(dc4.book_wikipedia_urls, [text2])

dc4.remove_book_wikipedia_url(text2)
dc4.save(updatemode=1)

dc5 = DublinCoreMapping.DublinCoreObject()
dc5.load_from_database(ebook)
self.assertEqual(dc5.book_wikipedia_urls, [])

def tearDown(self):
session = DBUtils.check_session(None)
DBUtils.remove_author('Lorem Ipsum Jr.', session=session)
session.query(Book).filter(Book.pk == 99999).delete()
session.commit()
session.query(Book).filter(Book.pk == 99998).delete()
session.commit()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# libgutenberg setup.py
#

__version__ = '0.10.37'
__version__ = '0.10.38'

from setuptools import setup

Expand Down