Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
CHANGES
0.11.1 (June 12, 2026)
- New field `book_wikipedia_url(s)` has been added to the DublinCore dispatcher.
- Two functions `add_book_wikipedia_url` and `remove_book_wikipedia_url` have been added to the DublinCoreObject.
- Wikipedia url dispatcher testing added to `test_load_from_json`.
- Add/remove tesing can be found in `test_book_wikipedia_urls_add_and_remove`
- Plus `test_book_wikipedia_url_format` which covers striping white space from client input.

0.11.0 (June *, 2026)
- Now using SQLAlchemy 2.0
- Upgraded all the packages inside of `Pipfile.lock` to most recent versions compatible with py3.9.
Expand Down
93 changes: 93 additions & 0 deletions libgutenberg/DublinCore.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import textwrap
import unicodedata
from gettext import gettext as _
from urllib.parse import unquote

import six
import lxml
Expand Down Expand Up @@ -523,6 +524,50 @@ def handle_dc_languages(dc, text):
pass


WIKIMATCH = re.compile(
r"(?ix)https?://([a-z]{2,3})\.wikipedia\.org/wiki/([/!@i^*$a-z0-9_\(\)\-.:]+)")
AVOID_WIKI = ["simple.", "File:", "/Category:", "Category:", "(disambiguation)"]
UNTRUSTED_WIKI_LANGS = {'sco'}
WIKIPEDIA_URL_PREFIX = 'Wikipedia page about this book: '
Comment thread
zachjesus marked this conversation as resolved.


def check_wikipedia_url(text):
"""Return (lang, page_title) if text contains a valid wiki URL, else None."""
if not text:
return None
match = WIKIMATCH.search(text)
if not match:
return None
lang, page_title = match.group(1), match.group(2)
if lang in UNTRUSTED_WIKI_LANGS:
return None
if any(pattern in unquote(page_title) for pattern in AVOID_WIKI):
return None
return (lang, page_title)


def wikipedia_url(lang, page_title):
return f"https://{lang}.wikipedia.org/wiki/{page_title}"


def extract_wikipedia_url(text):
checked = check_wikipedia_url(text)
return wikipedia_url(*checked) if checked else None


def format_wikipedia_url(url_or_text):
"""Bare URL gets default prefix; text with a URL already in it is kept as-is."""
if not url_or_text:
return ''
checked = check_wikipedia_url(url_or_text)
if not checked:
return url_or_text
url = wikipedia_url(*checked)
if url_or_text.strip() == url:
return f"{WIKIPEDIA_URL_PREFIX}{url}"
return url_or_text
Comment thread
zachjesus marked this conversation as resolved.


class GutenbergDublinCore(DublinCore):
""" Parse from PG files. """

Expand All @@ -533,8 +578,37 @@ def __init__(self):
self._project_gutenberg_id = None
self.request_key = ''
self.scan_urls = set()
self.wikipedia_urls = []


def add_wikipedia_url(self, url_or_text):
url_or_text = (url_or_text or '').strip()
if not url_or_text:
return
checked = check_wikipedia_url(url_or_text)
if not checked:
error('%s is not a valid wikipedia url', url_or_text)
return
if any(check_wikipedia_url(text) == checked for text in self.wikipedia_urls):
return
url = wikipedia_url(*checked)
if url_or_text == url:
text = f"{WIKIPEDIA_URL_PREFIX}{url}"
else:
text = url_or_text
self.wikipedia_urls.append(text)


def remove_wikipedia_url(self, url_or_text):
url_or_text = (url_or_text or '').strip()
if not url_or_text:
return
checked = check_wikipedia_url(url_or_text)
for i, text in enumerate(self.wikipedia_urls):
if text == url_or_text or (checked and check_wikipedia_url(text) == checked):
del self.wikipedia_urls[i]
return


@property
def project_gutenberg_id(self):
Expand Down Expand Up @@ -794,6 +868,21 @@ def handle_scan_urls(self, key, value):
self.scan_urls.add(scan_url)


def handle_wikipedia_urls(self, key, value):
if isinstance(value, str):
value = [value]
elif isinstance(value, list):
pass
else:
error('%s is not a valid wikipedia url', value)
return
for url in value:
if not check_wikipedia_url(url):
error('%s is not a valid wikipedia url', url)
continue
self.add_wikipedia_url(url)


def handle_pubinfo(self, key, value):
if key == 'publisher':
self.pubinfo.publisher = value
Expand Down Expand Up @@ -928,6 +1017,7 @@ def dispatch(self, key, val):
'alt_title': store,
'creator_role': handle_creators,
'scans_archive_url': handle_scan_urls,
'wikipedia_url': handle_wikipedia_urls,
'credit': store,
'publisher': handle_pubinfo,
'publisher_country': handle_pubinfo,
Expand All @@ -951,6 +1041,9 @@ def dispatch(self, key, val):
'created': 'source_publication_years',
'produced by': 'credit',
'publisher_place': 'place',
'wikipedia_urls': 'wikipedia_url',
'book_wikipedia_url': 'wikipedia_url',
'book_wikipedia_urls': 'wikipedia_url',
}

for role in list(self.inverse_role_map.keys()):
Expand Down
34 changes: 31 additions & 3 deletions libgutenberg/DublinCoreMapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from sqlalchemy.exc import DBAPIError

from . import DublinCore
from .DublinCore import check_wikipedia_url
from . import GutenbergGlobals as gg
from . import GutenbergDatabase
from . import GutenbergFiles
Expand Down Expand Up @@ -179,7 +180,10 @@ def parse260(s):
elif marc.code == '260':
(self.pubinfo.place, self.pubinfo.publisher, self.pubinfo.years) = parse260(marc.text)
elif marc.code == '500':
self.notes = marc.text
if check_wikipedia_url(marc.text):
self.add_wikipedia_url(marc.text)
else:
self.notes = marc.text
elif marc.code == '505':
self.contents = marc.text
elif marc.code == '508':
Expand Down Expand Up @@ -215,7 +219,7 @@ def parse260(s):
# categories(text, audiobook, etc)
if book.categories:
self.dcmitypes = [struct(id=cat.dcmitype[0], description=cat.dcmitype[1])
for cat in book.categories]
for cat in book.categories]
else:
self.dcmitypes = [struct(id='Text', description='Text')]

Expand Down Expand Up @@ -293,7 +297,7 @@ def register_coverpage(self, id_, url, code=901):
try:
session.begin_nested()
session.add(Attribute(fk_books=id_, fk_attriblist=code,
text=gg.archive2files(id_, url)))
text=gg.archive2files(id_, url)))
session.commit()

except IntegrityError: # Duplicate key
Expand Down Expand Up @@ -383,6 +387,8 @@ def save(self, updatemode=0):
if self.request_key:
self.add_attribute(self.book, self.request_key, marc=905)

self._update_wikipedia_urls()

self.book.updatemode = 1 # prevent non-cataloguer changes

session.commit()
Expand Down Expand Up @@ -480,6 +486,7 @@ def add_title(self, book, title, marc=245, subtitle=None):
title = title.replace(' *_ *', '\n')
self.add_attribute(book, title, nonfiling=nonfiling, marc=marc)


def add_attribute(self, book, attr, nonfiling=0, marc=0):
if not attr:
return
Expand All @@ -505,6 +512,27 @@ def add_attribute(self, book, attr, nonfiling=0, marc=0):
book.attributes.append(Attribute(
fk_attriblist=marc, nonfiling=nonfiling, text=attr))


def _update_wikipedia_urls(self):
"""Sync MARC 500 wiki rows to wikipedia_urls (matched by lang and title)."""
if not self.book:
return
wanted = {check_wikipedia_url(text): text
for text in self.wikipedia_urls
if check_wikipedia_url(text)}
for att in list(self.book.attributes):
if att.fk_attriblist != 500:
continue
checked = check_wikipedia_url(att.text)
if not checked:
continue
if checked in wanted:
del wanted[checked]
else:
self.book.attributes.remove(att)
for text in wanted.values():
self.book.attributes.append(Attribute(fk_attriblist=500, text=text))

def delete(self):
""" only delete the book! """
session = self.get_my_session()
Expand Down
4 changes: 4 additions & 0 deletions libgutenberg/tests/99999.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
"https://archive.org/details/in.ernet.dli.2013.179137",
"https://babel.hathitrust.org/cgi/pt?id=uieg.30152019845839"
],
"BOOK_WIKIPEDIA_URLS": [
"https://en.wikipedia.org/wiki/Test_Book",
"https://en.wikipedia.org/wiki/Another_Book"
],
"CREDIT": "Roger Frank and Sue Clark.",
"LANGUAGE": "English",
"PUBLISHER": "Frank A. Munsey Company",
Expand Down
104 changes: 102 additions & 2 deletions libgutenberg/tests/test_dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from libgutenberg.CommonOptions import Options
from libgutenberg import GutenbergDatabase, GutenbergDatabaseDublinCore, DummyConnectionPool
from libgutenberg import DBUtils, DublinCoreMapping
from libgutenberg.DublinCore import GutenbergDublinCore, WIKIPEDIA_URL_PREFIX, extract_wikipedia_url
from libgutenberg.Logger import debug, warning
from libgutenberg.Models import Attribute, Book

Expand Down Expand Up @@ -51,6 +52,12 @@ def test_orm_metadata(self):
dc = DublinCoreMapping.DublinCoreObject()
self.metadata_test2(dc)

def test_wikipedia_urls_load(self):
dc = DublinCoreMapping.DublinCoreObject()
dc.load_from_database(2701) # Moby Dick
urls = [extract_wikipedia_url(text) for text in dc.wikipedia_urls]
self.assertIn("https://en.wikipedia.org/wiki/Moby-Dick", urls)

def metadata_test1(self, dc):
dc.load_from_database(self.ebook)
self.assertEqual(dc.project_gutenberg_id, 20050)
Expand Down Expand Up @@ -224,7 +231,7 @@ def get_cover(ebook, dc):
dc = GutenbergDatabaseDublinCore.GutenbergDatabaseDublinCore(self.dummypool)
dc.register_coverpage(ebook, 'new_cover')
# does nothing to avoid violates foreign key constraint
self.assertEqual(get_cover(ebook, dc), None)
self.assertEqual(get_cover(ebook, dc), None)

def tearDown(self):
pass
Expand Down Expand Up @@ -297,6 +304,10 @@ def test_load_from_json(self):
self.assertEqual(set_subtitle, "a true story : second line")
self.assertEqual(len(dc.authors), 2)
self.assertEqual(len(dc.scan_urls), 2)
self.assertEqual(
dc.wikipedia_urls,
[f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book",
f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"])
self.assertEqual(dc.pubinfo.first_year, '1920')
self.assertEqual(dc.credit, 'Roger Frank and Sue Clark.')
dc.add_credit('Sue Frank and Roger Clark.\n')
Expand All @@ -312,6 +323,10 @@ def test_load_from_json(self):
dc.load_from_database(99999)
self.assertEqual(set_title, dc.title)
self.assertEqual(set_subtitle, dc.subtitle)
self.assertEqual(
dc.wikipedia_urls,
[f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book",
f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"])
marc260 = dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=260).first().text
self.assertTrue('1920' in marc260)
self.assertEqual(
Expand All @@ -327,6 +342,11 @@ def test_load_from_json(self):
len(dc.session.query(Attribute).filter_by(book=dc.book,
fk_attriblist=904).all()),
2)
self.assertEqual(
len(dc.session.query(Attribute).filter_by(book=dc.book,
fk_attriblist=500).filter(
Attribute.text.like('%wikipedia.org%')).all()),
2)
self.assertEqual(
dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=905).first().text,
'20210623194947brand')
Expand All @@ -337,8 +357,88 @@ def test_load_from_json(self):
dc.session.flush()
self.assertFalse(DBUtils.ebook_exists(99999))

def test_wikipedia_url_format(self):
bare = "https://en.wikipedia.org/wiki/Moby-Dick"
prefixed = f"{WIKIPEDIA_URL_PREFIX}{bare}"
custom = f"See also: {bare}"

dc = GutenbergDublinCore()
dc.add_wikipedia_url(bare)
self.assertEqual(dc.wikipedia_urls, [prefixed])

dc.add_wikipedia_url(prefixed)
self.assertEqual(dc.wikipedia_urls, [prefixed])

dc2 = GutenbergDublinCore()
dc2.add_wikipedia_url(custom)
self.assertEqual(dc2.wikipedia_urls, [custom])

dc.add_wikipedia_url(bare)
dc.remove_wikipedia_url(bare)
self.assertEqual(dc.wikipedia_urls, [])

dc.add_wikipedia_url(bare)
dc.remove_wikipedia_url(prefixed)
self.assertEqual(dc.wikipedia_urls, [])

dc.add_wikipedia_url(f" {bare} ")
self.assertEqual(dc.wikipedia_urls, [prefixed])
dc.remove_wikipedia_url(f" {bare} ")
self.assertEqual(dc.wikipedia_urls, [])

dc.add_wikipedia_url(f"See also: {bare} ")
self.assertEqual(dc.wikipedia_urls, [f"See also: {bare}"])

def test_wikipedia_urls_add_and_remove(self):
dc = DublinCoreMapping.DublinCoreObject()

ebook = 99998 # fake test id

dc.load_or_create_book(ebook)
dc.rights = 'Public Domain in the USA.'

url = "https://en.wikipedia.org/wiki/Moby-Dick"
url2 = "https://en.wikipedia.org/wiki/Test_Book"
url3 = "https://en.wikipedia.org/wiki/Herman_Melville"
text1 = f"{WIKIPEDIA_URL_PREFIX}{url}"
text2 = f"{WIKIPEDIA_URL_PREFIX}{url2}"
custom_text = f"Wikipedia page about this author: {url3}"

dc.add_wikipedia_url(url)
dc.add_wikipedia_url(text2)
dc.add_wikipedia_url(custom_text)
dc.save(updatemode=0)

dc2 = DublinCoreMapping.DublinCoreObject()
dc2.load_from_database(ebook)

self.assertEqual(dc2.wikipedia_urls, [text1, text2, custom_text])

dc2.remove_wikipedia_url(url)
dc2.save(updatemode=1)

dc3 = DublinCoreMapping.DublinCoreObject()
dc3.load_from_database(ebook)

self.assertEqual(dc3.wikipedia_urls, [text2, custom_text])

dc3.remove_wikipedia_url(custom_text)
dc3.save(updatemode=1)

dc4 = DublinCoreMapping.DublinCoreObject()
dc4.load_from_database(ebook)
self.assertEqual(dc4.wikipedia_urls, [text2])

dc4.remove_wikipedia_url(text2)
dc4.save(updatemode=1)

dc5 = DublinCoreMapping.DublinCoreObject()
dc5.load_from_database(ebook)
self.assertEqual(dc5.wikipedia_urls, [])

def tearDown(self):
session = DBUtils.check_session(None)
DBUtils.remove_author('Lorem Ipsum Jr.', session=session)
session.query(Book).filter(Book.pk == 99999).delete()
session.commit()
session.query(Book).filter(Book.pk == 99998).delete()
session.commit()
Loading