diff --git a/CHANGES b/CHANGES index b725de1..80f6511 100644 --- a/CHANGES +++ b/CHANGES @@ -1,4 +1,11 @@ CHANGES +0.11.1 (June 12, 2026) +- New field `book_wikipedia_url(s)` has been added to the DublinCore dispatcher. +- Two functions `add_book_wikipedia_url` and `remove_book_wikipedia_url` have been added to the DublinCoreObject. +- Wikipedia url dispatcher testing added to `test_load_from_json`. +- Add/remove tesing can be found in `test_book_wikipedia_urls_add_and_remove` + - Plus `test_book_wikipedia_url_format` which covers striping white space from client input. + 0.11.0 (June *, 2026) - Now using SQLAlchemy 2.0 - Upgraded all the packages inside of `Pipfile.lock` to most recent versions compatible with py3.9. diff --git a/libgutenberg/DublinCore.py b/libgutenberg/DublinCore.py index 984c441..5b60415 100644 --- a/libgutenberg/DublinCore.py +++ b/libgutenberg/DublinCore.py @@ -21,6 +21,7 @@ import textwrap import unicodedata from gettext import gettext as _ +from urllib.parse import unquote import six import lxml @@ -523,6 +524,50 @@ def handle_dc_languages(dc, text): pass +WIKIMATCH = re.compile( + r"(?ix)https?://([a-z]{2,3})\.wikipedia\.org/wiki/([/!@i^*$a-z0-9_\(\)\-.:]+)") +AVOID_WIKI = ["simple.", "File:", "/Category:", "Category:", "(disambiguation)"] +UNTRUSTED_WIKI_LANGS = {'sco'} +WIKIPEDIA_URL_PREFIX = 'Wikipedia page about this book: ' + + +def check_wikipedia_url(text): + """Return (lang, page_title) if text contains a valid wiki URL, else None.""" + if not text: + return None + match = WIKIMATCH.search(text) + if not match: + return None + lang, page_title = match.group(1), match.group(2) + if lang in UNTRUSTED_WIKI_LANGS: + return None + if any(pattern in unquote(page_title) for pattern in AVOID_WIKI): + return None + return (lang, page_title) + + +def wikipedia_url(lang, page_title): + return f"https://{lang}.wikipedia.org/wiki/{page_title}" + + +def extract_wikipedia_url(text): + checked = check_wikipedia_url(text) + return wikipedia_url(*checked) if checked else None + + +def format_wikipedia_url(url_or_text): + """Bare URL gets default prefix; text with a URL already in it is kept as-is.""" + if not url_or_text: + return '' + checked = check_wikipedia_url(url_or_text) + if not checked: + return url_or_text + url = wikipedia_url(*checked) + if url_or_text.strip() == url: + return f"{WIKIPEDIA_URL_PREFIX}{url}" + return url_or_text + + class GutenbergDublinCore(DublinCore): """ Parse from PG files. """ @@ -533,8 +578,37 @@ def __init__(self): self._project_gutenberg_id = None self.request_key = '' self.scan_urls = set() + self.wikipedia_urls = [] + def add_wikipedia_url(self, url_or_text): + url_or_text = (url_or_text or '').strip() + if not url_or_text: + return + checked = check_wikipedia_url(url_or_text) + if not checked: + error('%s is not a valid wikipedia url', url_or_text) + return + if any(check_wikipedia_url(text) == checked for text in self.wikipedia_urls): + return + url = wikipedia_url(*checked) + if url_or_text == url: + text = f"{WIKIPEDIA_URL_PREFIX}{url}" + else: + text = url_or_text + self.wikipedia_urls.append(text) + + + def remove_wikipedia_url(self, url_or_text): + url_or_text = (url_or_text or '').strip() + if not url_or_text: + return + checked = check_wikipedia_url(url_or_text) + for i, text in enumerate(self.wikipedia_urls): + if text == url_or_text or (checked and check_wikipedia_url(text) == checked): + del self.wikipedia_urls[i] + return + @property def project_gutenberg_id(self): @@ -794,6 +868,21 @@ def handle_scan_urls(self, key, value): self.scan_urls.add(scan_url) + def handle_wikipedia_urls(self, key, value): + if isinstance(value, str): + value = [value] + elif isinstance(value, list): + pass + else: + error('%s is not a valid wikipedia url', value) + return + for url in value: + if not check_wikipedia_url(url): + error('%s is not a valid wikipedia url', url) + continue + self.add_wikipedia_url(url) + + def handle_pubinfo(self, key, value): if key == 'publisher': self.pubinfo.publisher = value @@ -928,6 +1017,7 @@ def dispatch(self, key, val): 'alt_title': store, 'creator_role': handle_creators, 'scans_archive_url': handle_scan_urls, + 'wikipedia_url': handle_wikipedia_urls, 'credit': store, 'publisher': handle_pubinfo, 'publisher_country': handle_pubinfo, @@ -951,6 +1041,9 @@ def dispatch(self, key, val): 'created': 'source_publication_years', 'produced by': 'credit', 'publisher_place': 'place', + 'wikipedia_urls': 'wikipedia_url', + 'book_wikipedia_url': 'wikipedia_url', + 'book_wikipedia_urls': 'wikipedia_url', } for role in list(self.inverse_role_map.keys()): diff --git a/libgutenberg/DublinCoreMapping.py b/libgutenberg/DublinCoreMapping.py index 86c5074..f250873 100644 --- a/libgutenberg/DublinCoreMapping.py +++ b/libgutenberg/DublinCoreMapping.py @@ -23,6 +23,7 @@ from sqlalchemy.exc import DBAPIError from . import DublinCore +from .DublinCore import check_wikipedia_url from . import GutenbergGlobals as gg from . import GutenbergDatabase from . import GutenbergFiles @@ -179,7 +180,10 @@ def parse260(s): elif marc.code == '260': (self.pubinfo.place, self.pubinfo.publisher, self.pubinfo.years) = parse260(marc.text) elif marc.code == '500': - self.notes = marc.text + if check_wikipedia_url(marc.text): + self.add_wikipedia_url(marc.text) + else: + self.notes = marc.text elif marc.code == '505': self.contents = marc.text elif marc.code == '508': @@ -215,7 +219,7 @@ def parse260(s): # categories(text, audiobook, etc) if book.categories: self.dcmitypes = [struct(id=cat.dcmitype[0], description=cat.dcmitype[1]) - for cat in book.categories] + for cat in book.categories] else: self.dcmitypes = [struct(id='Text', description='Text')] @@ -293,7 +297,7 @@ def register_coverpage(self, id_, url, code=901): try: session.begin_nested() session.add(Attribute(fk_books=id_, fk_attriblist=code, - text=gg.archive2files(id_, url))) + text=gg.archive2files(id_, url))) session.commit() except IntegrityError: # Duplicate key @@ -383,6 +387,8 @@ def save(self, updatemode=0): if self.request_key: self.add_attribute(self.book, self.request_key, marc=905) + self._update_wikipedia_urls() + self.book.updatemode = 1 # prevent non-cataloguer changes session.commit() @@ -480,6 +486,7 @@ def add_title(self, book, title, marc=245, subtitle=None): title = title.replace(' *_ *', '\n') self.add_attribute(book, title, nonfiling=nonfiling, marc=marc) + def add_attribute(self, book, attr, nonfiling=0, marc=0): if not attr: return @@ -505,6 +512,27 @@ def add_attribute(self, book, attr, nonfiling=0, marc=0): book.attributes.append(Attribute( fk_attriblist=marc, nonfiling=nonfiling, text=attr)) + + def _update_wikipedia_urls(self): + """Sync MARC 500 wiki rows to wikipedia_urls (matched by lang and title).""" + if not self.book: + return + wanted = {check_wikipedia_url(text): text + for text in self.wikipedia_urls + if check_wikipedia_url(text)} + for att in list(self.book.attributes): + if att.fk_attriblist != 500: + continue + checked = check_wikipedia_url(att.text) + if not checked: + continue + if checked in wanted: + del wanted[checked] + else: + self.book.attributes.remove(att) + for text in wanted.values(): + self.book.attributes.append(Attribute(fk_attriblist=500, text=text)) + def delete(self): """ only delete the book! """ session = self.get_my_session() diff --git a/libgutenberg/tests/99999.json b/libgutenberg/tests/99999.json index 291ebfb..8be6238 100644 --- a/libgutenberg/tests/99999.json +++ b/libgutenberg/tests/99999.json @@ -26,6 +26,10 @@ "https://archive.org/details/in.ernet.dli.2013.179137", "https://babel.hathitrust.org/cgi/pt?id=uieg.30152019845839" ], + "BOOK_WIKIPEDIA_URLS": [ + "https://en.wikipedia.org/wiki/Test_Book", + "https://en.wikipedia.org/wiki/Another_Book" + ], "CREDIT": "Roger Frank and Sue Clark.", "LANGUAGE": "English", "PUBLISHER": "Frank A. Munsey Company", diff --git a/libgutenberg/tests/test_dc.py b/libgutenberg/tests/test_dc.py index de91afa..31a7f37 100755 --- a/libgutenberg/tests/test_dc.py +++ b/libgutenberg/tests/test_dc.py @@ -9,6 +9,7 @@ from libgutenberg.CommonOptions import Options from libgutenberg import GutenbergDatabase, GutenbergDatabaseDublinCore, DummyConnectionPool from libgutenberg import DBUtils, DublinCoreMapping +from libgutenberg.DublinCore import GutenbergDublinCore, WIKIPEDIA_URL_PREFIX, extract_wikipedia_url from libgutenberg.Logger import debug, warning from libgutenberg.Models import Attribute, Book @@ -51,6 +52,12 @@ def test_orm_metadata(self): dc = DublinCoreMapping.DublinCoreObject() self.metadata_test2(dc) + def test_wikipedia_urls_load(self): + dc = DublinCoreMapping.DublinCoreObject() + dc.load_from_database(2701) # Moby Dick + urls = [extract_wikipedia_url(text) for text in dc.wikipedia_urls] + self.assertIn("https://en.wikipedia.org/wiki/Moby-Dick", urls) + def metadata_test1(self, dc): dc.load_from_database(self.ebook) self.assertEqual(dc.project_gutenberg_id, 20050) @@ -224,7 +231,7 @@ def get_cover(ebook, dc): dc = GutenbergDatabaseDublinCore.GutenbergDatabaseDublinCore(self.dummypool) dc.register_coverpage(ebook, 'new_cover') # does nothing to avoid violates foreign key constraint - self.assertEqual(get_cover(ebook, dc), None) + self.assertEqual(get_cover(ebook, dc), None) def tearDown(self): pass @@ -297,6 +304,10 @@ def test_load_from_json(self): self.assertEqual(set_subtitle, "a true story : second line") self.assertEqual(len(dc.authors), 2) self.assertEqual(len(dc.scan_urls), 2) + self.assertEqual( + dc.wikipedia_urls, + [f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book", + f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"]) self.assertEqual(dc.pubinfo.first_year, '1920') self.assertEqual(dc.credit, 'Roger Frank and Sue Clark.') dc.add_credit('Sue Frank and Roger Clark.\n') @@ -312,6 +323,10 @@ def test_load_from_json(self): dc.load_from_database(99999) self.assertEqual(set_title, dc.title) self.assertEqual(set_subtitle, dc.subtitle) + self.assertEqual( + dc.wikipedia_urls, + [f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Test_Book", + f"{WIKIPEDIA_URL_PREFIX}https://en.wikipedia.org/wiki/Another_Book"]) marc260 = dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=260).first().text self.assertTrue('1920' in marc260) self.assertEqual( @@ -327,6 +342,11 @@ def test_load_from_json(self): len(dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=904).all()), 2) + self.assertEqual( + len(dc.session.query(Attribute).filter_by(book=dc.book, + fk_attriblist=500).filter( + Attribute.text.like('%wikipedia.org%')).all()), + 2) self.assertEqual( dc.session.query(Attribute).filter_by(book=dc.book, fk_attriblist=905).first().text, '20210623194947brand') @@ -337,8 +357,88 @@ def test_load_from_json(self): dc.session.flush() self.assertFalse(DBUtils.ebook_exists(99999)) + def test_wikipedia_url_format(self): + bare = "https://en.wikipedia.org/wiki/Moby-Dick" + prefixed = f"{WIKIPEDIA_URL_PREFIX}{bare}" + custom = f"See also: {bare}" + + dc = GutenbergDublinCore() + dc.add_wikipedia_url(bare) + self.assertEqual(dc.wikipedia_urls, [prefixed]) + + dc.add_wikipedia_url(prefixed) + self.assertEqual(dc.wikipedia_urls, [prefixed]) + + dc2 = GutenbergDublinCore() + dc2.add_wikipedia_url(custom) + self.assertEqual(dc2.wikipedia_urls, [custom]) + + dc.add_wikipedia_url(bare) + dc.remove_wikipedia_url(bare) + self.assertEqual(dc.wikipedia_urls, []) + + dc.add_wikipedia_url(bare) + dc.remove_wikipedia_url(prefixed) + self.assertEqual(dc.wikipedia_urls, []) + + dc.add_wikipedia_url(f" {bare} ") + self.assertEqual(dc.wikipedia_urls, [prefixed]) + dc.remove_wikipedia_url(f" {bare} ") + self.assertEqual(dc.wikipedia_urls, []) + + dc.add_wikipedia_url(f"See also: {bare} ") + self.assertEqual(dc.wikipedia_urls, [f"See also: {bare}"]) + + def test_wikipedia_urls_add_and_remove(self): + dc = DublinCoreMapping.DublinCoreObject() + + ebook = 99998 # fake test id + + dc.load_or_create_book(ebook) + dc.rights = 'Public Domain in the USA.' + + url = "https://en.wikipedia.org/wiki/Moby-Dick" + url2 = "https://en.wikipedia.org/wiki/Test_Book" + url3 = "https://en.wikipedia.org/wiki/Herman_Melville" + text1 = f"{WIKIPEDIA_URL_PREFIX}{url}" + text2 = f"{WIKIPEDIA_URL_PREFIX}{url2}" + custom_text = f"Wikipedia page about this author: {url3}" + + dc.add_wikipedia_url(url) + dc.add_wikipedia_url(text2) + dc.add_wikipedia_url(custom_text) + dc.save(updatemode=0) + + dc2 = DublinCoreMapping.DublinCoreObject() + dc2.load_from_database(ebook) + + self.assertEqual(dc2.wikipedia_urls, [text1, text2, custom_text]) + + dc2.remove_wikipedia_url(url) + dc2.save(updatemode=1) + + dc3 = DublinCoreMapping.DublinCoreObject() + dc3.load_from_database(ebook) + + self.assertEqual(dc3.wikipedia_urls, [text2, custom_text]) + + dc3.remove_wikipedia_url(custom_text) + dc3.save(updatemode=1) + + dc4 = DublinCoreMapping.DublinCoreObject() + dc4.load_from_database(ebook) + self.assertEqual(dc4.wikipedia_urls, [text2]) + + dc4.remove_wikipedia_url(text2) + dc4.save(updatemode=1) + + dc5 = DublinCoreMapping.DublinCoreObject() + dc5.load_from_database(ebook) + self.assertEqual(dc5.wikipedia_urls, []) + def tearDown(self): session = DBUtils.check_session(None) DBUtils.remove_author('Lorem Ipsum Jr.', session=session) session.query(Book).filter(Book.pk == 99999).delete() - session.commit() + session.query(Book).filter(Book.pk == 99998).delete() + session.commit() \ No newline at end of file diff --git a/setup.py b/setup.py index 50b4d2c..2a5d2af 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,8 @@ # libgutenberg setup.py # -__version__ = '0.11.0' +__version__ = '0.11.1' + from setuptools import setup