diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 28b5e54..924be5d 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "yt-dlp", # youtube-dl should be updated as frequently as possible "jinja2==3.1.4", # use zimscraperlib pinned version once content rewriting functions have been released - "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main", + "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@small_changes", "requests==2.32.3", "types-requests==2.32.0.20240914", "kiwixstorage==0.9.0", @@ -22,6 +22,9 @@ dependencies = [ "types-beautifulsoup4==4.12.0.20240907", "lxml==5.3.0", "tinycss2==1.3.0", + "pif==0.8.2", + "backoff==2.2.1", + "joblib==1.4.2", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py new file mode 100644 index 0000000..f3aa881 --- /dev/null +++ b/scraper/src/mindtouch2zim/asset.py @@ -0,0 +1,247 @@ +from io import BytesIO +from typing import NamedTuple + +import backoff +from kiwixstorage import KiwixStorage, NotFoundError +from pif import get_public_ip +from PIL import Image +from requests import HTTPError +from requests.exceptions import RequestException +from zimscraperlib.download import stream_file +from zimscraperlib.image.optimization import optimize_webp +from zimscraperlib.image.presets import WebpMedium +from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath +from zimscraperlib.zim import Creator + +from mindtouch2zim.constants import logger, web_session +from mindtouch2zim.utils import add_item_for, backoff_hdlr + +SUPPORTED_IMAGE_MIME_TYPES = { + "image/jpeg", + "image/png", + "image/gif", + "image/bmp", + "image/tiff", + "image/webp", + "image/x-portable-pixmap", + "image/x-portable-graymap", + "image/x-portable-bitmap", + "image/x-portable-anymap", + "image/vnd.microsoft.icon", + "image/vnd.ms-dds", + "application/postscript", # for EPS files +} + +WEBP_OPTIONS = WebpMedium().options + + +class HeaderData(NamedTuple): + ident: str # ~version~ of the URL data to use for comparisons + content_type: str | None + + +class AssetDetails(NamedTuple): + urls: set[HttpUrl] + always_fetch_online: bool + + +class AssetProcessor: + + def __init__(self, s3_url_with_credentials: str | None) -> None: + self.s3_url_with_credentials = s3_url_with_credentials + self._setup_s3() + + def process_asset( + self, + asset_path: ZimPath, + asset_details: AssetDetails, + creator: Creator, + ): + logger.debug(f"Processing asset for {asset_path}") + self._process_asset_internal( + asset_path=asset_path, asset_details=asset_details, creator=creator + ) + + @backoff.on_exception( + backoff.expo, + RequestException, + max_time=16, + on_backoff=backoff_hdlr, + ) + def _process_asset_internal( + self, + asset_path: ZimPath, + asset_details: AssetDetails, + creator: Creator, + ): + for asset_url in asset_details.urls: + try: + asset_content = self.get_asset_content( + asset_path=asset_path, + asset_url=asset_url, + always_fetch_online=asset_details.always_fetch_online, + ) + logger.debug( + f"Adding {asset_url.value} to {asset_path.value} in the ZIM" + ) + add_item_for( + creator=creator, + path="content/" + asset_path.value, + content=asset_content.getvalue(), + ) + break # file found and added + except HTTPError as exc: + # would make more sense to be a warning, but this is just too + # verbose, at least on geo.libretexts.org many assets are just + # missing + logger.debug(f"Ignoring {asset_path.value} due to {exc}") + + def _get_header_data_for(self, url: HttpUrl) -> HeaderData: + """Get details from headers for a given url + + - get response headers with GET and streaming (retrieveing only 1 byte) + - we do not HEAD because it is not possible to follow redirects directly + with a HEAD request, and this method is not always implemented / might lie + - extract HeaderData from these response headers and return it + """ + _, headers = stream_file( + url=url.value, + byte_stream=BytesIO(), + block_size=1, + only_first_block=True, + ) + + content_type = headers.get("Content-Type", None) + + for header in ("ETag", "Last-Modified", "Content-Length"): + if header := headers.get(header): + return HeaderData(ident=header, content_type=content_type) + + return HeaderData(ident="-1", content_type=content_type) + + def _get_image_content( + self, asset_path: ZimPath, asset_url: HttpUrl, header_data: HeaderData + ) -> BytesIO: + """Get image content for a given url + + - download from S3 cache if configured and available + - otherwise: + - download from online + - convert to webp + - optimize webp + - upload to S3 cache if configured + """ + meta = {"ident": header_data.ident, "version": str(WebpMedium.VERSION) + ".r"} + s3_key = f"medium/{asset_path.value}" + + if self.s3_url_with_credentials: + if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta): + logger.debug("Fetching directly from S3 cache") + return s3_data # found in cache + + logger.debug("Fetching from online") + unoptimized = self._download_from_online(asset_url=asset_url) + + logger.debug("Optimizing") + optimized = BytesIO() + with Image.open(unoptimized) as img: + img.save(optimized, format="WEBP") + del unoptimized + + optimize_webp( + src=optimized, + quality=WEBP_OPTIONS.get("quality"), # pyright: ignore[reportArgumentType] + method=WEBP_OPTIONS.get("method"), # pyright: ignore[reportArgumentType] + lossless=WEBP_OPTIONS.get( + "lossless" + ), # pyright: ignore[reportArgumentType] + ) + + if self.s3_url_with_credentials: + # upload optimized to S3 + logger.debug("Uploading to S3") + self._upload_to_s3_cache( + s3_key=s3_key, meta=meta, asset_content=BytesIO(optimized.getvalue()) + ) + + return optimized + + def _download_from_s3_cache( + self, s3_key: str, meta: dict[str, str] + ) -> BytesIO | None: + if not self.s3_storage: + raise Exception("s3 storage must be set") + try: + asset_content = BytesIO() + self.s3_storage.download_matching_fileobj( # pyright: ignore[reportUnknownMemberType] + s3_key, asset_content, meta=meta + ) + return asset_content + except NotFoundError: + return None + except Exception as exc: + raise Exception(f"Failed to download {s3_key} from S3 cache") from exc + + def _upload_to_s3_cache( + self, s3_key: str, meta: dict[str, str], asset_content: BytesIO + ): + if not self.s3_storage: + raise Exception("s3 storage must be set") + try: + self.s3_storage.upload_fileobj( # pyright: ignore[reportUnknownMemberType] + key=s3_key, fileobj=asset_content, meta=meta + ) + except Exception as exc: + raise Exception(f"Failed to upload {s3_key} to S3 cache") from exc + + def _download_from_online(self, asset_url: HttpUrl) -> BytesIO: + """Download whole content from online server with retry from scraperlib""" + + asset_content = BytesIO() + stream_file( + asset_url.value, + byte_stream=asset_content, + session=web_session, + ) + return asset_content + + def get_asset_content( + self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool + ) -> BytesIO: + """Download of a given asset, optimize if needed, or download from S3 cache""" + + if not always_fetch_online: + header_data = self._get_header_data_for(asset_url) + if header_data.content_type: + mime_type = header_data.content_type.split(";")[0].strip() + if mime_type in SUPPORTED_IMAGE_MIME_TYPES: + return self._get_image_content( + asset_path=asset_path, + asset_url=asset_url, + header_data=header_data, + ) + else: + logger.debug(f"Not optimizing, unsupported mime type: {mime_type}") + + return self._download_from_online(asset_url=asset_url) + + def _setup_s3(self): + if not self.s3_url_with_credentials: + return + logger.info("testing S3 Optimization Cache credentials") + self.s3_storage = KiwixStorage(self.s3_url_with_credentials) + if not self.s3_storage.check_credentials( # pyright: ignore[reportUnknownMemberType] + list_buckets=True, bucket=True, write=True, read=True, failsafe=True + ): + logger.error("S3 cache connection error testing permissions.") + logger.error( + f" Server: {self.s3_storage.url.netloc}" # pyright: ignore[reportUnknownMemberType] + ) + logger.error( + f" Bucket: {self.s3_storage.bucket_name}" # pyright: ignore[reportUnknownMemberType] + ) + logger.error( + f" Key ID: {self.s3_storage.params.get('keyid')}" # pyright: ignore[reportUnknownMemberType] + ) + logger.error(f" Public IP: {get_public_ip()}") + raise Exception("Invalid S3 credentials") diff --git a/scraper/src/mindtouch2zim/constants.py b/scraper/src/mindtouch2zim/constants.py index 958e2db..ee42963 100644 --- a/scraper/src/mindtouch2zim/constants.py +++ b/scraper/src/mindtouch2zim/constants.py @@ -2,9 +2,7 @@ import pathlib from zimscraperlib.download import get_session -from zimscraperlib.logging import ( - getLogger, -) +from zimscraperlib.logging import DEFAULT_FORMAT_WITH_THREADS, getLogger from mindtouch2zim.__about__ import __version__ @@ -18,6 +16,6 @@ HTTP_TIMEOUT_NORMAL_SECONDS = 15 HTTP_TIMEOUT_LONG_SECONDS = 30 -logger = getLogger(NAME, level=logging.DEBUG) +logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS) web_session = get_session() diff --git a/scraper/src/mindtouch2zim/entrypoint.py b/scraper/src/mindtouch2zim/entrypoint.py index 208d203..9b08801 100644 --- a/scraper/src/mindtouch2zim/entrypoint.py +++ b/scraper/src/mindtouch2zim/entrypoint.py @@ -8,7 +8,7 @@ MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, RECOMMENDED_MAX_TITLE_LENGTH, ) -from zimscraperlib.zim.filesystem import validate_zimfile_creatable +from zimscraperlib.zim.filesystem import validate_folder_writable from mindtouch2zim.client import MindtouchClient from mindtouch2zim.constants import ( @@ -218,17 +218,31 @@ def main(tmpdir: str) -> None: dest="illustration_url", ) + parser.add_argument( + "--optimization-cache", + help="URL with credentials to S3 for using as optimization cache", + dest="s3_url_with_credentials", + ) + + parser.add_argument( + "--assets-workers", + type=int, + help=("Number of parallel workers for asset processing (default: 10)"), + default=10, + dest="assets_workers", + ) + args = parser.parse_args() logger.setLevel(level=logging.DEBUG if args.debug else logging.INFO) output_folder = Path(args.output_folder) output_folder.mkdir(exist_ok=True) - validate_zimfile_creatable(output_folder, "test.txt") + validate_folder_writable(output_folder) tmp_folder = Path(args.tmp_folder) tmp_folder.mkdir(exist_ok=True) - validate_zimfile_creatable(tmp_folder, "test.txt") + validate_folder_writable(tmp_folder) library_url = str(args.library_url).rstrip("/") @@ -253,6 +267,8 @@ def main(tmpdir: str) -> None: stats_file=Path(args.stats_filename) if args.stats_filename else None, overwrite_existing_zim=args.overwrite, illustration_url=args.illustration_url, + s3_url_with_credentials=args.s3_url_with_credentials, + assets_workers=args.assets_workers, ).run() except SystemExit: logger.error("Generation failed, exiting") diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 90c9c24..bb52927 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -6,12 +6,13 @@ from io import BytesIO from pathlib import Path +import backoff +from joblib import Parallel, delayed from pydantic import BaseModel +from requests import RequestException from requests.exceptions import HTTPError from schedule import every, run_pending -from zimscraperlib.download import ( - stream_file, # pyright: ignore[reportUnknownVariableType] -) +from zimscraperlib.download import stream_file from zimscraperlib.image import convert_image, resize_image from zimscraperlib.image.conversion import convert_svg2png from zimscraperlib.image.probing import format_for @@ -24,9 +25,10 @@ ZimPath, ) from zimscraperlib.zim import Creator -from zimscraperlib.zim.filesystem import validate_zimfile_creatable +from zimscraperlib.zim.filesystem import validate_file_creatable from zimscraperlib.zim.indexing import IndexData +from mindtouch2zim.asset import AssetDetails, AssetProcessor from mindtouch2zim.client import ( LibraryPage, LibraryPageId, @@ -50,6 +52,7 @@ PageModel, SharedModel, ) +from mindtouch2zim.utils import add_item_for, backoff_hdlr from mindtouch2zim.zimconfig import ZimConfig @@ -121,42 +124,6 @@ def is_selected( return [page for page in page_tree.pages.values() if page.id in selected_ids] -def add_item_for( - creator: Creator, - path: str, - title: str | None = None, - *, - fpath: Path | None = None, - content: bytes | str | None = None, - mimetype: str | None = None, - is_front: bool | None = None, - should_compress: bool | None = None, - delete_fpath: bool | None = False, - duplicate_ok: bool | None = None, - index_data: IndexData | None = None, - auto_index: bool = True, -): - """ - Boilerplate to avoid repeating pyright ignore - - To be removed, once upstream issue is solved, see - https://github.com/openzim/libretexts/issues/26 - """ - creator.add_item_for( # pyright: ignore[reportUnknownMemberType] - path=path, - title=title, - fpath=fpath, - content=content, - mimetype=mimetype, - is_front=is_front, - should_compress=should_compress, - delete_fpath=delete_fpath, - duplicate_ok=duplicate_ok, - index_data=index_data, - auto_index=auto_index, - ) - - class Processor: """Generates ZIMs based on the user's configuration.""" @@ -169,6 +136,8 @@ def __init__( zimui_dist: Path, stats_file: Path | None, illustration_url: str | None, + s3_url_with_credentials: str | None, + assets_workers: int, *, overwrite_existing_zim: bool, ) -> None: @@ -191,6 +160,12 @@ def __init__( self.stats_file = stats_file self.overwrite_existing_zim = overwrite_existing_zim self.illustration_url = illustration_url + self.asset_processor = AssetProcessor( + s3_url_with_credentials=s3_url_with_credentials + ) + self.asset_executor = Parallel( + n_jobs=assets_workers, return_as="generator_unordered", backend="threading" + ) self.stats_items_done = 0 # we add 1 more items to process so that progress is not 100% at the beginning @@ -228,7 +203,7 @@ def run(self) -> Path: logger.error(f" {zim_path} already exists, aborting.") raise SystemExit(2) - validate_zimfile_creatable(self.output_folder, zim_file_name) + validate_file_creatable(self.output_folder, zim_file_name) logger.info(f" Writing to: {zim_path}") @@ -334,7 +309,7 @@ def run(self) -> Path: add_item_for(creator, "content/logo.png", content=welcome_image.getvalue()) del welcome_image - self.items_to_download: dict[ZimPath, set[HttpUrl]] = {} + self.items_to_download: dict[ZimPath, AssetDetails] = {} self._process_css( css_location=home.screen_css_url, target_filename="screen.css", @@ -416,31 +391,25 @@ def run(self) -> Path: logger.info(f" Retrieving {len(self.items_to_download)} assets...") self.stats_items_total += len(self.items_to_download) - for asset_path, asset_urls in self.items_to_download.items(): - self.stats_items_done += 1 - run_pending() - for asset_url in asset_urls: - try: - asset_content = BytesIO() - stream_file( - asset_url.value, - byte_stream=asset_content, - session=web_session, - ) - logger.debug( - f"Adding {asset_url.value} to {asset_path.value} in the ZIM" - ) - add_item_for( - creator, - "content/" + asset_path.value, - content=asset_content.getvalue(), - ) - break # file found and added - except HTTPError as exc: - # would make more sense to be a warning, but this is just too - # verbose, at least on geo.libretexts.org many assets are just - # missing - logger.debug(f"Ignoring {asset_path.value} due to {exc}") + + try: + res = self.asset_executor( + delayed(self.asset_processor.process_asset)( + asset_path, asset_details, creator + ) + for asset_path, asset_details in self.items_to_download.items() + ) + for _ in res: + self.stats_items_done += 1 + run_pending() + except Exception as exc: + logger.error( + "Exception occured during assets processing, aborting ZIM creation", + exc_info=exc, + ) + creator.can_finish = False + + logger.info(f"ZIM creation completed, ZIM is at {zim_path}") # same reason than self.stats_items_done = 1 at the beginning, we need to add # a final item to complete the progress @@ -479,11 +448,19 @@ def _process_css( # to use last URL encountered. for path, urls in url_rewriter.items_to_download.items(): if path in self.items_to_download: - self.items_to_download[path].update(urls) + self.items_to_download[path].urls.update(urls) else: - self.items_to_download[path] = urls + self.items_to_download[path] = AssetDetails( + urls=urls, always_fetch_online=True + ) add_item_for(creator, f"content/{target_filename}", content=result) + @backoff.on_exception( + backoff.expo, + RequestException, + max_time=16, + on_backoff=backoff_hdlr, + ) def _process_page( self, creator: Creator, page: LibraryPage, existing_zim_paths: set[ZimPath] ): @@ -506,9 +483,11 @@ def _process_page( rewriten = rewriter.rewrite(page_content.html_body) for path, urls in url_rewriter.items_to_download.items(): if path in self.items_to_download: - self.items_to_download[path].update(urls) + self.items_to_download[path].urls.update(urls) else: - self.items_to_download[path] = urls + self.items_to_download[path] = AssetDetails( + urls=urls, always_fetch_online=False + ) add_item_for( creator, f"content/page_content_{page.id}.json", @@ -579,7 +558,7 @@ def _fetch_favicon_from_illustration(self, illustration: BytesIO) -> BytesIO: """Return a converted version of the illustration into favicon""" favicon = BytesIO() convert_image(illustration, favicon, fmt="ICO") - logger.debug("Resizing ZIM illustration") + logger.debug("Resizing ZIM favicon") resize_image( src=favicon, width=32, diff --git a/scraper/src/mindtouch2zim/utils.py b/scraper/src/mindtouch2zim/utils.py index 594e6a1..bdd124f 100644 --- a/scraper/src/mindtouch2zim/utils.py +++ b/scraper/src/mindtouch2zim/utils.py @@ -1,6 +1,12 @@ from pathlib import Path +from typing import Any from urllib.parse import urlparse +from zimscraperlib.zim import Creator +from zimscraperlib.zim.indexing import IndexData + +from mindtouch2zim.constants import logger + def get_asset_path_from_url(online_url: str, already_used_paths: list[Path]) -> Path: """Computes the path where one should store its asset based on its online URL @@ -52,3 +58,47 @@ def is_better_srcset_descriptor( if current_best_descriptor[-1:] != new_descriptor[-1:]: return False return int(new_descriptor[:-1]) > int(current_best_descriptor[:-1]) + + +def add_item_for( + creator: Creator, + path: str, + title: str | None = None, + *, + fpath: Path | None = None, + content: bytes | str | None = None, + mimetype: str | None = None, + is_front: bool | None = None, + should_compress: bool | None = None, + delete_fpath: bool | None = False, + duplicate_ok: bool | None = None, + index_data: IndexData | None = None, + auto_index: bool = True, +): + """ + Boilerplate to avoid repeating pyright ignore + + To be removed, once upstream issue is solved, see + https://github.com/openzim/libretexts/issues/26 + """ + creator.add_item_for( # pyright: ignore[reportUnknownMemberType] + path=path, + title=title, + fpath=fpath, + content=content, + mimetype=mimetype, + is_front=is_front, + should_compress=should_compress, + delete_fpath=delete_fpath, + duplicate_ok=duplicate_ok, + index_data=index_data, + auto_index=auto_index, + ) + + +def backoff_hdlr(details: Any): + """Default backoff handler to log something when backoff occurs""" + logger.warning( + "Request error, starting backoff of {wait:0.1f} seconds after {tries} " + "tries".format(**details) + ) diff --git a/zimui/package.json b/zimui/package.json index a9d2c2b..0341aaa 100644 --- a/zimui/package.json +++ b/zimui/package.json @@ -23,7 +23,8 @@ "vite-plugin-vuetify": "^2.0.4", "vue": "^3.4.29", "vue-router": "^4.3.3", - "vuetify": "^3.7.2" + "vuetify": "^3.7.2", + "webp-hero": "^0.0.2" }, "devDependencies": { "@mdi/font": "^7.4.47", diff --git a/zimui/src/stores/main.ts b/zimui/src/stores/main.ts index 4f0ae51..a17655b 100644 --- a/zimui/src/stores/main.ts +++ b/zimui/src/stores/main.ts @@ -2,6 +2,7 @@ import { defineStore } from 'pinia' import axios, { AxiosError } from 'axios' import type { PageContent, Shared, SharedPage } from '@/types/shared' import mathjaxService from '@/services/mathjax' +import { WebpMachine, detectWebpSupport } from 'webp-hero' export type RootState = { shared: Shared | null @@ -53,22 +54,32 @@ export const useMainStore = defineStore('main', { this.errorMessage = '' this.errorDetails = '' - return axios.get(`./content/page_content_${page.id}.json`).then( - (response) => { - this.isLoading = false - this.pageContent = response.data as PageContent - mathjaxService.removeMathJax() - mathjaxService.addMathJax(mathjaxService.frontFromTitle(page.title)) - }, - (error) => { - this.isLoading = false - this.shared = null - this.errorMessage = `Failed to load page content for page ${page.id}` - if (error instanceof AxiosError) { - this.handleAxiosError(error) + return axios + .get(`./content/page_content_${page.id}.json`) + .then( + (response) => { + this.isLoading = false + this.pageContent = response.data as PageContent + mathjaxService.removeMathJax() + mathjaxService.addMathJax(mathjaxService.frontFromTitle(page.title)) + }, + (error) => { + this.isLoading = false + this.shared = null + this.errorMessage = `Failed to load page content for page ${page.id}` + if (error instanceof AxiosError) { + this.handleAxiosError(error) + } } - } - ) + ) + .then(detectWebpSupport) + .then((supported) => { + if (!supported) { + console.log('Polyfilling WebP') + const webpMachine = new WebpMachine() + webpMachine.polyfillDocument() + } + }) }, checkResponseObject(response: unknown, msg: string = '') { if (response === null || typeof response !== 'object') { diff --git a/zimui/yarn.lock b/zimui/yarn.lock index 66756d0..69f2c88 100644 --- a/zimui/yarn.lock +++ b/zimui/yarn.lock @@ -4811,6 +4811,11 @@ webidl-conversions@^7.0.0: resolved "https://registry.yarnpkg.com/webidl-conversions/-/webidl-conversions-7.0.0.tgz#256b4e1882be7debbf01d05f0aa2039778ea080a" integrity sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g== +webp-hero@^0.0.2: + version "0.0.2" + resolved "https://registry.yarnpkg.com/webp-hero/-/webp-hero-0.0.2.tgz#7adf20435ecfca73c764a3ad532a4304f15db52d" + integrity sha512-XDN8k2DZerXAawblkGKbcRXAz3WjY6Id5fTmrsOvblzFg5jELfoDCOxRDHD3zIGJo3OPEjLRsVS6Kzl36HxjqA== + whatwg-encoding@^3.1.1: version "3.1.1" resolved "https://registry.yarnpkg.com/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz#d0f4ef769905d426e1688f3e34381a99b60b76e5"