docs: Update HttpCrawler docs about mimetype handling (#3356)

janbuchar · web-flow · commit 7a087ae2b956 · 2026-01-20T15:01:22.000+01:00
diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts
@@ -120,8 +120,8 @@ export type CheerioRequestHandler<
  * ]
  * ```
  *
- * By default, `CheerioCrawler` only processes web pages with the `text/html`
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
+ * By default, `CheerioCrawler` only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
  * and skips pages with other content types. If you want the crawler to process other content types,
  * use the {@apilink CheerioCrawlerOptions.additionalMimeTypes} constructor option.
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts
@@ -137,7 +137,8 @@ export interface HttpCrawlerOptions<Context extends InternalHttpCrawlingContext
 
     /**
      * An array of [MIME types](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types)
-     * you want the crawler to load and process. By default, only `text/html` and `application/xhtml+xml` MIME types are supported.
+     * you want the crawler to load and process. By default, only `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+     * and `application/json` MIME types are supported.
      */
     additionalMimeTypes?: string[];
 
@@ -291,8 +292,8 @@ export type HttpRequestHandler<
  * ]
  * ```
  *
- * By default, this crawler only processes web pages with the `text/html`
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
+ * By default, this crawler only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
  * and skips pages with other content types. If you want the crawler to process other content types,
  * use the {@apilink HttpCrawlerOptions.additionalMimeTypes} constructor option.
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts
@@ -139,8 +139,8 @@ export type JSDOMRequestHandler<
  * ]
  * ```
  *
- * By default, `JSDOMCrawler` only processes web pages with the `text/html`
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
+ * By default, `JSDOMCrawler` only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
  * and skips pages with other content types. If you want the crawler to process other content types,
  * use the {@apilink JSDOMCrawlerOptions.additionalMimeTypes} constructor option.
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.
diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts
@@ -131,8 +131,8 @@ export type LinkeDOMRequestHandler<
  * ]
  * ```
  *
- * By default, `LinkeDOMCrawler` only processes web pages with the `text/html`
- * and `application/xhtml+xml` MIME content types (as reported by the `Content-Type` HTTP header),
+ * By default, `LinkeDOMCrawler` only processes web pages with the `text/html`, `application/xhtml+xml`, `text/xml`, `application/xml`,
+ * and `application/json` MIME content types (as reported by the `Content-Type` HTTP header),
  * and skips pages with other content types. If you want the crawler to process other content types,
  * use the {@apilink LinkeDOMCrawlerOptions.additionalMimeTypes} constructor option.
  * Beware that the parsing behavior differs for HTML, XML, JSON and other types of content.