Skip to content
19 changes: 15 additions & 4 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ import { cryptoRandomObjectId } from '@apify/utilities';

import { createSendRequest } from './send-request.js';

import { z } from 'zod';

export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {}

/**
Expand Down Expand Up @@ -134,6 +136,7 @@ export interface BasicCrawlerOptions<
Context extends CrawlingContext = CrawlingContext,
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
State extends StatisticState = StatisticState,
> {
/**
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
Expand Down Expand Up @@ -384,7 +387,12 @@ export interface BasicCrawlerOptions<
* Customize the way statistics collecting works, such as logging interval or
* whether to output them to the Key-Value store.
*/
statisticsOptions?: StatisticsOptions;
statisticsOptions?: StatisticsOptions<State>;

/**
* Optional custom Zod schema for extending crawler statistics.
*/
statisticsStateSchema?: z.ZodType<State, any, any>;

/**
* HTTP client implementation for the `sendRequest` context helper and for plain HTTP crawling.
Expand Down Expand Up @@ -532,6 +540,7 @@ export class BasicCrawler<
Context extends CrawlingContext = CrawlingContext,
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
State extends StatisticState = StatisticState,
> {
protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';

Expand All @@ -544,7 +553,7 @@ export class BasicCrawler<
/**
* A reference to the underlying {@apilink Statistics} class that collects and logs run statistics for requests.
*/
readonly stats: Statistics;
readonly stats: Statistics<State>;

/**
* A reference to the underlying {@apilink RequestList} class that manages the crawler's {@apilink Request|requests}.
Expand Down Expand Up @@ -712,6 +721,7 @@ export class BasicCrawler<
experiments: ow.optional.object,

statisticsOptions: ow.optional.object,
statisticsStateSchema: ow.optional.object,

id: ow.optional.string,
};
Expand All @@ -720,7 +730,7 @@ export class BasicCrawler<
* All `BasicCrawler` parameters are passed via an options object.
*/
constructor(
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> &
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext, State> &
RequireContextPipeline<CrawlingContext, Context> = {} as any, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
) {
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
Expand Down Expand Up @@ -859,7 +869,8 @@ export class BasicCrawler<
this.maxCrawlDepth = maxCrawlDepth;
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
this.maxSessionRotations = maxSessionRotations;
this.stats = new Statistics({
this.stats = new Statistics<State>({
stateSchema: options.statisticsStateSchema,
logMessage: `${this.constructor.name} request statistics:`,
log: this.log,
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
Expand Down
3 changes: 2 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"tldts": "^7.0.6",
"tough-cookie": "^6.0.0",
"tslib": "^2.8.1",
"type-fest": "^4.41.0"
"type-fest": "^4.41.0",
"zod": "^4.3.6"
}
}
86 changes: 46 additions & 40 deletions packages/core/src/crawlers/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import type { CrawleeLogger } from '../log.js';
import { serviceLocator } from '../service_locator.js';
import { KeyValueStore } from '../storages/key_value_store.js';
import { ErrorTracker } from './error_tracker.js';
import { z } from 'zod';

/**
* @ignore
Expand Down Expand Up @@ -54,7 +55,7 @@ export interface PersistenceOptions {
*
* @category Crawlers
*/
export class Statistics {
export class Statistics<State extends StatisticState = StatisticState> {
private static id = 0;

/**
Expand All @@ -75,7 +76,7 @@ export class Statistics {
/**
* Current statistic state used for doing calculations on {@apilink Statistics.calculate} calls
*/
state!: StatisticState;
state!: State;

/**
* Contains the current retries histogram. Index 0 means 0 retries, index 2, 2 retries, and so on
Expand All @@ -93,6 +94,7 @@ export class Statistics {
private logInterval: unknown;
private _events?: EventManager;
private persistenceOptions: PersistenceOptions;
private stateSchema: z.ZodType<State>;

private get events(): EventManager {
if (!this._events) {
Expand All @@ -104,7 +106,7 @@ export class Statistics {
/**
* @internal
*/
constructor(options: StatisticsOptions = {}) {
constructor(options: StatisticsOptions<State> = {}) {
ow(
options,
ow.object.exactShape({
Expand All @@ -115,6 +117,7 @@ export class Statistics {
persistenceOptions: ow.optional.object,
saveErrorSnapshots: ow.optional.boolean,
id: ow.optional.any(ow.number, ow.string),
stateSchema: ow.optional.object,
}),
);

Expand All @@ -141,6 +144,8 @@ export class Statistics {
this.listener = this.persistState.bind(this);
this.persistenceOptions = persistenceOptions;

this.stateSchema = options.stateSchema ?? (StatisticStateSchema as unknown as z.ZodType<State>);

// initialize by "resetting"
this.reset();
}
Expand All @@ -152,24 +157,10 @@ export class Statistics {
this.errorTracker.reset();
this.errorTrackerRetry.reset();

this.state = {
requestsFinished: 0,
requestsFailed: 0,
requestsRetries: 0,
requestsFailedPerMinute: 0,
requestsFinishedPerMinute: 0,
requestMinDurationMillis: Infinity,
requestMaxDurationMillis: 0,
requestTotalFailedDurationMillis: 0,
requestTotalFinishedDurationMillis: 0,
crawlerStartedAt: null,
crawlerFinishedAt: null,
statsPersistedAt: null,
crawlerRuntimeMillis: 0,
requestsWithStatusCode: {},
this.state = this.stateSchema.parse({
errors: this.errorTracker.result,
retryErrors: this.errorTrackerRetry.result,
};
});

this.requestRetryHistogram.length = 0;
this.requestsInProgress.clear();
Expand Down Expand Up @@ -419,7 +410,7 @@ export class Statistics {
statsId: this.id,
statsPersistedAt: new Date().toISOString(),
...this.calculate(),
};
} as unknown as StatisticPersistedState;

Reflect.deleteProperty(result, 'requestsWithStatusCode');
Reflect.deleteProperty(result, 'errors');
Expand All @@ -436,7 +427,7 @@ export class Statistics {
/**
* Configuration for the {@apilink Statistics} instance used by the crawler
*/
export interface StatisticsOptions {
export interface StatisticsOptions<State extends StatisticState = StatisticState> {
/**
* Interval in seconds to log the current statistics
* @default 60
Expand Down Expand Up @@ -481,6 +472,11 @@ export interface StatisticsOptions {
* if crawler creation order changes.
*/
id?: string;

/**
* Optional custom zod schema for extending crawler statistics.
*/
stateSchema?: z.ZodType<State, any, any>;
}

/**
Expand All @@ -498,23 +494,33 @@ export interface StatisticPersistedState extends Omit<StatisticState, 'statsPers
}

/**
* Contains the statistics state
* Contains the zod statistics state schema
*/
export interface StatisticState {
requestsFinished: number;
requestsFailed: number;
requestsRetries: number;
requestsFailedPerMinute: number;
requestsFinishedPerMinute: number;
requestMinDurationMillis: number;
requestMaxDurationMillis: number;
requestTotalFailedDurationMillis: number;
requestTotalFinishedDurationMillis: number;
crawlerStartedAt: Date | string | null;
crawlerFinishedAt: Date | string | null;
crawlerRuntimeMillis: number;
statsPersistedAt: Date | string | null;
errors: Record<string, unknown>;
retryErrors: Record<string, unknown>;
requestsWithStatusCode: Record<string, number>;
}

export const StatisticStateSchema = z.object({
requestsFinished: z.number().default(0),
requestsFailed: z.number().default(0),
requestsRetries: z.number().default(0),
requestsFailedPerMinute: z.number().default(0),
requestsFinishedPerMinute: z.number().default(0),
requestMinDurationMillis: z.number().default(Infinity),
requestMaxDurationMillis: z.number().default(0),
requestTotalFailedDurationMillis: z.number().default(0),
requestTotalFinishedDurationMillis: z.number().default(0),

crawlerStartedAt: z.coerce.date().nullable().default(null),
crawlerFinishedAt: z.coerce.date().nullable().default(null),
crawlerRuntimeMillis: z.number().default(0),

statsPersistedAt: z.coerce.date().nullable().default(null),

errors: z.any().default({}),
retryErrors: z.any().default({}),
requestsWithStatusCode: z.any().default({}),
});

/**
* Infers the StatisticsState interface from zod schema above
*/

export type StatisticState = z.infer<typeof StatisticStateSchema>;
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { isDeepStrictEqual } from 'node:util';

import { BasicCrawler } from '@crawlee/basic';
import type { BasicCrawlerOptions, BrowserHook, LoadedRequest, Request } from '@crawlee/browser';
import { extractUrlsFromPage } from '@crawlee/browser';
import { browserCrawlerEnqueueLinks, extractUrlsFromPage } from '@crawlee/browser';
import type { CheerioCrawlingContext } from '@crawlee/cheerio';
import { CheerioCrawler } from '@crawlee/cheerio';
import type {
Expand All @@ -25,6 +25,7 @@ import {
Router,
serviceLocator,
Statistics,
StatisticStateSchema,
withCheckedStorageAccess,
} from '@crawlee/core';
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
Expand All @@ -40,6 +41,8 @@ import type { PlaywrightCrawlingContext, PlaywrightGotoOptions } from './playwri
import { PlaywrightCrawler } from './playwright-crawler.js';
import { type RenderingType, RenderingTypePredictor } from './utils/rendering-type-prediction.js';

import { z } from 'zod';

type Result<TResult> =
| { result: TResult; ok: true; logs?: LogProxyCall[] }
| { error: unknown; ok: false; logs?: LogProxyCall[] };
Expand All @@ -60,7 +63,22 @@ class AdaptivePlaywrightCrawlerStatistics extends Statistics {
override state: AdaptivePlaywrightCrawlerStatisticState = null as any; // this needs to be assigned for a valid override, but the initialization is done by a reset() call from the parent constructor

constructor(options: StatisticsOptions = {}) {
super(options);

const baseAdaptiveSchema = StatisticStateSchema.extend({
trackHttpOnlyRequestHandlerRuns: z.number().default(0),
browserRequestHandlerRuns: z.number().default(0),
renderingTypeMispredictions: z.number().default(0),
});

const finalSchema = options.stateSchema
? baseAdaptiveSchema.merge(options.stateSchema as any)
: baseAdaptiveSchema;

super ({
...options,
stateSchema: finalSchema as any,
});

this.reset();
}

Expand Down Expand Up @@ -391,6 +409,7 @@ export class AdaptivePlaywrightCrawler<
this.stats = new AdaptivePlaywrightCrawlerStatistics({
logMessage: `${this.log.getOptions().prefix} request statistics:`,
...statisticsOptions,
stateSchema: options.statisticsStateSchema,
});

this.preventDirectStorageAccess = preventDirectStorageAccess;
Expand Down
3 changes: 2 additions & 1 deletion packages/types/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
},
"dependencies": {
"tough-cookie": "^6.0.0",
"tslib": "^2.8.1"
"tslib": "^2.8.1",
"zod": "^4.3.6"
}
}
51 changes: 49 additions & 2 deletions test/core/crawlers/adaptive_playwright_crawler.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import type { Server } from 'node:http';
import type { AddressInfo } from 'node:net';

import { Configuration, type Dictionary, EventType, KeyValueStore, serviceLocator } from '@crawlee/core';
import { Configuration, StatisticStateSchema, type Dictionary, EventType, KeyValueStore, serviceLocator } from '@crawlee/core';
import type {
AdaptivePlaywrightCrawlerContext,
AdaptivePlaywrightCrawlerOptions,
Expand All @@ -14,6 +14,8 @@ import express from 'express';
import { startExpressAppPromise } from 'test/shared/_helper.js';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator.js';

import { z } from 'zod';

describe('AdaptivePlaywrightCrawler', () => {
// Set up an express server that will serve test pages
const HOSTNAME = '127.0.0.1';
Expand Down Expand Up @@ -538,5 +540,50 @@ describe('AdaptivePlaywrightCrawler', () => {

// This should not throw since we've persisted valid state
await expect(newPredictor.initialize()).resolves.not.toThrow();
});
}),
test('preserves both custom user metrics and internal adaptive metrics', async () => {
// The user defines their custom schema
const customSchema = StatisticStateSchema.extend({
customAdaptiveMetric: z.number().default(0),
});

// Extract the TypeScript type for the handler
type CustomState = z.infer<typeof customSchema>;

const crawler = new AdaptivePlaywrightCrawler({
statisticsStateSchema: customSchema as any,
maxRequestsPerCrawl: 1,

// Isolate the test data
configuration: new Configuration({
purgeOnStart: true,
}),

// The main handler
async requestHandler() {
const state = crawler.stats.state as CustomState;
state.customAdaptiveMetric += 42;

// Simulate the adaptive crawler doing its internal tracking
(crawler.stats as any).trackRenderingTypeMisprediction();
},

// The fallback handler
async failedRequestHandler() {
const state = crawler.stats.state as CustomState;
state.customAdaptiveMetric += 42;

// Simulate the adaptive crawler doing its internal tracking
(crawler.stats as any).trackRenderingTypeMisprediction();
}
});

await crawler.run([`http://${HOSTNAME}:${port}/static`]);

const finalState = crawler.stats.state as any;

expect(finalState.customAdaptiveMetric).toBe(42);
expect(finalState.renderingTypeMispredictions).toBe(1);

}, 30000);
});
Loading