Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ test/e2e/**/packages

# Local vitest config overrides
vitest.config.local.mts
.worktrees/
40 changes: 40 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ import {
GotScrapingHttpClient,
KeyValueStore,
mergeCookies,
Monitor,
type MonitorOptions,
NonRetryableError,
purgeDefaultStorages,
RequestListAdapter,
Expand Down Expand Up @@ -405,6 +407,25 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* Defaults to a new instance of {@apilink GotScrapingHttpClient}
*/
httpClient?: BaseHttpClient;

/**
* Enables monitor mode: a compact real-time status block printed to `process.stderr` during the crawl.
*
* In interactive terminals (TTY), the block overwrites itself in-place.
* In non-TTY environments (CI, piped output), plain lines are printed instead.
*
* @default false
* @example
* ```ts
* const crawler = new BasicCrawler({ monitor: true });
* ```
*/
monitor?: boolean;

/**
* Options for the monitor display. Only used when `monitor` is `true`.
*/
monitorOptions?: MonitorOptions;
}

/**
Expand Down Expand Up @@ -574,6 +595,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected retryOnBlocked: boolean;
protected respectRobotsTxtFile: boolean | { userAgent?: string };
protected onSkippedRequest?: SkippedRequestCallback;
protected monitorEnabled: boolean;
protected monitorOptions: MonitorOptions;
private _closeEvents?: boolean;
private loggedPerRun = new Set<string>();
private experiments: CrawlerExperiments;
Expand Down Expand Up @@ -612,6 +635,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
respectRobotsTxtFile: ow.optional.any(ow.boolean, ow.object),
onSkippedRequest: ow.optional.function,
httpClient: ow.optional.object,
monitor: ow.optional.boolean,
monitorOptions: ow.optional.object,

// AutoscaledPool shorthands
minConcurrency: ow.optional.number,
Expand Down Expand Up @@ -679,6 +704,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

statisticsOptions,
httpClient,
monitor: monitorEnabled = false,
monitorOptions = {},
} = options;

if (requestManager !== undefined) {
Expand All @@ -696,6 +723,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

this.httpClient = httpClient ?? new GotScrapingHttpClient();
this.log = log;
this.monitorEnabled = monitorEnabled;
this.monitorOptions = monitorOptions;
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
this.events = config.getEventManager();
Expand Down Expand Up @@ -937,6 +966,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
};

const log = async () => {
// When monitor mode is active, it owns the display — skip the periodic log to avoid
// interleaving plain log lines with ANSI cursor-movement sequences.
if (this.monitorEnabled) return;

const { mode: operationMode, failedDelta } = getOperationMode();
let message: string;

Expand Down Expand Up @@ -1038,9 +1071,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

let stats = {} as FinalStatistics;

const monitor = this.monitorEnabled
? new Monitor(this.stats, this.autoscaledPool, this.monitorOptions, () => this.requestManager?.getTotalCount())
: null;

monitor?.start();

try {
await this.autoscaledPool!.run();
} finally {
monitor?.stop();
await this.teardown();
await this.stats.stopCapturing();

Expand Down
1 change: 1 addition & 0 deletions packages/core/src/crawlers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ export * from './crawler_utils';
export * from './statistics';
export * from './error_tracker';
export * from './error_snapshotter';
export * from './monitor';
160 changes: 160 additions & 0 deletions packages/core/src/crawlers/monitor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import os from 'node:os';

import type { AutoscaledPool } from '../autoscaling/autoscaled_pool';
import type { Statistics } from './statistics';

export interface MonitorOptions {
/**
* How often to refresh the monitor display, in seconds.
* @default 5
*/
intervalSecs?: number;
}

const MONITOR_LINE_COUNT = 5;

function padStart(n: number, width = 2): string {
return String(n).padStart(width, '0');
}

function formatDuration(ms: number): string {
const totalSecs = Math.floor(ms / 1000);
const h = Math.floor(totalSecs / 3600);
const m = Math.floor((totalSecs % 3600) / 60);
const s = totalSecs % 60;
return `${padStart(h)}:${padStart(m)}:${padStart(s)}`;
}

function formatBytes(bytes: number): string {
if (bytes >= 1024 ** 3) return `${(bytes / 1024 ** 3).toFixed(1)} GB`;
if (bytes >= 1024 ** 2) return `${(bytes / 1024 ** 2).toFixed(0)} MB`;
return `${(bytes / 1024).toFixed(0)} KB`;
}

/**
* Renders a compact real-time status block to `process.stderr` during a crawl.
*
* Enable via the `monitor` option on `BasicCrawler`:
* ```ts
* const crawler = new BasicCrawler({ monitor: true, ... });
* ```
*
* In TTY mode the block overwrites itself in-place. In non-TTY mode (CI, pipes)
* it prints plain lines so the output remains readable in logs.
*/
export class Monitor {
private intervalId?: ReturnType<typeof setInterval>;
private readonly intervalMs: number;
private rendered = false;

constructor(
private readonly stats: Statistics,
private readonly autoscaledPool?: AutoscaledPool,
options: MonitorOptions = {},
private readonly totalRequests?: () => number | undefined,
) {
this.intervalMs = (options.intervalSecs ?? 5) * 1000;
}

/** Starts the periodic display. Renders an initial frame immediately, then repeats on each interval. */
start(): void {
this.render(); // render immediately so short crawls always show output
this.intervalId = setInterval(() => this.render(), this.intervalMs);
this.intervalId.unref(); // don't prevent process exit if the event loop would otherwise be empty
}

/** Stops the periodic display and clears the last rendered block from the terminal. */
stop(): void {
if (this.intervalId !== undefined) {
clearInterval(this.intervalId);
this.intervalId = undefined;
}
if (this.rendered && process.stderr.isTTY) {
// Move up MONITOR_LINE_COUNT lines and clear each one
for (let i = 0; i < MONITOR_LINE_COUNT; i++) {
process.stderr.write('\x1b[1A\x1b[2K');
}
this.rendered = false;
}
}

/** Builds and returns the status block as an array of lines. Exposed for testing. */
buildLines(): string[] {
const { state } = this.stats;
const calculated = this.stats.calculate();

const startedAt = state.crawlerStartedAt ? new Date(state.crawlerStartedAt) : new Date();
const now = new Date();
const elapsed = now.getTime() - startedAt.getTime();

const finished = state.requestsFinished;
const failed = state.requestsFailed;
const total = this.totalRequests?.();
// getTotalCount() on RequestManagerTandem may be an approximate sum
// of the underlying RequestList + RequestQueue. The plan treats this as a best-effort
// estimate: progress % and ETA are shown when total > 0, hidden when total === 0.
// This matches the existing behaviour in PR #2692 and is acceptable for a "monitor mode"
// display (non-authoritative progress indicator). No special-casing per request-source mode.
const speed = calculated.requestsFinishedPerMinute;

const progressStr = total != null && total > 0
? `${finished}/${total} (${((finished / total) * 100).toFixed(1)}%)`
: total === 0
? `${finished}/0 (N/A%)`
: `${finished}/? (?%)`;

const failedPct = finished + failed > 0
? ` | Failed: ${failed} (${((failed / (finished + failed)) * 100).toFixed(1)}%)`
: '';

let etaStr = 'N/A';
if (total != null && total > 0 && speed > 0) {
// Use Math.max to guard against negative remaining (e.g. when total is an approximate count)
const remaining = Math.max(0, total - finished);
const etaMs = (remaining / speed) * 60 * 1000;
etaStr = `~${formatDuration(etaMs)}`;
}

const memInfo = process.memoryUsage();
const totalMem = os.totalmem();
const usedMem = totalMem - os.freemem();
const cpus = os.cpus();
const cpuLoad = os.loadavg()[0];
// os.loadavg() always returns [0,0,0] on Windows — show N/A to avoid misleading output.
const cpuPct = process.platform === 'win32'
? 'N/A'
: cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?';

const concurrency = this.autoscaledPool
? `${this.autoscaledPool.currentConcurrency}/${this.autoscaledPool.maxConcurrency} (desired: ${this.autoscaledPool.desiredConcurrency})`
: 'N/A';

return [
`\u23F1 Start: ${startedAt.toLocaleTimeString()} | Running for ${formatDuration(elapsed)}`,
`\uD83D\uDCCA Progress: ${progressStr}${failedPct} | Speed: ${speed} req/min`,
`\u23F3 ETA: ${etaStr}`,
`\uD83D\uDCBB CPU: ${cpuPct}% | Mem: ${formatBytes(memInfo.rss)} process / ${formatBytes(usedMem)} / ${formatBytes(totalMem)} total`,
`\uD83D\uDD00 Concurrency: ${concurrency}`,
];
}

private render(): void {
const lines = this.buildLines();

if (process.stderr.isTTY && this.rendered) {
// Move cursor up to overwrite previous block
process.stderr.write(`\x1b[${MONITOR_LINE_COUNT}A`);
}

for (const line of lines) {
if (process.stderr.isTTY) {
// Clear line then write
process.stderr.write(`\x1b[2K${line}\n`);
} else {
process.stderr.write(`${line}\n`);
}
}

this.rendered = true;
}
}
47 changes: 47 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2039,4 +2039,51 @@ describe('BasicCrawler', () => {
expect(crawlerB.requestQueue?.config).toBe(configB);
});
});

describe('monitor option', () => {
test('crawler runs successfully with monitor: true', async () => {
const handledUrls: string[] = [];

const crawler = new BasicCrawler({
monitor: true,
requestHandler: ({ request }) => {
handledUrls.push(request.url);
},
});

await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]);
expect(handledUrls).toHaveLength(1);
});

test('crawler runs successfully with monitor: false (default)', async () => {
const handledUrls: string[] = [];

const crawler = new BasicCrawler({
requestHandler: ({ request }) => {
handledUrls.push(request.url);
},
});

await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]);
expect(handledUrls).toHaveLength(1);
});

test('monitor: true does not suppress request errors — failedRequestHandler still fires', async () => {
let failed = 0;

const crawler = new BasicCrawler({
monitor: true,
maxRequestRetries: 0,
requestHandler: () => {
throw new Error('forced failure');
},
failedRequestHandler: () => {
failed++;
},
});

await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]);
expect(failed).toBe(1);
});
});
});
Loading