Skip to content

Commit 565fc34

Browse files
authored
feat: Add a counter of in-flight rendering type detections (#3355)
1 parent 63d753d commit 565fc34

1 file changed

Lines changed: 75 additions & 55 deletions

File tree

packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts

Lines changed: 75 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
270270
private resultComparator: NonNullable<AdaptivePlaywrightCrawlerOptions['resultComparator']>;
271271
private preventDirectStorageAccess: boolean;
272272
declare readonly stats: AdaptivePlaywrightCrawlerStatistics;
273+
private inFlightRenderingTypeDetections = 0;
273274

274275
/**
275276
* Default {@apilink Router} instance that will be used if we don't specify any {@apilink AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
@@ -325,6 +326,13 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
325326
this.preventDirectStorageAccess = preventDirectStorageAccess;
326327
}
327328

329+
/**
330+
* Returns the number of rendering type detections currently in progress.
331+
*/
332+
get inFlightRenderingTypeDetectionCount(): number {
333+
return this.inFlightRenderingTypeDetections;
334+
}
335+
328336
protected override async _init(): Promise<void> {
329337
await this.renderingTypePredictor.initialize();
330338
return await super._init();
@@ -334,77 +342,89 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
334342
const renderingTypePrediction = this.renderingTypePredictor.predict(crawlingContext.request);
335343
const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation;
336344

337-
if (!shouldDetectRenderingType) {
338-
crawlingContext.log.debug(
339-
`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`,
340-
);
345+
if (shouldDetectRenderingType) {
346+
this.inFlightRenderingTypeDetections++;
341347
}
342348

343-
if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
344-
crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
345-
this.stats.trackHttpOnlyRequestHandlerRun();
349+
try {
350+
if (!shouldDetectRenderingType) {
351+
crawlingContext.log.debug(
352+
`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`,
353+
);
354+
}
346355

347-
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
356+
if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) {
357+
crawlingContext.log.debug(`Running HTTP-only request handler for ${crawlingContext.request.url}`);
358+
this.stats.trackHttpOnlyRequestHandlerRun();
348359

349-
if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
350-
crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
351-
plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...(args as [any, any])));
352-
await this.commitResult(crawlingContext, plainHTTPRun.result);
353-
return;
354-
}
355-
if (!plainHTTPRun.ok) {
356-
crawlingContext.log.exception(
357-
plainHTTPRun.error as Error,
358-
`HTTP-only request handler failed for ${crawlingContext.request.url}`,
359-
);
360-
} else {
361-
crawlingContext.log.warning(
362-
`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`,
363-
);
364-
this.stats.trackRenderingTypeMisprediction();
360+
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext);
361+
362+
if (plainHTTPRun.ok && this.resultChecker(plainHTTPRun.result)) {
363+
crawlingContext.log.debug(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`);
364+
plainHTTPRun.logs?.forEach(([log, method, ...args]) => log[method](...(args as [any, any])));
365+
await this.commitResult(crawlingContext, plainHTTPRun.result);
366+
return;
367+
}
368+
if (!plainHTTPRun.ok) {
369+
crawlingContext.log.exception(
370+
plainHTTPRun.error as Error,
371+
`HTTP-only request handler failed for ${crawlingContext.request.url}`,
372+
);
373+
} else {
374+
crawlingContext.log.warning(
375+
`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`,
376+
);
377+
this.stats.trackRenderingTypeMisprediction();
378+
}
365379
}
366-
}
367380

368-
crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`);
369-
this.stats.trackBrowserRequestHandlerRun();
381+
crawlingContext.log.debug(`Running browser request handler for ${crawlingContext.request.url}`);
382+
this.stats.trackBrowserRequestHandlerRun();
370383

371-
// Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
372-
// a rendering type detection if necessary. Without this measure, the HTTP request handler would run
373-
// under different conditions, which could change its behavior. Changes done to the crawler state by
374-
// the HTTP request handler will not be committed to the actual storage.
375-
const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext);
384+
// Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
385+
// a rendering type detection if necessary. Without this measure, the HTTP request handler would run
386+
// under different conditions, which could change its behavior. Changes done to the crawler state by
387+
// the HTTP request handler will not be committed to the actual storage.
388+
const { result: browserRun, initialStateCopy } = await this.runRequestHandlerInBrowser(crawlingContext);
376389

377-
if (!browserRun.ok) {
378-
throw browserRun.error;
379-
}
390+
if (!browserRun.ok) {
391+
throw browserRun.error;
392+
}
380393

381-
await this.commitResult(crawlingContext, browserRun.result);
394+
await this.commitResult(crawlingContext, browserRun.result);
382395

383-
if (shouldDetectRenderingType) {
384-
crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
385-
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy);
396+
if (shouldDetectRenderingType) {
397+
crawlingContext.log.debug(`Detecting rendering type for ${crawlingContext.request.url}`);
398+
const plainHTTPRun = await this.runRequestHandlerWithPlainHTTP(crawlingContext, initialStateCopy);
386399

387-
const detectionResult: RenderingType | undefined = (() => {
388-
if (!plainHTTPRun.ok) {
389-
return 'clientOnly';
390-
}
400+
const detectionResult: RenderingType | undefined = (() => {
401+
if (!plainHTTPRun.ok) {
402+
return 'clientOnly';
403+
}
391404

392-
const comparisonResult = this.resultComparator(plainHTTPRun.result, browserRun.result);
393-
if (comparisonResult === true || comparisonResult === 'equal') {
394-
return 'static';
395-
}
405+
const comparisonResult = this.resultComparator(plainHTTPRun.result, browserRun.result);
406+
if (comparisonResult === true || comparisonResult === 'equal') {
407+
return 'static';
408+
}
396409

397-
if (comparisonResult === false || comparisonResult === 'different') {
398-
return 'clientOnly';
399-
}
410+
if (comparisonResult === false || comparisonResult === 'different') {
411+
return 'clientOnly';
412+
}
400413

401-
return undefined;
402-
})();
414+
return undefined;
415+
})();
403416

404-
crawlingContext.log.debug(`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`);
417+
crawlingContext.log.debug(
418+
`Detected rendering type ${detectionResult} for ${crawlingContext.request.url}`,
419+
);
405420

406-
if (detectionResult !== undefined) {
407-
this.renderingTypePredictor.storeResult(crawlingContext.request, detectionResult);
421+
if (detectionResult !== undefined) {
422+
this.renderingTypePredictor.storeResult(crawlingContext.request, detectionResult);
423+
}
424+
}
425+
} finally {
426+
if (shouldDetectRenderingType) {
427+
this.inFlightRenderingTypeDetections--;
408428
}
409429
}
410430
}

0 commit comments

Comments
 (0)