@@ -270,6 +270,7 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
270270 private resultComparator : NonNullable < AdaptivePlaywrightCrawlerOptions [ 'resultComparator' ] > ;
271271 private preventDirectStorageAccess : boolean ;
272272 declare readonly stats : AdaptivePlaywrightCrawlerStatistics ;
273+ private inFlightRenderingTypeDetections = 0 ;
273274
274275 /**
275276 * Default {@apilink Router} instance that will be used if we don't specify any {@apilink AdaptivePlaywrightCrawlerOptions.requestHandler|`requestHandler`}.
@@ -325,6 +326,13 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
325326 this . preventDirectStorageAccess = preventDirectStorageAccess ;
326327 }
327328
329+ /**
330+ * Returns the number of rendering type detections currently in progress.
331+ */
332+ get inFlightRenderingTypeDetectionCount ( ) : number {
333+ return this . inFlightRenderingTypeDetections ;
334+ }
335+
328336 protected override async _init ( ) : Promise < void > {
329337 await this . renderingTypePredictor . initialize ( ) ;
330338 return await super . _init ( ) ;
@@ -334,77 +342,89 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
334342 const renderingTypePrediction = this . renderingTypePredictor . predict ( crawlingContext . request ) ;
335343 const shouldDetectRenderingType = Math . random ( ) < renderingTypePrediction . detectionProbabilityRecommendation ;
336344
337- if ( ! shouldDetectRenderingType ) {
338- crawlingContext . log . debug (
339- `Predicted rendering type ${ renderingTypePrediction . renderingType } for ${ crawlingContext . request . url } ` ,
340- ) ;
345+ if ( shouldDetectRenderingType ) {
346+ this . inFlightRenderingTypeDetections ++ ;
341347 }
342348
343- if ( renderingTypePrediction . renderingType === 'static' && ! shouldDetectRenderingType ) {
344- crawlingContext . log . debug ( `Running HTTP-only request handler for ${ crawlingContext . request . url } ` ) ;
345- this . stats . trackHttpOnlyRequestHandlerRun ( ) ;
349+ try {
350+ if ( ! shouldDetectRenderingType ) {
351+ crawlingContext . log . debug (
352+ `Predicted rendering type ${ renderingTypePrediction . renderingType } for ${ crawlingContext . request . url } ` ,
353+ ) ;
354+ }
346355
347- const plainHTTPRun = await this . runRequestHandlerWithPlainHTTP ( crawlingContext ) ;
356+ if ( renderingTypePrediction . renderingType === 'static' && ! shouldDetectRenderingType ) {
357+ crawlingContext . log . debug ( `Running HTTP-only request handler for ${ crawlingContext . request . url } ` ) ;
358+ this . stats . trackHttpOnlyRequestHandlerRun ( ) ;
348359
349- if ( plainHTTPRun . ok && this . resultChecker ( plainHTTPRun . result ) ) {
350- crawlingContext . log . debug ( `HTTP-only request handler succeeded for ${ crawlingContext . request . url } ` ) ;
351- plainHTTPRun . logs ?. forEach ( ( [ log , method , ...args ] ) => log [ method ] ( ...( args as [ any , any ] ) ) ) ;
352- await this . commitResult ( crawlingContext , plainHTTPRun . result ) ;
353- return ;
354- }
355- if ( ! plainHTTPRun . ok ) {
356- crawlingContext . log . exception (
357- plainHTTPRun . error as Error ,
358- `HTTP-only request handler failed for ${ crawlingContext . request . url } ` ,
359- ) ;
360- } else {
361- crawlingContext . log . warning (
362- `HTTP-only request handler returned a suspicious result for ${ crawlingContext . request . url } ` ,
363- ) ;
364- this . stats . trackRenderingTypeMisprediction ( ) ;
360+ const plainHTTPRun = await this . runRequestHandlerWithPlainHTTP ( crawlingContext ) ;
361+
362+ if ( plainHTTPRun . ok && this . resultChecker ( plainHTTPRun . result ) ) {
363+ crawlingContext . log . debug ( `HTTP-only request handler succeeded for ${ crawlingContext . request . url } ` ) ;
364+ plainHTTPRun . logs ?. forEach ( ( [ log , method , ...args ] ) => log [ method ] ( ...( args as [ any , any ] ) ) ) ;
365+ await this . commitResult ( crawlingContext , plainHTTPRun . result ) ;
366+ return ;
367+ }
368+ if ( ! plainHTTPRun . ok ) {
369+ crawlingContext . log . exception (
370+ plainHTTPRun . error as Error ,
371+ `HTTP-only request handler failed for ${ crawlingContext . request . url } ` ,
372+ ) ;
373+ } else {
374+ crawlingContext . log . warning (
375+ `HTTP-only request handler returned a suspicious result for ${ crawlingContext . request . url } ` ,
376+ ) ;
377+ this . stats . trackRenderingTypeMisprediction ( ) ;
378+ }
365379 }
366- }
367380
368- crawlingContext . log . debug ( `Running browser request handler for ${ crawlingContext . request . url } ` ) ;
369- this . stats . trackBrowserRequestHandlerRun ( ) ;
381+ crawlingContext . log . debug ( `Running browser request handler for ${ crawlingContext . request . url } ` ) ;
382+ this . stats . trackBrowserRequestHandlerRun ( ) ;
370383
371- // Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
372- // a rendering type detection if necessary. Without this measure, the HTTP request handler would run
373- // under different conditions, which could change its behavior. Changes done to the crawler state by
374- // the HTTP request handler will not be committed to the actual storage.
375- const { result : browserRun , initialStateCopy } = await this . runRequestHandlerInBrowser ( crawlingContext ) ;
384+ // Run the request handler in a browser. The copy of the crawler state is kept so that we can perform
385+ // a rendering type detection if necessary. Without this measure, the HTTP request handler would run
386+ // under different conditions, which could change its behavior. Changes done to the crawler state by
387+ // the HTTP request handler will not be committed to the actual storage.
388+ const { result : browserRun , initialStateCopy } = await this . runRequestHandlerInBrowser ( crawlingContext ) ;
376389
377- if ( ! browserRun . ok ) {
378- throw browserRun . error ;
379- }
390+ if ( ! browserRun . ok ) {
391+ throw browserRun . error ;
392+ }
380393
381- await this . commitResult ( crawlingContext , browserRun . result ) ;
394+ await this . commitResult ( crawlingContext , browserRun . result ) ;
382395
383- if ( shouldDetectRenderingType ) {
384- crawlingContext . log . debug ( `Detecting rendering type for ${ crawlingContext . request . url } ` ) ;
385- const plainHTTPRun = await this . runRequestHandlerWithPlainHTTP ( crawlingContext , initialStateCopy ) ;
396+ if ( shouldDetectRenderingType ) {
397+ crawlingContext . log . debug ( `Detecting rendering type for ${ crawlingContext . request . url } ` ) ;
398+ const plainHTTPRun = await this . runRequestHandlerWithPlainHTTP ( crawlingContext , initialStateCopy ) ;
386399
387- const detectionResult : RenderingType | undefined = ( ( ) => {
388- if ( ! plainHTTPRun . ok ) {
389- return 'clientOnly' ;
390- }
400+ const detectionResult : RenderingType | undefined = ( ( ) => {
401+ if ( ! plainHTTPRun . ok ) {
402+ return 'clientOnly' ;
403+ }
391404
392- const comparisonResult = this . resultComparator ( plainHTTPRun . result , browserRun . result ) ;
393- if ( comparisonResult === true || comparisonResult === 'equal' ) {
394- return 'static' ;
395- }
405+ const comparisonResult = this . resultComparator ( plainHTTPRun . result , browserRun . result ) ;
406+ if ( comparisonResult === true || comparisonResult === 'equal' ) {
407+ return 'static' ;
408+ }
396409
397- if ( comparisonResult === false || comparisonResult === 'different' ) {
398- return 'clientOnly' ;
399- }
410+ if ( comparisonResult === false || comparisonResult === 'different' ) {
411+ return 'clientOnly' ;
412+ }
400413
401- return undefined ;
402- } ) ( ) ;
414+ return undefined ;
415+ } ) ( ) ;
403416
404- crawlingContext . log . debug ( `Detected rendering type ${ detectionResult } for ${ crawlingContext . request . url } ` ) ;
417+ crawlingContext . log . debug (
418+ `Detected rendering type ${ detectionResult } for ${ crawlingContext . request . url } ` ,
419+ ) ;
405420
406- if ( detectionResult !== undefined ) {
407- this . renderingTypePredictor . storeResult ( crawlingContext . request , detectionResult ) ;
421+ if ( detectionResult !== undefined ) {
422+ this . renderingTypePredictor . storeResult ( crawlingContext . request , detectionResult ) ;
423+ }
424+ }
425+ } finally {
426+ if ( shouldDetectRenderingType ) {
427+ this . inFlightRenderingTypeDetections -- ;
408428 }
409429 }
410430 }
0 commit comments