Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions docs/upgrading/upgrading_v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,37 @@ const count = await sessionPool.usableSessionsCount();
const state = await sessionPool.getState();
```

## Custom `createSessionFunction` receives merged session options

`SessionPool` now merges its pool-wide `sessionOptions` (including the pool-scoped logger) with per-call overrides before invoking `createSessionFunction`. Custom implementations no longer need to spread `pool.sessionOptions` themselves to inherit pool defaults.

**Before:**
```typescript
new SessionPool({
sessionOptions: { maxUsageCount: 5 },
createSessionFunction: async (pool, opts) =>
new Session({
...pool.sessionOptions, // had to be spread manually for the logger / pool defaults to apply
...opts?.sessionOptions,
sessionPool: pool,
}),
});
```
Comment thread
barjin marked this conversation as resolved.

**After:**
```typescript
new SessionPool({
sessionOptions: { maxUsageCount: 5 },
createSessionFunction: async (pool, opts) =>
new Session({
...opts?.sessionOptions, // already merged with pool-wide defaults
sessionPool: pool,
}),
});
```

If you were already spreading `pool.sessionOptions`, the change is harmless - pool defaults now appear twice in the spread chain, with the later (per-call) one winning, exactly as before.

## `retireOnBlockedStatusCodes` is removed from `Session`

`Session.retireOnBlockedStatusCodes` is removed. Blocked status code handling is now internal to the crawler. Configure blocked status codes via the `blockedStatusCodes` crawler option (moved from `sessionPoolOptions`).
Expand Down
30 changes: 22 additions & 8 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import type {
RequestTransform,
RouterHandler,
RouterRoutes,
Session,
SkippedRequestCallback,
Source,
StatisticsOptions,
Expand Down Expand Up @@ -59,6 +58,7 @@ import {
Router,
ServiceLocator,
serviceLocator,
Session,
SessionError,
SessionPool,
Statistics,
Expand Down Expand Up @@ -873,7 +873,26 @@ export class BasicCrawler<
...statisticsOptions,
});

this.sessionPool = sessionPool ?? new SessionPool();
if (sessionPool && proxyConfiguration) {
this.log.warning(
'Both `sessionPool` and `proxyConfiguration` were provided to the crawler. ' +
'The `proxyConfiguration` is ignored - sessions from the supplied pool keep whatever ' +
'`proxyInfo` they were created with. Configure proxies on the pool instead, ' +
'e.g. via `addSession({ proxyInfo })` or a custom `createSessionFunction`.',
);
Comment thread
barjin marked this conversation as resolved.
}

this.sessionPool =
sessionPool ??
new SessionPool({
createSessionFunction: async (pool, opts) =>
new Session({
...opts?.sessionOptions,
proxyInfo:
opts?.sessionOptions?.proxyInfo ?? (await this.proxyConfiguration?.newProxyInfo()),
sessionPool: pool,
}),
Comment thread
barjin marked this conversation as resolved.
});
this.sessionPool.setMaxListeners(20);

this.ownsSessionPool = !sessionPool;
Expand Down Expand Up @@ -1116,12 +1135,7 @@ export class BasicCrawler<
return existingSession;
}

return await this.sessionPool!.newSession({
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
request: request ?? undefined,
}),
maxUsageCount: 1,
});
return await this.sessionPool!.getSession();
},
Comment thread
barjin marked this conversation as resolved.
this.internalTimeoutMillis,
`Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
Expand Down
25 changes: 16 additions & 9 deletions packages/core/src/session_pool/session_pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,11 @@ export class SessionPool extends EventEmitter {
this.maxPoolSize = maxPoolSize;
this.createSessionFunction = createSessionFunction || this._defaultCreateSessionFunction;

// Session configuration
// Session configuration. The pool-scoped logger is merged into per-call sessionOptions inside
// `_invokeCreateSessionFunction`, so every Session inherits it without custom createSessionFunctions
// having to know about it.
this.sessionOptions = {
...sessionOptions,
// the log needs to propagate to createSessionFunction as in "new Session({ ...sessionPool.sessionOptions })"
// and can't go inside _defaultCreateSessionFunction
log: this.log,
};

Expand Down Expand Up @@ -264,8 +264,7 @@ export class SessionPool extends EventEmitter {
this._removeRetiredSessions();
}

const newSession =
options instanceof Session ? options : await this.createSessionFunction(this, { sessionOptions: options });
const newSession = options instanceof Session ? options : await this._invokeCreateSessionFunction(options);
this.log.debug(`Adding new Session - ${newSession.id}`);

this._addSession(newSession);
Expand All @@ -280,7 +279,7 @@ export class SessionPool extends EventEmitter {
async newSession(sessionOptions?: SessionOptions): Promise<Session> {
await this.ensureInitialized();

const newSession = await this.createSessionFunction(this, { sessionOptions });
const newSession = await this._invokeCreateSessionFunction(sessionOptions);
this._addSession(newSession);

return newSession;
Expand Down Expand Up @@ -446,18 +445,26 @@ export class SessionPool extends EventEmitter {
const { sessionOptions = {} } = options;

return new Session({
...this.sessionOptions,
...sessionOptions,
sessionPool,
});
}

/**
* Invokes `createSessionFunction` with `sessionOptions` already merged from pool-wide defaults and
* the supplied per-call overrides, so custom implementations don't need to spread `pool.sessionOptions` themselves.
*/
private async _invokeCreateSessionFunction(perCallOptions?: SessionOptions): Promise<Session> {
const sessionOptions = { ...this.sessionOptions, ...perCallOptions };
return this.createSessionFunction(this, { sessionOptions });
}

/**
* Creates new session and adds it to the pool.
* @returns Newly created `Session` instance.
*/
protected async _createSession(): Promise<Session> {
const newSession = await this.createSessionFunction(this);
const newSession = await this._invokeCreateSessionFunction();
this._addSession(newSession);
this.log.debug(`Created new Session - ${newSession.id}`);

Expand Down Expand Up @@ -498,7 +505,7 @@ export class SessionPool extends EventEmitter {
sessionObject.sessionPool = this;
sessionObject.createdAt = new Date(sessionObject.createdAt as string);
sessionObject.expiresAt = new Date(sessionObject.expiresAt as string);
const recreatedSession = await this.createSessionFunction(this, { sessionOptions: sessionObject });
const recreatedSession = await this._invokeCreateSessionFunction(sessionObject);

if (recreatedSession.isUsable()) {
this._addSession(recreatedSession);
Expand Down
65 changes: 65 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import http from 'node:http';
import type { AddressInfo } from 'node:net';

import type { EnqueueLinksOptions, ErrorHandler, RequestHandler, RequestOptions, Source } from '@crawlee/basic';
import type { Session } from '@crawlee/basic';
import {
BasicCrawler,
Configuration,
Expand All @@ -12,13 +13,15 @@ import {
KeyValueStore,
MissingRouteError,
NonRetryableError,
ProxyConfiguration,
Request,
RequestList,
RequestQueue,
serviceLocator,
SessionPool,
} from '@crawlee/basic';
import { RequestState } from '@crawlee/core';
import type { ProxyInfo } from '@crawlee/types';
import type { Dictionary } from '@crawlee/utils';
import { RobotsTxtFile, sleep } from '@crawlee/utils';
import express from 'express';
Expand Down Expand Up @@ -1602,6 +1605,68 @@ describe('BasicCrawler', () => {
});
});

describe('proxyConfiguration', () => {
it('assigns a proxyInfo from the proxyConfiguration to each Session and exposes it on the context', async () => {
const proxyUrls = [0, 1, 2].map((n) => `http://proxy.example.com:${1000 + n}`);
const proxyConfiguration = new ProxyConfiguration({ proxyUrls });

const sessions: Session[] = [];
const proxyInfos: (ProxyInfo | undefined)[] = [];

const crawler = new BasicCrawler({
proxyConfiguration,
requestHandler: async ({ session, proxyInfo }) => {
sessions.push(session);
proxyInfos.push(proxyInfo);
},
Comment thread
barjin marked this conversation as resolved.
});

await crawler.run([
{ url: 'https://example.com/a' },
{ url: 'https://example.com/b' },
{ url: 'https://example.com/c' },
]);

expect(sessions).toHaveLength(3);
for (let i = 0; i < sessions.length; i++) {
const proxyInfo = proxyInfos[i];
expect(proxyInfo).toBeDefined();
expect(proxyUrls).toContain(proxyInfo!.url);
expect(sessions[i].proxyInfo).toBe(proxyInfo);
}
});

it('reuses the same Session across multiple requests when the pool is restricted', async () => {
const sessions: Session[] = [];
const proxyInfos: (ProxyInfo | undefined)[] = [];

const crawler = new BasicCrawler({
sessionPool: new SessionPool({ maxPoolSize: 1 }),
requestHandler: async ({ session, proxyInfo }) => {
sessions.push(session);
proxyInfos.push(proxyInfo);
},
});

await crawler.run([
{ url: 'https://example.com/a' },
{ url: 'https://example.com/b' },
{ url: 'https://example.com/c' },
]);

expect(sessions).toHaveLength(3);
const firstId = sessions[0].id;
for (const session of sessions) {
expect(session.id).toBe(firstId);
expect(session.proxyInfo).toBe(sessions[0].proxyInfo);
}
for (const proxyInfo of proxyInfos) {
expect(proxyInfo).toBe(sessions[0].proxyInfo);
}
expect(sessions[0].usageCount).toBe(3);
});
});

test('extendContext', async () => {
const url = 'https://example.com';
const requestHandlerImplementation = vi.fn();
Expand Down