Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import type {
RequestTransform,
RouterHandler,
RouterRoutes,
Session,
SkippedRequestCallback,
Source,
StatisticsOptions,
Expand Down Expand Up @@ -59,6 +58,7 @@ import {
Router,
ServiceLocator,
serviceLocator,
Session,
SessionError,
SessionPool,
Statistics,
Expand Down Expand Up @@ -873,7 +873,25 @@ export class BasicCrawler<
...statisticsOptions,
});

this.sessionPool = sessionPool ?? new SessionPool();
if (sessionPool && proxyConfiguration) {
this.log.warning(
'Both `sessionPool` and `proxyConfiguration` were provided to the crawler. ' +
'The `proxyConfiguration` is ignored - sessions from the supplied pool keep whatever ' +
'`proxyInfo` they were created with. Configure proxies on the pool instead, ' +
'e.g. via `addSession({ proxyInfo })` or a custom `createSessionFunction`.',
);
Comment thread
barjin marked this conversation as resolved.
}

this.sessionPool =
sessionPool ??
new SessionPool({
createSessionFunction: async (pool, opts) =>
new Session({
proxyInfo: await this.proxyConfiguration?.newProxyInfo(),
Comment thread
barjin marked this conversation as resolved.
Outdated
Comment thread
barjin marked this conversation as resolved.
Outdated
...opts?.sessionOptions,
sessionPool: pool,
}),
Comment thread
barjin marked this conversation as resolved.
});
this.sessionPool.setMaxListeners(20);

this.ownsSessionPool = !sessionPool;
Expand Down Expand Up @@ -1116,12 +1134,7 @@ export class BasicCrawler<
return existingSession;
}

return await this.sessionPool!.newSession({
proxyInfo: await this.proxyConfiguration?.newProxyInfo({
request: request ?? undefined,
}),
maxUsageCount: 1,
});
return await this.sessionPool!.getSession();
},
Comment thread
barjin marked this conversation as resolved.
this.internalTimeoutMillis,
`Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`,
Expand Down
35 changes: 35 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import http from 'node:http';
import type { AddressInfo } from 'node:net';

import type { EnqueueLinksOptions, ErrorHandler, RequestHandler, RequestOptions, Source } from '@crawlee/basic';
import type { Session } from '@crawlee/basic';
import {
BasicCrawler,
Configuration,
Expand All @@ -12,13 +13,15 @@ import {
KeyValueStore,
MissingRouteError,
NonRetryableError,
ProxyConfiguration,
Request,
RequestList,
RequestQueue,
serviceLocator,
SessionPool,
} from '@crawlee/basic';
import { RequestState } from '@crawlee/core';
import type { ProxyInfo } from '@crawlee/types';
import type { Dictionary } from '@crawlee/utils';
import { RobotsTxtFile, sleep } from '@crawlee/utils';
import express from 'express';
Expand Down Expand Up @@ -1602,6 +1605,38 @@ describe('BasicCrawler', () => {
});
});

describe('proxyConfiguration', () => {
it('assigns a proxyInfo from the proxyConfiguration to each Session and exposes it on the context', async () => {
const proxyUrls = [0, 1, 2].map((n) => `http://proxy.example.com:${1000 + n}`);
const proxyConfiguration = new ProxyConfiguration({ proxyUrls });

const sessions: Session[] = [];
const proxyInfos: (ProxyInfo | undefined)[] = [];

const crawler = new BasicCrawler({
proxyConfiguration,
requestHandler: async ({ session, proxyInfo }) => {
sessions.push(session);
proxyInfos.push(proxyInfo);
},
Comment thread
barjin marked this conversation as resolved.
});

await crawler.run([
{ url: 'https://example.com/a' },
{ url: 'https://example.com/b' },
{ url: 'https://example.com/c' },
]);

expect(sessions).toHaveLength(3);
for (let i = 0; i < sessions.length; i++) {
const proxyInfo = proxyInfos[i];
expect(proxyInfo).toBeDefined();
expect(proxyUrls).toContain(proxyInfo!.url);
expect(sessions[i].proxyInfo).toBe(proxyInfo);
}
});
});

test('extendContext', async () => {
const url = 'https://example.com';
const requestHandlerImplementation = vi.fn();
Expand Down