diff --git a/package-lock.json b/package-lock.json index 02b619d3..f53da72e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,8 @@ "@ai-sdk/openai": "^1.1.9", "@mendable/firecrawl-js": "^1.16.0", "ai": "^4.1.17", + "axios": "^1.7.9", + "cheerio": "^1.0.0", "js-tiktoken": "^1.0.17", "lodash-es": "^4.17.21", "p-limit": "^6.2.0", @@ -850,6 +852,12 @@ ], "license": "MIT" }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/chalk": { "version": "5.4.1", "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", @@ -862,6 +870,48 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/cheerio": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz", + "integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==", + "license": "MIT", + "dependencies": { + "cheerio-select": "^2.1.0", + "dom-serializer": "^2.0.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "encoding-sniffer": "^0.2.0", + "htmlparser2": "^9.1.0", + "parse5": "^7.1.2", + "parse5-htmlparser2-tree-adapter": "^7.0.0", + "parse5-parser-stream": "^7.1.2", + "undici": "^6.19.5", + "whatwg-mimetype": "^4.0.0" + }, + "engines": { + "node": ">=18.17" + }, + "funding": { + "url": "https://github.com/cheeriojs/cheerio?sponsor=1" + } + }, + "node_modules/cheerio-select": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", + "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-select": "^5.1.0", + "css-what": "^6.1.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/combined-stream": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", @@ -874,6 +924,34 @@ "node": ">= 0.8" } }, + "node_modules/css-select": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", + "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", + "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/debug": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz", @@ -916,6 +994,86 @@ "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", "license": "Apache-2.0" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, + "node_modules/encoding-sniffer": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz", + "integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==", + "license": "MIT", + "dependencies": { + "iconv-lite": "^0.6.3", + "whatwg-encoding": "^3.1.1" + }, + "funding": { + "url": "https://github.com/fb55/encoding-sniffer?sponsor=1" + } + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/esbuild": { "version": "0.23.1", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.23.1.tgz", @@ -1037,6 +1195,37 @@ "node": ">=4" } }, + "node_modules/htmlparser2": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.1.0", + "entities": "^4.5.0" + } + }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/isows": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/isows/-/isows-1.0.6.tgz", @@ -1156,6 +1345,18 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/p-limit": { "version": "6.2.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-6.2.0.tgz", @@ -1171,6 +1372,43 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.2.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.2.1.tgz", + "integrity": "sha512-BuBYQYlv1ckiPdQi/ohiivi9Sagc9JG+Ozs0r7b/0iK3sKmrb0b9FdWdBbOdx6hBCM/F9Ir82ofnBhtZOjCRPQ==", + "license": "MIT", + "dependencies": { + "entities": "^4.5.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-htmlparser2-tree-adapter": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz", + "integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==", + "license": "MIT", + "dependencies": { + "domhandler": "^5.0.3", + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/parse5-parser-stream": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz", + "integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==", + "license": "MIT", + "dependencies": { + "parse5": "^7.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", @@ -1220,6 +1458,12 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, "node_modules/secure-json-parse": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", @@ -1304,6 +1548,15 @@ "integrity": "sha512-dFSOFBKV6uwaloBCCUhxlD3Pr/P1a/tJdcmPrTXCHlEFD3faj0mztjcGn6VBAhQ0/Bdy8K3VWrrqwbt/ffsYsg==", "license": "MIT" }, + "node_modules/undici": { + "version": "6.21.1", + "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.1.tgz", + "integrity": "sha512-q/1rj5D0/zayJB2FraXdaWxbhWiNKDvu8naDT2dl1yTlvJp4BLtOcp2a5BvgGNQpYYJzau7tf1WgKv3b+7mqpQ==", + "license": "MIT", + "engines": { + "node": ">=18.17" + } + }, "node_modules/undici-types": { "version": "6.20.0", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz", @@ -1320,6 +1573,27 @@ "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/ws": { "version": "8.18.0", "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.0.tgz", diff --git a/package.json b/package.json index 456ec7c4..c7bea428 100644 --- a/package.json +++ b/package.json @@ -24,6 +24,8 @@ "@ai-sdk/openai": "^1.1.9", "@mendable/firecrawl-js": "^1.16.0", "ai": "^4.1.17", + "axios": "^1.7.9", + "cheerio": "^1.0.0", "js-tiktoken": "^1.0.17", "lodash-es": "^4.17.21", "p-limit": "^6.2.0", diff --git a/src/deep-research.ts b/src/deep-research.ts index 1505c4af..0c1a032a 100644 --- a/src/deep-research.ts +++ b/src/deep-research.ts @@ -1,4 +1,5 @@ import FirecrawlApp, { SearchResponse } from '@mendable/firecrawl-js'; +import search from './duckduckgo-search'; import { generateObject } from 'ai'; import { compact } from 'lodash-es'; import pLimit from 'p-limit'; @@ -206,12 +207,24 @@ export async function deepResearch({ serpQueries.map(serpQuery => limit(async () => { try { - const result = await firecrawl.search(serpQuery.query, { + let result = await firecrawl.search(serpQuery.query, { timeout: 15000, limit: 5, scrapeOptions: { formats: ['markdown'] }, }); + // If the result is empty, use the DuckDuckGo search as a fallback + if (result.data.length === 0) { + log( + `No results found for ${serpQuery.query}, falling back to DuckDuckGo search`, + ); + result = await search(serpQuery.query, { + timeout: 15000, + limit: 5, + scrapeOptions: { formats: ['markdown'] }, + }); + } + // Collect URLs from this search const newUrls = compact(result.data.map(item => item.url)); const newBreadth = Math.ceil(breadth / 2); diff --git a/src/duckduckgo-search.ts b/src/duckduckgo-search.ts new file mode 100644 index 00000000..c13a4af3 --- /dev/null +++ b/src/duckduckgo-search.ts @@ -0,0 +1,81 @@ +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import { SearchResponse, FirecrawlDocument } from '@mendable/firecrawl-js'; + +async function search(query: string, params: { + timeout?: number; + limit?: number; + scrapeOptions?: { formats: string[] }; +} = {}): Promise { + const { timeout = 15000, limit = 5 } = params; + const baseUrl = 'https://duckduckgo.com/html/'; + + try { + const response = await axios.get(baseUrl, { + params: { + q: query, + }, + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + }, + timeout, + }); + + const $ = cheerio.load(response.data); + const results: FirecrawlDocument[] = []; + + // Find all search results with class 'result' + $('.result').each((i, element) => { + if (i >= limit) return false; + + const $element = $(element); + const title = $element.find('.result__title').text().trim(); + const url = $element.find('.result__url').text().trim(); + const snippet = $element.find('.result__snippet').text().trim(); + + // Get the actual URL from the href attribute + const linkHref = $element.find('.result__a').attr('href'); + let actualUrl = ''; + if (linkHref) { + // Extract the actual URL from DuckDuckGo's redirect URL + const match = linkHref.match(/uddg=([^&]+)/); + if (match && match[1]) { + actualUrl = decodeURIComponent(match[1]); + } + } + + const document: FirecrawlDocument = { + url: actualUrl || url, + title, + markdown: `# ${title}\n\n${snippet}`, + actions: undefined as never + }; + + results.push(document); + }); + + return { + success: true, + data: results + }; + + } catch (error) { + console.error('Search error:', error); + if (axios.isAxiosError(error) && error.code === 'ECONNABORTED') { + return { + success: false, + data: [], + error: 'Timeout' + }; + } + + return { + success: false, + data: [], + error: error instanceof Error ? error.message : 'Unknown error occurred' + }; + } +} + +export default search; \ No newline at end of file