Skip to content

Commit 2812d2b

Browse files
authored
fix(community): validate redirects in RecursiveUrlLoader (#10116)
1 parent 0050c91 commit 2812d2b

File tree

3 files changed

+134
-6
lines changed

3 files changed

+134
-6
lines changed

.changeset/ssrf-redirect-bypass.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@langchain/community": patch
3+
---
4+
5+
Validate redirects in RecursiveUrlLoader to prevent SSRF bypasses.

libs/langchain-community/src/document_loaders/tests/recursive_url.test.ts

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,105 @@
1-
import { test, describe, expect } from "@jest/globals";
1+
import {
2+
test,
3+
describe,
4+
expect,
5+
jest,
6+
beforeEach,
7+
afterEach,
8+
} from "@jest/globals";
29
import { RecursiveUrlLoader } from "../web/recursive_url.js";
310

11+
const _originalFetch = globalThis.fetch;
12+
13+
describe("RecursiveUrlLoader - Redirect SSRF Protection", () => {
14+
afterEach(() => {
15+
globalThis.fetch = _originalFetch;
16+
});
17+
18+
test("blocks redirects to private IPs (localhost)", async () => {
19+
globalThis.fetch = jest.fn<typeof fetch>().mockResolvedValue(
20+
new Response(null, {
21+
status: 302,
22+
headers: { Location: "http://127.0.0.1/admin" },
23+
})
24+
);
25+
26+
const loader = new RecursiveUrlLoader("https://example.com/", {
27+
maxDepth: 0,
28+
});
29+
const docs = await loader.load();
30+
expect(docs).toHaveLength(0);
31+
});
32+
33+
test("blocks redirects to cloud metadata IPs", async () => {
34+
globalThis.fetch = jest.fn<typeof fetch>().mockResolvedValue(
35+
new Response(null, {
36+
status: 302,
37+
headers: { Location: "http://169.254.169.254/latest/meta-data/" },
38+
})
39+
);
40+
41+
const loader = new RecursiveUrlLoader("https://example.com/", {
42+
maxDepth: 0,
43+
});
44+
const docs = await loader.load();
45+
expect(docs).toHaveLength(0);
46+
});
47+
48+
test("blocks redirects to private network ranges", async () => {
49+
globalThis.fetch = jest.fn<typeof fetch>().mockResolvedValue(
50+
new Response(null, {
51+
status: 302,
52+
headers: { Location: "http://192.168.1.1/internal" },
53+
})
54+
);
55+
56+
const loader = new RecursiveUrlLoader("https://example.com/", {
57+
maxDepth: 0,
58+
});
59+
const docs = await loader.load();
60+
expect(docs).toHaveLength(0);
61+
});
62+
63+
test("follows safe redirects", async () => {
64+
globalThis.fetch = jest
65+
.fn<typeof fetch>()
66+
.mockResolvedValueOnce(
67+
new Response(null, {
68+
status: 301,
69+
headers: { Location: "https://www.example.com/" },
70+
})
71+
)
72+
.mockResolvedValueOnce(
73+
new Response("<html><body>Hello</body></html>", {
74+
status: 200,
75+
headers: { "Content-Type": "text/html" },
76+
})
77+
);
78+
79+
const loader = new RecursiveUrlLoader("https://example.com/", {
80+
maxDepth: 0,
81+
});
82+
const docs = await loader.load();
83+
expect(docs).toHaveLength(1);
84+
expect(docs[0].pageContent).toContain("Hello");
85+
});
86+
87+
test("throws on too many redirects", async () => {
88+
globalThis.fetch = jest.fn<typeof fetch>().mockResolvedValue(
89+
new Response(null, {
90+
status: 302,
91+
headers: { Location: "https://example.com/loop" },
92+
})
93+
);
94+
95+
const loader = new RecursiveUrlLoader("https://example.com/", {
96+
maxDepth: 0,
97+
});
98+
const docs = await loader.load();
99+
expect(docs).toHaveLength(0);
100+
});
101+
});
102+
4103
describe("RecursiveUrlLoader - URL Origin Validation", () => {
5104
describe("preventOutside origin checking", () => {
6105
test("allows URLs with same origin", async () => {

libs/langchain-community/src/document_loaders/web/recursive_url.ts

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ import {
1010
const virtualConsole = new VirtualConsole();
1111
virtualConsole.on("error", () => {});
1212

13+
const MAX_REDIRECTS = 10;
14+
const REDIRECT_CODES = new Set([301, 302, 303, 307, 308]);
15+
1316
export interface RecursiveUrlLoaderOptions {
1417
excludeDirs?: string[];
1518
extractor?: (text: string) => string;
@@ -59,9 +62,32 @@ export class RecursiveUrlLoader
5962
options: { timeout: number } & RequestInit
6063
): Promise<Response> {
6164
const { timeout, ...rest } = options;
62-
return this.caller.call(() =>
63-
fetch(resource, { ...rest, signal: AbortSignal.timeout(timeout) })
64-
);
65+
let currentUrl = resource;
66+
67+
for (let i = 0; i <= MAX_REDIRECTS; i++) {
68+
validateSafeUrl(currentUrl, { allowHttp: true });
69+
70+
const response = await this.caller.call(() =>
71+
fetch(currentUrl, {
72+
...rest,
73+
redirect: "manual",
74+
signal: AbortSignal.timeout(timeout),
75+
})
76+
);
77+
78+
if (REDIRECT_CODES.has(response.status)) {
79+
const location = response.headers.get("location");
80+
if (!location) {
81+
throw new Error("Redirect response missing Location header");
82+
}
83+
currentUrl = new URL(location, currentUrl).href;
84+
continue;
85+
}
86+
87+
return response;
88+
}
89+
90+
throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`);
6591
}
6692

6793
private getChildLinks(html: string, baseUrl: string): Array<string> {
@@ -143,7 +169,6 @@ export class RecursiveUrlLoader
143169
private async getUrlAsDoc(url: string): Promise<Document | null> {
144170
let res;
145171
try {
146-
validateSafeUrl(url, { allowHttp: true });
147172
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
148173
res = await res.text();
149174
} catch {
@@ -171,7 +196,6 @@ export class RecursiveUrlLoader
171196

172197
let res;
173198
try {
174-
await validateSafeUrl(url, { allowHttp: true });
175199
res = await this.fetchWithTimeout(url, { timeout: this.timeout });
176200
res = await res.text();
177201
} catch {

0 commit comments

Comments
 (0)