Skip to content

Commit 49d41c4

Browse files
committed
fix(reference): replace hand-rolled HTML stripping with turndown
1 parent 95aba16 commit 49d41c4

File tree

3 files changed

+30
-23
lines changed

3 files changed

+30
-23
lines changed

pnpm-lock.yaml

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

references/ai-chat/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
"react": "^19.0.0",
2525
"react-dom": "^19.0.0",
2626
"streamdown": "^2.3.0",
27+
"turndown": "^7.2.2",
2728
"zod": "3.25.76"
2829
},
2930
"devDependencies": {
@@ -32,6 +33,7 @@
3233
"@types/node": "^22",
3334
"@types/react": "^19",
3435
"@types/react-dom": "^19",
36+
"@types/turndown": "^5.0.6",
3537
"tailwindcss": "^4",
3638
"prisma": "^7.4.2",
3739
"trigger.dev": "workspace:*",

references/ai-chat/src/trigger/chat.ts

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@ import { PrismaClient } from "../../lib/generated/prisma/client";
1212
const adapter = new PrismaPg({ connectionString: process.env.DATABASE_URL! });
1313
const prisma = new PrismaClient({ adapter });
1414

15+
import TurndownService from "turndown";
1516
import { DEFAULT_MODEL, REASONING_MODELS } from "@/lib/models";
1617

18+
const turndown = new TurndownService();
19+
1720
const MODELS: Record<string, () => LanguageModel> = {
1821
"gpt-4o-mini": () => openai("gpt-4o-mini"),
1922
"gpt-4o": () => openai("gpt-4o"),
@@ -98,20 +101,8 @@ const webFetch = tool({
98101
let text = await response.text();
99102
const contentType = response.headers.get("content-type") ?? "";
100103

101-
// Strip HTML to plain text for readability
102104
if (contentType.includes("html")) {
103-
text = text
104-
.replace(/<script[\s\S]*?<\/script>/gi, "")
105-
.replace(/<style[\s\S]*?<\/style>/gi, "")
106-
.replace(/<[^>]+>/g, " ")
107-
.replace(/&nbsp;/g, " ")
108-
.replace(/&amp;/g, "&")
109-
.replace(/&lt;/g, "<")
110-
.replace(/&gt;/g, ">")
111-
.replace(/&quot;/g, '"')
112-
.replace(/&#39;/g, "'")
113-
.replace(/\s+/g, " ")
114-
.trim();
105+
text = turndown.turndown(text);
115106
}
116107

117108
return {
@@ -204,16 +195,7 @@ export const deepResearch = schemaTask({
204195
const contentType = response.headers.get("content-type") ?? "";
205196

206197
if (contentType.includes("html")) {
207-
text = text
208-
.replace(/<script[\s\S]*?<\/script>/gi, "")
209-
.replace(/<style[\s\S]*?<\/style>/gi, "")
210-
.replace(/<[^>]+>/g, " ")
211-
.replace(/&nbsp;/g, " ")
212-
.replace(/&amp;/g, "&")
213-
.replace(/&lt;/g, "<")
214-
.replace(/&gt;/g, ">")
215-
.replace(/\s+/g, " ")
216-
.trim();
198+
text = turndown.turndown(text);
217199
}
218200

219201
results.push({

0 commit comments

Comments
 (0)