diff --git a/apps/docs/app/api/search/route.ts b/apps/docs/app/api/search/route.ts index be205cd553..b777ae890f 100644 --- a/apps/docs/app/api/search/route.ts +++ b/apps/docs/app/api/search/route.ts @@ -1,16 +1,126 @@ -import { createFromSource } from 'fumadocs-core/search/server' -import { source } from '@/lib/source' - -export const revalidate = 3600 // Revalidate every hour - -export const { GET } = createFromSource(source, { - localeMap: { - en: { language: 'english' }, - es: { language: 'spanish' }, - fr: { language: 'french' }, - de: { language: 'german' }, - // ja and zh are not supported by the stemmer library, so we'll skip language config for them - ja: {}, - zh: {}, - }, -}) +import { sql } from 'drizzle-orm' +import { type NextRequest, NextResponse } from 'next/server' +import { db, docsEmbeddings } from '@/lib/db' +import { generateSearchEmbedding } from '@/lib/embeddings' + +export const runtime = 'nodejs' +export const revalidate = 0 + +/** + * Hybrid search API endpoint + * - English: Vector embeddings + keyword search + * - Other languages: Keyword search only + */ +export async function GET(request: NextRequest) { + try { + const searchParams = request.nextUrl.searchParams + const query = searchParams.get('query') || searchParams.get('q') || '' + const locale = searchParams.get('locale') || 'en' + const limit = Number.parseInt(searchParams.get('limit') || '10', 10) + + if (!query || query.trim().length === 0) { + return NextResponse.json([]) + } + + const candidateLimit = limit * 3 + const similarityThreshold = 0.6 + + const localeMap: Record = { + en: 'english', + es: 'spanish', + fr: 'french', + de: 'german', + ja: 'simple', // PostgreSQL doesn't have Japanese support, use simple + zh: 'simple', // PostgreSQL doesn't have Chinese support, use simple + } + const tsConfig = localeMap[locale] || 'simple' + + const useVectorSearch = locale === 'en' + let vectorResults: Array<{ + chunkId: string + chunkText: string + sourceDocument: string + sourceLink: string + headerText: string + headerLevel: number + similarity: number + searchType: string + }> = [] + + if (useVectorSearch) { + const queryEmbedding = await generateSearchEmbedding(query) + vectorResults = await db + .select({ + chunkId: docsEmbeddings.chunkId, + chunkText: docsEmbeddings.chunkText, + sourceDocument: docsEmbeddings.sourceDocument, + sourceLink: docsEmbeddings.sourceLink, + headerText: docsEmbeddings.headerText, + headerLevel: docsEmbeddings.headerLevel, + similarity: sql`1 - (${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector)`, + searchType: sql`'vector'`, + }) + .from(docsEmbeddings) + .where( + sql`1 - (${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector) >= ${similarityThreshold}` + ) + .orderBy(sql`${docsEmbeddings.embedding} <=> ${JSON.stringify(queryEmbedding)}::vector`) + .limit(candidateLimit) + } + + const keywordResults = await db + .select({ + chunkId: docsEmbeddings.chunkId, + chunkText: docsEmbeddings.chunkText, + sourceDocument: docsEmbeddings.sourceDocument, + sourceLink: docsEmbeddings.sourceLink, + headerText: docsEmbeddings.headerText, + headerLevel: docsEmbeddings.headerLevel, + similarity: sql`ts_rank(${docsEmbeddings.chunkTextTsv}, plainto_tsquery(${tsConfig}, ${query}))`, + searchType: sql`'keyword'`, + }) + .from(docsEmbeddings) + .where(sql`${docsEmbeddings.chunkTextTsv} @@ plainto_tsquery(${tsConfig}, ${query})`) + .orderBy( + sql`ts_rank(${docsEmbeddings.chunkTextTsv}, plainto_tsquery(${tsConfig}, ${query})) DESC` + ) + .limit(candidateLimit) + + const seenIds = new Set() + const mergedResults = [] + + for (let i = 0; i < Math.max(vectorResults.length, keywordResults.length); i++) { + if (i < vectorResults.length && !seenIds.has(vectorResults[i].chunkId)) { + mergedResults.push(vectorResults[i]) + seenIds.add(vectorResults[i].chunkId) + } + if (i < keywordResults.length && !seenIds.has(keywordResults[i].chunkId)) { + mergedResults.push(keywordResults[i]) + seenIds.add(keywordResults[i].chunkId) + } + } + + const filteredResults = mergedResults.slice(0, limit) + const searchResults = filteredResults.map((result) => { + const title = result.headerText || result.sourceDocument.replace('.mdx', '') + const pathParts = result.sourceDocument + .replace('.mdx', '') + .split('/') + .map((part) => part.charAt(0).toUpperCase() + part.slice(1)) + + return { + id: result.chunkId, + type: 'page' as const, + url: result.sourceLink, + content: title, + breadcrumbs: pathParts, + } + }) + + return NextResponse.json(searchResults) + } catch (error) { + console.error('Semantic search error:', error) + + return NextResponse.json([]) + } +} diff --git a/apps/docs/lib/db.ts b/apps/docs/lib/db.ts new file mode 100644 index 0000000000..9ecca9431f --- /dev/null +++ b/apps/docs/lib/db.ts @@ -0,0 +1,4 @@ +import { db } from '@sim/db' +import { docsEmbeddings } from '@sim/db/schema' + +export { db, docsEmbeddings } diff --git a/apps/docs/lib/embeddings.ts b/apps/docs/lib/embeddings.ts new file mode 100644 index 0000000000..c41a3f1989 --- /dev/null +++ b/apps/docs/lib/embeddings.ts @@ -0,0 +1,40 @@ +/** + * Generate embeddings for search queries using OpenAI API + */ +export async function generateSearchEmbedding(query: string): Promise { + const apiKey = process.env.OPENAI_API_KEY + + if (!apiKey) { + throw new Error('OPENAI_API_KEY environment variable is required') + } + + const response = await fetch('https://api.openai.com/v1/embeddings', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + input: query, + model: 'text-embedding-3-small', + encoding_format: 'float', + }), + }) + + if (!response.ok) { + const errorText = await response.text() + throw new Error(`OpenAI API failed: ${response.status} ${response.statusText} - ${errorText}`) + } + + const data = await response.json() + + if (!data?.data || !Array.isArray(data.data) || data.data.length === 0) { + throw new Error('OpenAI API returned invalid response structure: missing or empty data array') + } + + if (!data.data[0]?.embedding || !Array.isArray(data.data[0].embedding)) { + throw new Error('OpenAI API returned invalid response structure: missing or invalid embedding') + } + + return data.data[0].embedding +} diff --git a/apps/docs/package.json b/apps/docs/package.json index a589e671ed..59b2610630 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -11,16 +11,19 @@ "type-check": "tsc --noEmit" }, "dependencies": { + "@sim/db": "workspace:*", "@tabler/icons-react": "^3.31.0", "@vercel/og": "^0.6.5", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", + "drizzle-orm": "^0.44.5", "fumadocs-core": "16.2.3", "fumadocs-mdx": "14.1.0", "fumadocs-ui": "16.2.3", "lucide-react": "^0.511.0", "next": "16.1.0-canary.21", "next-themes": "^0.4.6", + "postgres": "^3.4.5", "react": "19.2.1", "react-dom": "19.2.1", "tailwind-merge": "^3.0.2" diff --git a/bun.lock b/bun.lock index 8b813ec712..d67baa9fee 100644 --- a/bun.lock +++ b/bun.lock @@ -44,16 +44,19 @@ "name": "docs", "version": "0.0.0", "dependencies": { + "@sim/db": "workspace:*", "@tabler/icons-react": "^3.31.0", "@vercel/og": "^0.6.5", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", + "drizzle-orm": "^0.44.5", "fumadocs-core": "16.2.3", "fumadocs-mdx": "14.1.0", "fumadocs-ui": "16.2.3", "lucide-react": "^0.511.0", "next": "16.1.0-canary.21", "next-themes": "^0.4.6", + "postgres": "^3.4.5", "react": "19.2.1", "react-dom": "19.2.1", "tailwind-merge": "^3.0.2",