diff --git a/Makefile.cbm b/Makefile.cbm index b3bb4a8c..de52aa18 100644 --- a/Makefile.cbm +++ b/Makefile.cbm @@ -139,7 +139,7 @@ PREPROCESSOR_SRC = $(CBM_DIR)/preprocessor.cpp SQLITE_WRITER_SRC = $(CBM_DIR)/sqlite_writer.c # Store module (new) -STORE_SRCS = src/store/store.c +STORE_SRCS = src/store/store.c src/store/cross_repo.c # Cypher module (new) CYPHER_SRCS = src/cypher/cypher.c @@ -180,7 +180,8 @@ PIPELINE_SRCS = \ src/pipeline/pass_compile_commands.c \ src/pipeline/pass_infrascan.c \ src/pipeline/pass_k8s.c \ - src/pipeline/httplink.c + src/pipeline/httplink.c \ + src/pipeline/embedding.c # Traces module (new) TRACES_SRCS = src/traces/traces.c @@ -217,8 +218,8 @@ MIMALLOC_CFLAGS_TEST = -std=c11 -g -O1 -w \ # sqlite3 (vendored amalgamation — compiled ourselves for ASan instrumentation) SQLITE3_SRC = vendored/sqlite3/sqlite3.c -SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 +SQLITE3_CFLAGS = -std=c11 -O2 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5 +SQLITE3_CFLAGS_TEST = -std=c11 -g -O1 -w -DSQLITE_DQS=0 -DSQLITE_THREADSAFE=1 -DSQLITE_ENABLE_FTS5 # TRE regex (vendored, Windows only — POSIX uses system ) TRE_SRC = vendored/tre/tre_all.c diff --git a/internal/cbm/extract_calls.c b/internal/cbm/extract_calls.c index 87bfd005..d9c38c9c 100644 --- a/internal/cbm/extract_calls.c +++ b/internal/cbm/extract_calls.c @@ -344,4 +344,101 @@ void handle_calls(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec, Walk } } } + + // C# delegate/event patterns + if (ctx->language == CBM_LANG_CSHARP) { + // Fix 1: event += MethodName (bare method reference subscription) + // Creates a CALLS edge from the subscribing method to the handler method. + // e.g. _socket.OnConnected += SocketOnConnected; + if (strcmp(kind, "assignment_expression") == 0) { + TSNode op = ts_node_child_by_field_name(node, "operator", 8); + if (!ts_node_is_null(op)) { + char *op_text = cbm_node_text(ctx->arena, op, ctx->source); + if (op_text && strcmp(op_text, "+=") == 0) { + TSNode right = ts_node_child_by_field_name(node, "right", 5); + if (!ts_node_is_null(right)) { + const char *rk = ts_node_type(right); + if (strcmp(rk, "identifier") == 0 || + strcmp(rk, "member_access_expression") == 0) { + char *callee = cbm_node_text(ctx->arena, right, ctx->source); + if (callee && callee[0] && !cbm_is_keyword(callee, ctx->language)) { + CBMCall call; + call.callee_name = callee; + call.enclosing_func_qn = state->enclosing_func_qn; + cbm_calls_push(&ctx->result->calls, ctx->arena, call); + } + } + } + } + } + } + + // Fix 2: delegate?.Invoke() → resolve to receiver (delegate) name. + // C# delegates are invoked via .Invoke() or ?.Invoke() — the callee name + // "Invoke" resolves to nothing. Instead, extract the receiver (delegate property) + // name, which is more likely to match a registered symbol. + // e.g. OnConnected?.Invoke(this, e) → creates CALLS edge to "OnConnected" + // + // C# tree-sitter AST for "OnConnected?.Invoke(this, e)": + // invocation_expression + // function: conditional_access_expression + // expression: identifier "OnConnected" ← receiver + // member_binding_expression + // name: identifier "Invoke" ← method + // arguments: argument_list + if (cbm_kind_in_set(node, spec->call_node_types)) { + TSNode func_node2 = ts_node_child_by_field_name(node, "function", 8); + if (!ts_node_is_null(func_node2)) { + const char *fk2 = ts_node_type(func_node2); + bool is_invoke = false; + TSNode receiver2 = {0}; // NOLINT + + if (strcmp(fk2, "conditional_access_expression") == 0) { + // ?. access: look for member_binding_expression child + uint32_t ncc = ts_node_named_child_count(func_node2); + for (uint32_t ci = 0; ci < ncc; ci++) { + TSNode child = ts_node_named_child(func_node2, ci); + const char *ck = ts_node_type(child); + if (strcmp(ck, "member_binding_expression") == 0) { + TSNode name_n = ts_node_child_by_field_name(child, "name", 4); + if (!ts_node_is_null(name_n)) { + char *nm = cbm_node_text(ctx->arena, name_n, ctx->source); + if (nm && strcmp(nm, "Invoke") == 0) { + is_invoke = true; + } + } + } + if (strcmp(ck, "identifier") == 0 || + strcmp(ck, "member_access_expression") == 0) { + receiver2 = child; + } + } + } else if (strcmp(fk2, "member_access_expression") == 0) { + // Dot access: obj.Invoke(...) + TSNode name_n = ts_node_child_by_field_name(func_node2, "name", 4); + if (!ts_node_is_null(name_n)) { + char *nm = cbm_node_text(ctx->arena, name_n, ctx->source); + if (nm && strcmp(nm, "Invoke") == 0) { + is_invoke = true; + TSNode expr = ts_node_child_by_field_name(func_node2, + "expression", 10); + if (!ts_node_is_null(expr)) { + receiver2 = expr; + } + } + } + } + + if (is_invoke && !ts_node_is_null(receiver2)) { + char *recv = cbm_node_text(ctx->arena, receiver2, ctx->source); + if (recv && recv[0] && !cbm_is_keyword(recv, ctx->language)) { + CBMCall call; + call.callee_name = recv; + call.enclosing_func_qn = state->enclosing_func_qn; + cbm_calls_push(&ctx->result->calls, ctx->arena, call); + } + } + } + } + } } diff --git a/internal/cbm/extract_defs.c b/internal/cbm/extract_defs.c index 754a98f7..51c43978 100644 --- a/internal/cbm/extract_defs.c +++ b/internal/cbm/extract_defs.c @@ -5,6 +5,7 @@ #include "tree_sitter/api.h" // TSNode, ts_node_* #include // uint32_t #include +#include /* strcasecmp */ #include // Field name lengths for ts_node_child_by_field_name() calls. @@ -565,10 +566,58 @@ static const char **extract_base_classes(CBMArena *a, TSNode node, const char *s } } } - // C/C++ specific: handle base_class_clause (contains access specifiers + type names) + // C# specific: handle base_list node (contains base types separated by commas) { uint32_t count = ts_node_child_count(node); for (uint32_t i = 0; i < count; i++) { + TSNode child = ts_node_child(node, i); + if (strcmp(ts_node_type(child), "base_list") == 0) { + const char *bases[16]; + int base_count = 0; + uint32_t bnc = ts_node_named_child_count(child); + for (uint32_t bi = 0; bi < bnc && base_count < MAX_BASES_MINUS_1; bi++) { + TSNode bc = ts_node_named_child(child, bi); + const char *bk = ts_node_type(bc); + // C# base types can be: identifier, generic_name, qualified_name, + // or wrapped in a simple_base_type / primary_constructor_base_type + char *text = NULL; + if (strcmp(bk, "identifier") == 0 || strcmp(bk, "generic_name") == 0 || + strcmp(bk, "qualified_name") == 0) { + text = cbm_node_text(a, bc, source); + } else { + // For wrapper nodes (simple_base_type etc.), extract the first + // named child which should be the type identifier + TSNode inner = ts_node_named_child(bc, 0); + if (!ts_node_is_null(inner)) { + text = cbm_node_text(a, inner, source); + } + } + if (text && text[0]) { + // Strip generic args for resolution: "List" → "List" + char *angle = strchr(text, '<'); + if (angle) *angle = '\0'; + bases[base_count++] = text; + } + } + if (base_count > 0) { + const char **result = + (const char **)cbm_arena_alloc(a, (base_count + 1) * sizeof(const char *)); + if (result) { + for (int j = 0; j < base_count; j++) { + result[j] = bases[j]; + } + result[base_count] = NULL; + return result; + } + } + } + } + } + + // C/C++ specific: handle base_class_clause (contains access specifiers + type names) + { + uint32_t count2 = ts_node_child_count(node); + for (uint32_t i = 0; i < count2; i++) { TSNode child = ts_node_child(node, i); if (strcmp(ts_node_type(child), "base_class_clause") == 0) { // Extract type identifiers from base_class_clause, skipping access specifiers @@ -1136,11 +1185,82 @@ static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec } } - // main is always an entry point - if (strcmp(name, "main") == 0) { + // main/Main is always an entry point (case-insensitive for C#/Java) + if (strcasecmp(name, "main") == 0) { def.is_entry_point = true; } + // C/C++ entry point detection: WinMain, DllMain, GTest, MFC + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP) && !def.is_entry_point) { + if (strcmp(name, "WinMain") == 0 || strcmp(name, "wWinMain") == 0 || + strcmp(name, "DllMain") == 0 || strcmp(name, "wmain") == 0 || + strcmp(name, "_tmain") == 0 || strcmp(name, "InitInstance") == 0 || + strcmp(name, "OnInitDialog") == 0) { + def.is_entry_point = true; + } + } + + // C# entry point detection: Windows Service lifecycle, ASP.NET controllers + if (ctx->language == CBM_LANG_CSHARP && !def.is_entry_point) { + // Windows Service lifecycle entry points + if (strcmp(name, "OnStart") == 0 || strcmp(name, "OnStartImpl") == 0 || + strcmp(name, "OnStop") == 0 || strcmp(name, "OnStopImpl") == 0 || + strcmp(name, "Run") == 0 || strcmp(name, "Execute") == 0 || + strcmp(name, "Configure") == 0 || strcmp(name, "ConfigureServices") == 0) { + def.is_entry_point = true; + } + // ASP.NET controller decorators: [HttpGet], [HttpPost], [Route], etc. + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "HttpGet") || strstr(*d, "HttpPost") || + strstr(*d, "HttpPut") || strstr(*d, "HttpDelete") || + strstr(*d, "HttpPatch") || strstr(*d, "Route") || + strstr(*d, "ApiController") || strstr(*d, "Authorize")) { + def.is_entry_point = true; + break; + } + } + } + // Test entry points: [TestMethod], [Fact], [Test], [SetUp] + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "TestMethod") || strstr(*d, "Fact") || + strstr(*d, "Test") || strstr(*d, "SetUp") || + strstr(*d, "TestInitialize")) { + def.is_entry_point = true; + break; + } + } + } + } + + // Java entry point detection: Spring Boot, Vert.x, JAX-RS, JUnit + if (ctx->language == CBM_LANG_JAVA && !def.is_entry_point) { + // Vert.x lifecycle and common server patterns + if (strcmp(name, "start") == 0 || strcmp(name, "configure") == 0 || + strcmp(name, "init") == 0 || strcmp(name, "run") == 0 || + strcmp(name, "handle") == 0) { + def.is_entry_point = true; + } + // Spring/JAX-RS/JUnit decorators + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "RequestMapping") || strstr(*d, "GetMapping") || + strstr(*d, "PostMapping") || strstr(*d, "PutMapping") || + strstr(*d, "DeleteMapping") || strstr(*d, "PatchMapping") || + strstr(*d, "Endpoint") || strstr(*d, "EventHandler") || + strstr(*d, "Scheduled") || strstr(*d, "Bean") || + strstr(*d, "Override") || strstr(*d, "Test") || + strstr(*d, "GET") || strstr(*d, "POST") || + strstr(*d, "PUT") || strstr(*d, "DELETE") || + strstr(*d, "Path") || strstr(*d, "Consumes")) { + def.is_entry_point = true; + break; + } + } + } + } + cbm_defs_push(&ctx->result->defs, a, def); } @@ -1610,6 +1730,68 @@ static void push_method_def(CBMExtractCtx *ctx, TSNode child, const char *class_ def.complexity = cbm_count_branching(child, spec->branching_node_types); } + // Entry point detection for class methods (same rules as extract_func_def) + // Case-insensitive "main" check + if (strcasecmp(name, "main") == 0) { + def.is_entry_point = true; + } + + // C/C++ entry point detection: WinMain, DllMain, GTest, MFC + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP) && !def.is_entry_point) { + if (strcmp(name, "WinMain") == 0 || strcmp(name, "wWinMain") == 0 || + strcmp(name, "DllMain") == 0 || strcmp(name, "wmain") == 0 || + strcmp(name, "_tmain") == 0 || strcmp(name, "InitInstance") == 0 || + strcmp(name, "OnInitDialog") == 0) { + def.is_entry_point = true; + } + } + + // C# entry point detection: Windows Service lifecycle, ASP.NET controllers + if (ctx->language == CBM_LANG_CSHARP && !def.is_entry_point) { + if (strcmp(name, "OnStart") == 0 || strcmp(name, "OnStartImpl") == 0 || + strcmp(name, "OnStop") == 0 || strcmp(name, "OnStopImpl") == 0 || + strcmp(name, "Run") == 0 || strcmp(name, "Execute") == 0 || + strcmp(name, "Configure") == 0 || strcmp(name, "ConfigureServices") == 0) { + def.is_entry_point = true; + } + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "HttpGet") || strstr(*d, "HttpPost") || + strstr(*d, "HttpPut") || strstr(*d, "HttpDelete") || + strstr(*d, "HttpPatch") || strstr(*d, "Route") || + strstr(*d, "ApiController") || strstr(*d, "Authorize")) { + def.is_entry_point = true; + break; + } + } + } + } + + // Java entry point detection + if (ctx->language == CBM_LANG_JAVA && !def.is_entry_point) { + if (strcmp(name, "start") == 0 || strcmp(name, "configure") == 0 || + strcmp(name, "init") == 0 || strcmp(name, "run") == 0 || + strcmp(name, "handle") == 0) { + def.is_entry_point = true; + } + if (!def.is_entry_point && def.decorators) { + for (const char **d = def.decorators; *d; d++) { + if (strstr(*d, "RequestMapping") || strstr(*d, "GetMapping") || + strstr(*d, "PostMapping") || strstr(*d, "PutMapping") || + strstr(*d, "DeleteMapping") || strstr(*d, "PatchMapping") || + strstr(*d, "Endpoint") || strstr(*d, "EventHandler") || + strstr(*d, "Scheduled") || strstr(*d, "Bean") || + strstr(*d, "Override") || strstr(*d, "Test") || + strstr(*d, "GET") || strstr(*d, "POST") || + strstr(*d, "PUT") || strstr(*d, "DELETE") || + strstr(*d, "Path") || strstr(*d, "Consumes")) { + def.is_entry_point = true; + break; + } + } + } + } + cbm_defs_push(&ctx->result->defs, a, def); } @@ -1648,6 +1830,61 @@ static void extract_class_methods(CBMExtractCtx *ctx, TSNode class_node, const c continue; } + /* C#/Java property extraction: property_declaration, auto_property_declaration. + * Creates a "Property" node with parent_class set for DEFINES_METHOD edge. */ + const char *child_type = ts_node_type(child); + if (child_type && + (strcmp(child_type, "property_declaration") == 0 || + strcmp(child_type, "indexer_declaration") == 0 || + strcmp(child_type, "event_declaration") == 0 || + strcmp(child_type, "event_field_declaration") == 0)) { + TSNode name_node = ts_node_child_by_field_name(child, "name", 4); + if (ts_node_is_null(name_node)) { + /* indexer_declaration doesn't have a 'name' field, use "this" */ + if (strcmp(child_type, "indexer_declaration") == 0) { + CBMDefinition pdef; + memset(&pdef, 0, sizeof(pdef)); + pdef.name = cbm_arena_strdup(ctx->arena, "this[]"); + pdef.qualified_name = cbm_arena_sprintf(ctx->arena, "%s.this[]", class_qn); + pdef.label = "Property"; + pdef.file_path = ctx->rel_path; + pdef.parent_class = class_qn; + pdef.start_line = ts_node_start_point(child).row + 1; + pdef.end_line = ts_node_end_point(child).row + 1; + pdef.lines = (int)(pdef.end_line - pdef.start_line + 1); + TSNode type_node = ts_node_child_by_field_name(child, "type", 4); + if (!ts_node_is_null(type_node)) { + pdef.return_type = cbm_node_text(ctx->arena, type_node, ctx->source); + } + cbm_defs_push(&ctx->result->defs, ctx->arena, pdef); + } + continue; + } + char *pname = cbm_node_text(ctx->arena, name_node, ctx->source); + if (pname && pname[0]) { + CBMDefinition pdef; + memset(&pdef, 0, sizeof(pdef)); + pdef.name = pname; + pdef.qualified_name = cbm_arena_sprintf(ctx->arena, "%s.%s", class_qn, pname); + pdef.label = "Property"; + pdef.file_path = ctx->rel_path; + pdef.parent_class = class_qn; + pdef.start_line = ts_node_start_point(child).row + 1; + pdef.end_line = ts_node_end_point(child).row + 1; + pdef.lines = (int)(pdef.end_line - pdef.start_line + 1); + pdef.is_exported = cbm_is_exported(pname, ctx->language); + /* Extract type */ + TSNode type_node = ts_node_child_by_field_name(child, "type", 4); + if (!ts_node_is_null(type_node)) { + pdef.return_type = cbm_node_text(ctx->arena, type_node, ctx->source); + } + pdef.decorators = extract_decorators(ctx->arena, child, ctx->source, + ctx->language, spec); + cbm_defs_push(&ctx->result->defs, ctx->arena, pdef); + } + continue; + } + if (!cbm_kind_in_set(child, spec->function_node_types)) { continue; } diff --git a/internal/cbm/extract_imports.c b/internal/cbm/extract_imports.c index 87f8021b..103f4e06 100644 --- a/internal/cbm/extract_imports.c +++ b/internal/cbm/extract_imports.c @@ -340,6 +340,88 @@ static void walk_es_imports(CBMExtractCtx *ctx, TSNode node) { return; } + /* CommonJS: const X = require("Y"), const { A, B } = require("Y") + * Tree-sitter structure: variable_declarator → name + value(call_expression) + * We detect require() calls inside lexical_declaration/variable_declaration. */ + if (strcmp(kind, "variable_declarator") == 0 || strcmp(kind, "assignment_expression") == 0) { + TSNode value = ts_node_child_by_field_name(node, "value", 5); + if (ts_node_is_null(value)) { + value = ts_node_child_by_field_name(node, "right", 5); + } + if (!ts_node_is_null(value) && strcmp(ts_node_type(value), "call_expression") == 0) { + TSNode func = ts_node_child_by_field_name(value, "function", 8); + if (!ts_node_is_null(func) && strcmp(ts_node_type(func), "identifier") == 0) { + char *fname = cbm_node_text(a, func, ctx->source); + if (fname && strcmp(fname, "require") == 0) { + /* Extract the require() argument */ + TSNode args = ts_node_child_by_field_name(value, "arguments", 9); + if (!ts_node_is_null(args) && ts_node_named_child_count(args) > 0) { + TSNode arg0 = ts_node_named_child(args, 0); + const char *at = ts_node_type(arg0); + if (strcmp(at, "string") == 0 || strcmp(at, "string_literal") == 0 || + strcmp(at, "template_string") == 0) { + char *path = strip_quotes(a, cbm_node_text(a, arg0, ctx->source)); + if (path && path[0]) { + /* Get the variable name(s) being assigned */ + TSNode lhs = ts_node_child_by_field_name(node, "name", 4); + if (ts_node_is_null(lhs)) { + lhs = ts_node_child_by_field_name(node, "left", 4); + } + if (!ts_node_is_null(lhs)) { + const char *lk = ts_node_type(lhs); + if (strcmp(lk, "identifier") == 0) { + char *name = cbm_node_text(a, lhs, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } else if (strcmp(lk, "object_pattern") == 0) { + /* Destructured: const { A, B } = require("Y") */ + uint32_t nc = ts_node_named_child_count(lhs); + for (uint32_t k = 0; k < nc; k++) { + TSNode prop = ts_node_named_child(lhs, k); + const char *pk = ts_node_type(prop); + if (strcmp(pk, "shorthand_property_identifier_pattern") == 0 || + strcmp(pk, "shorthand_property_identifier") == 0 || + strcmp(pk, "identifier") == 0) { + char *name = cbm_node_text(a, prop, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } else if (strcmp(pk, "pair_pattern") == 0 || + strcmp(pk, "pair") == 0) { + TSNode val = ts_node_child_by_field_name(prop, "value", 5); + if (!ts_node_is_null(val)) { + char *name = cbm_node_text(a, val, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } else if (strcmp(lk, "array_pattern") == 0) { + /* Array destructured: const [A, B] = require("Y") */ + uint32_t nc = ts_node_named_child_count(lhs); + for (uint32_t k = 0; k < nc; k++) { + TSNode elem = ts_node_named_child(lhs, k); + if (strcmp(ts_node_type(elem), "identifier") == 0) { + char *name = cbm_node_text(a, elem, ctx->source); + CBMImport imp = {.local_name = name, .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } else { + /* Fallback: use last path segment as name */ + CBMImport imp = {.local_name = path_last(a, path), + .module_path = path}; + cbm_imports_push(&ctx->result->imports, a, imp); + } + } + } + } + } + } + } + /* Don't return — let it recurse to catch nested requires */ + } + recurse:; uint32_t count = ts_node_child_count(node); for (uint32_t i = 0; i < count; i++) { diff --git a/internal/cbm/extract_unified.c b/internal/cbm/extract_unified.c index f4cfb3cd..69029c29 100644 --- a/internal/cbm/extract_unified.c +++ b/internal/cbm/extract_unified.c @@ -79,6 +79,65 @@ static const char *compute_func_qn(CBMExtractCtx *ctx, TSNode node, const CBMLan } } + /* C/C++/CUDA/GLSL: function_definition has no "name" field. + * Name is buried in declarator chain: function_definition → declarator → + * function_declarator → declarator → identifier. Walk the chain. */ + if ((ctx->language == CBM_LANG_C || ctx->language == CBM_LANG_CPP || + ctx->language == CBM_LANG_CUDA || ctx->language == CBM_LANG_GLSL)) { + const char *nk = ts_node_type(node); + bool is_func_def = (strcmp(nk, "function_definition") == 0); + /* Template declarations wrap the function_definition */ + TSNode inner_func = node; + if (strcmp(nk, "template_declaration") == 0) { + for (uint32_t i = 0; i < ts_node_named_child_count(node); i++) { + TSNode ch = ts_node_named_child(node, i); + if (strcmp(ts_node_type(ch), "function_definition") == 0) { + inner_func = ch; + is_func_def = true; + break; + } + } + } + if (is_func_def) { + TSNode decl = ts_node_child_by_field_name(inner_func, "declarator", 10); + for (int depth = 0; depth < 8 && !ts_node_is_null(decl); depth++) { + const char *dk = ts_node_type(decl); + if (strcmp(dk, "identifier") == 0 || strcmp(dk, "field_identifier") == 0) { + char *name = cbm_node_text(ctx->arena, decl, ctx->source); + if (name && name[0]) { + if (state->enclosing_class_qn) { + return cbm_arena_sprintf(ctx->arena, "%s.%s", + state->enclosing_class_qn, name); + } + return cbm_fqn_compute(ctx->arena, ctx->project, ctx->rel_path, name); + } + return NULL; + } + if (strcmp(dk, "qualified_identifier") == 0 || + strcmp(dk, "scoped_identifier") == 0) { + TSNode id = cbm_find_child_by_kind(decl, "identifier"); + if (ts_node_is_null(id)) + id = cbm_find_child_by_kind(decl, "field_identifier"); + if (!ts_node_is_null(id)) { + char *name = cbm_node_text(ctx->arena, id, ctx->source); + if (name && name[0]) { + return cbm_fqn_compute(ctx->arena, ctx->project, + ctx->rel_path, name); + } + } + return NULL; + } + /* Unwrap: function_declarator → inner declarator */ + TSNode inner = ts_node_child_by_field_name(decl, "declarator", 10); + if (ts_node_is_null(inner) && ts_node_named_child_count(decl) > 0) { + inner = ts_node_named_child(decl, 0); + } + decl = inner; + } + return NULL; /* couldn't resolve C/C++ function name */ + } + } + TSNode name_node = ts_node_child_by_field_name(node, "name", 4); // Arrow function: name from parent variable_declarator @@ -153,9 +212,32 @@ void cbm_extract_unified(CBMExtractCtx *ctx) { // 4. Push scope markers for boundary nodes if (spec->function_node_types && cbm_kind_in_set(node, spec->function_node_types)) { - const char *fqn = compute_func_qn(ctx, node, spec, &state); - if (fqn) { - push_scope(&state, SCOPE_FUNC, depth, fqn); + // Fix 3: C# lambda_expression inside += assignment should NOT create + // a new scope boundary. Calls inside the lambda body should be attributed + // to the outer method that subscribes the event handler, not to an + // anonymous lambda. This matches the semantic intent: the subscribing + // method IS responsible for what runs when the event fires. + bool skip_scope = false; + if (ctx->language == CBM_LANG_CSHARP && + strcmp(ts_node_type(node), "lambda_expression") == 0) { + TSNode parent = ts_node_parent(node); + if (!ts_node_is_null(parent) && + strcmp(ts_node_type(parent), "assignment_expression") == 0) { + TSNode op = ts_node_child_by_field_name(parent, "operator", 8); + if (!ts_node_is_null(op)) { + char *op_text = cbm_node_text(ctx->arena, op, ctx->source); + if (op_text && (strcmp(op_text, "+=") == 0 || + strcmp(op_text, "-=") == 0)) { + skip_scope = true; + } + } + } + } + if (!skip_scope) { + const char *fqn = compute_func_qn(ctx, node, spec, &state); + if (fqn) { + push_scope(&state, SCOPE_FUNC, depth, fqn); + } } } else if (spec->class_node_types && cbm_kind_in_set(node, spec->class_node_types)) { const char *cqn = compute_class_qn(ctx, node); diff --git a/internal/cbm/helpers.c b/internal/cbm/helpers.c index 0b4147b5..d1abcb77 100644 --- a/internal/cbm/helpers.c +++ b/internal/cbm/helpers.c @@ -444,6 +444,34 @@ static const char *func_node_name(CBMArena *a, TSNode func_node, const char *sou } } + /* C/C++/CUDA/GLSL: function_definition has no "name" field. + * Name is inside declarator chain: function_definition → declarator → + * function_declarator → declarator → identifier. */ + if ((lang == CBM_LANG_C || lang == CBM_LANG_CPP || + lang == CBM_LANG_CUDA || lang == CBM_LANG_GLSL) && + strcmp(ts_node_type(func_node), "function_definition") == 0) { + TSNode decl = ts_node_child_by_field_name(func_node, "declarator", 10); + for (int depth = 0; depth < 8 && !ts_node_is_null(decl); depth++) { + const char *dk = ts_node_type(decl); + if (strcmp(dk, "identifier") == 0 || strcmp(dk, "field_identifier") == 0) { + return cbm_node_text(a, decl, source); + } + if (strcmp(dk, "qualified_identifier") == 0 || + strcmp(dk, "scoped_identifier") == 0) { + TSNode id = cbm_find_child_by_kind(decl, "identifier"); + if (ts_node_is_null(id)) + id = cbm_find_child_by_kind(decl, "field_identifier"); + if (!ts_node_is_null(id)) return cbm_node_text(a, id, source); + return NULL; + } + TSNode inner = ts_node_child_by_field_name(decl, "declarator", 10); + if (ts_node_is_null(inner) && ts_node_named_child_count(decl) > 0) + inner = ts_node_named_child(decl, 0); + decl = inner; + } + return NULL; + } + TSNode name_node = ts_node_child_by_field_name(func_node, "name", 4); if (!ts_node_is_null(name_node)) { return cbm_node_text(a, name_node, source); diff --git a/internal/cbm/lang_specs.c b/internal/cbm/lang_specs.c index 426db947..428c9cd3 100644 --- a/internal/cbm/lang_specs.c +++ b/internal/cbm/lang_specs.c @@ -114,6 +114,7 @@ static const char *js_class_types[] = {"class_declaration", "class", NULL}; static const char *js_module_types[] = {"program", NULL}; static const char *js_call_types[] = {"call_expression", NULL}; static const char *js_import_types[] = {"import_statement", "lexical_declaration", + "variable_declaration", "expression_statement", "export_statement", NULL}; static const char *js_branch_types[] = {"if_statement", "for_statement", "for_in_statement", "while_statement", "switch_statement", "case_clause", diff --git a/src/cypher/cypher.c b/src/cypher/cypher.c index 64985cbc..aa97b6a0 100644 --- a/src/cypher/cypher.c +++ b/src/cypher/cypher.c @@ -631,6 +631,39 @@ static void expr_free(cbm_expr_t *e) { // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) free(e->cond.in_values); } + if (e->type == EXPR_NOT_EXISTS) { + if (e->sub_pattern) { + /* Free pattern nodes and rels */ + for (int i = 0; i < e->sub_pattern->node_count; i++) { + free((void *)e->sub_pattern->nodes[i].variable); + free((void *)e->sub_pattern->nodes[i].label); + } + for (int i = 0; i < e->sub_pattern->rel_count; i++) { + free((void *)e->sub_pattern->rels[i].variable); + for (int t = 0; t < e->sub_pattern->rels[i].type_count; t++) { + free((void *)e->sub_pattern->rels[i].types[t]); + } + free(e->sub_pattern->rels[i].types); + free((void *)e->sub_pattern->rels[i].direction); + } + free(e->sub_pattern->nodes); + free(e->sub_pattern->rels); + free(e->sub_pattern); + } + if (e->sub_where) { + cbm_where_clause_t *sw = (cbm_where_clause_t *)e->sub_where; + if (sw->root) expr_free(sw->root); + for (int i = 0; i < sw->count; i++) { + free((void *)sw->conditions[i].variable); + free((void *)sw->conditions[i].property); + free((void *)sw->conditions[i].op); + free((void *)sw->conditions[i].value); + } + free(sw->conditions); + free((void *)sw->op); + free(sw); + } + } expr_free(e->left); expr_free(e->right); free(e); @@ -695,6 +728,8 @@ static const char *unsupported_clause_error(cbm_token_type_t type) { /* Forward declarations for recursive descent */ static cbm_expr_t *parse_or_expr(parser_t *p); +static int parse_match_pattern(parser_t *p, cbm_pattern_t *pat); +static int parse_where(parser_t *p, cbm_where_clause_t **out); /* Parse a single condition: var.prop OP value | var.prop IS [NOT] NULL | var.prop IN [...] */ static cbm_expr_t *parse_condition_expr(parser_t *p) { @@ -833,9 +868,40 @@ static cbm_expr_t *parse_atom_expr(parser_t *p) { return parse_condition_expr(p); } -/* NOT: NOT atom | atom */ +/* NOT: NOT EXISTS { MATCH ... WHERE ... } | NOT atom | atom */ static cbm_expr_t *parse_not_expr(parser_t *p) { if (match(p, TOK_NOT)) { + /* NOT EXISTS { MATCH (pattern) WHERE ... } — correlated subquery */ + if (check(p, TOK_EXISTS)) { + advance(p); /* consume EXISTS */ + if (!expect(p, TOK_LBRACE)) return NULL; + + cbm_expr_t *e = calloc(1, sizeof(cbm_expr_t)); + e->type = EXPR_NOT_EXISTS; + + /* Parse inner MATCH pattern */ + if (!expect(p, TOK_MATCH)) { free(e); return NULL; } + e->sub_pattern = calloc(1, sizeof(cbm_pattern_t)); + if (parse_match_pattern(p, e->sub_pattern) < 0) { + free(e->sub_pattern); + free(e); + return NULL; + } + + /* Optional inner WHERE */ + cbm_where_clause_t *inner_where = NULL; + parse_where(p, &inner_where); + e->sub_where = inner_where; + + if (!expect(p, TOK_RBRACE)) { + /* Cleanup on parse failure */ + free(e->sub_pattern); + free(e->sub_where); + free(e); + return NULL; + } + return e; + } cbm_expr_t *child = parse_not_expr(p); return child ? expr_not(child) : NULL; } @@ -1052,6 +1118,10 @@ static int parse_return_or_with(parser_t *p, cbm_return_clause_t **out, bool is_ cbm_token_type_t ft = peek(p)->type; advance(p); expect(p, TOK_LPAREN); + /* Check for DISTINCT inside aggregate: count(DISTINCT ...) */ + if (match(p, TOK_DISTINCT)) { + item.distinct_arg = true; + } if (match(p, TOK_STAR)) { item.variable = heap_strdup("*"); } else { @@ -1561,6 +1631,9 @@ typedef struct { } binding_t; /* Get node property by name */ +/* Forward declaration — full implementation below */ +static const char *json_extract_prop(const char *json, const char *key, char *buf, size_t buf_sz); + static const char *node_prop(const cbm_node_t *n, const char *prop) { if (!n || !prop) { return ""; @@ -1588,6 +1661,24 @@ static const char *node_prop(const cbm_node_t *n, const char *prop) { snprintf(buf, sizeof(buf), "%d", n->end_line); return buf; } + if (strcmp(prop, "file") == 0) { + return n->file_path ? n->file_path : ""; + } + if (strcmp(prop, "id") == 0) { + static char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)n->id); + return buf; + } + /* Fall through to JSON properties for unknown fields. + * This enables queries like WHERE n.is_entry_point = true + * or WHERE n.confidence > 0.5 on properties stored in properties_json. */ + if (n->properties_json) { + static char json_buf[1024]; + const char *val = json_extract_prop(n->properties_json, prop, json_buf, sizeof(json_buf)); + if (val && val[0]) { + return val; + } + } return ""; } @@ -1763,6 +1854,16 @@ static void binding_set(binding_t *b, const char *var, const cbm_node_t *node) { b->var_count++; } +/* Forward declarations for NOT EXISTS subquery evaluation */ +static void scan_pattern_nodes(cbm_store_t *store, const char *project, int max_rows, + cbm_node_pattern_t *first, cbm_node_t **out_nodes, + int *out_count); +static void expand_pattern_rels(cbm_store_t *store, cbm_pattern_t *pat, binding_t **bindings, + int *bind_count, const int *bind_cap, const char **var_name, + bool is_optional); +static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store, + const char *project, int max_rows); + /* Evaluate a WHERE condition against a binding */ static bool eval_condition(const cbm_condition_t *c, binding_t *b) { const char *actual; @@ -1855,8 +1956,10 @@ static bool eval_condition(const cbm_condition_t *c, binding_t *b) { return (int)(c->negated ? !result : result); } -/* Recursive expression tree evaluator */ -static bool eval_expr(const cbm_expr_t *e, binding_t *b) { +/* Recursive expression tree evaluator. + * store is needed for EXPR_NOT_EXISTS (correlated subquery expansion). */ +static bool eval_expr(const cbm_expr_t *e, binding_t *b, cbm_store_t *store, + const char *project, int max_rows) { if (!e) { return true; } @@ -1864,24 +1967,176 @@ static bool eval_expr(const cbm_expr_t *e, binding_t *b) { case EXPR_CONDITION: return eval_condition(&e->cond, b); case EXPR_AND: - return (eval_expr(e->left, b) && eval_expr(e->right, b)) != 0; + return (eval_expr(e->left, b, store, project, max_rows) && + eval_expr(e->right, b, store, project, max_rows)) != 0; case EXPR_OR: - return (eval_expr(e->left, b) || eval_expr(e->right, b)) != 0; + return (eval_expr(e->left, b, store, project, max_rows) || + eval_expr(e->right, b, store, project, max_rows)) != 0; case EXPR_NOT: - return (!eval_expr(e->left, b)) != 0; + return (!eval_expr(e->left, b, store, project, max_rows)) != 0; case EXPR_XOR: - return eval_expr(e->left, b) != eval_expr(e->right, b); + return eval_expr(e->left, b, store, project, max_rows) != + eval_expr(e->right, b, store, project, max_rows); + case EXPR_NOT_EXISTS: { + if (!e->sub_pattern || !store) return true; + cbm_pattern_t *sp = e->sub_pattern; + + /* OPTIMIZATION: For the common pattern + * MATCH (n:Function) WHERE NOT EXISTS { MATCH (caller)-[e]->(n) WHERE e.type = 'CALLS' } + * we detect when the inner pattern's TARGET variable is already bound from + * the outer scope. Instead of scanning all possible callers, we directly + * query edges TO the bound node — O(1) per node instead of O(N). */ + if (sp->rel_count == 1 && sp->node_count == 2) { + const char *start_var = sp->nodes[0].variable; + const char *end_var = sp->nodes[1].variable; + cbm_rel_pattern_t *rel = &sp->rels[0]; + + /* Check which end is bound from outer scope */ + cbm_node_t *bound_node = NULL; + bool bound_is_target = false; + if (end_var && binding_get(b, end_var)) { + bound_node = binding_get(b, end_var); + bound_is_target = true; + } else if (start_var && binding_get(b, start_var)) { + bound_node = binding_get(b, start_var); + } + + if (bound_node && bound_node->id > 0) { + /* Fast path: query edges directly to/from the bound node */ + cbm_edge_t *edges = NULL; + int edge_count = 0; + bool found_match = false; + + for (int ti = 0; ti < rel->type_count && !found_match; ti++) { + const char *edge_type = rel->types[ti]; + if (bound_is_target) { + /* bound node is the target: look for edges incoming TO it */ + cbm_store_find_edges_by_target_type(store, bound_node->id, + edge_type, &edges, &edge_count); + } else { + /* bound node is the source: look for edges outgoing FROM it */ + cbm_store_find_edges_by_source_type(store, bound_node->id, + edge_type, &edges, &edge_count); + } + /* Apply inner WHERE filter if present */ + cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where; + if (edge_count > 0 && inner_w) { + /* Build a temporary binding with the edge to check WHERE conditions */ + for (int ei = 0; ei < edge_count && !found_match; ei++) { + binding_t tmp = *b; /* shallow copy of outer binding */ + const char *edge_var = rel->variable; + if (edge_var) { + binding_set_edge(&tmp, edge_var, &edges[ei]); + } + if (eval_where(inner_w, &tmp, store, project, max_rows)) { + found_match = true; + } + } + } else if (edge_count > 0) { + found_match = true; + } + /* Free edges */ + for (int ei = 0; ei < edge_count; ei++) { + free((void *)edges[ei].project); + free((void *)edges[ei].type); + free((void *)edges[ei].properties_json); + } + free(edges); + edges = NULL; + edge_count = 0; + } + + if (rel->type_count == 0 && !found_match) { + /* No type filter — check ANY edge */ + cbm_edge_t *all_edges = NULL; + int all_count = 0; + if (bound_is_target) { + cbm_store_find_edges_by_target_type(store, bound_node->id, + NULL, &all_edges, &all_count); + } else { + cbm_store_find_edges_by_source_type(store, bound_node->id, + NULL, &all_edges, &all_count); + } + if (all_count > 0) found_match = true; + for (int ei = 0; ei < all_count; ei++) { + free((void *)all_edges[ei].project); + free((void *)all_edges[ei].type); + free((void *)all_edges[ei].properties_json); + } + free(all_edges); + } + + return !found_match; + } + } + + /* SLOW PATH: Full subquery expansion for complex patterns. + * Used when no variable is bound from outer scope, or multi-hop patterns. */ + const char *start_var = sp->nodes[0].variable; + cbm_node_t *scanned = NULL; + int scan_count = 0; + cbm_node_t *outer_node = start_var ? binding_get(b, start_var) : NULL; + + if (outer_node) { + scanned = calloc(1, sizeof(cbm_node_t)); + scanned[0] = *outer_node; + scanned[0].name = outer_node->name ? heap_strdup(outer_node->name) : NULL; + scanned[0].label = outer_node->label ? heap_strdup(outer_node->label) : NULL; + scanned[0].file_path = outer_node->file_path ? heap_strdup(outer_node->file_path) : NULL; + scanned[0].project = outer_node->project ? heap_strdup(outer_node->project) : NULL; + scanned[0].qualified_name = outer_node->qualified_name ? heap_strdup(outer_node->qualified_name) : NULL; + scan_count = 1; + } else { + scan_pattern_nodes(store, project, max_rows, &sp->nodes[0], + &scanned, &scan_count); + } + + if (scan_count == 0) { + free(scanned); + return true; + } + + const char *var = start_var ? start_var : "_ne"; + int sub_cap = scan_count > 4 ? scan_count : 4; + binding_t *sub_bindings = calloc(sub_cap, sizeof(binding_t)); + int sub_count = 0; + for (int i = 0; i < scan_count && sub_count < sub_cap; i++) { + binding_set(&sub_bindings[sub_count], var, &scanned[i]); + sub_count++; + } + free(scanned); + + if (sub_count > 0 && sp->rel_count > 0) { + expand_pattern_rels(store, sp, &sub_bindings, &sub_count, &sub_cap, + &var, false); + } + + bool any_match = false; + cbm_where_clause_t *inner_w = (cbm_where_clause_t *)e->sub_where; + for (int i = 0; i < sub_count && !any_match; i++) { + bool pass = inner_w ? eval_where(inner_w, &sub_bindings[i], store, project, max_rows) : true; + if (pass) any_match = true; + } + for (int i = 0; i < sub_count; i++) { + for (int v = 0; v < sub_bindings[i].var_count; v++) { + node_fields_free(&sub_bindings[i].var_nodes[v]); + } + } + free(sub_bindings); + return !any_match; + } } return true; } /* Evaluate WHERE clause — uses expression tree if available, falls back to legacy */ -static bool eval_where(const cbm_where_clause_t *w, binding_t *b) { +static bool eval_where(const cbm_where_clause_t *w, binding_t *b, cbm_store_t *store, + const char *project, int max_rows) { if (!w) { return true; } if (w->root) { - return eval_expr(w->root, b); + return eval_expr(w->root, b, store, project, max_rows); } /* Legacy flat evaluation */ @@ -2021,7 +2276,7 @@ static const char *eval_case_expr(const cbm_case_expr_t *k, binding_t *b) { return ""; } for (int i = 0; i < k->branch_count; i++) { - if (eval_expr(k->branches[i].when_expr, b)) { + if (eval_expr(k->branches[i].when_expr, b, NULL, NULL, 0)) { return k->branches[i].then_val ? k->branches[i].then_val : ""; } } @@ -2404,9 +2659,9 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec bool pass = true; if (q->where && pat0->rel_count > 0) { /* With expression tree, evaluate full tree — unbound vars pass through */ - pass = eval_where(q->where, &b); + pass = eval_where(q->where, &b, store, project, max_rows); } else if (q->where && pat0->rel_count == 0) { - pass = eval_where(q->where, &b); + pass = eval_where(q->where, &b, store, project, max_rows); } if (pass) { @@ -2507,7 +2762,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (q->where && (pat0->rel_count > 0 || q->pattern_count > 1)) { int kept = 0; for (int i = 0; i < bind_count; i++) { - if (eval_where(q->where, &bindings[i])) { + if (eval_where(q->where, &bindings[i], store, project, max_rows)) { if (kept != i) { bindings[kept] = bindings[i]; } @@ -2547,6 +2802,10 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec double *sums; int *counts; double *mins, *maxs; + /* For count(DISTINCT ...): per-column arrays of seen values */ + const char ***distinct_seen; /* [col][seen_idx] */ + int *distinct_seen_count; /* count per column */ + int *distinct_seen_cap; /* capacity per column */ } with_agg_t; int agg_cap = 256; with_agg_t *aggs = calloc(agg_cap, sizeof(with_agg_t)); @@ -2585,6 +2844,9 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec aggs[found].counts = calloc(wc->count, sizeof(int)); aggs[found].mins = malloc(wc->count * sizeof(double)); aggs[found].maxs = malloc(wc->count * sizeof(double)); + aggs[found].distinct_seen = calloc(wc->count, sizeof(const char **)); + aggs[found].distinct_seen_count = calloc(wc->count, sizeof(int)); + aggs[found].distinct_seen_cap = calloc(wc->count, sizeof(int)); for (int ci = 0; ci < wc->count; ci++) { aggs[found].mins[ci] = 1e308; aggs[found].maxs[ci] = -1e308; @@ -2603,9 +2865,34 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (!wc->items[ci].func) { continue; } - aggs[found].counts[ci]++; const char *raw = binding_get_virtual(&bindings[bi], wc->items[ci].variable, wc->items[ci].property); + /* count(DISTINCT ...): only count if value not already seen */ + if (wc->items[ci].distinct_arg && strcmp(wc->items[ci].func, "COUNT") == 0) { + bool already = false; + for (int di = 0; di < aggs[found].distinct_seen_count[ci]; di++) { + if (aggs[found].distinct_seen[ci][di] && + strcmp(aggs[found].distinct_seen[ci][di], raw) == 0) { + already = true; + break; + } + } + if (!already) { + /* Track the value */ + if (aggs[found].distinct_seen_count[ci] >= aggs[found].distinct_seen_cap[ci]) { + int newcap = aggs[found].distinct_seen_cap[ci] < 16 ? 16 : + aggs[found].distinct_seen_cap[ci] * 2; + aggs[found].distinct_seen[ci] = safe_realloc( + aggs[found].distinct_seen[ci], newcap * sizeof(const char *)); + aggs[found].distinct_seen_cap[ci] = newcap; + } + aggs[found].distinct_seen[ci][aggs[found].distinct_seen_count[ci]++] = + heap_strdup(raw); + aggs[found].counts[ci]++; + } + } else { + aggs[found].counts[ci]++; + } double dv = strtod(raw, NULL); aggs[found].sums[ci] += dv; if (dv < aggs[found].mins[ci]) { @@ -2682,6 +2969,17 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec free(aggs[a].counts); free(aggs[a].mins); free(aggs[a].maxs); + if (aggs[a].distinct_seen) { + for (int ci = 0; ci < wc->count; ci++) { + for (int di = 0; di < aggs[a].distinct_seen_count[ci]; di++) { + free((void *)aggs[a].distinct_seen[ci][di]); + } + free(aggs[a].distinct_seen[ci]); + } + free(aggs[a].distinct_seen); + free(aggs[a].distinct_seen_count); + free(aggs[a].distinct_seen_cap); + } } free(aggs); } else { @@ -2772,7 +3070,7 @@ static int execute_single(cbm_store_t *store, cbm_query_t *q, const char *projec if (q->post_with_where) { int kept = 0; for (int i = 0; i < bind_count; i++) { - if (eval_where(q->post_with_where, &bindings[i])) { + if (eval_where(q->post_with_where, &bindings[i], store, project, max_rows)) { if (kept != i) { bindings[kept] = bindings[i]; } diff --git a/src/cypher/cypher.h b/src/cypher/cypher.h index dedf4c82..8c53a175 100644 --- a/src/cypher/cypher.h +++ b/src/cypher/cypher.h @@ -199,7 +199,8 @@ typedef enum { EXPR_AND, EXPR_OR, EXPR_NOT, - EXPR_XOR + EXPR_XOR, + EXPR_NOT_EXISTS /* NOT EXISTS { MATCH ... WHERE ... } */ } cbm_expr_type_t; typedef struct cbm_expr cbm_expr_t; @@ -208,6 +209,9 @@ struct cbm_expr { cbm_condition_t cond; /* leaf (EXPR_CONDITION only) */ cbm_expr_t *left; /* AND/OR/XOR left; NOT child */ cbm_expr_t *right; /* AND/OR/XOR right; NULL for NOT */ + /* NOT EXISTS subquery (EXPR_NOT_EXISTS only) */ + cbm_pattern_t *sub_pattern; /* inner MATCH pattern */ + void *sub_where; /* cbm_where_clause_t* — void to avoid circular dep */ }; typedef struct { @@ -238,6 +242,7 @@ typedef struct { const char *func; /* "COUNT", "SUM", "AVG", "MIN", "MAX", "COLLECT", "toLower", "toUpper", "toString" or NULL */ cbm_case_expr_t *kase; /* CASE expression (NULL if not CASE) */ + bool distinct_arg; /* true when func is count(DISTINCT ...) */ } cbm_return_item_t; typedef struct { diff --git a/src/mcp/mcp.c b/src/mcp/mcp.c index 3530acc3..eac30e68 100644 --- a/src/mcp/mcp.c +++ b/src/mcp/mcp.c @@ -9,8 +9,11 @@ #include "mcp/mcp.h" #include "store/store.h" +#include #include "cypher/cypher.h" #include "pipeline/pipeline.h" +#include "pipeline/embedding.h" +#include "store/cross_repo.h" #include "cli/cli.h" #include "watcher/watcher.h" #include "foundation/mem.h" @@ -235,13 +238,25 @@ static const tool_def_t TOOLS[] = { {"search_graph", "Search the code knowledge graph for functions, classes, routes, and variables. Use INSTEAD " "OF grep/glob when finding code definitions, implementations, or relationships. Returns " - "precise results in one call.", - "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"label\":{\"type\":" - "\"string\"},\"name_pattern\":{\"type\":\"string\"},\"qn_pattern\":{\"type\":\"string\"}," - "\"file_pattern\":{\"type\":\"string\"},\"relationship\":{\"type\":\"string\"},\"min_degree\":" - "{\"type\":\"integer\"},\"max_degree\":{\"type\":\"integer\"},\"exclude_entry_points\":{" - "\"type\":\"boolean\"},\"include_connected\":{\"type\":\"boolean\"},\"limit\":{\"type\":" - "\"integer\",\"description\":\"Max results. Default: " + "precise results in one call. Two modes: (1) query='search terms' for BM25 ranked full-text " + "search with structural boosting (recommended for discovery and conceptual search), " + "(2) name_pattern='regex' for exact pattern matching.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"query\":{\"type\":\"string\",\"description\":\"Natural language or keyword search using " + "BM25 full-text ranking. Searches function names, class names, qualified names, and file " + "paths. Results ranked by relevance with structural boosting (Functions/Methods +10, " + "Routes +8, Classes +5, high-fan-in +3). Filters out noise nodes (File/Folder/Module/" + "Variable). Example: 'session management' or 'error handling'. When provided, name_pattern " + "is ignored.\"}," + "\"label\":{\"type\":\"string\"},\"name_pattern\":{\"type\":\"string\"}," + "\"qn_pattern\":{\"type\":\"string\"}," + "\"file_pattern\":{\"type\":\"string\"},\"relationship\":{\"type\":\"string\"}," + "\"min_degree\":{\"type\":\"integer\"},\"max_degree\":{\"type\":\"integer\"}," + "\"exclude_entry_points\":{\"type\":\"boolean\"},\"include_connected\":{\"type\":" + "\"boolean\"}," + "\"sort_by\":{\"type\":\"string\",\"description\":\"Sort by: relevance (default with " + "query), name, file_path\"}," + "\"limit\":{\"type\":\"integer\",\"description\":\"Max results. Default: " "unlimited\"},\"offset\":{\"type\":\"integer\",\"default\":0}},\"required\":[\"project\"]}"}, {"query_graph", @@ -281,6 +296,39 @@ static const tool_def_t TOOLS[] = { "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"},\"aspects\":{\"type\":" "\"array\",\"items\":{\"type\":\"string\"}}},\"required\":[\"project\"]}"}, + {"list_processes", + "List discovered execution flows (processes). Each process is a named path from an entry " + "point through the call graph to a terminal node that crosses a community boundary. " + "Processes are auto-detected during indexing using BFS from entry points + Louvain " + "community detection. Returns up to 300 processes ordered by step count.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}},\"required\":[\"project\"]}"}, + + {"get_process_steps", + "Get the ordered step list for a specific execution flow. Returns each function " + "in the flow with file_path, qualified_name, and step number. Use after list_processes " + "to drill into a specific flow for step-by-step debugging.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"process_id\":{\"type\":\"number\",\"description\":\"Process ID from list_processes\"}}" + ",\"required\":[\"project\",\"process_id\"]}"}, + + {"get_impact", + "Analyze blast radius of changing a symbol. Returns all upstream callers grouped by " + "depth (d=1 WILL BREAK, d=2 LIKELY AFFECTED), affected processes, risk assessment " + "(LOW/MEDIUM/HIGH/CRITICAL), and affected modules. Use before modifying shared code.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"target\":{\"type\":\"string\",\"description\":\"Function or class name to analyze\"}," + "\"direction\":{\"type\":\"string\",\"enum\":[\"upstream\",\"downstream\"],\"default\":\"upstream\"}," + "\"max_depth\":{\"type\":\"number\",\"default\":3}}" + ",\"required\":[\"project\",\"target\"]}"}, + + {"get_channels", + "Find message channels (Socket.IO events, EventEmitter signals) across projects. " + "Shows which functions emit and listen on each channel, enabling cross-service " + "message flow tracing. Auto-detects patterns during indexing. " + "Query by channel name (partial match) and/or project.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"channel\":{\"type\":\"string\",\"description\":\"Channel name filter (partial match)\"}}}"}, + {"search_code", "Graph-augmented code search. Finds text patterns via grep, then enriches results with " "the knowledge graph: deduplicates matches into containing functions, ranks by structural " @@ -324,6 +372,32 @@ static const tool_def_t TOOLS[] = { "{\"type\":\"object\",\"properties\":{\"traces\":{\"type\":\"array\",\"items\":{\"type\":" "\"object\"}},\"project\":{\"type\":" "\"string\"}},\"required\":[\"traces\",\"project\"]}"}, + + {"generate_embeddings", + "Generate semantic embeddings for code symbols via external embedding server. " + "Requires CBM_EMBEDDING_URL environment variable (e.g., http://localhost:11434/v1 for Ollama). " + "Embeddings enable hybrid BM25+vector search in search_graph, bridging the gap between " + "keyword queries and conceptual code discovery.", + "{\"type\":\"object\",\"properties\":{\"project\":{\"type\":\"string\"}," + "\"force\":{\"type\":\"boolean\",\"default\":false,\"description\":" + "\"Re-generate all embeddings even if they already exist\"}},\"required\":[\"project\"]}"}, + + {"build_cross_repo_index", + "Build unified cross-repo index for cross-repository search, channel matching, and flow tracing. " + "Scans all indexed project databases and builds a _cross_repo.db with node stubs, channels, " + "and embeddings from all repos. Enables search_graph with project='*' for cross-repo search, " + "and trace_cross_repo for cross-service message flow tracing. Auto-rebuilds after each " + "index_repository call, but can be triggered manually to refresh.", + "{\"type\":\"object\",\"properties\":{}}"}, + + {"trace_cross_repo", + "Trace message/event channels across repositories. Shows which services produce and consume " + "a specific channel, with file-level and function-level detail. Requires build_cross_repo_index " + "to have been run at least once.", + "{\"type\":\"object\",\"properties\":{" + "\"channel\":{\"type\":\"string\",\"description\":\"Channel name to trace (partial match). " + "Omit to list all cross-repo channels.\"}," + "\"repo\":{\"type\":\"string\",\"description\":\"Filter to channels involving a specific repo.\"}}}"}, }; static const int TOOL_COUNT = sizeof(TOOLS) / sizeof(TOOLS[0]); @@ -926,8 +1000,21 @@ static char *handle_get_graph_schema(cbm_mcp_server_t *srv, const char *args) { return result; } +/* Forward declarations — defined in cross-repo handler section */ +static char *handle_cross_repo_search(cbm_mcp_server_t *srv, const char *args); +static char *derive_short_project(const char *full_project); +static void add_trace_steps(yyjson_mut_doc *doc, yyjson_mut_val *parent, + const char *key, cbm_cross_trace_step_t *steps, int count); + static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { char *project = cbm_mcp_get_string_arg(args, "project"); + + /* Cross-repo search: project="*" dispatches to unified _cross_repo.db */ + if (project && strcmp(project, "*") == 0) { + free(project); + return handle_cross_repo_search(srv, args); + } + cbm_store_t *store = resolve_store(srv, project); REQUIRE_STORE(store, project); @@ -940,6 +1027,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { char *label = cbm_mcp_get_string_arg(args, "label"); char *name_pattern = cbm_mcp_get_string_arg(args, "name_pattern"); char *file_pattern = cbm_mcp_get_string_arg(args, "file_pattern"); + char *query = cbm_mcp_get_string_arg(args, "query"); + char *sort_by = cbm_mcp_get_string_arg(args, "sort_by"); int limit = cbm_mcp_get_int_arg(args, "limit", 500000); int offset = cbm_mcp_get_int_arg(args, "offset", 0); int min_degree = cbm_mcp_get_int_arg(args, "min_degree", -1); @@ -950,6 +1039,8 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { .label = label, .name_pattern = name_pattern, .file_pattern = file_pattern, + .query = query, + .sort_by = sort_by, .limit = limit, .offset = offset, .min_degree = min_degree, @@ -959,11 +1050,75 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { cbm_search_output_t out = {0}; cbm_store_search(store, ¶ms, &out); + /* ── Hybrid search: if query is provided and embeddings exist, run + * vector search and merge with BM25 results using RRF (k=60). + * This enables semantic search: "institution name update" finds + * updateCloudClient even though the keywords don't overlap. ── */ + cbm_rrf_result_t *rrf_results = NULL; + int rrf_count = 0; + bool used_hybrid = false; + + if (query && query[0] && cbm_embedding_is_configured()) { + int emb_count = cbm_store_count_embeddings(store, project); + if (emb_count > 0) { + cbm_embedding_config_t cfg = cbm_embedding_get_config(); + + /* Embed the query text */ + float *query_vec = cbm_embedding_embed_text(&cfg, query); + if (query_vec) { + /* Run vector search */ + cbm_vector_result_t *vec_results = NULL; + int vec_count = 0; + cbm_store_vector_search(store, project, query_vec, cfg.dims, + 50, &vec_results, &vec_count); + + if (vec_count > 0) { + /* Collect BM25 node IDs in ranked order */ + int64_t *bm25_ids = malloc((size_t)out.count * sizeof(int64_t)); + if (bm25_ids) { + for (int i = 0; i < out.count; i++) { + bm25_ids[i] = out.results[i].node.id; + } + + /* RRF merge */ + cbm_embedding_rrf_merge(bm25_ids, out.count, + vec_results, vec_count, + &rrf_results, &rrf_count); + used_hybrid = true; + free(bm25_ids); + } + } + + cbm_store_free_vector_results(vec_results, vec_count); + free(query_vec); + } + } + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); - yyjson_mut_obj_add_int(doc, root, "total", out.total); + yyjson_mut_obj_add_int(doc, root, "total", + used_hybrid ? rrf_count : out.total); + if (used_hybrid) { + yyjson_mut_obj_add_str(doc, root, "search_mode", "hybrid_bm25_vector"); + } + + /* For each result, look up which execution flows it participates in. + * This enables process-grouped search results similar to GitNexus's + * flow-aware query output. Uses a single prepared statement. */ + sqlite3_stmt *proc_stmt = NULL; + { + const char *psql = + "SELECT DISTINCT p.id, p.label, p.step_count FROM process_steps ps " + "JOIN processes p ON p.id = ps.process_id AND p.project = ?2 " + "WHERE ps.node_id = ?1 LIMIT 5"; + sqlite3_prepare_v2(cbm_store_get_db(store), psql, -1, &proc_stmt, NULL); + if (proc_stmt) { + sqlite3_bind_text(proc_stmt, 2, project, -1, SQLITE_STATIC); + } + } yyjson_mut_val *results = yyjson_mut_arr(doc); for (int i = 0; i < out.count; i++) { @@ -977,19 +1132,105 @@ static char *handle_search_graph(cbm_mcp_server_t *srv, const char *args) { sr->node.file_path ? sr->node.file_path : ""); yyjson_mut_obj_add_int(doc, item, "in_degree", sr->in_degree); yyjson_mut_obj_add_int(doc, item, "out_degree", sr->out_degree); + + /* Process participation */ + if (proc_stmt && sr->node.id > 0) { + sqlite3_reset(proc_stmt); + sqlite3_bind_int64(proc_stmt, 1, sr->node.id); + + yyjson_mut_val *proc_arr = yyjson_mut_arr(doc); + while (sqlite3_step(proc_stmt) == SQLITE_ROW) { + yyjson_mut_val *pobj = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, pobj, "id", sqlite3_column_int64(proc_stmt, 0)); + const char *plabel = (const char *)sqlite3_column_text(proc_stmt, 1); + yyjson_mut_obj_add_strcpy(doc, pobj, "label", plabel ? plabel : ""); + yyjson_mut_obj_add_int(doc, pobj, "step_count", sqlite3_column_int(proc_stmt, 2)); + yyjson_mut_arr_add_val(proc_arr, pobj); + } + yyjson_mut_obj_add_val(doc, item, "processes", proc_arr); + } + yyjson_mut_arr_add_val(results, item); } + yyjson_mut_obj_add_val(doc, root, "results", results); - yyjson_mut_obj_add_bool(doc, root, "has_more", out.total > offset + out.count); + yyjson_mut_obj_add_bool(doc, root, "has_more", + used_hybrid ? false : (out.total > offset + out.count)); + + /* If hybrid search found vector-only results (not in BM25), add them. + * These are semantically relevant results that keyword search missed — + * the whole point of vector search. */ + if (used_hybrid && rrf_results) { + yyjson_mut_val *vec_only = yyjson_mut_arr(doc); + int vec_only_count = 0; + + for (int i = 0; i < rrf_count && vec_only_count < 20; i++) { + if (rrf_results[i].bm25_rank < 0) { + /* This result was found ONLY by vector search */ + cbm_node_t vnode = {0}; + if (cbm_store_find_node_by_id(store, rrf_results[i].node_id, + &vnode) == CBM_STORE_OK) { + yyjson_mut_val *vitem = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, vitem, "name", + vnode.name ? vnode.name : ""); + yyjson_mut_obj_add_strcpy(doc, vitem, "qualified_name", + vnode.qualified_name ? vnode.qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, vitem, "label", + vnode.label ? vnode.label : ""); + yyjson_mut_obj_add_strcpy(doc, vitem, "file_path", + vnode.file_path ? vnode.file_path : ""); + yyjson_mut_obj_add_real(doc, vitem, "similarity", + rrf_results[i].similarity); + yyjson_mut_obj_add_real(doc, vitem, "rrf_score", + rrf_results[i].rrf_score); + + /* Process participation for vector-only results too */ + if (proc_stmt) { + sqlite3_reset(proc_stmt); + sqlite3_bind_int64(proc_stmt, 1, rrf_results[i].node_id); + yyjson_mut_val *vproc_arr = yyjson_mut_arr(doc); + while (sqlite3_step(proc_stmt) == SQLITE_ROW) { + yyjson_mut_val *vpobj = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, vpobj, "id", + sqlite3_column_int64(proc_stmt, 0)); + const char *vplabel = + (const char *)sqlite3_column_text(proc_stmt, 1); + yyjson_mut_obj_add_strcpy(doc, vpobj, "label", + vplabel ? vplabel : ""); + yyjson_mut_obj_add_int(doc, vpobj, "step_count", + sqlite3_column_int(proc_stmt, 2)); + yyjson_mut_arr_add_val(vproc_arr, vpobj); + } + yyjson_mut_obj_add_val(doc, vitem, "processes", vproc_arr); + } + + yyjson_mut_arr_add_val(vec_only, vitem); + vec_only_count++; + cbm_node_free_fields(&vnode); + } + } + } + + if (vec_only_count > 0) { + yyjson_mut_obj_add_val(doc, root, "semantic_results", vec_only); + yyjson_mut_obj_add_int(doc, root, "semantic_result_count", vec_only_count); + } + } + + if (proc_stmt) sqlite3_finalize(proc_stmt); + // Note: proc_stmt finalize moved here to be AFTER vector-only result processing char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); cbm_store_search_free(&out); + free(rrf_results); free(project); free(label); free(name_pattern); free(file_pattern); + free(query); + free(sort_by); char *result = cbm_mcp_text_result(json, false); free(json); @@ -1152,6 +1393,504 @@ static char *handle_delete_project(cbm_mcp_server_t *srv, const char *args) { return result; } +static char *handle_get_process_steps(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + int64_t process_id = (int64_t)cbm_mcp_get_int_arg(args, "process_id", 0); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + cbm_process_step_t *steps = NULL; + int count = 0; + cbm_store_get_process_steps(store, process_id, &steps, &count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + yyjson_mut_obj_add_int(doc, root, "total_steps", count); + + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "step", steps[i].step); + yyjson_mut_obj_add_strcpy(doc, item, "name", steps[i].name ? steps[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "qualified_name", + steps[i].qualified_name ? steps[i].qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file_path", + steps[i].file_path ? steps[i].file_path : ""); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "steps", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_process_steps(steps, count); + free(project); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_get_impact(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + char *target = cbm_mcp_get_string_arg(args, "target"); + char *direction = cbm_mcp_get_string_arg(args, "direction"); + int max_depth = cbm_mcp_get_int_arg(args, "max_depth", 3); + bool cross_repo = cbm_mcp_get_bool_arg(args, "cross_repo"); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + if (!direction) direction = heap_strdup("upstream"); + bool is_upstream = strcmp(direction, "upstream") == 0; + const char *bfs_dir = is_upstream ? "inbound" : "outbound"; + + /* Find target node */ + cbm_node_t *nodes = NULL; + int node_count = 0; + cbm_store_find_nodes_by_name(store, project, target, &nodes, &node_count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + if (node_count == 0) { + yyjson_mut_obj_add_strcpy(doc, root, "error", "symbol not found"); + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + free(target); free(project); free(direction); + char *r = cbm_mcp_text_result(json, true); + free(json); + return r; + } + + /* Pick best node: prefer Class over Constructor when both share the same name. + * This mirrors the disambiguation logic in trace_call_path so that impact + * analysis on a class name (e.g. "UserService") resolves to the Class node + * and then fans out through DEFINES_METHOD to all its methods. Previously + * this picked the Constructor/Method first, which has 0 callers. */ + int best = 0; + bool has_class = false; + int class_idx = -1; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Class") == 0 || strcmp(lbl, "Interface") == 0)) { + has_class = true; + if (class_idx < 0) class_idx = i; + } + } + /* Look for a non-constructor Function/Method (skip if same name as Class) */ + bool found_callable = false; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Function") == 0 || strcmp(lbl, "Method") == 0)) { + if (has_class) continue; /* skip constructor */ + best = i; + found_callable = true; + break; + } + } + if (!found_callable && class_idx >= 0) { + best = class_idx; + } + + /* Resolve start IDs: if target is a Class/Interface, expand through + * DEFINES_METHOD edges to get all method node IDs for BFS. */ + int64_t *start_ids = NULL; + int start_id_count = 0; + bool is_class_like = false; + const char *best_label = nodes[best].label; + if (best_label && + (strcmp(best_label, "Class") == 0 || strcmp(best_label, "Interface") == 0)) { + is_class_like = true; + } + + if (is_class_like) { + cbm_edge_t *dm_edges = NULL; + int dm_count = 0; + cbm_store_find_edges_by_source_type(store, nodes[best].id, "DEFINES_METHOD", + &dm_edges, &dm_count); + if (dm_count > 0) { + /* For impact we use all methods (unlike trace which caps at 5) */ + int use_count = dm_count > 30 ? 30 : dm_count; + start_ids = malloc((size_t)use_count * sizeof(int64_t)); + for (int i = 0; i < use_count; i++) { + start_ids[i] = dm_edges[i].target_id; + } + start_id_count = use_count; + } + for (int i = 0; i < dm_count; i++) { + free((void *)dm_edges[i].project); + free((void *)dm_edges[i].type); + free((void *)dm_edges[i].properties_json); + } + free(dm_edges); + if (start_id_count == 0) { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best].id; + start_id_count = 1; + } + } else { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best].id; + start_id_count = 1; + } + + yyjson_mut_obj_add_strcpy(doc, root, "target", target); + yyjson_mut_obj_add_strcpy(doc, root, "direction", direction); + yyjson_mut_obj_add_strcpy(doc, root, "file_path", + nodes[best].file_path ? nodes[best].file_path : ""); + yyjson_mut_obj_add_int(doc, root, "line", nodes[best].start_line); + + /* BFS from each start ID and merge results. For classes this fans out + * through all methods, giving a true blast radius instead of 0. */ + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS", "USAGE"}; + cbm_traverse_result_t tr = {0}; + + if (start_id_count == 1) { + cbm_store_bfs(store, start_ids[0], bfs_dir, call_types, 4, max_depth, 200, &tr); + } else { + /* Multi-method BFS: run from each method, collect unique visited nodes */ + cbm_traverse_result_t *subs = calloc((size_t)start_id_count, sizeof(*subs)); + int total_visited = 0; + for (int s = 0; s < start_id_count; s++) { + cbm_store_bfs(store, start_ids[s], bfs_dir, call_types, 4, max_depth, + 200, &subs[s]); + total_visited += subs[s].visited_count; + } + /* Merge into tr: allocate worst-case, then dedup by node id */ + if (total_visited > 0) { + tr.visited = malloc((size_t)total_visited * sizeof(cbm_node_hop_t)); + tr.visited_count = 0; + for (int s = 0; s < start_id_count; s++) { + for (int v = 0; v < subs[s].visited_count; v++) { + int64_t vid = subs[s].visited[v].node.id; + /* Check for duplicate (same node already in tr) */ + bool dup = false; + for (int e = 0; e < tr.visited_count; e++) { + if (tr.visited[e].node.id == vid) { + /* Keep the one with smaller hop (closer = more impacted) */ + if (subs[s].visited[v].hop < tr.visited[e].hop) + tr.visited[e].hop = subs[s].visited[v].hop; + dup = true; + break; + } + } + if (!dup && tr.visited_count < total_visited) { + tr.visited[tr.visited_count] = subs[s].visited[v]; + tr.visited_count++; + } + } + } + } + /* Free sub-traversals (but NOT their visited[].node fields — we moved them) */ + for (int s = 0; s < start_id_count; s++) { + free(subs[s].edges); + } + free(subs); + } + + /* Group by depth */ + yyjson_mut_val *d1_arr = yyjson_mut_arr(doc); + yyjson_mut_val *d2_arr = yyjson_mut_arr(doc); + yyjson_mut_val *d3_arr = yyjson_mut_arr(doc); + int depth_counts[10] = {0}; + int total_affected = 0; + + for (int i = 0; i < tr.visited_count; i++) { + int h = tr.visited[i].hop; + if (h >= 1 && h <= max_depth) { + if (h < 10) depth_counts[h]++; + total_affected++; + + cbm_node_t *vn = &tr.visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_str(doc, item, "label", vn->label ? vn->label : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + + if (h == 1) yyjson_mut_arr_add_val(d1_arr, item); + else if (h == 2) yyjson_mut_arr_add_val(d2_arr, item); + else yyjson_mut_arr_add_val(d3_arr, item); + } + } + yyjson_mut_val *by_depth = yyjson_mut_obj(doc); + yyjson_mut_obj_add_val(doc, by_depth, "d1_will_break", d1_arr); + yyjson_mut_obj_add_val(doc, by_depth, "d2_likely_affected", d2_arr); + yyjson_mut_obj_add_val(doc, by_depth, "d3_may_need_testing", d3_arr); + yyjson_mut_obj_add_val(doc, root, "by_depth", by_depth); + + /* Risk assessment */ + const char *risk; + if (depth_counts[1] >= 20) risk = "CRITICAL"; + else if (depth_counts[1] >= 10) risk = "HIGH"; + else if (depth_counts[1] >= 3) risk = "MEDIUM"; + else risk = "LOW"; + + yyjson_mut_obj_add_str(doc, root, "risk", risk); + yyjson_mut_obj_add_int(doc, root, "total_affected", total_affected); + yyjson_mut_obj_add_int(doc, root, "direct_callers", depth_counts[1]); + + /* Summary labels per depth */ + yyjson_mut_val *summary = yyjson_mut_obj(doc); + char d1_label[64]; snprintf(d1_label, sizeof(d1_label), "%d WILL BREAK", depth_counts[1]); + char d2_label[64]; snprintf(d2_label, sizeof(d2_label), "%d LIKELY AFFECTED", depth_counts[2]); + char d3_label[64]; snprintf(d3_label, sizeof(d3_label), "%d MAY NEED TESTING", depth_counts[3]); + yyjson_mut_obj_add_strcpy(doc, summary, "d1", d1_label); + yyjson_mut_obj_add_strcpy(doc, summary, "d2", d2_label); + yyjson_mut_obj_add_strcpy(doc, summary, "d3", d3_label); + yyjson_mut_obj_add_val(doc, root, "summary", summary); + + /* Affected processes — match by checking if any BFS-visited node name + * appears in the process label, OR if the target name itself appears. + * This catches processes that flow through the target's methods. */ + { + cbm_process_info_t *procs = NULL; + int pcount = 0; + cbm_store_list_processes(store, project, &procs, &pcount); + yyjson_mut_val *paff = yyjson_mut_arr(doc); + int pc = 0; + for (int pi = 0; pi < pcount && pc < 20; pi++) { + if (!procs[pi].label) continue; + bool match = false; + /* Check target name */ + if (target && strstr(procs[pi].label, target)) match = true; + /* Check BFS-visited node names (d=1 callers are most likely) */ + if (!match) { + for (int v = 0; v < tr.visited_count && !match; v++) { + if (tr.visited[v].hop == 1 && tr.visited[v].node.name && + strstr(procs[pi].label, tr.visited[v].node.name)) { + match = true; + } + } + } + if (match) { + yyjson_mut_val *pitem = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, pitem, "label", procs[pi].label); + yyjson_mut_obj_add_int(doc, pitem, "step_count", procs[pi].step_count); + yyjson_mut_arr_add_val(paff, pitem); + pc++; + } + } + yyjson_mut_obj_add_val(doc, root, "affected_processes", paff); + cbm_store_free_processes(procs, pcount); + } + + /* ── Cross-repo impact: check if d=1 nodes emit channels to other repos ── */ + if (cross_repo && tr.visited_count > 0) { + cbm_cross_repo_t *cr = cbm_cross_repo_open(); + if (cr) { + yyjson_mut_val *xr_arr = yyjson_mut_arr(doc); + int xr_count = 0; + + /* For each d=1 visited node, check cross_channels for emitters */ + for (int vi = 0; vi < tr.visited_count && xr_count < 10; vi++) { + if (tr.visited[vi].hop != 1) continue; /* only d=1 */ + const char *vname = tr.visited[vi].node.name; + if (!vname) continue; + + /* Query cross_channels for this function as an emitter */ + cbm_cross_channel_match_t *xmatches = NULL; + int xmatch_count = 0; + /* Use the function name to filter — imprecise but functional */ + cbm_cross_repo_match_channels(cr, NULL, &xmatches, &xmatch_count); + + for (int xi = 0; xi < xmatch_count && xr_count < 10; xi++) { + /* Match: emitter function name matches d=1 visited node */ + if (!xmatches[xi].emit_function) continue; + if (strcmp(xmatches[xi].emit_function, vname) != 0) continue; + /* And the emitter project matches our project */ + if (!xmatches[xi].emit_project || !project) continue; + if (strcmp(xmatches[xi].emit_project, project) != 0) continue; + + /* Found a cross-repo channel triggered by this d=1 node */ + yyjson_mut_val *xr_item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, xr_item, "channel", + xmatches[xi].channel_name ? xmatches[xi].channel_name : ""); + yyjson_mut_obj_add_strcpy(doc, xr_item, "transport", + xmatches[xi].transport ? xmatches[xi].transport : ""); + yyjson_mut_obj_add_strcpy(doc, xr_item, "triggered_by", vname); + yyjson_mut_obj_add_int(doc, xr_item, "triggered_by_depth", 1); + + char *lp_short = derive_short_project( + xmatches[xi].listen_project ? xmatches[xi].listen_project : ""); + yyjson_mut_obj_add_strcpy(doc, xr_item, "consumer_repo", + lp_short ? lp_short : ""); + yyjson_mut_obj_add_strcpy(doc, xr_item, "consumer_project_id", + xmatches[xi].listen_project ? xmatches[xi].listen_project : ""); + free(lp_short); + + yyjson_mut_obj_add_strcpy(doc, xr_item, "listener_function", + xmatches[xi].listen_function ? xmatches[xi].listen_function : ""); + yyjson_mut_obj_add_strcpy(doc, xr_item, "listener_file", + xmatches[xi].listen_file ? xmatches[xi].listen_file : ""); + + /* Trace downstream in consumer repo */ + char db_path_buf[2048]; + project_db_path(xmatches[xi].listen_project, db_path_buf, sizeof(db_path_buf)); + cbm_cross_trace_step_t *ds_steps = NULL; + int ds_count = 0; + cbm_cross_repo_trace_in_project(db_path_buf, + xmatches[xi].listen_function, xmatches[xi].listen_file, + xmatches[xi].channel_name, "outbound", 2, &ds_steps, &ds_count); + yyjson_mut_obj_add_int(doc, xr_item, "downstream_affected", ds_count); + if (ds_count > 0) { + add_trace_steps(doc, xr_item, "downstream", ds_steps, ds_count); + } + cbm_cross_trace_free(ds_steps, ds_count); + + yyjson_mut_arr_add_val(xr_arr, xr_item); + xr_count++; + } + cbm_cross_channel_free(xmatches, xmatch_count); + } + + if (xr_count > 0) { + yyjson_mut_obj_add_val(doc, root, "cross_repo_impacts", xr_arr); + yyjson_mut_obj_add_int(doc, root, "cross_repo_impact_count", xr_count); + } + cbm_cross_repo_close(cr); + } + } + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_traverse_free(&tr); + cbm_store_free_nodes(nodes, node_count); + free(start_ids); + free(target); free(project); free(direction); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_get_channels(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + char *channel = cbm_mcp_get_string_arg(args, "channel"); + + /* Cross-repo channel query: when project is NULL, iterate all indexed projects */ + cbm_channel_info_t *channels = NULL; + int count = 0; + + if (!project || strlen(project) == 0) { + char dir_path[1024]; + cache_dir(dir_path, sizeof(dir_path)); + cbm_dir_t *d = cbm_opendir(dir_path); + if (d) { + cbm_dirent_t *entry; + while ((entry = cbm_readdir(d)) != NULL) { + const char *n = entry->name; + size_t len = strlen(n); + if (len < 4 || strcmp(n + len - 3, ".db") != 0) continue; + if (strncmp(n, "tmp-", 4) == 0 || strncmp(n, "_", 1) == 0) continue; + + /* Extract project name (filename without .db) */ + char proj_name[512]; + snprintf(proj_name, sizeof(proj_name), "%.*s", (int)(len - 3), n); + + /* Open this project's store and query channels */ + char db_path[2048]; + snprintf(db_path, sizeof(db_path), "%s/%s", dir_path, n); + cbm_store_t *ps = cbm_store_open_path_query(db_path); + if (!ps) continue; + + cbm_channel_info_t *proj_ch = NULL; + int proj_count = 0; + cbm_store_find_channels(ps, proj_name, channel, &proj_ch, &proj_count); + + if (proj_count > 0) { + /* Merge into main results */ + channels = safe_realloc(channels, + (count + proj_count) * sizeof(cbm_channel_info_t)); + memcpy(channels + count, proj_ch, proj_count * sizeof(cbm_channel_info_t)); + count += proj_count; + free(proj_ch); /* shallow free — info fields now owned by channels[] */ + } + cbm_store_close(ps); + } + cbm_closedir(d); + } + } else { + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + cbm_store_find_channels(store, project, channel, &channels, &count); + } + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total", count); + + /* Group by channel name for readable output */ + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "channel", + channels[i].channel_name ? channels[i].channel_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "direction", + channels[i].direction ? channels[i].direction : ""); + yyjson_mut_obj_add_strcpy(doc, item, "transport", + channels[i].transport ? channels[i].transport : ""); + yyjson_mut_obj_add_strcpy(doc, item, "project", + channels[i].project ? channels[i].project : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file", + channels[i].file_path ? channels[i].file_path : ""); + yyjson_mut_obj_add_strcpy(doc, item, "function", + channels[i].function_name ? channels[i].function_name : ""); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "channels", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_channels(channels, count); + free(project); + free(channel); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +static char *handle_list_processes(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + cbm_process_info_t *procs = NULL; + int count = 0; + cbm_store_list_processes(store, project, &procs, &count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total", count); + + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "id", procs[i].id); + yyjson_mut_obj_add_strcpy(doc, item, "label", procs[i].label ? procs[i].label : ""); + yyjson_mut_obj_add_strcpy(doc, item, "process_type", + procs[i].process_type ? procs[i].process_type : ""); + yyjson_mut_obj_add_int(doc, item, "step_count", procs[i].step_count); + yyjson_mut_obj_add_int(doc, item, "entry_point_id", procs[i].entry_point_id); + yyjson_mut_obj_add_int(doc, item, "terminal_id", procs[i].terminal_id); + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "processes", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_store_free_processes(procs, count); + free(project); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { char *project = cbm_mcp_get_string_arg(args, "project"); cbm_store_t *store = resolve_store(srv, project); @@ -1169,6 +1908,12 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { int node_count = cbm_store_count_nodes(store, project); int edge_count = cbm_store_count_edges(store, project); + /* Call the full architecture analysis */ + cbm_architecture_info_t arch = {0}; + const char *all_aspects[] = {"languages", "hotspots", "routes", "entry_points", + "packages", "clusters", "layers", "boundaries"}; + cbm_store_get_architecture(store, project, all_aspects, 8, &arch); + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); @@ -1199,6 +1944,105 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { } yyjson_mut_obj_add_val(doc, root, "edge_types", types); + /* Languages */ + if (arch.language_count > 0) { + yyjson_mut_val *langs = yyjson_mut_arr(doc); + for (int i = 0; i < arch.language_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "language", + arch.languages[i].language ? arch.languages[i].language : ""); + yyjson_mut_obj_add_int(doc, item, "files", arch.languages[i].file_count); + yyjson_mut_arr_add_val(langs, item); + } + yyjson_mut_obj_add_val(doc, root, "languages", langs); + } + + /* Hotspots (high fan-in functions) */ + if (arch.hotspot_count > 0) { + yyjson_mut_val *spots = yyjson_mut_arr(doc); + for (int i = 0; i < arch.hotspot_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.hotspots[i].name ? arch.hotspots[i].name : ""); + yyjson_mut_obj_add_strcpy( + doc, item, "qualified_name", + arch.hotspots[i].qualified_name ? arch.hotspots[i].qualified_name : ""); + yyjson_mut_obj_add_int(doc, item, "fan_in", arch.hotspots[i].fan_in); + yyjson_mut_arr_add_val(spots, item); + } + yyjson_mut_obj_add_val(doc, root, "hotspots", spots); + } + + /* Routes */ + if (arch.route_count > 0) { + yyjson_mut_val *routes_arr = yyjson_mut_arr(doc); + for (int i = 0; i < arch.route_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "method", + arch.routes[i].method ? arch.routes[i].method : ""); + yyjson_mut_obj_add_strcpy(doc, item, "path", + arch.routes[i].path ? arch.routes[i].path : ""); + yyjson_mut_obj_add_strcpy(doc, item, "handler", + arch.routes[i].handler ? arch.routes[i].handler : ""); + yyjson_mut_arr_add_val(routes_arr, item); + } + yyjson_mut_obj_add_val(doc, root, "routes", routes_arr); + } + + /* Entry points */ + if (arch.entry_point_count > 0) { + yyjson_mut_val *eps = yyjson_mut_arr(doc); + for (int i = 0; i < arch.entry_point_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.entry_points[i].name ? arch.entry_points[i].name : ""); + yyjson_mut_obj_add_strcpy( + doc, item, "qualified_name", + arch.entry_points[i].qualified_name ? arch.entry_points[i].qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file", + arch.entry_points[i].file ? arch.entry_points[i].file : ""); + yyjson_mut_arr_add_val(eps, item); + } + yyjson_mut_obj_add_val(doc, root, "entry_points", eps); + } + + /* Packages */ + if (arch.package_count > 0) { + yyjson_mut_val *pkgs = yyjson_mut_arr(doc); + for (int i = 0; i < arch.package_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "name", + arch.packages[i].name ? arch.packages[i].name : ""); + yyjson_mut_obj_add_int(doc, item, "node_count", arch.packages[i].node_count); + yyjson_mut_obj_add_int(doc, item, "fan_in", arch.packages[i].fan_in); + yyjson_mut_obj_add_int(doc, item, "fan_out", arch.packages[i].fan_out); + yyjson_mut_arr_add_val(pkgs, item); + } + yyjson_mut_obj_add_val(doc, root, "packages", pkgs); + } + + /* Clusters */ + if (arch.cluster_count > 0) { + yyjson_mut_val *cls = yyjson_mut_arr(doc); + for (int i = 0; i < arch.cluster_count; i++) { + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_int(doc, item, "id", arch.clusters[i].id); + yyjson_mut_obj_add_strcpy(doc, item, "label", + arch.clusters[i].label ? arch.clusters[i].label : ""); + yyjson_mut_obj_add_int(doc, item, "members", arch.clusters[i].members); + yyjson_mut_obj_add_real(doc, item, "cohesion", arch.clusters[i].cohesion); + if (arch.clusters[i].top_node_count > 0) { + yyjson_mut_val *tn = yyjson_mut_arr(doc); + for (int j = 0; j < arch.clusters[i].top_node_count; j++) { + yyjson_mut_arr_add_strcpy(doc, tn, arch.clusters[i].top_nodes[j]); + } + yyjson_mut_obj_add_val(doc, item, "top_nodes", tn); + } + yyjson_mut_arr_add_val(cls, item); + } + yyjson_mut_obj_add_val(doc, root, "clusters", cls); + } + /* Relationship patterns */ if (schema.rel_pattern_count > 0) { yyjson_mut_val *pats = yyjson_mut_arr(doc); @@ -1210,6 +2054,7 @@ static char *handle_get_architecture(cbm_mcp_server_t *srv, const char *args) { char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); + cbm_store_architecture_free(&arch); cbm_store_schema_free(&schema); free(project); @@ -1258,13 +2103,149 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { cbm_store_find_nodes_by_name(store, project, func_name, &nodes, &node_count); if (node_count == 0) { - free(func_name); - free(project); - free(direction); + /* Fuzzy fallback: try substring match when exact name not found. + * This handles cases like searching for "RecordingSession" when only + * "ContinuousRecordingSessionDataGen" exists. */ + cbm_search_params_t fuzzy = {0}; + char pattern[512]; + snprintf(pattern, sizeof(pattern), ".*%s.*", func_name); + fuzzy.project = project; + fuzzy.name_pattern = pattern; + fuzzy.limit = 10; + cbm_search_output_t fuzzy_results = {0}; + cbm_store_search(store, &fuzzy, &fuzzy_results); + + if (fuzzy_results.count > 0) { + /* Return fuzzy matches as suggestions */ + yyjson_mut_doc *fdoc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *froot = yyjson_mut_obj(fdoc); + yyjson_mut_doc_set_root(fdoc, froot); + yyjson_mut_obj_add_str(fdoc, froot, "status", "not_found_exact"); + char msg[512]; + snprintf(msg, sizeof(msg), + "No exact match for '%s'. Found %d partial matches — " + "use one of these exact names:", func_name, fuzzy_results.count); + yyjson_mut_obj_add_strcpy(fdoc, froot, "message", msg); + yyjson_mut_val *suggestions = yyjson_mut_arr(fdoc); + for (int i = 0; i < fuzzy_results.count; i++) { + yyjson_mut_val *si = yyjson_mut_obj(fdoc); + yyjson_mut_obj_add_strcpy(fdoc, si, "name", + fuzzy_results.results[i].node.name ? fuzzy_results.results[i].node.name : ""); + yyjson_mut_obj_add_strcpy(fdoc, si, "label", + fuzzy_results.results[i].node.label ? fuzzy_results.results[i].node.label : ""); + yyjson_mut_obj_add_strcpy(fdoc, si, "file_path", + fuzzy_results.results[i].node.file_path ? fuzzy_results.results[i].node.file_path : ""); + yyjson_mut_obj_add_int(fdoc, si, "line", fuzzy_results.results[i].node.start_line); + yyjson_mut_arr_add_val(suggestions, si); + } + yyjson_mut_obj_add_val(fdoc, froot, "suggestions", suggestions); + char *fjson = yy_doc_to_str(fdoc); + yyjson_mut_doc_free(fdoc); + cbm_store_search_free(&fuzzy_results); + free(func_name); free(project); free(direction); + cbm_store_free_nodes(nodes, 0); + char *result = cbm_mcp_text_result(fjson, false); + free(fjson); + return result; + } + cbm_store_search_free(&fuzzy_results); + free(func_name); free(project); free(direction); cbm_store_free_nodes(nodes, 0); return cbm_mcp_text_result("{\"error\":\"function not found\"}", true); } + /* Pick the best node for tracing. Strategy: + * 1. Prefer Function/Method nodes that are NOT constructors (same name as a + * Class in the result set — constructors rarely have interesting CALLS). + * 2. If only Class/Interface nodes match, resolve through DEFINES_METHOD. */ + int best_idx = 0; + bool has_class = false; + int class_idx = -1; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Class") == 0 || strcmp(lbl, "Interface") == 0)) { + has_class = true; + if (class_idx < 0) class_idx = i; + } + } + /* Look for a non-constructor Function/Method */ + bool found_callable = false; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && (strcmp(lbl, "Function") == 0 || strcmp(lbl, "Method") == 0)) { + /* Skip if this is a constructor (same name as a Class in results) */ + if (has_class) continue; + best_idx = i; + found_callable = true; + break; + } + } + /* If no non-constructor callable was found but we have a Class, use the Class */ + if (!found_callable && class_idx >= 0) { + best_idx = class_idx; + } + + /* Track disambiguation info — added to the main doc after creation */ + int callable_count = 0; + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 && + strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 && + strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) { + callable_count++; + } + } + + /* Determine if the selected node is a Class or Interface. If so, we need to + * resolve through DEFINES_METHOD edges to find the actual callable methods, + * then run BFS from each method and merge results. */ + bool is_class_like = false; + const char *best_label = nodes[best_idx].label; + if (best_label && + (strcmp(best_label, "Class") == 0 || strcmp(best_label, "Interface") == 0)) { + is_class_like = true; + } + + /* Collect BFS start IDs: either the single node, or all methods of the class */ + int64_t *start_ids = NULL; + int start_id_count = 0; + + if (is_class_like) { + /* Find all DEFINES_METHOD targets of this class */ + cbm_edge_t *dm_edges = NULL; + int dm_count = 0; + cbm_store_find_edges_by_source_type(store, nodes[best_idx].id, "DEFINES_METHOD", + &dm_edges, &dm_count); + if (dm_count > 0) { + /* Cap at 5 methods to prevent excessive BFS calls (each method + * spawns ~6 BFS queries across edge type categories) */ + int use_count = dm_count > 5 ? 5 : dm_count; + start_ids = malloc((size_t)use_count * sizeof(int64_t)); + for (int i = 0; i < use_count; i++) { + start_ids[i] = dm_edges[i].target_id; + } + start_id_count = use_count; + } + /* Free edge data */ + for (int i = 0; i < dm_count; i++) { + free((void *)dm_edges[i].project); + free((void *)dm_edges[i].type); + free((void *)dm_edges[i].properties_json); + } + free(dm_edges); + + /* If no methods found, fall back to the class node itself */ + if (start_id_count == 0) { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best_idx].id; + start_id_count = 1; + } + } else { + start_ids = malloc(sizeof(int64_t)); + start_ids[0] = nodes[best_idx].id; + start_id_count = 1; + } + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); yyjson_mut_val *root = yyjson_mut_obj(doc); yyjson_mut_doc_set_root(doc, root); @@ -1272,68 +2253,341 @@ static char *handle_trace_call_path(cbm_mcp_server_t *srv, const char *args) { yyjson_mut_obj_add_str(doc, root, "function", func_name); yyjson_mut_obj_add_str(doc, root, "direction", direction); - const char *edge_types[] = {"CALLS"}; - int edge_type_count = 1; + /* Add matched node info */ + yyjson_mut_obj_add_strcpy(doc, root, "matched_file", + nodes[best_idx].file_path ? nodes[best_idx].file_path : ""); + yyjson_mut_obj_add_strcpy(doc, root, "matched_label", + nodes[best_idx].label ? nodes[best_idx].label : ""); + yyjson_mut_obj_add_int(doc, root, "matched_line", nodes[best_idx].start_line); + + /* Disambiguation: list all callable candidates when multiple match */ + if (callable_count > 1) { + yyjson_mut_val *cands = yyjson_mut_arr(doc); + for (int i = 0; i < node_count; i++) { + const char *lbl = nodes[i].label; + if (lbl && strcmp(lbl, "File") != 0 && strcmp(lbl, "Folder") != 0 && + strcmp(lbl, "Module") != 0 && strcmp(lbl, "Variable") != 0 && + strcmp(lbl, "Section") != 0 && strcmp(lbl, "Project") != 0) { + yyjson_mut_val *ci = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, ci, "name", + nodes[i].name ? nodes[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, ci, "label", + nodes[i].label ? nodes[i].label : ""); + yyjson_mut_obj_add_strcpy(doc, ci, "file_path", + nodes[i].file_path ? nodes[i].file_path : ""); + yyjson_mut_obj_add_int(doc, ci, "line", nodes[i].start_line); + yyjson_mut_arr_add_val(cands, ci); + } + } + yyjson_mut_obj_add_val(doc, root, "candidates", cands); + } + + /* Check if the node has any edges at all. If not, return basic info only. + * This avoids BFS crashes on nodes with 0 edges (e.g. Type nodes, empty Classes). */ + { + int in_deg = 0; + int out_deg = 0; + cbm_store_node_degree(store, nodes[best_idx].id, &in_deg, &out_deg); + if (in_deg == 0 && out_deg == 0 && !is_class_like) { + /* No CALLS edges and not a Class — return basic info. + * Class/Interface nodes skip this check because they have + * DEFINES_METHOD and INHERITS edges that aren't counted by + * cbm_store_node_degree (which only counts CALLS). */ + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + free(start_ids); + cbm_store_free_nodes(nodes, node_count); + free(func_name); free(project); free(direction); + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; + } + } + + /* ── Categorized edge query: like GitNexus context() ── + * Instead of flat BFS, query each edge type separately and return + * categorized results: incoming.calls, incoming.imports, incoming.extends, + * outgoing.calls, outgoing.has_method, outgoing.has_property. + * This gives investigation-grade output where a QA engineer can see + * exactly which functions CALL this vs which files IMPORT it. */ + + /* Helper: query edges for specific types and build JSON array. + * Uses strcpy variants since nodes are freed per-query. */ + #define EDGE_QUERY_MAX 30 - /* Run BFS for each requested direction. - * IMPORTANT: yyjson_mut_obj_add_str borrows pointers — we must keep - * traversal results alive until after yy_doc_to_str serialization. */ // NOLINTNEXTLINE(readability-implicit-bool-conversion) bool do_outbound = strcmp(direction, "outbound") == 0 || strcmp(direction, "both") == 0; // NOLINTNEXTLINE(readability-implicit-bool-conversion) bool do_inbound = strcmp(direction, "inbound") == 0 || strcmp(direction, "both") == 0; - cbm_traverse_result_t tr_out = {0}; - cbm_traverse_result_t tr_in = {0}; + /* Collect all traversal results for lifetime management */ + #define MAX_TR 64 + cbm_traverse_result_t *all_tr = calloc(MAX_TR, sizeof(cbm_traverse_result_t)); + int tr_count = 0; - if (do_outbound) { - cbm_store_bfs(store, nodes[0].id, "outbound", edge_types, edge_type_count, depth, 100, - &tr_out); + if (do_inbound) { + yyjson_mut_val *incoming = yyjson_mut_obj(doc); + + /* Incoming CALLS (direct callers — hop 1 only for clean results). + * For Classes: also include USAGE and DEFINES edges which capture + * file-level references like `new MyClass()` and `import MyClass`. + * Query both the class node AND its methods as BFS roots. */ + { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS", "USAGE", "RAISES"}; + /* Always include the original node (Class or Function) */ + if (tr_count < MAX_TR) { + cbm_store_bfs(store, nodes[best_idx].id, "inbound", call_types, 5, 1, + EDGE_QUERY_MAX, &all_tr[tr_count]); + tr_count++; + } + /* Also include methods for class resolution */ + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + if (start_ids[s] == nodes[best_idx].id) continue; /* already queried */ + cbm_store_bfs(store, start_ids[s], "inbound", call_types, 5, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + if (all_tr[tr_count].visited_count > 0) { + tr_count++; + } + } + } + /* Build calls array from all BFS results */ + yyjson_mut_val *calls_arr = yyjson_mut_arr(doc); + for (int t = 0; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(calls_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "calls", calls_arr); + + /* Incoming IMPORTS */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *imp_types[] = {"IMPORTS"}; + cbm_store_bfs(store, start_ids[s], "inbound", imp_types, 1, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *imp_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(imp_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "imports", imp_arr); + } - yyjson_mut_val *callees = yyjson_mut_arr(doc); - for (int i = 0; i < tr_out.visited_count; i++) { - yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", - tr_out.visited[i].node.name ? tr_out.visited[i].node.name : ""); - yyjson_mut_obj_add_str( - doc, item, "qualified_name", - tr_out.visited[i].node.qualified_name ? tr_out.visited[i].node.qualified_name : ""); - yyjson_mut_obj_add_int(doc, item, "hop", tr_out.visited[i].hop); - yyjson_mut_arr_add_val(callees, item); + /* Incoming INHERITS (who extends this) */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *inh_types[] = {"INHERITS"}; + cbm_store_bfs(store, start_ids[s], "inbound", inh_types, 1, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *inh_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(inh_arr, item); + } + } + yyjson_mut_obj_add_val(doc, incoming, "extends", inh_arr); + } + + yyjson_mut_obj_add_val(doc, root, "incoming", incoming); + + /* Also include deeper BFS (hop 2+) as a separate "transitive_callers" field + * for users who need it — but only on CALLS, capped at 50. */ + if (depth > 1) { + int saved_tr2 = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_store_bfs(store, start_ids[s], "inbound", call_types, 3, depth, 50, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *trans_arr = yyjson_mut_arr(doc); + for (int t = saved_tr2; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + if (all_tr[t].visited[i].hop <= 1) continue; /* skip hop 1, already shown */ + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "hop", all_tr[t].visited[i].hop); + yyjson_mut_arr_add_val(trans_arr, item); + } + } + yyjson_mut_obj_add_val(doc, root, "transitive_callers", trans_arr); } - yyjson_mut_obj_add_val(doc, root, "callees", callees); } - if (do_inbound) { - cbm_store_bfs(store, nodes[0].id, "inbound", edge_types, edge_type_count, depth, 100, - &tr_in); + if (do_outbound) { + yyjson_mut_val *outgoing = yyjson_mut_obj(doc); + + /* Outgoing CALLS */ + { + int saved_tr = tr_count; + for (int s = 0; s < start_id_count && tr_count < MAX_TR; s++) { + const char *call_types[] = {"CALLS", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_store_bfs(store, start_ids[s], "outbound", call_types, 3, 1, EDGE_QUERY_MAX, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *calls_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(calls_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "calls", calls_arr); + } - yyjson_mut_val *callers = yyjson_mut_arr(doc); - for (int i = 0; i < tr_in.visited_count; i++) { - yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", - tr_in.visited[i].node.name ? tr_in.visited[i].node.name : ""); - yyjson_mut_obj_add_str( - doc, item, "qualified_name", - tr_in.visited[i].node.qualified_name ? tr_in.visited[i].node.qualified_name : ""); - yyjson_mut_obj_add_int(doc, item, "hop", tr_in.visited[i].hop); - yyjson_mut_arr_add_val(callers, item); + /* Outgoing DEFINES_METHOD (for Classes). + * Use the original Class node ID, not start_ids (which are method IDs). + * DEFINES_METHOD edges go FROM the Class TO its Methods. */ + { + int saved_tr = tr_count; + if (is_class_like && tr_count < MAX_TR) { + const char *dm_types[] = {"DEFINES_METHOD"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", dm_types, 1, 1, 30, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *methods_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(methods_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "has_method", methods_arr); + } + + /* Outgoing HAS_PROPERTY (for Classes — class properties). */ + { + int saved_tr = tr_count; + if (is_class_like && tr_count < MAX_TR) { + const char *hp_types[] = {"HAS_PROPERTY"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", hp_types, 1, 1, 30, + &all_tr[tr_count]); + tr_count++; + } + yyjson_mut_val *props_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_obj_add_int(doc, item, "line", vn->start_line); + yyjson_mut_arr_add_val(props_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "has_property", props_arr); } - yyjson_mut_obj_add_val(doc, root, "callers", callers); + + /* Outgoing INHERITS (what this extends) */ + { + int saved_tr = tr_count; + const char *inh_types[] = {"INHERITS"}; + cbm_store_bfs(store, nodes[best_idx].id, "outbound", inh_types, 1, 1, 10, + &all_tr[tr_count]); + tr_count++; + yyjson_mut_val *ext_arr = yyjson_mut_arr(doc); + for (int t = saved_tr; t < tr_count; t++) { + for (int i = 0; i < all_tr[t].visited_count; i++) { + cbm_node_t *vn = &all_tr[t].visited[i].node; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_str(doc, item, "name", vn->name ? vn->name : ""); + yyjson_mut_obj_add_str(doc, item, "file_path", vn->file_path ? vn->file_path : ""); + yyjson_mut_arr_add_val(ext_arr, item); + } + } + yyjson_mut_obj_add_val(doc, outgoing, "extends", ext_arr); + } + + yyjson_mut_obj_add_val(doc, root, "outgoing", outgoing); + } + + /* Process participation */ + { + cbm_process_info_t *procs = NULL; + int pcount = 0; + cbm_store_list_processes(store, project, &procs, &pcount); + if (pcount > 0) { + yyjson_mut_val *flows = yyjson_mut_arr(doc); + int flow_count = 0; + for (int pi = 0; pi < pcount && flow_count < 20; pi++) { + bool participates = false; + if (procs[pi].entry_point_id == nodes[best_idx].id || + procs[pi].terminal_id == nodes[best_idx].id) { + participates = true; + } + if (!participates) { + for (int si = 0; si < start_id_count; si++) { + if (procs[pi].entry_point_id == start_ids[si] || + procs[pi].terminal_id == start_ids[si]) { + participates = true; + break; + } + } + } + if (!participates && func_name && procs[pi].label) { + if (strstr(procs[pi].label, func_name) != NULL) { + participates = true; + } + } + if (participates) { + yyjson_mut_val *fi = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, fi, "label", + procs[pi].label ? procs[pi].label : ""); + yyjson_mut_obj_add_int(doc, fi, "step_count", procs[pi].step_count); + yyjson_mut_arr_add_val(flows, fi); + flow_count++; + } + } + if (flow_count > 0) yyjson_mut_obj_add_val(doc, root, "processes", flows); + } + cbm_store_free_processes(procs, pcount); } /* Serialize BEFORE freeing traversal results (yyjson borrows strings) */ char *json = yy_doc_to_str(doc); yyjson_mut_doc_free(doc); - /* Now safe to free traversal data */ - if (do_outbound) { - cbm_store_traverse_free(&tr_out); - } - if (do_inbound) { - cbm_store_traverse_free(&tr_in); + /* Now safe to free all traversal data */ + for (int t = 0; t < tr_count; t++) { + cbm_store_traverse_free(&all_tr[t]); } + free(all_tr); + #undef EDGE_QUERY_MAX + #undef MAX_TR + free(start_ids); cbm_store_free_nodes(nodes, node_count); free(func_name); free(project); @@ -2502,7 +3756,10 @@ static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { continue; } - yyjson_mut_arr_add_str(doc, changed, line); + /* Use strcpy variants: line is a stack buffer reused each iteration, + * and node strings are freed by cbm_store_free_nodes below. + * yyjson_mut_*_add_str only borrows pointers — strcpy makes copies. */ + yyjson_mut_arr_add_strcpy(doc, changed, line); file_count++; /* Find symbols defined in this file */ @@ -2514,9 +3771,9 @@ static char *handle_detect_changes(cbm_mcp_server_t *srv, const char *args) { if (nodes[i].label && strcmp(nodes[i].label, "File") != 0 && strcmp(nodes[i].label, "Folder") != 0 && strcmp(nodes[i].label, "Project") != 0) { yyjson_mut_val *item = yyjson_mut_obj(doc); - yyjson_mut_obj_add_str(doc, item, "name", nodes[i].name ? nodes[i].name : ""); - yyjson_mut_obj_add_str(doc, item, "label", nodes[i].label); - yyjson_mut_obj_add_str(doc, item, "file", line); + yyjson_mut_obj_add_strcpy(doc, item, "name", nodes[i].name ? nodes[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "label", nodes[i].label); + yyjson_mut_obj_add_strcpy(doc, item, "file", line); yyjson_mut_arr_add_val(impacted, item); } } @@ -2672,6 +3929,295 @@ static char *handle_ingest_traces(cbm_mcp_server_t *srv, const char *args) { return result; } +/* ── generate_embeddings ─────────────────────────────────────── */ + +static char *handle_generate_embeddings(cbm_mcp_server_t *srv, const char *args) { + char *project = cbm_mcp_get_string_arg(args, "project"); + cbm_store_t *store = resolve_store(srv, project); + REQUIRE_STORE(store, project); + + if (!cbm_embedding_is_configured()) { + free(project); + return cbm_mcp_text_result( + "{\"error\":\"CBM_EMBEDDING_URL not set. " + "Set CBM_EMBEDDING_URL to an OpenAI-compatible /v1/embeddings endpoint " + "(e.g., http://localhost:11434/v1 for Ollama).\"}", true); + } + + bool force = cbm_mcp_get_bool_arg(args, "force"); + int existing = cbm_store_count_embeddings(store, project); + + int generated = cbm_embedding_generate_for_project(store, project, force); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_str(doc, root, "status", generated >= 0 ? "success" : "error"); + yyjson_mut_obj_add_int(doc, root, "generated", generated >= 0 ? generated : 0); + yyjson_mut_obj_add_int(doc, root, "existing_before", existing); + yyjson_mut_obj_add_int(doc, root, "total_embeddings", + cbm_store_count_embeddings(store, project)); + + cbm_embedding_config_t cfg = cbm_embedding_get_config(); + yyjson_mut_obj_add_str(doc, root, "model", cfg.model ? cfg.model : ""); + yyjson_mut_obj_add_int(doc, root, "dimensions", cfg.dims); + yyjson_mut_obj_add_str(doc, root, "url", cfg.url ? cfg.url : ""); + + if (generated >= 0) { + yyjson_mut_obj_add_str(doc, root, "hint", + "Embeddings generated. search_graph with query= " + "now uses hybrid BM25+vector search with RRF merge."); + } + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + free(project); + + char *result = cbm_mcp_text_result(json, generated < 0); + free(json); + return result; +} + +/* ── build_cross_repo_index ──────────────────────────────────── */ + +/* ── Cross-repo search (search_graph with project="*") ───────── */ + +static char *derive_short_project(const char *full_project) { + /* "mnt-c-Users-Name-Projects-repo-name" → "repo-name" */ + const char *marker = strstr(full_project, "Projects-"); + if (marker) return heap_strdup(marker + 9); + return heap_strdup(full_project); +} + +static char *handle_cross_repo_search(cbm_mcp_server_t *srv, const char *args) { + (void)srv; + + cbm_cross_repo_t *cr = cbm_cross_repo_open(); + if (!cr) { + return cbm_mcp_text_result( + "{\"error\":\"Cross-repo index not built. Run build_cross_repo_index first.\"}", true); + } + + char *query = cbm_mcp_get_string_arg(args, "query"); + int limit = cbm_mcp_get_int_arg(args, "limit", 30); + if (limit <= 0) limit = 30; + + if (!query || !query[0]) { + cbm_cross_repo_close(cr); + free(query); + return cbm_mcp_text_result( + "{\"error\":\"query parameter required for cross-repo search\"}", true); + } + + /* Embed query for hybrid search if configured */ + float *query_vec = NULL; + int dims = 0; + if (cbm_embedding_is_configured()) { + cbm_embedding_config_t cfg = cbm_embedding_get_config(); + query_vec = cbm_embedding_embed_text(&cfg, query); + dims = cfg.dims; + } + + cbm_cross_search_output_t out = {0}; + cbm_cross_repo_search(cr, query, query_vec, dims, limit, &out); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total", out.total); + yyjson_mut_obj_add_str(doc, root, "search_mode", + out.used_vector ? "hybrid_bm25_vector" : "bm25"); + yyjson_mut_obj_add_bool(doc, root, "cross_repo", true); + + yyjson_mut_val *results = yyjson_mut_arr(doc); + for (int i = 0; i < out.count; i++) { + cbm_cross_search_result_t *r = &out.results[i]; + yyjson_mut_val *item = yyjson_mut_obj(doc); + + /* Short project name for display */ + char *short_proj = derive_short_project(r->project ? r->project : ""); + yyjson_mut_obj_add_strcpy(doc, item, "project", short_proj ? short_proj : ""); + /* Full project ID for follow-up calls */ + yyjson_mut_obj_add_strcpy(doc, item, "project_id", r->project ? r->project : ""); + free(short_proj); + + yyjson_mut_obj_add_strcpy(doc, item, "name", r->name ? r->name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "qualified_name", + r->qualified_name ? r->qualified_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "label", r->label ? r->label : ""); + yyjson_mut_obj_add_strcpy(doc, item, "file_path", r->file_path ? r->file_path : ""); + yyjson_mut_obj_add_real(doc, item, "score", r->score); + if (r->similarity > 0) + yyjson_mut_obj_add_real(doc, item, "similarity", r->similarity); + + yyjson_mut_arr_add_val(results, item); + } + yyjson_mut_obj_add_val(doc, root, "results", results); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + cbm_cross_search_free(&out); + cbm_cross_repo_close(cr); + free(query); + free(query_vec); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + +/* ── build_cross_repo_index ──────────────────────────────────── */ + +static char *handle_build_cross_repo_index(cbm_mcp_server_t *srv, const char *args) { + (void)srv; (void)args; + + cbm_cross_repo_stats_t stats = cbm_cross_repo_build(); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_str(doc, root, "status", + stats.repos_scanned >= 0 ? "success" : "error"); + yyjson_mut_obj_add_int(doc, root, "repos_scanned", stats.repos_scanned); + yyjson_mut_obj_add_int(doc, root, "nodes_copied", stats.nodes_copied); + yyjson_mut_obj_add_int(doc, root, "channels_copied", stats.channels_copied); + yyjson_mut_obj_add_int(doc, root, "embeddings_copied", stats.embeddings_copied); + yyjson_mut_obj_add_int(doc, root, "cross_repo_channel_matches", stats.cross_repo_matches); + yyjson_mut_obj_add_real(doc, root, "build_time_ms", stats.build_time_ms); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + + char *result = cbm_mcp_text_result(json, stats.repos_scanned < 0); + free(json); + return result; +} + +/* ── trace_cross_repo ────────────────────────────────────────── */ + +/* Add trace steps as a JSON array to a parent object */ +static void add_trace_steps(yyjson_mut_doc *doc, yyjson_mut_val *parent, + const char *key, cbm_cross_trace_step_t *steps, int count) { + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_val *step = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, step, "name", steps[i].name ? steps[i].name : ""); + yyjson_mut_obj_add_strcpy(doc, step, "label", steps[i].label ? steps[i].label : ""); + yyjson_mut_obj_add_strcpy(doc, step, "file_path", + steps[i].file_path ? steps[i].file_path : ""); + yyjson_mut_obj_add_int(doc, step, "depth", steps[i].depth); + yyjson_mut_arr_add_val(arr, step); + } + yyjson_mut_obj_add_val(doc, parent, key, arr); +} + +static char *handle_trace_cross_repo(cbm_mcp_server_t *srv, const char *args) { + (void)srv; + char *channel = cbm_mcp_get_string_arg(args, "channel"); + bool trace_calls = (channel && channel[0]); /* only trace call chains when channel filter given */ + + cbm_cross_repo_t *cr = cbm_cross_repo_open(); + if (!cr) { + free(channel); + return cbm_mcp_text_result( + "{\"error\":\"Cross-repo index not built. Run build_cross_repo_index first.\"}", true); + } + + /* Get cross-repo info */ + cbm_cross_repo_info_t info = {0}; + cbm_cross_repo_get_info(cr, &info); + + /* Get channel matches */ + cbm_cross_channel_match_t *matches = NULL; + int match_count = 0; + cbm_cross_repo_match_channels(cr, channel, &matches, &match_count); + + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_int(doc, root, "total_repos", info.total_repos); + yyjson_mut_obj_add_int(doc, root, "total_cross_repo_channels", info.cross_repo_channel_count); + yyjson_mut_obj_add_int(doc, root, "matches", match_count); + yyjson_mut_obj_add_bool(doc, root, "call_chains_included", trace_calls); + if (info.built_at) + yyjson_mut_obj_add_strcpy(doc, root, "built_at", info.built_at); + + yyjson_mut_val *arr = yyjson_mut_arr(doc); + for (int i = 0; i < match_count; i++) { + cbm_cross_channel_match_t *m = &matches[i]; + yyjson_mut_val *item = yyjson_mut_obj(doc); + yyjson_mut_obj_add_strcpy(doc, item, "channel", m->channel_name ? m->channel_name : ""); + yyjson_mut_obj_add_strcpy(doc, item, "transport", m->transport ? m->transport : ""); + + /* Emitter side */ + yyjson_mut_val *emit = yyjson_mut_obj(doc); + char *ep_short = derive_short_project(m->emit_project ? m->emit_project : ""); + yyjson_mut_obj_add_strcpy(doc, emit, "project", ep_short ? ep_short : ""); + free(ep_short); + yyjson_mut_obj_add_strcpy(doc, emit, "file", m->emit_file ? m->emit_file : ""); + yyjson_mut_obj_add_strcpy(doc, emit, "function", m->emit_function ? m->emit_function : ""); + + /* Trace upstream callers of the emitter (what triggers the emit) */ + if (trace_calls && m->emit_project) { + char db_path[2048]; + project_db_path(m->emit_project, db_path, sizeof(db_path)); + cbm_cross_trace_step_t *steps = NULL; + int step_count = 0; + cbm_cross_repo_trace_in_project(db_path, m->emit_function, + m->emit_file, m->channel_name, + "inbound", 2, &steps, &step_count); + if (step_count > 0) { + add_trace_steps(doc, emit, "upstream", steps, step_count); + } + cbm_cross_trace_free(steps, step_count); + } + yyjson_mut_obj_add_val(doc, item, "emitter", emit); + + /* Listener side */ + yyjson_mut_val *listen = yyjson_mut_obj(doc); + char *lp_short = derive_short_project(m->listen_project ? m->listen_project : ""); + yyjson_mut_obj_add_strcpy(doc, listen, "project", lp_short ? lp_short : ""); + free(lp_short); + yyjson_mut_obj_add_strcpy(doc, listen, "file", m->listen_file ? m->listen_file : ""); + yyjson_mut_obj_add_strcpy(doc, listen, "function", m->listen_function ? m->listen_function : ""); + + /* Trace downstream callees of the listener (what the listener triggers) */ + if (trace_calls && m->listen_project) { + char db_path[2048]; + project_db_path(m->listen_project, db_path, sizeof(db_path)); + cbm_cross_trace_step_t *steps = NULL; + int step_count = 0; + cbm_cross_repo_trace_in_project(db_path, m->listen_function, + m->listen_file, m->channel_name, + "outbound", 2, &steps, &step_count); + if (step_count > 0) { + add_trace_steps(doc, listen, "downstream", steps, step_count); + } + cbm_cross_trace_free(steps, step_count); + } + yyjson_mut_obj_add_val(doc, item, "listener", listen); + + yyjson_mut_arr_add_val(arr, item); + } + yyjson_mut_obj_add_val(doc, root, "channel_flows", arr); + + char *json = yy_doc_to_str(doc); + yyjson_mut_doc_free(doc); + + cbm_cross_channel_free(matches, match_count); + cbm_cross_repo_info_free(&info); + cbm_cross_repo_close(cr); + free(channel); + + char *result = cbm_mcp_text_result(json, false); + free(json); + return result; +} + /* ── Tool dispatch ────────────────────────────────────────────── */ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) @@ -2704,6 +4250,18 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch if (strcmp(tool_name, "get_architecture") == 0) { return handle_get_architecture(srv, args_json); } + if (strcmp(tool_name, "list_processes") == 0) { + return handle_list_processes(srv, args_json); + } + if (strcmp(tool_name, "get_channels") == 0) { + return handle_get_channels(srv, args_json); + } + if (strcmp(tool_name, "get_process_steps") == 0) { + return handle_get_process_steps(srv, args_json); + } + if (strcmp(tool_name, "get_impact") == 0) { + return handle_get_impact(srv, args_json); + } /* Pipeline-dependent tools */ if (strcmp(tool_name, "index_repository") == 0) { @@ -2724,6 +4282,15 @@ char *cbm_mcp_handle_tool(cbm_mcp_server_t *srv, const char *tool_name, const ch if (strcmp(tool_name, "ingest_traces") == 0) { return handle_ingest_traces(srv, args_json); } + if (strcmp(tool_name, "generate_embeddings") == 0) { + return handle_generate_embeddings(srv, args_json); + } + if (strcmp(tool_name, "build_cross_repo_index") == 0) { + return handle_build_cross_repo_index(srv, args_json); + } + if (strcmp(tool_name, "trace_cross_repo") == 0) { + return handle_trace_cross_repo(srv, args_json); + } char msg[256]; snprintf(msg, sizeof(msg), "unknown tool: %s", tool_name); return cbm_mcp_text_result(msg, true); diff --git a/src/pipeline/embedding.c b/src/pipeline/embedding.c new file mode 100644 index 00000000..6daa0088 --- /dev/null +++ b/src/pipeline/embedding.c @@ -0,0 +1,498 @@ +/* + * embedding.c — Semantic embedding generation via HTTP API + RRF hybrid search. + * + * Uses Mongoose for synchronous HTTP POST to OpenAI-compatible /v1/embeddings. + * Uses yyjson for JSON serialization/deserialization. + */ + +#include "pipeline/embedding.h" +#include "foundation/log.h" +#include "foundation/platform.h" +#include "foundation/compat.h" + +#include +#include +#include + +#include +#include +#include + +/* Thread-local int-to-string buffer for log key-value pairs. */ +static _Thread_local char _itoa_buf[32]; +static const char *itoa_buf(int val) { + snprintf(_itoa_buf, sizeof(_itoa_buf), "%d", val); + return _itoa_buf; +} + +/* ── Configuration ──────────────────────────────────────────────── */ + +cbm_embedding_config_t cbm_embedding_get_config(void) { + cbm_embedding_config_t cfg = {0}; + cfg.url = getenv("CBM_EMBEDDING_URL"); + cfg.model = getenv("CBM_EMBEDDING_MODEL"); + if (!cfg.model) cfg.model = "nomic-embed-text"; + + const char *dims_str = getenv("CBM_EMBEDDING_DIMS"); + cfg.dims = dims_str ? atoi(dims_str) : 768; + if (cfg.dims <= 0) cfg.dims = 768; + + const char *batch_str = getenv("CBM_EMBEDDING_BATCH_SIZE"); + cfg.batch_size = batch_str ? atoi(batch_str) : 32; + if (cfg.batch_size <= 0) cfg.batch_size = 32; + + cfg.timeout_ms = 30000; + return cfg; +} + +bool cbm_embedding_is_configured(void) { + const char *url = getenv("CBM_EMBEDDING_URL"); + return url && url[0]; +} + +/* ── HTTP embedding client (Mongoose synchronous) ──────────────── */ + +/* State for the synchronous HTTP request. */ +typedef struct { + bool done; + bool error; + char *response_body; + int response_len; + const char *url; /* original URL for building the request */ + const char *content_type; + const char *body; + bool request_sent; +} http_state_t; + +static void http_handler(struct mg_connection *c, int ev, void *ev_data) { + http_state_t *state = (http_state_t *)c->fn_data; + + if (ev == MG_EV_CONNECT) { + /* Connection established — send the HTTP request */ + struct mg_str host = mg_url_host(state->url); + mg_printf(c, + "POST %s HTTP/1.1\r\n" + "Host: %.*s\r\n" + "Content-Type: %s\r\n" + "Content-Length: %d\r\n" + "\r\n" + "%s", + mg_url_uri(state->url), + (int)host.len, host.buf, + state->content_type, + (int)strlen(state->body), + state->body); + state->request_sent = true; + } else if (ev == MG_EV_HTTP_MSG) { + struct mg_http_message *hm = (struct mg_http_message *)ev_data; + state->response_body = malloc((size_t)hm->body.len + 1); + if (state->response_body) { + memcpy(state->response_body, hm->body.buf, hm->body.len); + state->response_body[hm->body.len] = '\0'; + state->response_len = (int)hm->body.len; + } + state->done = true; + c->is_draining = 1; + } else if (ev == MG_EV_ERROR) { + state->error = true; + state->done = true; + c->is_draining = 1; + } +} + +/* Synchronous HTTP POST. Returns allocated response body or NULL on error. + * Caller must free the returned string. */ +static char *http_post_sync(const char *url, const char *content_type, + const char *body, int timeout_ms) { + struct mg_mgr mgr; + mg_mgr_init(&mgr); + + http_state_t state = {0}; + state.url = url; + state.content_type = content_type; + state.body = body; + + struct mg_connection *c = mg_http_connect(&mgr, url, http_handler, &state); + if (!c) { + mg_mgr_free(&mgr); + return NULL; + } + + /* Poll until done or timeout */ + int elapsed = 0; + while (!state.done && elapsed < timeout_ms) { + mg_mgr_poll(&mgr, 50); + elapsed += 50; + } + + mg_mgr_free(&mgr); + + if (state.error || !state.done) { + free(state.response_body); + return NULL; + } + return state.response_body; +} + +/* ── Embedding API calls ───────────────────────────────────────── */ + +/* Build the JSON request body for /v1/embeddings. + * {"model": "...", "input": ["text1", "text2", ...]} */ +static char *build_embedding_request(const char *model, const char **texts, int count) { + yyjson_mut_doc *doc = yyjson_mut_doc_new(NULL); + yyjson_mut_val *root = yyjson_mut_obj(doc); + yyjson_mut_doc_set_root(doc, root); + + yyjson_mut_obj_add_str(doc, root, "model", model); + + yyjson_mut_val *input = yyjson_mut_arr(doc); + for (int i = 0; i < count; i++) { + yyjson_mut_arr_add_str(doc, input, texts[i]); + } + yyjson_mut_obj_add_val(doc, root, "input", input); + + char *json = yyjson_mut_write(doc, 0, NULL); + yyjson_mut_doc_free(doc); + return json; +} + +/* Parse the JSON response from /v1/embeddings. + * Returns allocated float[count * dims] or NULL on error. */ +static float *parse_embedding_response(const char *json, int expected_count, + int expected_dims) { + yyjson_doc *doc = yyjson_read(json, strlen(json), 0); + if (!doc) return NULL; + + yyjson_val *root = yyjson_doc_get_root(doc); + yyjson_val *data = yyjson_obj_get(root, "data"); + if (!data || !yyjson_is_arr(data)) { + yyjson_doc_free(doc); + return NULL; + } + + int arr_len = (int)yyjson_arr_size(data); + if (arr_len != expected_count) { + cbm_log_error("embedding.parse", "msg", "count_mismatch", + "expected", itoa_buf(expected_count), + "got", itoa_buf(arr_len)); + yyjson_doc_free(doc); + return NULL; + } + + float *result = calloc((size_t)(expected_count * expected_dims), sizeof(float)); + if (!result) { + yyjson_doc_free(doc); + return NULL; + } + + size_t idx, max; + yyjson_val *item; + yyjson_arr_foreach(data, idx, max, item) { + /* Each item: {"embedding": [...], "index": N} */ + yyjson_val *emb = yyjson_obj_get(item, "embedding"); + yyjson_val *index_val = yyjson_obj_get(item, "index"); + if (!emb || !yyjson_is_arr(emb)) continue; + + int emb_idx = index_val ? (int)yyjson_get_int(index_val) : (int)idx; + if (emb_idx < 0 || emb_idx >= expected_count) continue; + + int emb_dims = (int)yyjson_arr_size(emb); + if (emb_dims != expected_dims) { + /* Dimension mismatch — first occurrence, log and bail */ + if (idx == 0) { + cbm_log_error("embedding.parse", "msg", "dims_mismatch", + "expected", itoa_buf(expected_dims), + "got", itoa_buf(emb_dims)); + } + /* Still try to copy what we can */ + emb_dims = emb_dims < expected_dims ? emb_dims : expected_dims; + } + + float *dest = &result[emb_idx * expected_dims]; + size_t ei, emax; + yyjson_val *val; + int d = 0; + yyjson_arr_foreach(emb, ei, emax, val) { + if (d >= expected_dims) break; + dest[d++] = (float)yyjson_get_real(val); + } + } + + yyjson_doc_free(doc); + return result; +} + +float *cbm_embedding_embed_text(const cbm_embedding_config_t *cfg, const char *text) { + if (!cfg || !cfg->url || !text) return NULL; + const char *texts[] = {text}; + return cbm_embedding_embed_batch(cfg, texts, 1); +} + +float *cbm_embedding_embed_batch(const cbm_embedding_config_t *cfg, + const char **texts, int count) { + if (!cfg || !cfg->url || !texts || count <= 0) return NULL; + + /* Build URL: base_url + "/embeddings" */ + char url[1024]; + snprintf(url, sizeof(url), "%s/embeddings", cfg->url); + + /* Build JSON request */ + char *request_json = build_embedding_request(cfg->model, texts, count); + if (!request_json) return NULL; + + /* HTTP POST */ + char *response = http_post_sync(url, "application/json", + request_json, cfg->timeout_ms); + free(request_json); + + if (!response) { + cbm_log_error("embedding.http", "msg", "request_failed", "url", url); + return NULL; + } + + /* Parse response */ + float *embeddings = parse_embedding_response(response, count, cfg->dims); + free(response); + + return embeddings; +} + +/* ── Text generation ───────────────────────────────────────────── */ + +char *cbm_embedding_node_text(const cbm_node_t *node) { + if (!node || !node->name) return NULL; + + /* Extract directory from file_path */ + char dir[256] = ""; + if (node->file_path) { + const char *last_slash = strrchr(node->file_path, '/'); + if (last_slash && last_slash > node->file_path) { + int dlen = (int)(last_slash - node->file_path); + if (dlen >= (int)sizeof(dir)) dlen = (int)sizeof(dir) - 1; + memcpy(dir, node->file_path, (size_t)dlen); + dir[dlen] = '\0'; + } + } + + /* Extract filename from file_path */ + const char *filename = node->file_path; + if (filename) { + const char *last_slash = strrchr(filename, '/'); + if (last_slash) filename = last_slash + 1; + } + + /* Extract code snippet from properties JSON (first 500 chars) */ + char snippet[512] = ""; + if (node->properties_json && node->properties_json[0] != '{') { + /* properties_json IS the code sometimes */ + } else if (node->properties_json) { + yyjson_doc *pdoc = yyjson_read(node->properties_json, + strlen(node->properties_json), 0); + if (pdoc) { + yyjson_val *proot = yyjson_doc_get_root(pdoc); + yyjson_val *code = yyjson_obj_get(proot, "code"); + if (!code) code = yyjson_obj_get(proot, "content"); + if (!code) code = yyjson_obj_get(proot, "signature"); + if (code && yyjson_is_str(code)) { + const char *s = yyjson_get_str(code); + if (s) { + int slen = (int)strlen(s); + if (slen > 500) slen = 500; + memcpy(snippet, s, (size_t)slen); + snippet[slen] = '\0'; + } + } + yyjson_doc_free(pdoc); + } + } + + /* Build: "Label: name\nFile: filename\nDirectory: dir\n\nsnippet" */ + int buf_size = 2048; + char *buf = malloc((size_t)buf_size); + if (!buf) return NULL; + + int len = snprintf(buf, (size_t)buf_size, + "%s: %s\nFile: %s\nDirectory: %s", + node->label ? node->label : "Symbol", + node->name, + filename ? filename : "", + dir[0] ? dir : ""); + + if (snippet[0]) { + len += snprintf(buf + len, (size_t)(buf_size - len), "\n\n%s", snippet); + } + + return buf; +} + +/* ── RRF merge ─────────────────────────────────────────────────── */ + +int cbm_embedding_rrf_merge(const int64_t *bm25_ids, int bm25_count, + const cbm_vector_result_t *vec_results, int vec_count, + cbm_rrf_result_t **out, int *out_count) { + if (!out || !out_count) return CBM_STORE_ERR; + *out = NULL; + *out_count = 0; + + /* Estimate max unique results */ + int max_results = bm25_count + vec_count; + if (max_results == 0) return CBM_STORE_OK; + + cbm_rrf_result_t *results = calloc((size_t)max_results, sizeof(cbm_rrf_result_t)); + if (!results) return CBM_STORE_ERR; + + int count = 0; + + /* Add BM25 results with RRF scores */ + for (int i = 0; i < bm25_count; i++) { + double rrf_score = 1.0 / (CBM_RRF_K + i); + /* Check if already in results (shouldn't happen for BM25) */ + results[count].node_id = bm25_ids[i]; + results[count].rrf_score = rrf_score; + results[count].bm25_rank = i; + results[count].vec_rank = -1; + results[count].similarity = 0; + count++; + } + + /* Add vector results, merging with existing BM25 results */ + for (int i = 0; i < vec_count; i++) { + double rrf_score = 1.0 / (CBM_RRF_K + i); + int64_t nid = vec_results[i].node_id; + + /* Check if this node_id already exists from BM25 */ + bool found = false; + for (int j = 0; j < count; j++) { + if (results[j].node_id == nid) { + results[j].rrf_score += rrf_score; + results[j].vec_rank = i; + results[j].similarity = vec_results[i].similarity; + found = true; + break; + } + } + + if (!found) { + results[count].node_id = nid; + results[count].rrf_score = rrf_score; + results[count].bm25_rank = -1; + results[count].vec_rank = i; + results[count].similarity = vec_results[i].similarity; + count++; + } + } + + /* Sort by RRF score descending */ + for (int i = 0; i < count - 1; i++) { + for (int j = i + 1; j < count; j++) { + if (results[j].rrf_score > results[i].rrf_score) { + cbm_rrf_result_t tmp = results[i]; + results[i] = results[j]; + results[j] = tmp; + } + } + } + + *out = results; + *out_count = count; + return CBM_STORE_OK; +} + +/* ── Pipeline integration ──────────────────────────────────────── */ + +int cbm_embedding_generate_for_project(cbm_store_t *s, const char *project, bool force) { + if (!s || !project) return -1; + + cbm_embedding_config_t cfg = cbm_embedding_get_config(); + if (!cfg.url) { + cbm_log_info("embedding.skip", "reason", "not_configured"); + return 0; + } + + /* Query embeddable nodes */ + const char *sql = force + ? "SELECT id, project, label, name, qualified_name, file_path, " + "start_line, end_line, properties FROM nodes " + "WHERE project = ?1 " + "AND label IN ('Function','Method','Class','Interface','Route')" + : "SELECT id, project, label, name, qualified_name, file_path, " + "start_line, end_line, properties FROM nodes " + "WHERE project = ?1 " + "AND label IN ('Function','Method','Class','Interface','Route') " + "AND id NOT IN (SELECT node_id FROM embeddings WHERE project = ?1)"; + + sqlite3_stmt *stmt = NULL; + struct sqlite3 *db = cbm_store_get_db(s); + if (!db) return -1; + if (sqlite3_prepare_v2(db, sql, -1, &stmt, NULL) != SQLITE_OK) return -1; + sqlite3_bind_text(stmt, 1, project, -1, SQLITE_STATIC); + + /* Collect nodes into batches */ + int total_embedded = 0; + int batch_cap = cfg.batch_size; + int64_t *batch_ids = malloc((size_t)batch_cap * sizeof(int64_t)); + const char **batch_texts = malloc((size_t)batch_cap * sizeof(char *)); + int batch_count = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW) { + cbm_node_t node = {0}; + node.id = sqlite3_column_int64(stmt, 0); + node.project = (const char *)sqlite3_column_text(stmt, 1); + node.label = (const char *)sqlite3_column_text(stmt, 2); + node.name = (const char *)sqlite3_column_text(stmt, 3); + node.qualified_name = (const char *)sqlite3_column_text(stmt, 4); + node.file_path = (const char *)sqlite3_column_text(stmt, 5); + node.start_line = sqlite3_column_int(stmt, 6); + node.end_line = sqlite3_column_int(stmt, 7); + node.properties_json = (const char *)sqlite3_column_text(stmt, 8); + + char *text = cbm_embedding_node_text(&node); + if (!text) continue; + + batch_ids[batch_count] = node.id; + batch_texts[batch_count] = text; + batch_count++; + + /* Flush batch when full */ + if (batch_count >= batch_cap) { + float *embeddings = cbm_embedding_embed_batch(&cfg, batch_texts, batch_count); + if (embeddings) { + cbm_store_upsert_embedding_batch(s, batch_ids, project, + embeddings, cfg.dims, batch_count); + total_embedded += batch_count; + free(embeddings); + } else { + cbm_log_error("embedding.batch", "msg", "failed", + "batch_size", itoa_buf(batch_count)); + } + + /* Free batch texts */ + for (int i = 0; i < batch_count; i++) { + free((void *)batch_texts[i]); + } + batch_count = 0; + } + } + + /* Flush remaining batch */ + if (batch_count > 0) { + float *embeddings = cbm_embedding_embed_batch(&cfg, batch_texts, batch_count); + if (embeddings) { + cbm_store_upsert_embedding_batch(s, batch_ids, project, + embeddings, cfg.dims, batch_count); + total_embedded += batch_count; + free(embeddings); + } + for (int i = 0; i < batch_count; i++) { + free((void *)batch_texts[i]); + } + } + + sqlite3_finalize(stmt); + free(batch_ids); + free(batch_texts); + + cbm_log_info("embedding.done", "project", project, + "embedded", itoa_buf(total_embedded)); + return total_embedded; +} diff --git a/src/pipeline/embedding.h b/src/pipeline/embedding.h new file mode 100644 index 00000000..5eb21790 --- /dev/null +++ b/src/pipeline/embedding.h @@ -0,0 +1,81 @@ +/* + * embedding.h — Semantic embedding generation and hybrid search. + * + * Generates embeddings via HTTP POST to an OpenAI-compatible /v1/embeddings + * endpoint (Ollama, llamafile, OpenAI, etc.). Configuration via env vars: + * CBM_EMBEDDING_URL — Base URL (e.g., http://localhost:11434/v1) + * CBM_EMBEDDING_MODEL — Model name (e.g., nomic-embed-text) + * CBM_EMBEDDING_DIMS — Expected vector dimensions (default: 768) + * + * When CBM_EMBEDDING_URL is not set, all embedding functions are no-ops. + */ +#ifndef CBM_EMBEDDING_H +#define CBM_EMBEDDING_H + +#include "store/store.h" +#include + +/* ── Configuration ──────────────────────────────────────────────── */ + +typedef struct { + const char *url; /* CBM_EMBEDDING_URL (NULL = disabled) */ + const char *model; /* CBM_EMBEDDING_MODEL */ + int dims; /* CBM_EMBEDDING_DIMS (default 768) */ + int batch_size; /* texts per HTTP request (default 32) */ + int timeout_ms; /* HTTP timeout (default 30000) */ +} cbm_embedding_config_t; + +/* Read config from environment variables. Returns config with url=NULL if disabled. */ +cbm_embedding_config_t cbm_embedding_get_config(void); + +/* Check if embedding is configured (CBM_EMBEDDING_URL is set). */ +bool cbm_embedding_is_configured(void); + +/* ── Embedding generation ──────────────────────────────────────── */ + +/* Embed a single text string. Returns allocated float[dims] or NULL on error. + * Caller must free the returned array. */ +float *cbm_embedding_embed_text(const cbm_embedding_config_t *cfg, const char *text); + +/* Embed multiple texts in a single HTTP request. + * Returns allocated float[count * dims] or NULL on error. + * Caller must free the returned array. */ +float *cbm_embedding_embed_batch(const cbm_embedding_config_t *cfg, + const char **texts, int count); + +/* ── Text generation (node → embeddable text) ──────────────────── */ + +/* Generate embeddable text for a node: "Label: name\nFile: path\nDir: dir\n\n" + * Returns allocated string. Caller must free. */ +char *cbm_embedding_node_text(const cbm_node_t *node); + +/* ── Hybrid search (BM25 + vector + RRF merge) ────────────────── */ + +/* RRF constant (from IR literature). */ +#define CBM_RRF_K 60 + +/* Merged search result with combined RRF score. */ +typedef struct { + int64_t node_id; + double rrf_score; /* combined RRF score (higher = better) */ + double bm25_rank; /* rank in BM25 results (-1 if not found by BM25) */ + double vec_rank; /* rank in vector results (-1 if not found by vector) */ + double similarity; /* cosine similarity (0 if not found by vector) */ +} cbm_rrf_result_t; + +/* Merge BM25 search results with vector search results using RRF (k=60). + * bm25_ids: node IDs from BM25 search, in ranked order (best first). + * vec_results: vector search results from cbm_store_vector_search. + * Returns allocated array sorted by combined RRF score. Caller frees. */ +int cbm_embedding_rrf_merge(const int64_t *bm25_ids, int bm25_count, + const cbm_vector_result_t *vec_results, int vec_count, + cbm_rrf_result_t **out, int *out_count); + +/* ── Pipeline integration ──────────────────────────────────────── */ + +/* Generate embeddings for all embeddable nodes in a project. + * Skips nodes that already have embeddings unless force=true. + * Returns number of embeddings generated, or -1 on error. */ +int cbm_embedding_generate_for_project(cbm_store_t *s, const char *project, bool force); + +#endif /* CBM_EMBEDDING_H */ diff --git a/src/pipeline/fqn.c b/src/pipeline/fqn.c index 0936c78c..fb860730 100644 --- a/src/pipeline/fqn.c +++ b/src/pipeline/fqn.c @@ -158,6 +158,91 @@ char *cbm_pipeline_fqn_folder(const char *project, const char *rel_dir) { return result; } +/** + * Resolve an import module_path relative to the importing file's directory. + * + * For relative paths (starting with ./ or ../), resolves against the importer's + * directory. For bare module specifiers (no ./ prefix), returns a copy unchanged. + * + * Examples (importer_rel_path="src/routes/api.js"): + * "./controllers/auth" → "src/routes/controllers/auth" + * "../utils/helpers" → "src/utils/helpers" + * "lodash" → "lodash" (bare module, unchanged) + * "@hapi/hapi" → "@hapi/hapi" (scoped package, unchanged) + * + * Returns: heap-allocated resolved path. Caller must free(). + */ +char *cbm_pipeline_resolve_import_path(const char *importer_rel_path, const char *module_path) { + if (!module_path || !module_path[0]) { + return strdup(""); + } + + /* Bare module specifier — no relative path resolution needed */ + if (module_path[0] != '.') { + return strdup(module_path); + } + + /* Get the importing file's directory */ + char *importer_dir = strdup(importer_rel_path ? importer_rel_path : ""); + cbm_normalize_path_sep(importer_dir); + char *last_slash = strrchr(importer_dir, '/'); + if (last_slash) { + *(last_slash + 1) = '\0'; /* keep trailing slash */ + } else { + importer_dir[0] = '\0'; /* file is at root */ + } + + /* Concatenate: importer_dir + module_path */ + size_t dir_len = strlen(importer_dir); + size_t mod_len = strlen(module_path); + char *combined = malloc(dir_len + mod_len + 2); + snprintf(combined, dir_len + mod_len + 2, "%s%s", importer_dir, module_path); + free(importer_dir); + + /* Normalize: resolve . and .. segments */ + cbm_normalize_path_sep(combined); + const char *segments[256]; + int seg_count = 0; + + char *tok = combined; + while (tok && *tok) { + char *slash = strchr(tok, '/'); + if (slash) *slash = '\0'; + + if (strcmp(tok, ".") == 0) { + /* skip */ + } else if (strcmp(tok, "..") == 0) { + if (seg_count > 0) seg_count--; /* pop parent */ + } else if (tok[0] != '\0') { + if (seg_count < 255) { + segments[seg_count++] = tok; + } + } + + tok = slash ? slash + 1 : NULL; + } + + /* Rebuild path */ + if (seg_count == 0) { + free(combined); + return strdup(""); + } + + size_t total = 0; + for (int i = 0; i < seg_count; i++) { + total += strlen(segments[i]) + 1; + } + char *result = malloc(total + 1); + result[0] = '\0'; + for (int i = 0; i < seg_count; i++) { + if (i > 0) strcat(result, "/"); + strcat(result, segments[i]); + } + + free(combined); + return result; +} + char *cbm_project_name_from_path(const char *abs_path) { if (!abs_path || !abs_path[0]) { return strdup("root"); diff --git a/src/pipeline/httplink.c b/src/pipeline/httplink.c index 7d72c1c5..ff3bc975 100644 --- a/src/pipeline/httplink.c +++ b/src/pipeline/httplink.c @@ -362,7 +362,6 @@ static int count_segments(const char *path) { return count; } -/* Jaccard similarity of path segments (intersection/union) */ static double segment_jaccard(const char *norm_call, const char *norm_route) { /* Split into segments */ char a[1024]; @@ -1379,6 +1378,193 @@ int cbm_extract_express_routes(const char *name, const char *qn, const char *sou return count; } +/* ── Route extraction: Hapi.js ─────────────────────────────────── */ + +/* Extract a quoted string value after a colon, e.g. method: 'GET' → "GET". + * Returns the number of chars consumed from `src` (0 on failure). */ +static int hapi_extract_string_value(const char *src, char *out, int outsz) { + const char *p = src; + /* Skip whitespace after colon */ + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + char quote = *p; + if (quote != '\'' && quote != '"' && quote != '`') return 0; + p++; + const char *start = p; + while (*p && *p != quote) p++; + if (*p != quote) return 0; + int len = (int)(p - start); + if (len >= outsz) len = outsz - 1; + memcpy(out, start, (size_t)len); + out[len] = '\0'; + return (int)(p + 1 - src); +} + +// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) +int cbm_extract_hapi_routes(const char *name, const char *qn, const char *source, + cbm_route_handler_t *out, int max_out) { + if (!source || !*source) { + return 0; + } + + int count = 0; + const char *p = source; + + /* Scan for object literals containing method: and path: properties. + * Hapi pattern: + * { method: 'GET', path: '/api/users', handler: ... } + * or: + * { method: 'POST', path: '/api/users', handler: UsersController.create } + * + * We look for "method:" followed by a string value, then scan nearby for + * "path:" followed by a string value (or vice versa). */ + while (*p && count < max_out) { + /* Find next "method:" or "method :" */ + const char *mkey = strstr(p, "method"); + if (!mkey) break; + + /* Verify it looks like a property key (preceded by space/newline/comma/brace) */ + if (mkey > source) { + char before = *(mkey - 1); + if (before != ' ' && before != '\t' && before != '\n' && before != '\r' && + before != ',' && before != '{') { + p = mkey + 6; + continue; + } + } + + const char *after_method = mkey + 6; + /* Skip optional whitespace and colon */ + while (*after_method == ' ' || *after_method == '\t') after_method++; + if (*after_method != ':') { + p = after_method; + continue; + } + after_method++; /* skip ':' */ + + char method_val[16] = {0}; + int consumed = hapi_extract_string_value(after_method, method_val, sizeof(method_val)); + if (consumed == 0) { + p = after_method; + continue; + } + + /* Uppercase the method */ + for (int j = 0; method_val[j]; j++) { + method_val[j] = (char)toupper((unsigned char)method_val[j]); + } + + /* Validate it's a real HTTP method */ + if (strcmp(method_val, "GET") != 0 && strcmp(method_val, "POST") != 0 && + strcmp(method_val, "PUT") != 0 && strcmp(method_val, "DELETE") != 0 && + strcmp(method_val, "PATCH") != 0 && strcmp(method_val, "OPTIONS") != 0 && + strcmp(method_val, "HEAD") != 0 && strcmp(method_val, "*") != 0) { + p = after_method + consumed; + continue; + } + + /* Search for "path:" within the same object literal — look forward from the + * method: position. Both method: and path: are in the same {...} block, + * typically within 300 chars of each other. Also search a small window + * backward in case path: comes before method: in the object. */ + const char *search_start = (mkey - 300 > source) ? mkey - 300 : source; + const char *search_end_limit = mkey + 500; + char path_val[256] = {0}; + bool found_path = false; + + /* Find the enclosing '{' to scope the search to this object literal */ + const char *obj_start = mkey; + int brace_depth = 0; + while (obj_start > source) { + obj_start--; + if (*obj_start == '{') { + if (brace_depth == 0) break; + brace_depth--; + } else if (*obj_start == '}') { + brace_depth++; + } + } + if (*obj_start == '{') { + search_start = obj_start; + } + + const char *pkey = search_start; + while ((pkey = strstr(pkey, "path")) != NULL && pkey < search_end_limit) { + /* Verify it looks like a property key */ + if (pkey > source) { + char pb = *(pkey - 1); + if (pb != ' ' && pb != '\t' && pb != '\n' && pb != '\r' && + pb != ',' && pb != '{') { + pkey += 4; + continue; + } + } + const char *after_path = pkey + 4; + while (*after_path == ' ' || *after_path == '\t') after_path++; + if (*after_path != ':') { + pkey += 4; + continue; + } + after_path++; + int pc = hapi_extract_string_value(after_path, path_val, sizeof(path_val)); + if (pc > 0 && path_val[0] == '/') { + found_path = true; + break; + } + pkey += 4; + } + + if (found_path) { + /* Optionally extract handler reference — scope to same object */ + char handler_val[256] = {0}; + const char *hkey = strstr(obj_start, "handler"); + while (hkey && hkey < search_end_limit) { + /* Verify property key */ + if (hkey > source) { + char hb = *(hkey - 1); + if (hb != ' ' && hb != '\t' && hb != '\n' && hb != '\r' && + hb != ',' && hb != '{') { + hkey = strstr(hkey + 7, "handler"); + continue; + } + } + const char *after_handler = hkey + 7; + while (*after_handler == ' ' || *after_handler == '\t') after_handler++; + if (*after_handler == ':') { + after_handler++; + while (*after_handler == ' ' || *after_handler == '\t') after_handler++; + /* Handler can be identifier.identifier or just identifier */ + const char *hs = after_handler; + while (*after_handler && *after_handler != ',' && *after_handler != '\n' && + *after_handler != '}' && *after_handler != ' ') { + after_handler++; + } + int hlen = (int)(after_handler - hs); + if (hlen > 0 && hlen < (int)sizeof(handler_val)) { + memcpy(handler_val, hs, (size_t)hlen); + handler_val[hlen] = '\0'; + } + } + break; + } + + cbm_route_handler_t *r = &out[count]; + memset(r, 0, sizeof(*r)); + strncpy(r->method, method_val, sizeof(r->method) - 1); + strncpy(r->path, path_val, sizeof(r->path) - 1); + strncpy(r->function_name, name ? name : "", sizeof(r->function_name) - 1); + strncpy(r->qualified_name, qn ? qn : "", sizeof(r->qualified_name) - 1); + if (handler_val[0]) { + strncpy(r->handler_ref, handler_val, sizeof(r->handler_ref) - 1); + } + count++; + } + + p = after_method + consumed; + } + + return count; +} + /* ── Route extraction: Laravel ─────────────────────────────────── */ // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) @@ -1720,3 +1906,347 @@ int cbm_httplink_all_exclude_paths(const cbm_httplink_config_t *cfg, const char return count; } + +/* ── Channel extraction: Socket.IO / EventEmitter ────────────────── */ + +typedef struct cbm_channel_match { + char channel[256]; + char direction[8]; /* "emit" or "listen" */ + char transport[32]; /* "socketio", "eventemitter" */ +} cbm_channel_match_t; + +int cbm_extract_channels(const char *source, cbm_channel_match_t *out, int max_out) { + if (!source || !*source) return 0; + + cbm_regex_t re; + if (cbm_regcomp(&re, + "([a-zA-Z_][a-zA-Z0-9_]*)\\.(" + "emit|on|once|addListener|removeListener" + ")\\([[:space:]]*['\"`]([^'\"`]{1,128})['\"`]", + CBM_REG_EXTENDED) != 0) { + return 0; + } + + static const char *channel_receivers[] = { + "socket", "io", "client", "server", "connection", + "emitter", "eventEmitter", "eventBus", "bus", "pubsub", + "producer", "consumer", "channel", "broker", + "nsp", "namespace", "this", NULL + }; + + int count = 0; + const char *p = source; + cbm_regmatch_t match[4]; + + while (count < max_out && cbm_regexec(&re, p, 4, match, 0) == 0) { + int rlen = match[1].rm_eo - match[1].rm_so; + char receiver[64]; + if (rlen >= (int)sizeof(receiver)) rlen = (int)sizeof(receiver) - 1; + memcpy(receiver, p + match[1].rm_so, (size_t)rlen); + receiver[rlen] = '\0'; + + bool is_channel = false; + for (int i = 0; channel_receivers[i]; i++) { + if (strcasecmp(receiver, channel_receivers[i]) == 0) { + is_channel = true; + break; + } + } + + if (is_channel) { + int mlen = match[2].rm_eo - match[2].rm_so; + char method[32]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + match[2].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int clen = match[3].rm_eo - match[3].rm_so; + if (clen >= (int)sizeof(out[count].channel)) + clen = (int)sizeof(out[count].channel) - 1; + memcpy(out[count].channel, p + match[3].rm_so, (size_t)clen); + out[count].channel[clen] = '\0'; + + const char *ch = out[count].channel; + if (strcmp(ch, "error") != 0 && strcmp(ch, "close") != 0 && + strcmp(ch, "end") != 0 && strcmp(ch, "data") != 0 && + strcmp(ch, "connect") != 0 && strcmp(ch, "disconnect") != 0 && + strcmp(ch, "connection") != 0 && strcmp(ch, "message") != 0 && + strcmp(ch, "open") != 0 && strcmp(ch, "drain") != 0 && + strcmp(ch, "finish") != 0 && strcmp(ch, "pipe") != 0 && + strcmp(ch, "unpipe") != 0 && strcmp(ch, "readable") != 0 && + strcmp(ch, "resume") != 0 && strcmp(ch, "pause") != 0) { + if (strcmp(method, "emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + if (strcasecmp(receiver, "socket") == 0 || strcasecmp(receiver, "io") == 0 || + strcasecmp(receiver, "nsp") == 0 || strcasecmp(receiver, "namespace") == 0) { + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + } else { + strncpy(out[count].transport, "eventemitter", sizeof(out[count].transport) - 1); + } + count++; + } + } + p += match[0].rm_eo; + } + + cbm_regfree(&re); + return count; +} + +/* ── JS/TS channel extraction: constant resolution pass ─────────── */ + +/* Second pass for JS/TS: resolves .emit(CONSTANT) and .on(CONSTANT) where + * the channel name is a JS constant instead of a string literal. + * Pattern: socket.on(SOME_CONSTANT, handler) / this.emit(EVENT_NAME, data) + * Resolves via: const SOME_CONSTANT = 'ActualChannelName'; */ +int cbm_extract_js_channels_constants(const char *source, cbm_channel_match_t *out, int max_out) { + if (!source || !*source) return 0; + + /* Pass 1: collect const NAME = 'value' and const NAME = "value" mappings */ + typedef struct { char name[128]; char value[256]; } js_const_t; + js_const_t consts[256]; + int nconsts = 0; + + cbm_regex_t const_re; + if (cbm_regcomp(&const_re, + "const[[:space:]]+([A-Z_][A-Z0-9_]*)[[:space:]]*=[[:space:]]*['\"]([^'\"]{1,128})['\"]", + CBM_REG_EXTENDED) != 0) { + return 0; + } + + const char *p = source; + cbm_regmatch_t cm[3]; + while (nconsts < 256 && cbm_regexec(&const_re, p, 3, cm, 0) == 0) { + int nlen = cm[1].rm_eo - cm[1].rm_so; + int vlen = cm[2].rm_eo - cm[2].rm_so; + if (nlen < (int)sizeof(consts[0].name) && vlen < (int)sizeof(consts[0].value)) { + memcpy(consts[nconsts].name, p + cm[1].rm_so, (size_t)nlen); + consts[nconsts].name[nlen] = '\0'; + memcpy(consts[nconsts].value, p + cm[2].rm_so, (size_t)vlen); + consts[nconsts].value[vlen] = '\0'; + nconsts++; + } + p += cm[0].rm_eo; + } + cbm_regfree(&const_re); + + if (nconsts == 0) return 0; + + /* Pass 2: find .emit(CONSTANT) and .on(CONSTANT) with bare identifiers */ + static const char *channel_receivers[] = { + "socket", "io", "client", "server", "connection", + "emitter", "eventEmitter", "eventBus", "this", + "socketIoEventEmitter", "socketServer", "nsp", NULL + }; + + /* Match both receiver.on(CONSTANT) and chained .on(CONSTANT) patterns. + * The chained pattern starts with optional whitespace + dot. */ + cbm_regex_t call_re; + if (cbm_regcomp(&call_re, + "([a-zA-Z_][a-zA-Z0-9_]*)?\\.(" + "emit|on|once|addListener|onRequest|respond" + ")\\([[:space:]]*([A-Z_][A-Z0-9_]*)", + CBM_REG_EXTENDED) != 0) { + return 0; + } + + int count = 0; + p = source; + cbm_regmatch_t mm[4]; + while (count < max_out && cbm_regexec(&call_re, p, 4, mm, 0) == 0) { + int rlen = mm[1].rm_eo - mm[1].rm_so; + char receiver[64]; + bool is_chained = (rlen <= 0); /* method chaining: no receiver captured */ + if (rlen > 0) { + if (rlen >= (int)sizeof(receiver)) rlen = (int)sizeof(receiver) - 1; + memcpy(receiver, p + mm[1].rm_so, (size_t)rlen); + receiver[rlen] = '\0'; + } else { + receiver[0] = '\0'; + } + + bool is_channel = is_chained; /* chained .on() assumed to be on socket object */ + if (!is_chained) { + for (int i = 0; channel_receivers[i]; i++) { + if (strcasecmp(receiver, channel_receivers[i]) == 0) { + is_channel = true; + break; + } + } + } + + if (is_channel) { + int mlen = mm[2].rm_eo - mm[2].rm_so; + char method[32]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + mm[2].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int clen = mm[3].rm_eo - mm[3].rm_so; + char constant_name[128]; + if (clen >= (int)sizeof(constant_name)) clen = (int)sizeof(constant_name) - 1; + memcpy(constant_name, p + mm[3].rm_so, (size_t)clen); + constant_name[clen] = '\0'; + + /* Resolve constant to string value */ + const char *resolved = NULL; + for (int c = 0; c < nconsts; c++) { + if (strcmp(consts[c].name, constant_name) == 0) { + resolved = consts[c].value; + break; + } + } + + if (resolved) { + strncpy(out[count].channel, resolved, sizeof(out[count].channel) - 1); + out[count].channel[sizeof(out[count].channel) - 1] = '\0'; + } else { + /* Unresolved constant — use the constant name as channel name */ + strncpy(out[count].channel, constant_name, sizeof(out[count].channel) - 1); + out[count].channel[sizeof(out[count].channel) - 1] = '\0'; + } + + /* Skip generic events */ + const char *ch = out[count].channel; + if (strcmp(ch, "error") != 0 && strcmp(ch, "close") != 0 && + strcmp(ch, "end") != 0 && strcmp(ch, "data") != 0 && + strcmp(ch, "connect") != 0 && strcmp(ch, "disconnect") != 0 && + strcmp(ch, "connection") != 0 && strcmp(ch, "message") != 0) { + if (strcmp(method, "emit") == 0 || strcmp(method, "respond") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + if (strcasecmp(receiver, "socket") == 0 || strcasecmp(receiver, "io") == 0 || + strcasecmp(receiver, "nsp") == 0 || strcasecmp(receiver, "socketServer") == 0) { + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + } else { + strncpy(out[count].transport, "eventemitter", sizeof(out[count].transport) - 1); + } + count++; + } + } + p += mm[0].rm_eo; + } + cbm_regfree(&call_re); + return count; +} + +/* ── C# channel extraction: Socket.IO with constant resolution ─── */ + +/* Extract channels from C# source that uses constant names for event strings. + * Pattern: _socket.Emit(CONSTANT_NAME, data) / _socket.OnRequest(CONSTANT_NAME, ...) + * Resolves constants via: const string CONSTANT_NAME = "ActualChannelName"; */ +int cbm_extract_csharp_channels(const char *source, cbm_channel_match_t *out, int max_out) { + if (!source || !*source) return 0; + + /* Pass 1: Collect const string mappings: name → value */ + typedef struct { char name[128]; char value[256]; } const_map_t; + const_map_t cmap[128]; + int cmap_count = 0; + + cbm_regex_t re_const; + if (cbm_regcomp(&re_const, + "const[[:space:]]+string[[:space:]]+([A-Z_][A-Z_0-9]*)[[:space:]]*=[[:space:]]*\"([^\"]{1,128})\"", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t cm[3]; + while (cmap_count < 128 && cbm_regexec(&re_const, p, 3, cm, 0) == 0) { + int nlen = cm[1].rm_eo - cm[1].rm_so; + int vlen = cm[2].rm_eo - cm[2].rm_so; + if (nlen > 0 && nlen < 128 && vlen > 0 && vlen < 256) { + memcpy(cmap[cmap_count].name, p + cm[1].rm_so, (size_t)nlen); + cmap[cmap_count].name[nlen] = '\0'; + memcpy(cmap[cmap_count].value, p + cm[2].rm_so, (size_t)vlen); + cmap[cmap_count].value[vlen] = '\0'; + cmap_count++; + } + p += cm[0].rm_eo; + } + cbm_regfree(&re_const); + } + + /* Pass 2: Find .Emit( and .OnRequest patterns */ + int count = 0; + + /* Pattern: .Emit(IDENTIFIER or .OnRequest<...>(IDENTIFIER */ + cbm_regex_t re_emit; + if (cbm_regcomp(&re_emit, + "\\.(Emit|OnRequest)[^(]*\\([[:space:]]*([A-Z_][A-Z_0-9]*)", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t em[3]; + while (count < max_out && cbm_regexec(&re_emit, p, 3, em, 0) == 0) { + int mlen = em[1].rm_eo - em[1].rm_so; + char method[16]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + em[1].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int ilen = em[2].rm_eo - em[2].rm_so; + char ident[128]; + if (ilen >= (int)sizeof(ident)) ilen = (int)sizeof(ident) - 1; + memcpy(ident, p + em[2].rm_so, (size_t)ilen); + ident[ilen] = '\0'; + + /* Resolve constant to string value */ + const char *resolved = NULL; + for (int i = 0; i < cmap_count; i++) { + if (strcmp(cmap[i].name, ident) == 0) { + resolved = cmap[i].value; + break; + } + } + + if (resolved) { + strncpy(out[count].channel, resolved, sizeof(out[count].channel) - 1); + out[count].channel[sizeof(out[count].channel) - 1] = '\0'; + + if (strcmp(method, "Emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + count++; + } + p += em[0].rm_eo; + } + cbm_regfree(&re_emit); + } + + /* Also match direct string literal patterns: .Emit("ChannelName" */ + cbm_regex_t re_literal; + if (cbm_regcomp(&re_literal, + "\\.(Emit|On|OnRequest)[^(]*\\([[:space:]]*\"([^\"]{1,128})\"", + CBM_REG_EXTENDED) == 0) { + const char *p = source; + cbm_regmatch_t lm[3]; + while (count < max_out && cbm_regexec(&re_literal, p, 3, lm, 0) == 0) { + int mlen = lm[1].rm_eo - lm[1].rm_so; + char method[16]; + if (mlen >= (int)sizeof(method)) mlen = (int)sizeof(method) - 1; + memcpy(method, p + lm[1].rm_so, (size_t)mlen); + method[mlen] = '\0'; + + int clen = lm[2].rm_eo - lm[2].rm_so; + strncpy(out[count].channel, p + lm[2].rm_so, (size_t)(clen < 255 ? clen : 255)); + out[count].channel[clen < 255 ? clen : 255] = '\0'; + + if (strcmp(method, "Emit") == 0) { + strncpy(out[count].direction, "emit", sizeof(out[count].direction) - 1); + } else { + strncpy(out[count].direction, "listen", sizeof(out[count].direction) - 1); + } + strncpy(out[count].transport, "socketio", sizeof(out[count].transport) - 1); + count++; + p += lm[0].rm_eo; + } + cbm_regfree(&re_literal); + } + + return count; +} diff --git a/src/pipeline/httplink.h b/src/pipeline/httplink.h index c0cd275a..b14fbe3c 100644 --- a/src/pipeline/httplink.h +++ b/src/pipeline/httplink.h @@ -113,6 +113,10 @@ int cbm_extract_ktor_routes(const char *name, const char *qn, const char *source int cbm_extract_express_routes(const char *name, const char *qn, const char *source, cbm_route_handler_t *out, int max_out); +/* Hapi.js object-literal routes: { method: 'GET', path: '/api/...', handler: ... } */ +int cbm_extract_hapi_routes(const char *name, const char *qn, const char *source, + cbm_route_handler_t *out, int max_out); + /* Extract PHP Laravel routes from source. * Returns count. */ int cbm_extract_laravel_routes(const char *name, const char *qn, const char *source, diff --git a/src/pipeline/pass_definitions.c b/src/pipeline/pass_definitions.c index a19175a8..e1514783 100644 --- a/src/pipeline/pass_definitions.c +++ b/src/pipeline/pass_definitions.c @@ -271,6 +271,13 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t cbm_gbuf_insert_edge(ctx->gbuf, parent->id, node_id, "DEFINES_METHOD", "{}"); } } + /* HAS_PROPERTY edge: Class → Property */ + if (def->parent_class && def->label && strcmp(def->label, "Property") == 0) { + const cbm_gbuf_node_t *parent = cbm_gbuf_find_by_qn(ctx->gbuf, def->parent_class); + if (parent && node_id > 0) { + cbm_gbuf_insert_edge(ctx->gbuf, parent->id, node_id, "HAS_PROPERTY", "{}"); + } + } total_defs++; } @@ -281,28 +288,76 @@ int cbm_pipeline_pass_definitions(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t total_imports += result->imports.count; /* Store per-file import map for later use by pass_calls. - * For each import, create an IMPORTS edge: File → imported module. */ - for (int j = 0; j < result->imports.count; j++) { - CBMImport *imp = &result->imports.items[j]; - if (!imp->module_path) { - continue; - } - - /* Find or create the target module node */ - char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path); - const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); - + * For each import, create an IMPORTS edge: File → imported module. + * Resolve relative paths (./ ../) and probe common extensions. */ + { char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + free(file_qn); + + for (int j = 0; j < result->imports.count && source_node; j++) { + CBMImport *imp = &result->imports.items[j]; + if (!imp->module_path) { + continue; + } - if (source_node && target) { - char imp_props[256]; - snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", - imp->local_name ? imp->local_name : ""); - cbm_gbuf_insert_edge(ctx->gbuf, source_node->id, target->id, "IMPORTS", imp_props); + /* Resolve relative paths against importing file's directory */ + char *resolved = cbm_pipeline_resolve_import_path(rel, imp->module_path); + char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved); + const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + + /* Probe common extensions */ + if (!target) { + static const char *exts[] = { + ".js", ".ts", ".tsx", ".jsx", ".mjs", ".mts", + ".css", ".scss", ".json", NULL + }; + for (int e = 0; !target && exts[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, exts[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* Probe /index variants */ + if (!target) { + static const char *idx[] = { + "/index.js", "/index.ts", "/index.tsx", "/index.jsx", + "/index.mjs", "/index", NULL + }; + for (int e = 0; !target && idx[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, idx[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* C/C++ include: try .h, .hpp */ + if (!target) { + static const char *hdr[] = {".h", ".hpp", ".hh", NULL}; + for (int e = 0; !target && hdr[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, hdr[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + if (target) { + char imp_props[256]; + snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", + imp->local_name ? imp->local_name : ""); + cbm_gbuf_insert_edge(ctx->gbuf, source_node->id, target->id, "IMPORTS", + imp_props); + } + free(target_qn); + free(resolved); } - free(target_qn); - free(file_qn); } /* Cache or free the extraction result */ diff --git a/src/pipeline/pass_httplinks.c b/src/pipeline/pass_httplinks.c index 7ecdda71..7f8e966c 100644 --- a/src/pipeline/pass_httplinks.c +++ b/src/pipeline/pass_httplinks.c @@ -277,6 +277,9 @@ static int discover_node_routes(const cbm_gbuf_node_t *n, const cbm_pipeline_ctx nr = cbm_extract_express_routes(n->name, n->qualified_name, source, out + total, max_out - total); total += nr; + nr = cbm_extract_hapi_routes(n->name, n->qualified_name, source, out + total, + max_out - total); + total += nr; } if (has_suffix(fp, ".php")) { nr = cbm_extract_laravel_routes(n->name, n->qualified_name, source, out + total, @@ -323,6 +326,8 @@ static int discover_module_routes(const cbm_gbuf_node_t *mod, const cbm_pipeline if (is_js) { total += cbm_extract_express_routes(mod->name, mod->qualified_name, source, out + total, max_out - total); + total += cbm_extract_hapi_routes(mod->name, mod->qualified_name, source, out + total, + max_out - total); } free(source); return total; @@ -881,6 +886,64 @@ static int insert_route_nodes(cbm_pipeline_ctx_t *ctx, cbm_route_handler_t *rout for (int i = 0; i < route_count; i++) { cbm_route_handler_t *rh = &routes[i]; + /* Reject obviously invalid route paths. + * Vendored/minified JS files (e.g. tsc.js, typescript.js) inside non-JS + * repos can produce false positives where JS operators/keywords get + * matched as route paths by the Express extractor. */ + { + const char *p = rh->path; + /* Skip empty paths */ + if (!p || !*p) continue; + + /* Reject paths that are JS operators or keywords — not valid URL routes */ + static const char *const invalid_paths[] = { + "!", "+", "++", "-", "--", ":", "~", "void", "null", "true", + "false", "throw", "this", "typeof", "delete", "new", "return", + "undefined", "NaN", "Infinity", "var", "let", "const", + "function", "class", "if", "else", "for", "while", "do", + "switch", "case", "break", "continue", "try", "catch", + "finally", "with", "in", "of", "yield", "await", "async", + "super", "import", "export", "default", "extends", "static", + "_this", "self", "__proto__", "arguments", "range", + NULL + }; + bool rejected = false; + /* Work with a trimmed copy for comparison */ + char trimmed[256]; + /* Trim leading whitespace */ + while (*p == ' ' || *p == '\t') p++; + strncpy(trimmed, p, sizeof(trimmed) - 1); + trimmed[sizeof(trimmed) - 1] = '\0'; + /* Trim trailing whitespace */ + size_t tlen = strlen(trimmed); + while (tlen > 0 && (trimmed[tlen - 1] == ' ' || trimmed[tlen - 1] == '\t' || + trimmed[tlen - 1] == '\n' || trimmed[tlen - 1] == '\r')) { + trimmed[--tlen] = '\0'; + } + for (int k = 0; invalid_paths[k]; k++) { + if (strcmp(trimmed, invalid_paths[k]) == 0) { + rejected = true; + break; + } + } + if (rejected) continue; + + /* Reject single-character non-slash paths (e.g. "*", "?", "#") */ + if (p[0] && !p[1] && p[0] != '/') continue; + + /* Reject paths that contain no alphanumeric or slash characters. + * Valid routes like "/api/v1" always have at least one alnum. */ + bool has_alnum_or_slash = false; + for (const char *c = p; *c; c++) { + if ((*c >= 'a' && *c <= 'z') || (*c >= 'A' && *c <= 'Z') || + (*c >= '0' && *c <= '9') || *c == '/') { + has_alnum_or_slash = true; + break; + } + } + if (!has_alnum_or_slash) continue; + } + /* Build Route QN and name */ char normal_method[16]; snprintf(normal_method, sizeof(normal_method), "%s", rh->method[0] ? rh->method : "ANY"); @@ -1339,6 +1402,45 @@ int cbm_pipeline_pass_httplinks(cbm_pipeline_ctx_t *ctx) { cbm_log_info("httplink.routes", "count", itoa_hl(route_count)); + /* ── Phase 1b: Deduplicate routes by (method, path) ──────── */ + /* Three sources of route duplication: + * 1. Module-level extraction (empty QN) re-discovers routes already found + * at function level (non-empty QN) for the same (method, path). + * 2. Both extract_express_routes and extract_hapi_routes may match the + * same route patterns in the same function body. + * Strategy: for each (method, path) group, keep the entry with the best + * qualified_name (non-empty wins over empty; longer wins over shorter). */ + { + int deduped = 0; + for (int i = 0; i < route_count; i++) { + cbm_route_handler_t *a = &routes[i]; + /* Check if a better or equal entry already exists */ + bool dominated = false; + for (int j = 0; j < deduped; j++) { + cbm_route_handler_t *b = &routes[j]; + if (strcmp(a->method, b->method) == 0 && + strcmp(a->path, b->path) == 0) { + /* Same route — keep whichever has a better QN */ + if (a->qualified_name[0] && !b->qualified_name[0]) { + /* a is better — replace b */ + *b = *a; + } + /* else b is better or equal — drop a */ + dominated = true; + break; + } + } + if (!dominated) { + routes[deduped++] = *a; + } + } + if (deduped < route_count) { + cbm_log_info("httplink.dedup", "before", itoa_hl(route_count), + "after", itoa_hl(deduped)); + route_count = deduped; + } + } + /* ── Phase 2: Resolve cross-file prefixes (serial) ────────── */ resolve_cross_file_group_prefixes(ctx, routes, route_count); resolve_fastapi_prefixes(ctx, routes, route_count); diff --git a/src/pipeline/pass_parallel.c b/src/pipeline/pass_parallel.c index 3193c1c7..b14b249b 100644 --- a/src/pipeline/pass_parallel.c +++ b/src/pipeline/pass_parallel.c @@ -572,6 +572,9 @@ static void prescan_routes(const char *source, int source_len, const CBMFileResu nr = cbm_extract_express_routes(def->name, def->qualified_name, func_src, routes + total, 16 - total); total += nr; + nr = cbm_extract_hapi_routes(def->name, def->qualified_name, func_src, + routes + total, 16 - total); + total += nr; nr = cbm_extract_laravel_routes(def->name, def->qualified_name, func_src, routes + total, 16 - total); total += nr; @@ -608,6 +611,8 @@ static void prescan_routes(const char *source, int source_len, const CBMFileResu if (is_js) { total += cbm_extract_express_routes(basename, "", source, mod_routes + total, 16 - total); + total += cbm_extract_hapi_routes(basename, "", source, mod_routes + total, + 16 - total); } for (int r = 0; r < total; r++) { prescan_add_route(ps, &mod_routes[r]); @@ -938,22 +943,75 @@ int cbm_build_registry_from_cache(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t "{}"); } } + /* HAS_PROPERTY edge: Class → Property */ + if (def->parent_class && def->label && strcmp(def->label, "Property") == 0) { + const cbm_gbuf_node_t *parent = cbm_gbuf_find_by_qn(ctx->gbuf, def->parent_class); + if (parent && def_node) { + cbm_gbuf_insert_edge(ctx->gbuf, parent->id, def_node->id, "HAS_PROPERTY", + "{}"); + } + } } - /* IMPORTS edges */ - for (int j = 0; j < result->imports.count; j++) { + /* IMPORTS edges — resolve relative paths and probe extensions */ + char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); + const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + free(file_qn); + + for (int j = 0; j < result->imports.count && source_node; j++) { CBMImport *imp = &result->imports.items[j]; if (!imp->module_path) { continue; } - char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, imp->module_path); + /* Resolve relative paths (./ ../) against importing file's directory */ + char *resolved = cbm_pipeline_resolve_import_path(rel, imp->module_path); + char *target_qn = cbm_pipeline_fqn_module(ctx->project_name, resolved); const cbm_gbuf_node_t *target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); - char *file_qn = cbm_pipeline_fqn_compute(ctx->project_name, rel, "__file__"); - const cbm_gbuf_node_t *source_node = cbm_gbuf_find_by_qn(ctx->gbuf, file_qn); + /* Probe common extensions if no exact match: .js, .ts, .tsx, .jsx, .mjs */ + if (!target) { + static const char *exts[] = { + ".js", ".ts", ".tsx", ".jsx", ".mjs", ".mts", + ".css", ".scss", ".json", NULL + }; + for (int e = 0; !target && exts[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, exts[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + /* Probe /index variants (directory imports) */ + if (!target) { + static const char *idx[] = { + "/index.js", "/index.ts", "/index.tsx", "/index.jsx", + "/index.mjs", "/index", NULL + }; + for (int e = 0; !target && idx[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, idx[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } - if (source_node && target) { + /* C/C++ include: try .h, .hpp variants */ + if (!target && (resolved[0] != '.' || resolved[1] == '.')) { + static const char *hdr[] = {".h", ".hpp", ".hh", NULL}; + for (int e = 0; !target && hdr[e]; e++) { + char buf[2048]; + snprintf(buf, sizeof(buf), "%s%s", resolved, hdr[e]); + free(target_qn); + target_qn = cbm_pipeline_fqn_module(ctx->project_name, buf); + target = cbm_gbuf_find_by_qn(ctx->gbuf, target_qn); + } + } + + if (target) { char imp_props[256]; snprintf(imp_props, sizeof(imp_props), "{\"local_name\":\"%s\"}", imp->local_name ? imp->local_name : ""); @@ -961,7 +1019,7 @@ int cbm_build_registry_from_cache(cbm_pipeline_ctx_t *ctx, const cbm_file_info_t imports_edges++; } free(target_qn); - free(file_qn); + free(resolved); } } diff --git a/src/pipeline/pipeline.c b/src/pipeline/pipeline.c index 66f47eac..6876b2e5 100644 --- a/src/pipeline/pipeline.c +++ b/src/pipeline/pipeline.c @@ -12,8 +12,9 @@ */ #include "pipeline/pipeline.h" #include "pipeline/pipeline_internal.h" -// NOLINTNEXTLINE(misc-include-cleaner) — worker_pool.h included for interface contract #include "pipeline/worker_pool.h" +#include "pipeline/embedding.h" +#include "store/cross_repo.h" #include "graph_buffer/graph_buffer.h" #include "store/store.h" #include "discover/discover.h" @@ -818,6 +819,62 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { } cbm_store_close(hash_store); cbm_log_info("pass.timing", "pass", "persist_hashes", "files", itoa_buf(file_count)); + + /* Backfill FTS5 index: the direct B-tree dump bypasses SQLite triggers, + * so the FTS5 table is empty after indexing. Populate it in bulk now. + * cbm_camel_split(name) splits camelCase into individual tokens so + * "updateCloudClient" becomes searchable as "update", "Cloud", "Client". */ + cbm_store_t *fts_store = cbm_store_open_path(db_path); + if (fts_store) { + /* Contentless FTS5 (content='') — use plain INSERT, not INSERT OR REPLACE. + * Clear first to handle reindex scenarios, then bulk insert. */ + cbm_store_exec(fts_store, + "DELETE FROM nodes_fts;"); + cbm_store_exec(fts_store, + "INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path) " + "SELECT id, cbm_camel_split(name), qualified_name, label, file_path FROM nodes;"); + cbm_store_close(fts_store); + } + + /* ── Process detection: discover execution flows from entry points ── */ + { + cbm_store_t *proc_store = cbm_store_open_path(db_path); + if (proc_store) { + int nprocs = cbm_store_detect_processes(proc_store, p->project_name, 300); + cbm_log_info("pass.done", "pass", "processes", + "detected", itoa_buf(nprocs)); + cbm_store_close(proc_store); + } + } + + /* ── Channel detection: scan source for emit/on patterns ── */ + { + cbm_store_t *ch_store = cbm_store_open_path(db_path); + if (ch_store) { + int nch = cbm_store_detect_channels(ch_store, p->project_name, p->repo_path); + cbm_log_info("pass.done", "pass", "channels", + "detected", itoa_buf(nch)); + cbm_store_close(ch_store); + } + } + + /* ── Embedding generation: semantic vectors for hybrid search ── */ + if (cbm_embedding_is_configured()) { + cbm_store_t *emb_store = cbm_store_open_path(db_path); + if (emb_store) { + int nemb = cbm_embedding_generate_for_project( + emb_store, p->project_name, false /* don't force — skip existing */); + if (nemb > 0) { + cbm_log_info("pass.done", "pass", "embeddings", + "generated", itoa_buf(nemb)); + } else if (nemb == 0) { + cbm_log_info("pass.skip", "pass", "embeddings", + "reason", "all_exist"); + } + /* nemb < 0 means error — already logged inside */ + cbm_store_close(emb_store); + } + } } } @@ -825,6 +882,16 @@ int cbm_pipeline_run(cbm_pipeline_t *p) { itoa_buf(cbm_gbuf_edge_count(p->gbuf)), "elapsed_ms", itoa_buf((int)elapsed_ms(t0))); + /* ── Auto-rebuild cross-repo index after indexing ── */ + { + cbm_cross_repo_stats_t cr_stats = cbm_cross_repo_build(); + if (cr_stats.repos_scanned > 0) { + cbm_log_info("pass.done", "pass", "cross_repo_index", + "repos", itoa_buf(cr_stats.repos_scanned), + "cross_channels", itoa_buf(cr_stats.cross_repo_matches)); + } + } + cleanup: /* Free prescan if not already freed */ if (ctx.prescan_cache) { diff --git a/src/pipeline/pipeline.h b/src/pipeline/pipeline.h index 203f4374..58850e7c 100644 --- a/src/pipeline/pipeline.h +++ b/src/pipeline/pipeline.h @@ -82,6 +82,10 @@ char *cbm_pipeline_fqn_module(const char *project, const char *rel_path); /* Folder QN: project.dir.parts. Caller must free(). */ char *cbm_pipeline_fqn_folder(const char *project, const char *rel_dir); +/* Resolve an import module_path relative to the importing file's directory. + * Handles ./ and ../ resolution. Bare modules returned unchanged. Caller must free(). */ +char *cbm_pipeline_resolve_import_path(const char *importer_rel_path, const char *module_path); + /* Derive project name from an absolute path. * Replaces / and : with -, collapses --, trims leading -. * Caller must free() the returned string. */ diff --git a/src/store/cross_repo.c b/src/store/cross_repo.c new file mode 100644 index 00000000..73916b3c --- /dev/null +++ b/src/store/cross_repo.c @@ -0,0 +1,879 @@ +/* + * cross_repo.c — Cross-repository index: build, search, channel matching. + * + * Scans all per-project .db files to build a unified _cross_repo.db with: + * - cross_channels: all channel emit/listen from every repo + * - cross_nodes: Function/Method/Class/Interface/Route stubs from all repos + * - cross_nodes_fts: BM25 FTS5 index with camelCase splitting + * - cross_embeddings: semantic vectors copied from per-project DBs + * + * The cross-repo DB is a standard SQLite file — no ATTACH needed. + * Built by scanning each project DB via cbm_store_open_path_query(). + */ + +#include "store/cross_repo.h" +#include "store/store.h" +#include "foundation/log.h" +#include "foundation/platform.h" +#include "foundation/compat.h" +#include "foundation/compat_fs.h" + +#include +#include +#include +#include +#include +#include + +/* ── Helpers ────────────────────────────────────────────────────── */ + +static _Thread_local char _itoa[32]; +static const char *itoa_cr(int v) { snprintf(_itoa, sizeof(_itoa), "%d", v); return _itoa; } + +static const char *get_cross_repo_path(void) { + static char path[1024]; + const char *home = getenv("HOME"); + if (!home) home = getenv("USERPROFILE"); + if (!home) return NULL; + snprintf(path, sizeof(path), "%s/.cache/codebase-memory-mcp/_cross_repo.db", home); + return path; +} + +static const char *get_cache_dir(void) { + static char dir[1024]; + const char *home = getenv("HOME"); + if (!home) home = getenv("USERPROFILE"); + if (!home) return NULL; + snprintf(dir, sizeof(dir), "%s/.cache/codebase-memory-mcp", home); + return dir; +} + +/* CamelCase splitter — same as store.c. Duplicated to keep cross_repo self-contained. */ +static void sqlite_camel_split_cr(sqlite3_context *ctx, int argc, sqlite3_value **argv) { + (void)argc; + const char *input = (const char *)sqlite3_value_text(argv[0]); + if (!input || !input[0]) { + sqlite3_result_text(ctx, input ? input : "", -1, SQLITE_TRANSIENT); + return; + } + char buf[2048]; + int len = snprintf(buf, sizeof(buf), "%s ", input); + for (int i = 0; input[i] && len < (int)sizeof(buf) - 2; i++) { + if (i > 0) { + bool split = false; + if (input[i] >= 'A' && input[i] <= 'Z' && + input[i - 1] >= 'a' && input[i - 1] <= 'z') split = true; + if (input[i] >= 'A' && input[i] <= 'Z' && + input[i - 1] >= 'A' && input[i - 1] <= 'Z' && + input[i + 1] >= 'a' && input[i + 1] <= 'z') split = true; + if (split) buf[len++] = ' '; + } + buf[len++] = input[i]; + } + buf[len] = '\0'; + sqlite3_result_text(ctx, buf, len, SQLITE_TRANSIENT); +} + +/* Cosine similarity — same as store.c. */ +static void sqlite_cosine_sim_cr(sqlite3_context *ctx, int argc, sqlite3_value **argv) { + (void)argc; + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB || + sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_null(ctx); return; + } + const float *a = (const float *)sqlite3_value_blob(argv[0]); + const float *b = (const float *)sqlite3_value_blob(argv[1]); + int a_bytes = sqlite3_value_bytes(argv[0]); + int b_bytes = sqlite3_value_bytes(argv[1]); + if (a_bytes != b_bytes || a_bytes == 0 || (a_bytes % (int)sizeof(float)) != 0) { + sqlite3_result_null(ctx); return; + } + int dims = a_bytes / (int)sizeof(float); + float dot = 0.0f, na = 0.0f, nb = 0.0f; + for (int i = 0; i < dims; i++) { + dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; + } + if (na == 0.0f || nb == 0.0f) { sqlite3_result_double(ctx, 0.0); return; } + sqlite3_result_double(ctx, (double)dot / (sqrt((double)na) * sqrt((double)nb))); +} + +/* ── Cross-Repo Handle ──────────────────────────────────────────── */ + +struct cbm_cross_repo { + sqlite3 *db; +}; + +cbm_cross_repo_t *cbm_cross_repo_open(void) { + const char *path = get_cross_repo_path(); + if (!path) return NULL; + + sqlite3 *db = NULL; + if (sqlite3_open_v2(path, &db, SQLITE_OPEN_READONLY | SQLITE_OPEN_NOMUTEX, NULL) != SQLITE_OK) { + if (db) sqlite3_close(db); + return NULL; + } + + /* Register custom functions */ + sqlite3_create_function(db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_camel_split_cr, NULL, NULL); + sqlite3_create_function(db, "cbm_cosine_sim", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_cosine_sim_cr, NULL, NULL); + + cbm_cross_repo_t *cr = calloc(1, sizeof(cbm_cross_repo_t)); + if (!cr) { sqlite3_close(db); return NULL; } + cr->db = db; + return cr; +} + +void cbm_cross_repo_close(cbm_cross_repo_t *cr) { + if (!cr) return; + if (cr->db) sqlite3_close(cr->db); + free(cr); +} + +/* ── Build ──────────────────────────────────────────────────────── */ + +static const char *CROSS_SCHEMA = + "CREATE TABLE IF NOT EXISTS cross_channels (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " channel_name TEXT NOT NULL," + " transport TEXT NOT NULL," + " direction TEXT NOT NULL," + " project TEXT NOT NULL," + " file_path TEXT NOT NULL DEFAULT ''," + " function_name TEXT NOT NULL DEFAULT ''," + " node_id INTEGER NOT NULL DEFAULT 0" + ");" + "CREATE INDEX IF NOT EXISTS idx_xch_name ON cross_channels(channel_name);" + "CREATE INDEX IF NOT EXISTS idx_xch_project ON cross_channels(project);" + "CREATE TABLE IF NOT EXISTS cross_nodes (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " project TEXT NOT NULL," + " orig_id INTEGER NOT NULL," + " label TEXT NOT NULL," + " name TEXT NOT NULL," + " qualified_name TEXT NOT NULL," + " file_path TEXT NOT NULL DEFAULT ''" + ");" + "CREATE INDEX IF NOT EXISTS idx_xn_project ON cross_nodes(project);" + "CREATE INDEX IF NOT EXISTS idx_xn_name ON cross_nodes(name);" + "CREATE INDEX IF NOT EXISTS idx_xn_proj_orig ON cross_nodes(project, orig_id);" + "CREATE TABLE IF NOT EXISTS cross_embeddings (" + " node_id INTEGER PRIMARY KEY," + " project TEXT NOT NULL," + " embedding BLOB NOT NULL," + " dimensions INTEGER NOT NULL" + ");" + "CREATE INDEX IF NOT EXISTS idx_xe_project ON cross_embeddings(project);" + "CREATE TABLE IF NOT EXISTS cross_meta (" + " key TEXT PRIMARY KEY," + " value TEXT NOT NULL" + ");"; + +static const char *CROSS_FTS = + "CREATE VIRTUAL TABLE IF NOT EXISTS cross_nodes_fts USING fts5(" + "name, qualified_name, label, file_path, project," + "content=''," + "tokenize='unicode61 remove_diacritics 2'" + ");"; + +cbm_cross_repo_stats_t cbm_cross_repo_build(void) { + cbm_cross_repo_stats_t stats = {0}; + struct timespec t0; + clock_gettime(CLOCK_MONOTONIC, &t0); + + const char *db_path = get_cross_repo_path(); + const char *cache_dir = get_cache_dir(); + if (!db_path || !cache_dir) { + stats.repos_scanned = -1; + return stats; + } + + /* Delete old cross-repo DB and create fresh */ + remove(db_path); + + sqlite3 *db = NULL; + if (sqlite3_open_v2(db_path, &db, + SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_NOMUTEX, + NULL) != SQLITE_OK) { + if (db) sqlite3_close(db); + stats.repos_scanned = -1; + return stats; + } + + /* Register custom functions for FTS5 */ + sqlite3_create_function(db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_camel_split_cr, NULL, NULL); + + /* Pragmas for fast bulk write */ + sqlite3_exec(db, "PRAGMA journal_mode=WAL; PRAGMA synchronous=OFF; " + "PRAGMA cache_size=-32000;", NULL, NULL, NULL); + + /* Create schema */ + char *err = NULL; + sqlite3_exec(db, CROSS_SCHEMA, NULL, NULL, &err); + if (err) { sqlite3_free(err); err = NULL; } + sqlite3_exec(db, CROSS_FTS, NULL, NULL, &err); + if (err) { sqlite3_free(err); err = NULL; } + + sqlite3_exec(db, "BEGIN TRANSACTION", NULL, NULL, NULL); + + /* Scan all project DBs in cache directory */ + cbm_dir_t *dir = cbm_opendir(cache_dir); + if (!dir) { + sqlite3_close(db); + stats.repos_scanned = -1; + return stats; + } + + /* Prepared statements for inserting into cross-repo DB */ + sqlite3_stmt *ins_ch = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO cross_channels(channel_name, transport, direction, project, " + "file_path, function_name, node_id) VALUES(?1,?2,?3,?4,?5,?6,?7)", + -1, &ins_ch, NULL); + + sqlite3_stmt *ins_node = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO cross_nodes(project, orig_id, label, name, qualified_name, file_path) " + "VALUES(?1,?2,?3,?4,?5,?6)", + -1, &ins_node, NULL); + + sqlite3_stmt *ins_emb = NULL; + sqlite3_prepare_v2(db, + "INSERT INTO cross_embeddings(node_id, project, embedding, dimensions) " + "VALUES(?1,?2,?3,?4)", + -1, &ins_emb, NULL); + + cbm_dirent_t *dent; + while ((dent = cbm_readdir(dir)) != NULL) { + const char *entry = dent->name; + /* Skip non-.db files, _cross_repo.db, _config.db */ + size_t elen = strlen(entry); + if (elen < 4 || strcmp(entry + elen - 3, ".db") != 0) continue; + if (strstr(entry, "_cross_repo") || strstr(entry, "_config")) continue; + if (strstr(entry, "-wal") || strstr(entry, "-shm")) continue; + + char proj_db_path[2048]; + snprintf(proj_db_path, sizeof(proj_db_path), "%s/%s", cache_dir, entry); + + /* Derive project name from filename (remove .db suffix) */ + char project_name[512]; + snprintf(project_name, sizeof(project_name), "%.*s", (int)(elen - 3), entry); + + /* Open project DB read-only */ + sqlite3 *pdb = NULL; + if (sqlite3_open_v2(proj_db_path, &pdb, + SQLITE_OPEN_READONLY | SQLITE_OPEN_NOMUTEX, NULL) != SQLITE_OK) { + if (pdb) sqlite3_close(pdb); + continue; + } + + stats.repos_scanned++; + + /* Copy channels */ + { + sqlite3_stmt *sel = NULL; + if (sqlite3_prepare_v2(pdb, + "SELECT channel_name, transport, direction, project, file_path, " + "function_name, node_id FROM channels", -1, &sel, NULL) == SQLITE_OK) { + while (sqlite3_step(sel) == SQLITE_ROW) { + sqlite3_reset(ins_ch); + for (int c = 0; c < 7; c++) { + if (sqlite3_column_type(sel, c) == SQLITE_INTEGER) + sqlite3_bind_int64(ins_ch, c + 1, sqlite3_column_int64(sel, c)); + else + sqlite3_bind_text(ins_ch, c + 1, + (const char *)sqlite3_column_text(sel, c), -1, SQLITE_TRANSIENT); + } + sqlite3_step(ins_ch); + stats.channels_copied++; + } + sqlite3_finalize(sel); + } + } + + /* Copy embeddable nodes */ + { + sqlite3_stmt *sel = NULL; + if (sqlite3_prepare_v2(pdb, + "SELECT id, label, name, qualified_name, file_path FROM nodes " + "WHERE label IN ('Function','Method','Class','Interface','Route')", + -1, &sel, NULL) == SQLITE_OK) { + while (sqlite3_step(sel) == SQLITE_ROW) { + sqlite3_reset(ins_node); + sqlite3_bind_text(ins_node, 1, project_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(ins_node, 2, sqlite3_column_int64(sel, 0)); /* orig_id */ + sqlite3_bind_text(ins_node, 3, + (const char *)sqlite3_column_text(sel, 1), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(ins_node, 4, + (const char *)sqlite3_column_text(sel, 2), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(ins_node, 5, + (const char *)sqlite3_column_text(sel, 3), -1, SQLITE_TRANSIENT); + sqlite3_bind_text(ins_node, 6, + (const char *)sqlite3_column_text(sel, 4), -1, SQLITE_TRANSIENT); + sqlite3_step(ins_node); + stats.nodes_copied++; + } + sqlite3_finalize(sel); + } + } + + /* Copy embeddings — join with cross_nodes via a single efficient query. + * First ensure we have the index on (project, orig_id) for the join. */ + { + sqlite3_stmt *sel = NULL; + /* Use the per-project DB to read embeddings, then look up cross_nodes.id + * via a prepared statement (reuse for all rows in this project). */ + sqlite3_stmt *lu_emb = NULL; + sqlite3_prepare_v2(db, + "SELECT id FROM cross_nodes WHERE project=?1 AND orig_id=?2", + -1, &lu_emb, NULL); + + if (lu_emb && sqlite3_prepare_v2(pdb, + "SELECT node_id, embedding, dimensions FROM embeddings", + -1, &sel, NULL) == SQLITE_OK) { + while (sqlite3_step(sel) == SQLITE_ROW) { + int64_t orig_id = sqlite3_column_int64(sel, 0); + sqlite3_reset(lu_emb); + sqlite3_bind_text(lu_emb, 1, project_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_int64(lu_emb, 2, orig_id); + if (sqlite3_step(lu_emb) == SQLITE_ROW) { + int64_t cross_id = sqlite3_column_int64(lu_emb, 0); + sqlite3_reset(ins_emb); + sqlite3_bind_int64(ins_emb, 1, cross_id); + sqlite3_bind_text(ins_emb, 2, project_name, -1, SQLITE_TRANSIENT); + sqlite3_bind_blob(ins_emb, 3, + sqlite3_column_blob(sel, 1), + sqlite3_column_bytes(sel, 1), SQLITE_TRANSIENT); + sqlite3_bind_int(ins_emb, 4, sqlite3_column_int(sel, 2)); + sqlite3_step(ins_emb); + stats.embeddings_copied++; + } + } + sqlite3_finalize(sel); + } + if (lu_emb) sqlite3_finalize(lu_emb); + } + + sqlite3_close(pdb); + } + cbm_closedir(dir); + + if (ins_ch) sqlite3_finalize(ins_ch); + if (ins_node) sqlite3_finalize(ins_node); + if (ins_emb) sqlite3_finalize(ins_emb); + + /* Build FTS5 index with camelCase splitting */ + sqlite3_exec(db, "DELETE FROM cross_nodes_fts", NULL, NULL, NULL); + sqlite3_exec(db, + "INSERT INTO cross_nodes_fts(rowid, name, qualified_name, label, file_path, project) " + "SELECT id, cbm_camel_split(name), qualified_name, label, file_path, project " + "FROM cross_nodes", + NULL, NULL, NULL); + + /* Count cross-repo channel matches */ + { + sqlite3_stmt *cnt = NULL; + if (sqlite3_prepare_v2(db, + "SELECT COUNT(DISTINCT e.channel_name) FROM cross_channels e " + "JOIN cross_channels l ON e.channel_name = l.channel_name " + "WHERE e.direction = 'emit' AND l.direction = 'listen' " + "AND e.project != l.project", + -1, &cnt, NULL) == SQLITE_OK) { + if (sqlite3_step(cnt) == SQLITE_ROW) { + stats.cross_repo_matches = sqlite3_column_int(cnt, 0); + } + sqlite3_finalize(cnt); + } + } + + /* Store metadata */ + { + time_t now = time(NULL); + char ts[64]; + strftime(ts, sizeof(ts), "%Y-%m-%dT%H:%M:%SZ", gmtime(&now)); + sqlite3_stmt *meta = NULL; + sqlite3_prepare_v2(db, + "INSERT OR REPLACE INTO cross_meta(key, value) VALUES(?1, ?2)", + -1, &meta, NULL); + if (meta) { + sqlite3_bind_text(meta, 1, "built_at", -1, SQLITE_STATIC); + sqlite3_bind_text(meta, 2, ts, -1, SQLITE_TRANSIENT); + sqlite3_step(meta); + sqlite3_reset(meta); + char buf[32]; + snprintf(buf, sizeof(buf), "%d", stats.repos_scanned); + sqlite3_bind_text(meta, 1, "repos", -1, SQLITE_STATIC); + sqlite3_bind_text(meta, 2, buf, -1, SQLITE_TRANSIENT); + sqlite3_step(meta); + sqlite3_finalize(meta); + } + } + + sqlite3_exec(db, "COMMIT", NULL, NULL, NULL); + sqlite3_exec(db, "PRAGMA synchronous=NORMAL", NULL, NULL, NULL); + sqlite3_close(db); + + struct timespec t1; + clock_gettime(CLOCK_MONOTONIC, &t1); + stats.build_time_ms = (double)(t1.tv_sec - t0.tv_sec) * 1000.0 + + (double)(t1.tv_nsec - t0.tv_nsec) / 1000000.0; + + cbm_log_info("cross_repo.build", "repos", itoa_cr(stats.repos_scanned), + "nodes", itoa_cr(stats.nodes_copied), + "channels", itoa_cr(stats.channels_copied), + "embeddings", itoa_cr(stats.embeddings_copied), + "cross_matches", itoa_cr(stats.cross_repo_matches)); + + return stats; +} + +/* ── Cross-Repo Search ──────────────────────────────────────────── */ + +static char *heap_dup(const char *s) { + if (!s) return NULL; + size_t len = strlen(s); + char *d = malloc(len + 1); + if (d) { memcpy(d, s, len + 1); } + return d; +} + +int cbm_cross_repo_search(cbm_cross_repo_t *cr, const char *query, + const float *query_vec, int dims, + int limit, cbm_cross_search_output_t *out) { + if (!cr || !cr->db || !query || !out) return CBM_STORE_ERR; + memset(out, 0, sizeof(*out)); + if (limit <= 0) limit = 50; + + /* Tokenize query for FTS5: split on whitespace, join with OR */ + char fts_query[1024]; + { + char tmp[1024]; + snprintf(tmp, sizeof(tmp), "%s", query); + int fq_len = 0; + char *tok = strtok(tmp, " \t\n"); + while (tok && fq_len < (int)sizeof(fts_query) - 20) { + if (fq_len > 0) fq_len += snprintf(fts_query + fq_len, + sizeof(fts_query) - (size_t)fq_len, " OR "); + fq_len += snprintf(fts_query + fq_len, + sizeof(fts_query) - (size_t)fq_len, "%s", tok); + tok = strtok(NULL, " \t\n"); + } + fts_query[fq_len] = '\0'; + } + + /* BM25 search */ + int bm25_cap = limit * 2; + int64_t *bm25_ids = calloc((size_t)bm25_cap, sizeof(int64_t)); + int bm25_count = 0; + + { + sqlite3_stmt *stmt = NULL; + const char *sql = + "SELECT cn.id, cn.project, cn.orig_id, cn.label, cn.name, " + "cn.qualified_name, cn.file_path, " + "(bm25(cross_nodes_fts) " + " - CASE WHEN cn.label IN ('Function','Method') THEN 10.0 " + " WHEN cn.label IN ('Class','Interface') THEN 5.0 " + " WHEN cn.label = 'Route' THEN 8.0 " + " ELSE 0.0 END) AS rank " + "FROM cross_nodes_fts f " + "JOIN cross_nodes cn ON cn.id = f.rowid " + "WHERE cross_nodes_fts MATCH ?1 " + "ORDER BY rank LIMIT ?2"; + if (sqlite3_prepare_v2(cr->db, sql, -1, &stmt, NULL) == SQLITE_OK) { + sqlite3_bind_text(stmt, 1, fts_query, -1, SQLITE_TRANSIENT); + sqlite3_bind_int(stmt, 2, bm25_cap); + while (sqlite3_step(stmt) == SQLITE_ROW && bm25_count < bm25_cap) { + bm25_ids[bm25_count++] = sqlite3_column_int64(stmt, 0); + } + sqlite3_finalize(stmt); + } + } + + /* Vector search (if query_vec provided and embeddings exist) */ + int vec_cap = limit; + int64_t *vec_ids = NULL; + double *vec_sims = NULL; + int vec_count = 0; + + if (query_vec && dims > 0) { + int emb_count = 0; + { + sqlite3_stmt *cnt = NULL; + sqlite3_prepare_v2(cr->db, "SELECT COUNT(*) FROM cross_embeddings", -1, &cnt, NULL); + if (cnt && sqlite3_step(cnt) == SQLITE_ROW) emb_count = sqlite3_column_int(cnt, 0); + if (cnt) sqlite3_finalize(cnt); + } + if (emb_count > 0) { + vec_ids = calloc((size_t)vec_cap, sizeof(int64_t)); + vec_sims = calloc((size_t)vec_cap, sizeof(double)); + + sqlite3_stmt *stmt = NULL; + const char *sql = + "SELECT ce.node_id, cbm_cosine_sim(?1, ce.embedding) AS sim " + "FROM cross_embeddings ce " + "WHERE sim > 0.3 " + "ORDER BY sim DESC LIMIT ?2"; + if (sqlite3_prepare_v2(cr->db, sql, -1, &stmt, NULL) == SQLITE_OK) { + sqlite3_bind_blob(stmt, 1, query_vec, dims * (int)sizeof(float), SQLITE_STATIC); + sqlite3_bind_int(stmt, 2, vec_cap); + while (sqlite3_step(stmt) == SQLITE_ROW && vec_count < vec_cap) { + vec_ids[vec_count] = sqlite3_column_int64(stmt, 0); + vec_sims[vec_count] = sqlite3_column_double(stmt, 1); + vec_count++; + } + sqlite3_finalize(stmt); + } + out->used_vector = (vec_count > 0); + } + } + + /* RRF merge (k=60) */ + int merge_cap = bm25_count + vec_count; + if (merge_cap == 0) { + free(bm25_ids); free(vec_ids); free(vec_sims); + return CBM_STORE_OK; + } + + typedef struct { int64_t id; double score; double sim; } rrf_entry_t; + rrf_entry_t *merged = calloc((size_t)merge_cap, sizeof(rrf_entry_t)); + int merge_count = 0; + + for (int i = 0; i < bm25_count; i++) { + merged[merge_count].id = bm25_ids[i]; + merged[merge_count].score = 1.0 / (60 + i); + merged[merge_count].sim = 0; + merge_count++; + } + for (int i = 0; i < vec_count; i++) { + bool found = false; + for (int j = 0; j < merge_count; j++) { + if (merged[j].id == vec_ids[i]) { + merged[j].score += 1.0 / (60 + i); + merged[j].sim = vec_sims[i]; + found = true; + break; + } + } + if (!found && merge_count < merge_cap) { + merged[merge_count].id = vec_ids[i]; + merged[merge_count].score = 1.0 / (60 + i); + merged[merge_count].sim = vec_sims[i]; + merge_count++; + } + } + + /* Sort by RRF score descending */ + for (int i = 0; i < merge_count - 1; i++) { + for (int j = i + 1; j < merge_count; j++) { + if (merged[j].score > merged[i].score) { + rrf_entry_t tmp = merged[i]; merged[i] = merged[j]; merged[j] = tmp; + } + } + } + + /* Build output — look up node details from cross_nodes */ + int result_count = merge_count < limit ? merge_count : limit; + out->results = calloc((size_t)result_count, sizeof(cbm_cross_search_result_t)); + out->total = merge_count; + + sqlite3_stmt *lu = NULL; + sqlite3_prepare_v2(cr->db, + "SELECT project, orig_id, label, name, qualified_name, file_path " + "FROM cross_nodes WHERE id = ?1", -1, &lu, NULL); + + for (int i = 0; i < result_count && lu; i++) { + sqlite3_reset(lu); + sqlite3_bind_int64(lu, 1, merged[i].id); + if (sqlite3_step(lu) == SQLITE_ROW) { + cbm_cross_search_result_t *r = &out->results[out->count]; + r->project = heap_dup((const char *)sqlite3_column_text(lu, 0)); + r->orig_id = sqlite3_column_int64(lu, 1); + r->label = heap_dup((const char *)sqlite3_column_text(lu, 2)); + r->name = heap_dup((const char *)sqlite3_column_text(lu, 3)); + r->qualified_name = heap_dup((const char *)sqlite3_column_text(lu, 4)); + r->file_path = heap_dup((const char *)sqlite3_column_text(lu, 5)); + r->score = merged[i].score; + r->similarity = merged[i].sim; + out->count++; + } + } + if (lu) sqlite3_finalize(lu); + + free(bm25_ids); free(vec_ids); free(vec_sims); free(merged); + return CBM_STORE_OK; +} + +void cbm_cross_search_free(cbm_cross_search_output_t *out) { + if (!out || !out->results) return; + for (int i = 0; i < out->count; i++) { + free((void *)out->results[i].project); + free((void *)out->results[i].label); + free((void *)out->results[i].name); + free((void *)out->results[i].qualified_name); + free((void *)out->results[i].file_path); + } + free(out->results); + memset(out, 0, sizeof(*out)); +} + +/* ── Cross-Repo Channel Matching ────────────────────────────────── */ + +int cbm_cross_repo_match_channels(cbm_cross_repo_t *cr, const char *channel_filter, + cbm_cross_channel_match_t **out, int *count) { + if (!cr || !cr->db || !out || !count) return CBM_STORE_ERR; + *out = NULL; + *count = 0; + + const char *sql = + "SELECT e.channel_name, e.transport, " + "e.project, e.file_path, e.function_name, " + "l.project, l.file_path, l.function_name " + "FROM cross_channels e " + "JOIN cross_channels l ON e.channel_name = l.channel_name " + "WHERE e.direction = 'emit' AND l.direction = 'listen' " + "AND e.project != l.project " + "%s " + "ORDER BY e.channel_name LIMIT 200"; + + char full_sql[2048]; + if (channel_filter && channel_filter[0]) { + char filter_clause[256]; + snprintf(filter_clause, sizeof(filter_clause), + "AND e.channel_name LIKE '%%%s%%'", channel_filter); + snprintf(full_sql, sizeof(full_sql), sql, filter_clause); + } else { + snprintf(full_sql, sizeof(full_sql), sql, ""); + } + + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(cr->db, full_sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_ERR; + } + + int cap = 200; + cbm_cross_channel_match_t *matches = calloc((size_t)cap, sizeof(cbm_cross_channel_match_t)); + int n = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW && n < cap) { + cbm_cross_channel_match_t *m = &matches[n]; + m->channel_name = heap_dup((const char *)sqlite3_column_text(stmt, 0)); + m->transport = heap_dup((const char *)sqlite3_column_text(stmt, 1)); + m->emit_project = heap_dup((const char *)sqlite3_column_text(stmt, 2)); + m->emit_file = heap_dup((const char *)sqlite3_column_text(stmt, 3)); + m->emit_function = heap_dup((const char *)sqlite3_column_text(stmt, 4)); + m->listen_project = heap_dup((const char *)sqlite3_column_text(stmt, 5)); + m->listen_file = heap_dup((const char *)sqlite3_column_text(stmt, 6)); + m->listen_function = heap_dup((const char *)sqlite3_column_text(stmt, 7)); + n++; + } + sqlite3_finalize(stmt); + + *out = matches; + *count = n; + return CBM_STORE_OK; +} + +void cbm_cross_channel_free(cbm_cross_channel_match_t *matches, int count) { + if (!matches) return; + for (int i = 0; i < count; i++) { + free((void *)matches[i].channel_name); + free((void *)matches[i].transport); + free((void *)matches[i].emit_project); + free((void *)matches[i].emit_file); + free((void *)matches[i].emit_function); + free((void *)matches[i].listen_project); + free((void *)matches[i].listen_file); + free((void *)matches[i].listen_function); + } + free(matches); +} + +/* ── Cross-Repo Stats ───────────────────────────────────────────── */ + +int cbm_cross_repo_get_info(cbm_cross_repo_t *cr, cbm_cross_repo_info_t *out) { + if (!cr || !cr->db || !out) return CBM_STORE_ERR; + memset(out, 0, sizeof(*out)); + + sqlite3_stmt *s = NULL; + sqlite3_prepare_v2(cr->db, "SELECT COUNT(DISTINCT project) FROM cross_nodes", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) out->total_repos = sqlite3_column_int(s, 0); + if (s) sqlite3_finalize(s); + + sqlite3_prepare_v2(cr->db, "SELECT COUNT(*) FROM cross_nodes", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) out->total_nodes = sqlite3_column_int(s, 0); + if (s) sqlite3_finalize(s); + + sqlite3_prepare_v2(cr->db, "SELECT COUNT(*) FROM cross_channels", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) out->total_channels = sqlite3_column_int(s, 0); + if (s) sqlite3_finalize(s); + + sqlite3_prepare_v2(cr->db, "SELECT COUNT(*) FROM cross_embeddings", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) out->total_embeddings = sqlite3_column_int(s, 0); + if (s) sqlite3_finalize(s); + + /* Cross-repo channel count */ + sqlite3_prepare_v2(cr->db, + "SELECT COUNT(DISTINCT e.channel_name) FROM cross_channels e " + "JOIN cross_channels l ON e.channel_name = l.channel_name " + "WHERE e.direction = 'emit' AND l.direction = 'listen' " + "AND e.project != l.project", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) out->cross_repo_channel_count = sqlite3_column_int(s, 0); + if (s) sqlite3_finalize(s); + + sqlite3_prepare_v2(cr->db, + "SELECT value FROM cross_meta WHERE key = 'built_at'", -1, &s, NULL); + if (s && sqlite3_step(s) == SQLITE_ROW) + out->built_at = heap_dup((const char *)sqlite3_column_text(s, 0)); + if (s) sqlite3_finalize(s); + + return CBM_STORE_OK; +} + +void cbm_cross_repo_info_free(cbm_cross_repo_info_t *info) { + if (!info) return; + free((void *)info->built_at); + info->built_at = NULL; +} + +/* ── Cross-Repo Trace Helper ────────────────────────────────────── */ + +int cbm_cross_repo_trace_in_project( + const char *project_db_path, + const char *function_name, + const char *file_path_hint, + const char *channel_name, + const char *direction, + int max_depth, + cbm_cross_trace_step_t **out, int *out_count) { + + if (!project_db_path || !function_name || !direction || !out || !out_count) { + return CBM_STORE_ERR; + } + *out = NULL; + *out_count = 0; + if (max_depth <= 0) max_depth = 2; + + /* Open project DB read-only */ + cbm_store_t *store = cbm_store_open_path_query(project_db_path); + if (!store) return CBM_STORE_ERR; + + struct sqlite3 *db = cbm_store_get_db(store); + if (!db) { cbm_store_close(store); return CBM_STORE_ERR; } + + int64_t start_id = 0; + + /* Resolve start node — handle special cases */ + if (strcmp(function_name, "(file-level)") == 0 && file_path_hint) { + /* File-level listener: find the actual handler function via channels table */ + if (channel_name) { + sqlite3_stmt *s = NULL; + sqlite3_prepare_v2(db, + "SELECT DISTINCT c.node_id FROM channels c " + "WHERE c.file_path = ?1 AND c.channel_name = ?2 AND c.node_id > 0 " + "LIMIT 1", -1, &s, NULL); + if (s) { + sqlite3_bind_text(s, 1, file_path_hint, -1, SQLITE_STATIC); + sqlite3_bind_text(s, 2, channel_name, -1, SQLITE_STATIC); + if (sqlite3_step(s) == SQLITE_ROW) { + start_id = sqlite3_column_int64(s, 0); + } + sqlite3_finalize(s); + } + } + /* Fallback: first Function/Method in the file */ + if (start_id == 0) { + sqlite3_stmt *s = NULL; + sqlite3_prepare_v2(db, + "SELECT id FROM nodes WHERE file_path = ?1 " + "AND label IN ('Function','Method') ORDER BY start_line LIMIT 1", + -1, &s, NULL); + if (s) { + sqlite3_bind_text(s, 1, file_path_hint, -1, SQLITE_STATIC); + if (sqlite3_step(s) == SQLITE_ROW) { + start_id = sqlite3_column_int64(s, 0); + } + sqlite3_finalize(s); + } + } + } else { + /* Normal case: find by name, optionally filtered by file_path */ + const char *sql = file_path_hint + ? "SELECT id, label FROM nodes WHERE name = ?1 AND file_path = ?2 " + "AND label IN ('Function','Method','Class') LIMIT 1" + : "SELECT id, label FROM nodes WHERE name = ?1 " + "AND label IN ('Function','Method','Class') LIMIT 1"; + sqlite3_stmt *s = NULL; + sqlite3_prepare_v2(db, sql, -1, &s, NULL); + if (s) { + sqlite3_bind_text(s, 1, function_name, -1, SQLITE_STATIC); + if (file_path_hint) + sqlite3_bind_text(s, 2, file_path_hint, -1, SQLITE_STATIC); + if (sqlite3_step(s) == SQLITE_ROW) { + start_id = sqlite3_column_int64(s, 0); + const char *label = (const char *)sqlite3_column_text(s, 1); + /* If it's a Class, resolve through DEFINES_METHOD → use first method */ + if (label && strcmp(label, "Class") == 0) { + int64_t class_id = start_id; + sqlite3_stmt *m = NULL; + sqlite3_prepare_v2(db, + "SELECT target_id FROM edges WHERE source_id = ?1 " + "AND type = 'DEFINES_METHOD' LIMIT 1", -1, &m, NULL); + if (m) { + sqlite3_bind_int64(m, 1, class_id); + if (sqlite3_step(m) == SQLITE_ROW) { + start_id = sqlite3_column_int64(m, 0); + } + sqlite3_finalize(m); + } + } + } + sqlite3_finalize(s); + } + } + + if (start_id == 0) { + cbm_store_close(store); + return CBM_STORE_OK; /* no results, not an error */ + } + + /* Run BFS */ + const char *edge_types[] = {"CALLS"}; + cbm_traverse_result_t trav = {0}; + cbm_store_bfs(store, start_id, direction, edge_types, 1, + max_depth, 20, &trav); + + /* Convert to output format */ + int cap = trav.visited_count; + if (cap > 0) { + cbm_cross_trace_step_t *steps = calloc((size_t)cap, sizeof(cbm_cross_trace_step_t)); + int count = 0; + for (int i = 0; i < trav.visited_count && count < cap; i++) { + cbm_node_hop_t *h = &trav.visited[i]; + if (h->node.id == start_id) continue; /* skip the start node itself */ + steps[count].name = heap_dup(h->node.name); + steps[count].label = heap_dup(h->node.label); + steps[count].file_path = heap_dup(h->node.file_path); + steps[count].depth = h->hop; + count++; + } + *out = steps; + *out_count = count; + } + + cbm_store_traverse_free(&trav); + cbm_store_close(store); + return CBM_STORE_OK; +} + +void cbm_cross_trace_free(cbm_cross_trace_step_t *steps, int count) { + if (!steps) return; + for (int i = 0; i < count; i++) { + free((void *)steps[i].name); + free((void *)steps[i].label); + free((void *)steps[i].file_path); + } + free(steps); +} diff --git a/src/store/cross_repo.h b/src/store/cross_repo.h new file mode 100644 index 00000000..421fdeb3 --- /dev/null +++ b/src/store/cross_repo.h @@ -0,0 +1,137 @@ +/* + * cross_repo.h — Cross-repository discovery, search, and flow tracing. + * + * Builds a unified _cross_repo.db by scanning all per-project databases. + * Enables: cross-repo channel matching, cross-repo BM25+vector search, + * cross-repo flow tracing, and cross-repo impact analysis. + * + * The cross-repo DB is read-only (built by cbm_cross_repo_build) and + * does NOT use ATTACH — it copies data into a separate SQLite file, + * preserving per-project security isolation. + */ +#ifndef CBM_CROSS_REPO_H +#define CBM_CROSS_REPO_H + +#include "store/store.h" +#include + +/* ── Build ──────────────────────────────────────────────────────── */ + +typedef struct { + int repos_scanned; + int channels_copied; + int nodes_copied; + int embeddings_copied; + int cross_repo_matches; /* channels with emit in A + listen in B */ + double build_time_ms; +} cbm_cross_repo_stats_t; + +/* Build (or rebuild) the cross-repo index by scanning all project DBs. + * Writes to ~/.cache/codebase-memory-mcp/_cross_repo.db. + * Returns stats on success, or sets stats.repos_scanned=-1 on error. */ +cbm_cross_repo_stats_t cbm_cross_repo_build(void); + +/* ── Query ──────────────────────────────────────────────────────── */ + +/* Opaque handle for the cross-repo DB (separate from per-project stores). */ +typedef struct cbm_cross_repo cbm_cross_repo_t; + +/* Open the cross-repo DB for querying. Returns NULL if not built yet. */ +cbm_cross_repo_t *cbm_cross_repo_open(void); + +/* Close and free. NULL-safe. */ +void cbm_cross_repo_close(cbm_cross_repo_t *cr); + +/* ── Cross-Repo Search ──────────────────────────────────────────── */ + +typedef struct { + const char *project; /* short project name */ + int64_t orig_id; /* node ID in the project's own DB */ + const char *label; + const char *name; + const char *qualified_name; + const char *file_path; + double score; /* BM25 or RRF score */ + double similarity; /* cosine similarity (0 if BM25-only) */ +} cbm_cross_search_result_t; + +typedef struct { + cbm_cross_search_result_t *results; + int count; + int total; + bool used_vector; /* true if hybrid BM25+vector was used */ +} cbm_cross_search_output_t; + +/* Search across all repos. Uses BM25 FTS5 + optional vector search + RRF merge. + * query_vec may be NULL (BM25-only). Caller frees output with _free(). */ +int cbm_cross_repo_search(cbm_cross_repo_t *cr, const char *query, + const float *query_vec, int dims, + int limit, cbm_cross_search_output_t *out); + +void cbm_cross_search_free(cbm_cross_search_output_t *out); + +/* ── Cross-Repo Channel Matching ────────────────────────────────── */ + +typedef struct { + const char *channel_name; + const char *transport; + /* Emitter side */ + const char *emit_project; + const char *emit_file; + const char *emit_function; + /* Listener side */ + const char *listen_project; + const char *listen_file; + const char *listen_function; +} cbm_cross_channel_match_t; + +/* Find cross-repo channel matches: channels where emit is in one repo + * and listen is in another. Optional channel_name filter (partial match). + * Returns allocated array. Caller frees with _free(). */ +int cbm_cross_repo_match_channels(cbm_cross_repo_t *cr, const char *channel_filter, + cbm_cross_channel_match_t **out, int *count); + +void cbm_cross_channel_free(cbm_cross_channel_match_t *matches, int count); + +/* ── Cross-Repo Stats ───────────────────────────────────────────── */ + +typedef struct { + int total_repos; + int total_nodes; + int total_channels; + int total_embeddings; + int cross_repo_channel_count; + const char *built_at; /* ISO timestamp */ +} cbm_cross_repo_info_t; + +/* Get stats about the cross-repo index. */ +int cbm_cross_repo_get_info(cbm_cross_repo_t *cr, cbm_cross_repo_info_t *out); + +void cbm_cross_repo_info_free(cbm_cross_repo_info_t *info); + +/* ── Cross-Repo Trace Helper ────────────────────────────────────── */ + +typedef struct { + const char *name; + const char *label; + const char *file_path; + int depth; +} cbm_cross_trace_step_t; + +/* Trace callers (inbound) or callees (outbound) from a function in a project DB. + * Opens the project DB read-only, resolves the function, runs BFS, closes DB. + * Handles Class→Method resolution and (file-level) listener fallback. + * channel_name is optional — used for file-level listener resolution. + * Returns allocated array. Caller frees with cbm_cross_trace_free(). */ +int cbm_cross_repo_trace_in_project( + const char *project_db_path, + const char *function_name, + const char *file_path_hint, + const char *channel_name, /* optional: for resolving (file-level) listeners */ + const char *direction, /* "inbound" or "outbound" */ + int max_depth, + cbm_cross_trace_step_t **out, int *out_count); + +void cbm_cross_trace_free(cbm_cross_trace_step_t *steps, int count); + +#endif /* CBM_CROSS_REPO_H */ diff --git a/src/store/store.c b/src/store/store.c index 88aa7078..b5799753 100644 --- a/src/store/store.c +++ b/src/store/store.c @@ -12,6 +12,7 @@ #include "foundation/platform.h" #include "foundation/compat.h" #include "foundation/compat_regex.h" +#include #include #include @@ -191,13 +192,50 @@ static int init_schema(cbm_store_t *s) { " properties TEXT DEFAULT '{}'," " UNIQUE(source_id, target_id, type)" ");" - "CREATE TABLE IF NOT EXISTS project_summaries (" + "CREATE TABLE IF NOT EXISTS processes (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " project TEXT NOT NULL REFERENCES projects(name) ON DELETE CASCADE," + " label TEXT NOT NULL," + " process_type TEXT NOT NULL DEFAULT 'cross_community'," + " step_count INTEGER NOT NULL DEFAULT 0," + " entry_point_id INTEGER NOT NULL," + " terminal_id INTEGER NOT NULL" + ");" + "CREATE TABLE IF NOT EXISTS process_steps (" + " process_id INTEGER NOT NULL REFERENCES processes(id) ON DELETE CASCADE," + " node_id INTEGER NOT NULL," + " step INTEGER NOT NULL," + " PRIMARY KEY (process_id, step)" + ");" + "CREATE TABLE IF NOT EXISTS channels (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " project TEXT NOT NULL REFERENCES projects(name) ON DELETE CASCADE," + " channel_name TEXT NOT NULL," + " direction TEXT NOT NULL," /* 'emit' or 'listen' */ + " transport TEXT NOT NULL DEFAULT 'socketio'," + " node_id INTEGER NOT NULL," + " file_path TEXT DEFAULT ''," + " function_name TEXT DEFAULT ''" + ");" + "CREATE INDEX IF NOT EXISTS idx_channels_name ON channels(channel_name);" + "CREATE INDEX IF NOT EXISTS idx_channels_project ON channels(project);" + "CREATE UNIQUE INDEX IF NOT EXISTS idx_channels_unique " + "ON channels(project, channel_name, direction, file_path, function_name);" + "CREATE TABLE IF NOT EXISTS project_summaries (" " project TEXT PRIMARY KEY," " summary TEXT NOT NULL," " source_hash TEXT NOT NULL," " created_at TEXT NOT NULL," " updated_at TEXT NOT NULL" - ");"; + ");" + "CREATE TABLE IF NOT EXISTS embeddings (" + " node_id INTEGER PRIMARY KEY," + " project TEXT NOT NULL," + " embedding BLOB NOT NULL," /* float32[N], N = CBM_EMBEDDING_DIMS */ + " dimensions INTEGER NOT NULL DEFAULT 0" + ");" + "CREATE INDEX IF NOT EXISTS idx_embeddings_project " + "ON embeddings(project);"; return exec_sql(s, ddl); } @@ -212,7 +250,60 @@ static int create_user_indexes(cbm_store_t *s) { "CREATE INDEX IF NOT EXISTS idx_edges_type ON edges(project, type);" "CREATE INDEX IF NOT EXISTS idx_edges_target_type ON edges(project, target_id, type);" "CREATE INDEX IF NOT EXISTS idx_edges_source_type ON edges(project, source_id, type);"; - return exec_sql(s, sql); + int rc = exec_sql(s, sql); + if (rc != SQLITE_OK) return rc; + + /* FTS5 full-text search index on node names for BM25 ranking. + * content='' makes it a contentless table — it stores only the inverted index, + * not the original text. This is required for camelCase token splitting: + * we index "createSession create Session" but the source table has "createSession". + * With content='nodes', FTS5 would re-verify against the source and fail to match + * the split tokens. Contentless mode trusts the inverted index directly. + * Trade-off: highlight()/snippet() unavailable, but we never use them. + * Each DDL statement must be executed separately for FTS5 compatibility. */ + { + char *fts_err = NULL; + int fts_rc = sqlite3_exec(s->db, + "CREATE VIRTUAL TABLE IF NOT EXISTS nodes_fts USING fts5(" + "name, qualified_name, label, file_path," + "content='', content_rowid='id'," + "tokenize='unicode61 remove_diacritics 2'" + ");", + NULL, NULL, &fts_err); + if (fts_rc != SQLITE_OK) { + sqlite3_free(fts_err); + /* Non-fatal — FTS5 may not be compiled in. Fall back to regex search. */ + return SQLITE_OK; + } + } + + /* Sync triggers: keep FTS index up to date when nodes change. + * cbm_camel_split(name) splits camelCase into individual tokens so + * "updateCloudClient" is searchable as "update", "Cloud", "Client". + * Contentless FTS5 (content='') requires delete operations to provide the + * exact same tokenized content that was originally inserted. */ + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ai AFTER INSERT ON nodes BEGIN" + " INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)" + " VALUES (new.id, cbm_camel_split(new.name), new.qualified_name," + " new.label, new.file_path);" + "END;"); + + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_ad AFTER DELETE ON nodes BEGIN" + " INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)" + " VALUES ('delete', old.id, cbm_camel_split(old.name), old.qualified_name," + " old.label, old.file_path);" + "END;"); + + exec_sql(s, "CREATE TRIGGER IF NOT EXISTS nodes_fts_au AFTER UPDATE ON nodes BEGIN" + " INSERT INTO nodes_fts(nodes_fts, rowid, name, qualified_name, label, file_path)" + " VALUES ('delete', old.id, cbm_camel_split(old.name), old.qualified_name," + " old.label, old.file_path);" + " INSERT INTO nodes_fts(rowid, name, qualified_name, label, file_path)" + " VALUES (new.id, cbm_camel_split(new.name), new.qualified_name," + " new.label, new.file_path);" + "END;"); + + return SQLITE_OK; } static int configure_pragmas(cbm_store_t *s, bool in_memory) { @@ -269,6 +360,91 @@ static void sqlite_regexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) sqlite3_result_int(ctx, rc == 0 ? 1 : 0); } +/* CamelCase token splitter for FTS5. + * "updateCloudClient" → "updateCloudClient update Cloud Client" + * "HTMLParser" → "HTMLParser HTML Parser" + * "getURL" → "getURL get URL" + * Preserves original name as first token for exact-match queries, + * then appends space-split words for broad keyword matching. */ +static void sqlite_camel_split(sqlite3_context *ctx, int argc, sqlite3_value **argv) { + (void)argc; + const char *input = (const char *)sqlite3_value_text(argv[0]); + if (!input || !input[0]) { + sqlite3_result_text(ctx, input ? input : "", -1, SQLITE_TRANSIENT); + return; + } + + char buf[2048]; + /* Start with the original name (preserves exact-match capability) */ + int len = snprintf(buf, sizeof(buf), "%s ", input); + + /* Walk input, insert space before each camelCase boundary: + * - lowercase→Uppercase: "updateCloud" → "update Cloud" + * - Uppercase→Uppercase+lowercase: "HTMLParser" → "HTML Parser" */ + for (int i = 0; input[i] && len < (int)sizeof(buf) - 2; i++) { + if (i > 0) { + bool split = false; + /* lowercase followed by Uppercase: updateC → update C */ + if (input[i] >= 'A' && input[i] <= 'Z' && + input[i - 1] >= 'a' && input[i - 1] <= 'z') { + split = true; + } + /* Uppercase followed by Uppercase+lowercase: HTMLParser → HTML Parser + * Only split before the LAST uppercase in a run */ + if (input[i] >= 'A' && input[i] <= 'Z' && + input[i - 1] >= 'A' && input[i - 1] <= 'Z' && + input[i + 1] >= 'a' && input[i + 1] <= 'z') { + split = true; + } + if (split) { + buf[len++] = ' '; + } + } + buf[len++] = input[i]; + } + buf[len] = '\0'; + sqlite3_result_text(ctx, buf, len, SQLITE_TRANSIENT); +} + +/* Cosine similarity for vector search. + * argv[0] = query vector (BLOB, float32[N]) + * argv[1] = stored embedding (BLOB, float32[N]) + * Returns cosine similarity as DOUBLE (1.0 = identical, 0.0 = orthogonal). */ +static void sqlite_cosine_sim(sqlite3_context *ctx, int argc, sqlite3_value **argv) { + (void)argc; + if (sqlite3_value_type(argv[0]) != SQLITE_BLOB || + sqlite3_value_type(argv[1]) != SQLITE_BLOB) { + sqlite3_result_null(ctx); + return; + } + + const float *a = (const float *)sqlite3_value_blob(argv[0]); + const float *b = (const float *)sqlite3_value_blob(argv[1]); + int a_bytes = sqlite3_value_bytes(argv[0]); + int b_bytes = sqlite3_value_bytes(argv[1]); + + if (a_bytes != b_bytes || a_bytes == 0 || (a_bytes % (int)sizeof(float)) != 0) { + sqlite3_result_null(ctx); + return; + } + + int dims = a_bytes / (int)sizeof(float); + float dot = 0.0f, norm_a = 0.0f, norm_b = 0.0f; + for (int i = 0; i < dims; i++) { + dot += a[i] * b[i]; + norm_a += a[i] * a[i]; + norm_b += b[i] * b[i]; + } + + if (norm_a == 0.0f || norm_b == 0.0f) { + sqlite3_result_double(ctx, 0.0); + return; + } + + double similarity = (double)dot / (sqrt((double)norm_a) * sqrt((double)norm_b)); + sqlite3_result_double(ctx, similarity); +} + /* Case-insensitive REGEXP variant */ static void sqlite_iregexp(sqlite3_context *ctx, int argc, sqlite3_value **argv) { (void)argc; @@ -342,6 +518,12 @@ static cbm_store_t *store_open_internal(const char *path, bool in_memory) { /* Case-insensitive variant for search with case_sensitive=false */ sqlite3_create_function(s->db, "iregexp", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, sqlite_iregexp, NULL, NULL); + /* CamelCase splitter for FTS5 indexing — used in triggers and backfill */ + sqlite3_create_function(s->db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_camel_split, NULL, NULL); + /* Cosine similarity for vector search — used in hybrid BM25+vector queries */ + sqlite3_create_function(s->db, "cbm_cosine_sim", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_cosine_sim, NULL, NULL); if (configure_pragmas(s, in_memory) != CBM_STORE_OK || init_schema(s) != CBM_STORE_OK || create_user_indexes(s) != CBM_STORE_OK) { @@ -394,6 +576,12 @@ cbm_store_t *cbm_store_open_path_query(const char *db_path) { sqlite_regexp, NULL, NULL); sqlite3_create_function(s->db, "iregexp", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, sqlite_iregexp, NULL, NULL); + /* CamelCase splitter for FTS5 — must be registered before triggers fire */ + sqlite3_create_function(s->db, "cbm_camel_split", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_camel_split, NULL, NULL); + /* Cosine similarity for vector search */ + sqlite3_create_function(s->db, "cbm_cosine_sim", 2, SQLITE_UTF8 | SQLITE_DETERMINISTIC, + NULL, sqlite_cosine_sim, NULL, NULL); if (configure_pragmas(s, false) != CBM_STORE_OK) { sqlite3_close(s->db); @@ -474,6 +662,10 @@ static void finalize_stmt(sqlite3_stmt **s) { } } +int cbm_store_exec(cbm_store_t *s, const char *sql) { + return exec_sql(s, sql); +} + void cbm_store_close(cbm_store_t *s) { if (!s) { return; @@ -1955,6 +2147,138 @@ int cbm_store_search(cbm_store_t *s, const cbm_search_params_t *params, cbm_sear char count_sql[4096]; int bind_idx = 0; + /* ── FTS5 BM25 path: when params->query is set, use full-text search ── */ + if (params->query && params->query[0]) { + /* Build FTS5 query: JOIN nodes_fts for BM25 ranking. + * Tokenize the user query into FTS5 OR terms for broader matching. + * "authentication middleware" → "authentication OR middleware" */ + char fts_query[1024]; + { + const char *q = params->query; + int fqlen = 0; + bool in_word = false; + bool first_word = true; + while (*q && fqlen < (int)sizeof(fts_query) - 20) { + if ((*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z') || + (*q >= '0' && *q <= '9') || *q == '_' || *q == '-') { + if (!in_word && !first_word) { + fqlen += snprintf(fts_query + fqlen, sizeof(fts_query) - fqlen, " OR "); + } + fts_query[fqlen++] = *q; + in_word = true; + first_word = false; + } else { + if (in_word) { + fts_query[fqlen++] = ' '; + } + in_word = false; + } + q++; + } + fts_query[fqlen] = '\0'; + } + + char fts_sql[4096]; + /* Join with FTS5 table, filter by project/label, order by pure BM25 relevance. + * Exclude noise labels (File, Folder, Module, Section, Variable, Project). + * Label-type boost: prefer Functions/Methods/Routes/Classes over generic nodes. + * No fan_in/popularity boost — that corrupts relevance for discovery queries + * (e.g. "update" with fan_in=222 would outrank the actually relevant match). + * in_deg/out_deg are still returned for display but do NOT affect ranking. */ + int flen = snprintf(fts_sql, sizeof(fts_sql), + "SELECT n.id, n.project, n.label, n.name, n.qualified_name, " + "n.file_path, n.start_line, n.end_line, n.properties, " + "(SELECT COUNT(*) FROM edges e WHERE e.target_id = n.id AND e.type = 'CALLS') AS in_deg, " + "(SELECT COUNT(*) FROM edges e WHERE e.source_id = n.id AND e.type = 'CALLS') AS out_deg, " + "(bm25(nodes_fts) " + " - CASE WHEN n.label IN ('Function','Method') THEN 10.0 " + " WHEN n.label IN ('Class','Interface','Type') THEN 5.0 " + " WHEN n.label = 'Route' THEN 8.0 " + " ELSE 0.0 END " + ") AS rank " + "FROM nodes_fts " + "JOIN nodes n ON n.id = nodes_fts.rowid " + "WHERE nodes_fts MATCH ?1" + " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')"); + + int fts_bind_idx = 1; + if (params->project) { + fts_bind_idx++; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " AND n.project = ?%d", fts_bind_idx); + } + if (params->label) { + fts_bind_idx++; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " AND n.label = ?%d", fts_bind_idx); + } + + int limit = params->limit > 0 ? params->limit : 50; + flen += snprintf(fts_sql + flen, sizeof(fts_sql) - flen, + " ORDER BY rank LIMIT %d OFFSET %d", limit, params->offset); + + /* Count query — same exclusions as main query */ + char fts_count[4096]; + snprintf(fts_count, sizeof(fts_count), + "SELECT COUNT(*) FROM nodes_fts " + "JOIN nodes n ON n.id = nodes_fts.rowid " + "WHERE nodes_fts MATCH ?1" + " AND n.label NOT IN ('File','Folder','Module','Section','Variable','Project')" + "%s%s", + params->project ? " AND n.project = ?2" : "", + params->label ? (params->project ? " AND n.label = ?3" : " AND n.label = ?2") : ""); + + /* Execute count */ + sqlite3_stmt *cnt_stmt = NULL; + if (sqlite3_prepare_v2(s->db, fts_count, -1, &cnt_stmt, NULL) == SQLITE_OK) { + bind_text(cnt_stmt, 1, fts_query); + int bi = 1; + if (params->project) { bi++; bind_text(cnt_stmt, bi, params->project); } + if (params->label) { bi++; bind_text(cnt_stmt, bi, params->label); } + if (sqlite3_step(cnt_stmt) == SQLITE_ROW) { + out->total = sqlite3_column_int(cnt_stmt, 0); + } + sqlite3_finalize(cnt_stmt); + } + + /* Execute main query */ + sqlite3_stmt *main_stmt = NULL; + int rc = sqlite3_prepare_v2(s->db, fts_sql, -1, &main_stmt, NULL); + if (rc != SQLITE_OK) { + /* FTS5 table may not exist for older DBs — fall through to regex path */ + /* FTS5 table may not exist for older DBs — silently fall through */ + goto regex_path; + } + bind_text(main_stmt, 1, fts_query); + { + int bi = 1; + if (params->project) { bi++; bind_text(main_stmt, bi, params->project); } + if (params->label) { bi++; bind_text(main_stmt, bi, params->label); } + } + + int cap = 16; + int n = 0; + cbm_search_result_t *results = malloc(cap * sizeof(cbm_search_result_t)); + while (sqlite3_step(main_stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + results = safe_realloc(results, cap * sizeof(cbm_search_result_t)); + } + memset(&results[n], 0, sizeof(cbm_search_result_t)); + scan_node(main_stmt, &results[n].node); + results[n].in_degree = sqlite3_column_int(main_stmt, 9); + results[n].out_degree = sqlite3_column_int(main_stmt, 10); + n++; + } + sqlite3_finalize(main_stmt); + out->results = results; + out->count = n; + return CBM_STORE_OK; + } + +regex_path: + /* ── Regex path: original regex-based search ── */ + /* We build a query that selects nodes with optional degree subqueries */ const char *select_cols = "SELECT n.id, n.project, n.label, n.name, n.qualified_name, " @@ -3951,6 +4275,309 @@ static bool want_aspect(const char **aspects, int aspect_count, const char *name return false; } +/* ── Clusters via Louvain community detection ──────────────────── */ + +static int arch_clusters(cbm_store_t *s, const char *project, cbm_architecture_info_t *out) { + /* 1. Load all callable node IDs for this project */ + const char *nsql = "SELECT id FROM nodes WHERE project=?1 " + "AND label IN ('Function','Method','Class','Interface')"; + sqlite3_stmt *nstmt = NULL; + if (sqlite3_prepare_v2(s->db, nsql, -1, &nstmt, NULL) != SQLITE_OK) { + store_set_error_sqlite(s, "arch_clusters_nodes"); + return CBM_STORE_ERR; + } + bind_text(nstmt, 1, project); + + int ncap = 1024; + int nn = 0; + int64_t *node_ids = malloc((size_t)ncap * sizeof(int64_t)); + + while (sqlite3_step(nstmt) == SQLITE_ROW) { + if (nn >= ncap) { + ncap *= 2; + node_ids = safe_realloc(node_ids, (size_t)ncap * sizeof(int64_t)); + } + node_ids[nn++] = sqlite3_column_int64(nstmt, 0); + } + sqlite3_finalize(nstmt); + + if (nn < 2) { + free(node_ids); + return CBM_STORE_OK; /* Nothing to cluster */ + } + + /* 2. Load all CALLS edges for this project */ + const char *esql = "SELECT source_id, target_id FROM edges WHERE project=?1 AND type='CALLS'"; + sqlite3_stmt *estmt = NULL; + if (sqlite3_prepare_v2(s->db, esql, -1, &estmt, NULL) != SQLITE_OK) { + free(node_ids); + store_set_error_sqlite(s, "arch_clusters_edges"); + return CBM_STORE_ERR; + } + bind_text(estmt, 1, project); + + int ecap = 2048; + int en = 0; + cbm_louvain_edge_t *edges = malloc((size_t)ecap * sizeof(cbm_louvain_edge_t)); + + while (sqlite3_step(estmt) == SQLITE_ROW) { + if (en >= ecap) { + ecap *= 2; + edges = safe_realloc(edges, (size_t)ecap * sizeof(cbm_louvain_edge_t)); + } + edges[en].src = sqlite3_column_int64(estmt, 0); + edges[en].dst = sqlite3_column_int64(estmt, 1); + en++; + } + sqlite3_finalize(estmt); + + if (en < 1) { + free(node_ids); + free(edges); + return CBM_STORE_OK; + } + + /* 3. Run Louvain */ + cbm_louvain_result_t *lresults = NULL; + int lcount = 0; + int rc = cbm_louvain(node_ids, nn, edges, en, &lresults, &lcount); + free(node_ids); + free(edges); + + if (rc != CBM_STORE_OK || lcount == 0) { + free(lresults); + return CBM_STORE_OK; + } + + /* 4. Find max community ID to size the grouping array */ + int max_community = 0; + for (int i = 0; i < lcount; i++) { + if (lresults[i].community > max_community) { + max_community = lresults[i].community; + } + } + int num_communities = max_community + 1; + + /* 5. Count members per community */ + int *member_counts = calloc((size_t)num_communities, sizeof(int)); + for (int i = 0; i < lcount; i++) { + if (lresults[i].community >= 0 && lresults[i].community < num_communities) { + member_counts[lresults[i].community]++; + } + } + + /* Count non-empty communities */ + int active_count = 0; + for (int i = 0; i < num_communities; i++) { + if (member_counts[i] > 0) { + active_count++; + } + } + + if (active_count == 0) { + free(member_counts); + free(lresults); + return CBM_STORE_OK; + } + + /* Cap at 20 clusters, keep the largest */ + int max_clusters = active_count < 20 ? active_count : 20; + + /* 6. Build cluster info structs. + * For each community, find the top-5 nodes by CALLS in-degree. */ + cbm_cluster_info_t *clusters = calloc((size_t)max_clusters, sizeof(cbm_cluster_info_t)); + int ci = 0; + + /* Sort communities by member count descending — simple selection of top N */ + int *sorted_ids = malloc((size_t)num_communities * sizeof(int)); + for (int i = 0; i < num_communities; i++) sorted_ids[i] = i; + /* Bubble sort is fine for small N (typically < 100 communities) */ + for (int i = 0; i < num_communities - 1 && i < max_clusters; i++) { + for (int j = i + 1; j < num_communities; j++) { + if (member_counts[sorted_ids[j]] > member_counts[sorted_ids[i]]) { + int tmp = sorted_ids[i]; + sorted_ids[i] = sorted_ids[j]; + sorted_ids[j] = tmp; + } + } + } + + for (int si = 0; si < max_clusters; si++) { + int comm_id = sorted_ids[si]; + if (member_counts[comm_id] == 0) break; + + clusters[ci].id = comm_id; + clusters[ci].members = member_counts[comm_id]; + clusters[ci].cohesion = 0.0; /* Would need intra-/inter-edge ratio to compute */ + + /* Collect node IDs in this community */ + int64_t *comm_nodes = malloc((size_t)member_counts[comm_id] * sizeof(int64_t)); + int cn = 0; + for (int i = 0; i < lcount; i++) { + if (lresults[i].community == comm_id) { + comm_nodes[cn++] = lresults[i].node_id; + } + } + + /* Find top 5 by in-degree via SQL */ + int top_n = cn < 5 ? cn : 5; + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + const char **top_names = calloc((size_t)top_n, sizeof(const char *)); + int tn = 0; + + /* Build a simple query: SELECT name from nodes WHERE id IN (...) ordered by + * incoming CALLS count. For efficiency, just query each node's degree. */ + for (int k = 0; k < cn && tn < top_n; k++) { + int in_deg = 0; + int out_deg = 0; + cbm_store_node_degree(s, comm_nodes[k], &in_deg, &out_deg); + + /* Simple insertion into top-N by in-degree. + * We'll just pick the first top_n by iterating degree queries. */ + cbm_node_t ninfo; + if (cbm_store_find_node_by_id(s, comm_nodes[k], &ninfo) == CBM_STORE_OK) { + /* Skip File/Folder/Module nodes */ + if (ninfo.label && strcmp(ninfo.label, "File") != 0 && + strcmp(ninfo.label, "Folder") != 0 && + strcmp(ninfo.label, "Module") != 0) { + if (ninfo.name) { + top_names[tn++] = heap_strdup(ninfo.name); + } + } + cbm_node_free_fields(&ninfo); + } + } + + clusters[ci].top_nodes = top_names; + clusters[ci].top_node_count = tn; + + /* Derive semantic label from most common directory in member file paths. + * E.g. members in controllers/ → "Controllers", components/ → "Components" */ + { + /* Query file paths for a sample of cluster members */ + char dir_counts[64][64]; /* directory names */ + int dir_freqs[64]; /* frequency counts */ + int dir_n = 0; + memset(dir_freqs, 0, sizeof(dir_freqs)); + + int sample_limit = cn < 50 ? cn : 50; + for (int k = 0; k < sample_limit; k++) { + cbm_node_t ni; + if (cbm_store_find_node_by_id(s, comm_nodes[k], &ni) == CBM_STORE_OK) { + if (ni.file_path && ni.file_path[0]) { + /* Extract the deepest meaningful directory segment. + * E.g. "src/controllers/users-controller.ts" → "controllers" */ + const char *fp = ni.file_path; + const char *best_dir = NULL; + const char *p2 = fp; + const char *prev_slash = NULL; + while (*p2) { + if (*p2 == '/') { + if (prev_slash) { + /* Extract segment between prev_slash+1 and p2 */ + int slen = (int)(p2 - prev_slash - 1); + if (slen > 0 && slen < 60) { + /* Skip generic dirs: src, lib, dist, build, test, node_modules */ + char seg[64]; + memcpy(seg, prev_slash + 1, (size_t)slen); + seg[slen] = '\0'; + if (strcmp(seg, "src") != 0 && strcmp(seg, "lib") != 0 && + strcmp(seg, "dist") != 0 && strcmp(seg, "build") != 0 && + strcmp(seg, "node_modules") != 0 && + strcmp(seg, "test") != 0 && strcmp(seg, "tests") != 0 && + strcmp(seg, "shared") != 0 && strcmp(seg, "utils") != 0 && + strcmp(seg, "internal") != 0 && strcmp(seg, "generated") != 0) { + best_dir = prev_slash + 1; + } + } + } + prev_slash = p2; + } + p2++; + } + if (best_dir) { + const char *end = strchr(best_dir, '/'); + int dlen = end ? (int)(end - best_dir) : (int)strlen(best_dir); + if (dlen > 0 && dlen < 60) { + char dname[64]; + memcpy(dname, best_dir, (size_t)dlen); + dname[dlen] = '\0'; + /* Find or add to dir_counts */ + bool found_dir = false; + for (int d = 0; d < dir_n; d++) { + if (strcmp(dir_counts[d], dname) == 0) { + dir_freqs[d]++; + found_dir = true; + break; + } + } + if (!found_dir && dir_n < 64) { + strncpy(dir_counts[dir_n], dname, 63); + dir_counts[dir_n][63] = '\0'; + dir_freqs[dir_n] = 1; + dir_n++; + } + } + } + } + cbm_node_free_fields(&ni); + } + } + + /* Pick the most frequent directory name */ + char label_buf[64]; + int best_freq = 0; + int best_di = -1; + for (int d = 0; d < dir_n; d++) { + if (dir_freqs[d] > best_freq) { + best_freq = dir_freqs[d]; + best_di = d; + } + } + if (best_di >= 0 && best_freq >= 3) { + /* Capitalize first letter */ + char cap_name[64]; + strncpy(cap_name, dir_counts[best_di], sizeof(cap_name) - 1); + cap_name[sizeof(cap_name) - 1] = '\0'; + if (cap_name[0] >= 'a' && cap_name[0] <= 'z') { + cap_name[0] = cap_name[0] - 'a' + 'A'; + } + /* Convert kebab-case to TitleCase: "users-controller" → "UsersController" */ + for (int j = 0; cap_name[j]; j++) { + if (cap_name[j] == '-' && cap_name[j + 1]) { + /* Remove dash and capitalize next */ + memmove(&cap_name[j], &cap_name[j + 1], strlen(&cap_name[j + 1]) + 1); + if (cap_name[j] >= 'a' && cap_name[j] <= 'z') { + cap_name[j] = cap_name[j] - 'a' + 'A'; + } + } + } + snprintf(label_buf, sizeof(label_buf), "%s", cap_name); + } else { + snprintf(label_buf, sizeof(label_buf), "Cluster_%d", comm_id); + } + clusters[ci].label = heap_strdup(label_buf); + } + + /* packages and edge_types are optional, leave as NULL/0 for now */ + clusters[ci].packages = NULL; + clusters[ci].package_count = 0; + clusters[ci].edge_types = NULL; + clusters[ci].edge_type_count = 0; + + free(comm_nodes); + ci++; + } + + free(sorted_ids); + free(member_counts); + free(lresults); + + out->clusters = clusters; + out->cluster_count = ci; + return CBM_STORE_OK; +} + int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char **aspects, int aspect_count, cbm_architecture_info_t *out) { memset(out, 0, sizeof(*out)); @@ -4008,6 +4635,12 @@ int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char * return rc; } } + if (want_aspect(aspects, aspect_count, "clusters")) { + rc = arch_clusters(s, project, out); + if (rc != CBM_STORE_OK) { + return rc; + } + } return CBM_STORE_OK; } @@ -4085,6 +4718,767 @@ void cbm_store_architecture_free(cbm_architecture_info_t *out) { memset(out, 0, sizeof(*out)); } +/* ── Processes (execution flows) ──────────────────────────────── */ + +/* Detect execution flows: BFS from entry points, identify cross-community paths. */ +int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_processes) { + if (!s || !s->db || !project) return 0; + + /* Clear existing processes */ + { + char sql[512]; + snprintf(sql, sizeof(sql), + "DELETE FROM process_steps WHERE process_id IN " + "(SELECT id FROM processes WHERE project = '%s')", project); + exec_sql(s, sql); + snprintf(sql, sizeof(sql), "DELETE FROM processes WHERE project = '%s'", project); + exec_sql(s, sql); + } + + /* 1. Find entry point node IDs */ + const char *ep_sql = + "SELECT id, name FROM nodes WHERE project = ?1 " + "AND (json_extract(properties, '$.is_entry_point') = 1 OR label = 'Route') " + "AND label NOT IN ('File','Folder','Module','Project')"; + sqlite3_stmt *ep_stmt = NULL; + if (sqlite3_prepare_v2(s->db, ep_sql, -1, &ep_stmt, NULL) != SQLITE_OK) return 0; + bind_text(ep_stmt, 1, project); + + int ep_cap = 512; + int ep_count = 0; + int64_t *ep_ids = malloc((size_t)ep_cap * sizeof(int64_t)); + char **ep_names = malloc((size_t)ep_cap * sizeof(char *)); + + while (sqlite3_step(ep_stmt) == SQLITE_ROW) { + if (ep_count >= ep_cap) { + ep_cap *= 2; + ep_ids = safe_realloc(ep_ids, (size_t)ep_cap * sizeof(int64_t)); + ep_names = safe_realloc(ep_names, (size_t)ep_cap * sizeof(char *)); + } + ep_ids[ep_count] = sqlite3_column_int64(ep_stmt, 0); + const char *nm = (const char *)sqlite3_column_text(ep_stmt, 1); + ep_names[ep_count] = heap_strdup(nm ? nm : "?"); + ep_count++; + } + sqlite3_finalize(ep_stmt); + + if (ep_count == 0) { + free(ep_ids); + free(ep_names); + return 0; + } + + /* 1b. Resolve Route entry points to handler Functions. + * Route nodes have 0 outgoing edges (only incoming HANDLES from Modules). + * For each Route, find the Module that HANDLES it, then find Functions in + * the same file that have outgoing CALLS. Replace the Route entry point + * with those Functions — they're the real BFS starting points. */ + { + const char *resolve_sql = + "SELECT DISTINCT fn.id, fn.name FROM edges e " + "JOIN nodes m ON m.id = e.source_id AND m.label = 'Module' " + "JOIN nodes fn ON fn.file_path = m.file_path " + "AND fn.label IN ('Function','Method') AND fn.project = ?2 " + "WHERE e.target_id = ?1 AND e.type = 'HANDLES' AND e.project = ?2"; + sqlite3_stmt *res_stmt = NULL; + sqlite3_prepare_v2(s->db, resolve_sql, -1, &res_stmt, NULL); + + if (res_stmt) { + int orig_count = ep_count; + for (int i = 0; i < orig_count; i++) { + /* Check if this entry point is a Route node */ + const char *check_sql = "SELECT label FROM nodes WHERE id = ?1"; + sqlite3_stmt *chk = NULL; + sqlite3_prepare_v2(s->db, check_sql, -1, &chk, NULL); + if (!chk) continue; + sqlite3_bind_int64(chk, 1, ep_ids[i]); + const char *label = NULL; + if (sqlite3_step(chk) == SQLITE_ROW) { + label = (const char *)sqlite3_column_text(chk, 0); + } + bool is_route = (label && strcmp(label, "Route") == 0); + sqlite3_finalize(chk); + + if (!is_route) continue; + + /* Resolve Route → Module → Functions */ + sqlite3_reset(res_stmt); + sqlite3_bind_int64(res_stmt, 1, ep_ids[i]); + bind_text(res_stmt, 2, project); + + while (sqlite3_step(res_stmt) == SQLITE_ROW) { + if (ep_count >= ep_cap) { + ep_cap *= 2; + ep_ids = safe_realloc(ep_ids, (size_t)ep_cap * sizeof(int64_t)); + ep_names = safe_realloc(ep_names, (size_t)ep_cap * sizeof(char *)); + } + ep_ids[ep_count] = sqlite3_column_int64(res_stmt, 0); + const char *fn_name = (const char *)sqlite3_column_text(res_stmt, 1); + ep_names[ep_count] = heap_strdup(fn_name ? fn_name : "?"); + ep_count++; + } + } + sqlite3_finalize(res_stmt); + } + } + + /* 1c. Deduplicate entry points. + * Route resolution (1b) adds ALL functions from a handler file for EACH Route + * that maps to it. If 8 Routes → same file with 5 functions, each function + * appears 8 times. Remove duplicates by ep_id to avoid 8x duplicate processes. */ + { + int deduped = 0; + for (int i = 0; i < ep_count; i++) { + bool dup = false; + for (int j = 0; j < deduped; j++) { + if (ep_ids[j] == ep_ids[i]) { dup = true; break; } + } + if (!dup) { + /* Keep this entry — compact it into position [deduped] */ + if (deduped != i) { + ep_ids[deduped] = ep_ids[i]; + ep_names[deduped] = ep_names[i]; + ep_names[i] = NULL; /* prevent double-free in cleanup */ + } + deduped++; + } else { + /* Duplicate — free the name string */ + free(ep_names[i]); + ep_names[i] = NULL; + } + } + ep_count = deduped; + } + + /* 2. Load nodes + CALLS edges for Louvain */ + const char *nsql = "SELECT id FROM nodes WHERE project=?1 " + "AND label IN ('Function','Method','Class','Interface')"; + sqlite3_stmt *nst = NULL; + int all_cap = 4096; + int all_count = 0; + int64_t *all_ids = malloc((size_t)all_cap * sizeof(int64_t)); + if (sqlite3_prepare_v2(s->db, nsql, -1, &nst, NULL) == SQLITE_OK) { + bind_text(nst, 1, project); + while (sqlite3_step(nst) == SQLITE_ROW) { + if (all_count >= all_cap) { + all_cap *= 2; + all_ids = safe_realloc(all_ids, (size_t)all_cap * sizeof(int64_t)); + } + all_ids[all_count++] = sqlite3_column_int64(nst, 0); + } + sqlite3_finalize(nst); + } + + /* Include CALLS, HANDLES, and HTTP_CALLS for Louvain community detection. + * HANDLES connects Route → handler, HTTP_CALLS connects client → API endpoint. + * Without these, Express/Hapi route flows are invisible to process detection. */ + const char *esql = "SELECT source_id, target_id FROM edges WHERE project=?1 " + "AND type IN ('CALLS','HANDLES','HTTP_CALLS','ASYNC_CALLS')"; + sqlite3_stmt *est = NULL; + int le_cap = 8192; + int le_count = 0; + cbm_louvain_edge_t *ledges = malloc((size_t)le_cap * sizeof(cbm_louvain_edge_t)); + if (sqlite3_prepare_v2(s->db, esql, -1, &est, NULL) == SQLITE_OK) { + bind_text(est, 1, project); + while (sqlite3_step(est) == SQLITE_ROW) { + if (le_count >= le_cap) { + le_cap *= 2; + ledges = safe_realloc(ledges, (size_t)le_cap * sizeof(cbm_louvain_edge_t)); + } + ledges[le_count].src = sqlite3_column_int64(est, 0); + ledges[le_count].dst = sqlite3_column_int64(est, 1); + le_count++; + } + sqlite3_finalize(est); + } + + /* 3. Run Louvain */ + cbm_louvain_result_t *lresults = NULL; + int lcount = 0; + if (all_count > 1 && le_count > 0) { + cbm_louvain(all_ids, all_count, ledges, le_count, &lresults, &lcount); + } + free(all_ids); + free(ledges); + + /* Build node_id → community lookup (parallel arrays — O(n) scan per lookup, + * acceptable for entry_point_count * visited_count iterations) */ + int64_t *comm_nids = NULL; + int *comm_vals = NULL; + int comm_size = 0; + if (lresults && lcount > 0) { + comm_nids = malloc((size_t)lcount * sizeof(int64_t)); + comm_vals = malloc((size_t)lcount * sizeof(int)); + for (int i = 0; i < lcount; i++) { + comm_nids[i] = lresults[i].node_id; + comm_vals[i] = lresults[i].community; + } + comm_size = lcount; + } + free(lresults); + + /* 4. BFS from each entry point, detect cross-community flows */ + sqlite3_stmt *ins_proc = NULL; + sqlite3_stmt *ins_step = NULL; + sqlite3_prepare_v2(s->db, + "INSERT INTO processes(project,label,process_type,step_count," + "entry_point_id,terminal_id) VALUES(?1,?2,?3,?4,?5,?6)", + -1, &ins_proc, NULL); + sqlite3_prepare_v2(s->db, + "INSERT INTO process_steps(process_id,node_id,step) VALUES(?1,?2,?3)", + -1, &ins_step, NULL); + + exec_sql(s, "BEGIN TRANSACTION"); + int proc_count = 0; + + for (int ei = 0; ei < ep_count && proc_count < max_processes; ei++) { + const char *bfs_types[] = {"CALLS", "HANDLES", "HTTP_CALLS", "ASYNC_CALLS"}; + cbm_traverse_result_t tr = {0}; + cbm_store_bfs(s, ep_ids[ei], "outbound", bfs_types, 4, 8, 50, &tr); + + if (tr.visited_count < 2) { + cbm_store_traverse_free(&tr); + continue; + } + + /* Find the best cross-community terminal node. + * Instead of just picking the deepest hop (which gives generic utility functions + * like "update", "findOne"), score candidates by domain specificity: + * - Longer names score higher (domain-specific names are longer) + * - Generic names (update, get, set, find, create, delete, push, pop, error, + * log, emit, send, save, load, init, close, open) score 0 + * - Names starting with uppercase score higher (likely domain classes/handlers) */ + static const char *generic_names[] = { + "update", "get", "set", "find", "findOne", "findAll", "create", "delete", + "push", "pop", "error", "log", "emit", "send", "save", "load", "init", + "close", "open", "call", "apply", "bind", "then", "catch", "resolve", + "reject", "next", "done", "callback", "handler", "run", "execute", + "start", "stop", "reset", "clear", "add", "remove", "insert", + "forEach", "map", "filter", "reduce", "assign", "merge", "clone", + "parse", "format", "validate", "check", "test", "assert", + "toString", "valueOf", "toJSON", "default", "index", "main", + "getInstance", "getConnection", "getConfig", "getLogger", + "request", "response", "query", "result", "data", "value", + "defaultFilter", "_refreshCookies", NULL + }; + + int ep_comm = -1; + for (int c = 0; c < comm_size; c++) { + if (comm_nids[c] == ep_ids[ei]) { ep_comm = comm_vals[c]; break; } + } + + int64_t terminal_id = ep_ids[ei]; + const char *terminal_name = ep_names[ei]; + int best_score = -1; + bool is_cross = false; + + for (int v = 0; v < tr.visited_count; v++) { + int node_comm = -1; + for (int c = 0; c < comm_size; c++) { + if (comm_nids[c] == tr.visited[v].node.id) { node_comm = comm_vals[c]; break; } + } + if (node_comm != ep_comm && node_comm >= 0 && ep_comm >= 0) { + const char *nm = tr.visited[v].node.name; + if (!nm) continue; + + /* Score: name length * 10 + hop * 5, minus penalty for generics */ + int score = (int)strlen(nm) * 10 + tr.visited[v].hop * 5; + + /* Penalty for generic names */ + bool is_generic = false; + for (int g = 0; generic_names[g]; g++) { + if (strcmp(nm, generic_names[g]) == 0) { + is_generic = true; + break; + } + } + if (is_generic) score = 0; + + /* Bonus for CamelCase names starting with uppercase (domain handlers) */ + if (nm[0] >= 'A' && nm[0] <= 'Z') score += 50; + + /* Bonus for names containing domain verbs */ + if (strstr(nm, "Handler") || strstr(nm, "Controller") || + strstr(nm, "Service") || strstr(nm, "Storage") || + strstr(nm, "Plugin") || strstr(nm, "Middleware") || + strstr(nm, "Permission") || strstr(nm, "Authorization") || + strstr(nm, "Scope") || strstr(nm, "Role") || + strstr(nm, "Session") || strstr(nm, "User") || + strstr(nm, "Course") || strstr(nm, "Evaluation") || + strstr(nm, "Scenario")) { + score += 100; + } + + if (score > best_score) { + best_score = score; + terminal_id = tr.visited[v].node.id; + terminal_name = nm; + is_cross = true; + } + } + } + + /* If no cross-community terminal was found, still accept flows with ≥3 steps. + * This prevents filtering out legitimate API flows (route → controller → storage) + * that happen to stay within one Louvain community due to flat call patterns. + * Pick the deepest non-generic node as terminal for the label. */ + if (!is_cross) { + if (tr.visited_count < 3) { + cbm_store_traverse_free(&tr); + continue; + } + /* Find best terminal by hop depth + name quality */ + for (int v = 0; v < tr.visited_count; v++) { + const char *nm = tr.visited[v].node.name; + if (!nm) continue; + bool is_generic = false; + for (int g = 0; generic_names[g]; g++) { + if (strcmp(nm, generic_names[g]) == 0) { is_generic = true; break; } + } + if (is_generic) continue; + int score = (int)strlen(nm) * 10 + tr.visited[v].hop * 5; + if (nm[0] >= 'A' && nm[0] <= 'Z') score += 50; + if (score > best_score) { + best_score = score; + terminal_id = tr.visited[v].node.id; + terminal_name = nm; + } + } + } + + /* Label: "[module] EntryPoint → Terminal" (UTF-8 arrow) + * Derive module from entry point's parent directory for navigability. + * "server/src/controllers/clients-controller.js" → "controllers" */ + char module[128] = ""; + { + const char *fp_sql = "SELECT file_path FROM nodes WHERE id=?1"; + sqlite3_stmt *fp_stmt = NULL; + sqlite3_prepare_v2(s->db, fp_sql, -1, &fp_stmt, NULL); + if (fp_stmt) { + sqlite3_bind_int64(fp_stmt, 1, ep_ids[ei]); + if (sqlite3_step(fp_stmt) == SQLITE_ROW) { + const char *fp = (const char *)sqlite3_column_text(fp_stmt, 0); + if (fp) { + /* Find the parent directory name (second-to-last path segment) */ + const char *last_slash = strrchr(fp, '/'); + if (last_slash && last_slash > fp) { + const char *seg_start = last_slash - 1; + while (seg_start > fp && *seg_start != '/') seg_start--; + if (*seg_start == '/') seg_start++; + int mlen = (int)(last_slash - seg_start); + if (mlen > 0 && mlen < (int)sizeof(module)) { + memcpy(module, seg_start, (size_t)mlen); + module[mlen] = '\0'; + } + } + } + } + sqlite3_finalize(fp_stmt); + } + } + + char label[512]; + if (module[0]) { + snprintf(label, sizeof(label), "[%s] %s \xe2\x86\x92 %s", + module, ep_names[ei], terminal_name); + } else { + snprintf(label, sizeof(label), "%s \xe2\x86\x92 %s", + ep_names[ei], terminal_name); + } + + if (ins_proc) { + sqlite3_reset(ins_proc); + bind_text(ins_proc, 1, project); + bind_text(ins_proc, 2, label); + bind_text(ins_proc, 3, "cross_community"); + sqlite3_bind_int(ins_proc, 4, tr.visited_count + 1); + sqlite3_bind_int64(ins_proc, 5, ep_ids[ei]); + sqlite3_bind_int64(ins_proc, 6, terminal_id); + sqlite3_step(ins_proc); + } + + int64_t proc_id = sqlite3_last_insert_rowid(s->db); + + /* Insert steps */ + if (ins_step) { + sqlite3_reset(ins_step); + sqlite3_bind_int64(ins_step, 1, proc_id); + sqlite3_bind_int64(ins_step, 2, ep_ids[ei]); + sqlite3_bind_int(ins_step, 3, 0); + sqlite3_step(ins_step); + + for (int v = 0; v < tr.visited_count; v++) { + sqlite3_reset(ins_step); + sqlite3_bind_int64(ins_step, 1, proc_id); + sqlite3_bind_int64(ins_step, 2, tr.visited[v].node.id); + sqlite3_bind_int(ins_step, 3, tr.visited[v].hop); + sqlite3_step(ins_step); + } + } + + cbm_store_traverse_free(&tr); + proc_count++; + } + + exec_sql(s, "COMMIT"); + if (ins_proc) sqlite3_finalize(ins_proc); + if (ins_step) sqlite3_finalize(ins_step); + + free(comm_nids); + free(comm_vals); + for (int i = 0; i < ep_count; i++) free(ep_names[i]); + free(ep_names); + free(ep_ids); + + return proc_count; +} + +int cbm_store_list_processes(cbm_store_t *s, const char *project, + cbm_process_info_t **out, int *count) { + *out = NULL; + *count = 0; + const char *sql = "SELECT p.id, p.label, p.process_type, p.step_count, " + "p.entry_point_id, p.terminal_id " + "FROM processes p WHERE p.project = ?1 " + "ORDER BY p.step_count DESC LIMIT 300"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_OK; /* Table may not exist yet */ + } + bind_text(stmt, 1, project); + + int cap = 64; + int n = 0; + cbm_process_info_t *arr = calloc((size_t)cap, sizeof(cbm_process_info_t)); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_process_info_t)); + } + arr[n].id = sqlite3_column_int64(stmt, 0); + arr[n].label = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].process_type = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].step_count = sqlite3_column_int(stmt, 3); + arr[n].entry_point_id = sqlite3_column_int64(stmt, 4); + arr[n].terminal_id = sqlite3_column_int64(stmt, 5); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +int cbm_store_get_process_steps(cbm_store_t *s, int64_t process_id, + cbm_process_step_t **out, int *count) { + *out = NULL; + *count = 0; + const char *sql = "SELECT ps.node_id, n.name, n.qualified_name, n.file_path, ps.step " + "FROM process_steps ps JOIN nodes n ON n.id = ps.node_id " + "WHERE ps.process_id = ?1 ORDER BY ps.step"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_OK; + } + sqlite3_bind_int64(stmt, 1, process_id); + + int cap = 16; + int n = 0; + cbm_process_step_t *arr = calloc((size_t)cap, sizeof(cbm_process_step_t)); + + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { + cap *= 2; + arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_process_step_t)); + } + arr[n].node_id = sqlite3_column_int64(stmt, 0); + arr[n].name = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].qualified_name = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].file_path = heap_strdup((const char *)sqlite3_column_text(stmt, 3)); + arr[n].step = sqlite3_column_int(stmt, 4); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +void cbm_store_free_processes(cbm_process_info_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].label); + free((void *)arr[i].process_type); + } + free(arr); +} + +void cbm_store_free_process_steps(cbm_process_step_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].name); + free((void *)arr[i].qualified_name); + free((void *)arr[i].file_path); + } + free(arr); +} + +/* ── Channels (cross-service message tracing) ────────────────────── */ + +/* Forward declaration of channel extractors from httplink.c */ +typedef struct { + char channel[256]; + char direction[8]; + char transport[32]; +} cbm_channel_match_t; +int cbm_extract_channels(const char *source, cbm_channel_match_t *out, int max_out); +int cbm_extract_csharp_channels(const char *source, cbm_channel_match_t *out, int max_out); +int cbm_extract_js_channels_constants(const char *source, cbm_channel_match_t *out, int max_out); + +int cbm_store_detect_channels(cbm_store_t *s, const char *project, const char *repo_path) { + if (!s || !s->db || !project || !repo_path) return 0; + + /* Clear existing channels for this project (parameterized — no SQL injection) */ + { + sqlite3_stmt *del_stmt = NULL; + sqlite3_prepare_v2(s->db, "DELETE FROM channels WHERE project = ?1", -1, &del_stmt, NULL); + if (del_stmt) { + bind_text(del_stmt, 1, project); + sqlite3_step(del_stmt); + sqlite3_finalize(del_stmt); + } + } + + /* Find all Function/Method nodes with source file references in supported languages */ + const char *sql = "SELECT id, name, file_path, start_line, end_line FROM nodes " + "WHERE project = ?1 AND label IN ('Function','Method','Module','Class') " + "AND (file_path LIKE '%.ts' OR file_path LIKE '%.js' " + "OR file_path LIKE '%.tsx' OR file_path LIKE '%.py' " + "OR file_path LIKE '%.cs')"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) return 0; + bind_text(stmt, 1, project); + + sqlite3_stmt *ins = NULL; + sqlite3_prepare_v2(s->db, + "INSERT OR IGNORE INTO channels(project,channel_name,direction,transport,node_id,file_path,function_name) " + "VALUES(?1,?2,?3,?4,?5,?6,?7)", -1, &ins, NULL); + + exec_sql(s, "BEGIN TRANSACTION"); + int total = 0; + + while (sqlite3_step(stmt) == SQLITE_ROW) { + int64_t node_id = sqlite3_column_int64(stmt, 0); + const char *name = (const char *)sqlite3_column_text(stmt, 1); + const char *fpath = (const char *)sqlite3_column_text(stmt, 2); + int start = sqlite3_column_int(stmt, 3); + int end = sqlite3_column_int(stmt, 4); + + if (!fpath || !fpath[0] || start <= 0 || end <= 0) continue; + + /* Read source lines from disk */ + char full_path[2048]; + snprintf(full_path, sizeof(full_path), "%s/%s", repo_path, fpath); + + FILE *f = fopen(full_path, "r"); + if (!f) continue; + + /* Read relevant lines */ + char *source = NULL; + size_t src_len = 0; + size_t src_cap = 0; + int line_num = 0; + char line[4096]; + + while (fgets(line, sizeof(line), f)) { + line_num++; + if (line_num < start) continue; + if (line_num > end) break; + size_t ll = strlen(line); + if (src_len + ll >= src_cap) { + src_cap = (src_cap == 0) ? 4096 : src_cap * 2; + source = safe_realloc(source, src_cap); + } + memcpy(source + src_len, line, ll); + src_len += ll; + } + fclose(f); + + if (source) { + source[src_len] = '\0'; + cbm_channel_match_t matches[64]; + int mc = 0; + /* Use language-appropriate extractor */ + bool is_cs = fpath && (strstr(fpath, ".cs") != NULL && + strstr(fpath, ".css") == NULL); + if (is_cs) { + mc = cbm_extract_csharp_channels(source, matches, 64); + } else { + mc = cbm_extract_channels(source, matches, 64); + } + for (int i = 0; i < mc && ins; i++) { + sqlite3_reset(ins); + bind_text(ins, 1, project); + bind_text(ins, 2, matches[i].channel); + bind_text(ins, 3, matches[i].direction); + bind_text(ins, 4, matches[i].transport); + sqlite3_bind_int64(ins, 5, node_id); + bind_text(ins, 6, fpath); + bind_text(ins, 7, name ? name : ""); + sqlite3_step(ins); + total++; + } + free(source); + } + } + + exec_sql(s, "COMMIT"); + sqlite3_finalize(stmt); + + /* Second pass: JS/TS constant resolution on full files. + * The per-node pass above only sees function bodies — constants defined at file + * scope are invisible. This pass reads complete JS/TS files that contain Socket.IO + * patterns and resolves constant channel names. */ + { + const char *file_sql = + "SELECT DISTINCT file_path FROM nodes WHERE project = ?1 " + "AND (file_path LIKE '%.js' OR file_path LIKE '%.ts' OR file_path LIKE '%.tsx') " + "AND label NOT IN ('File','Folder','Project')"; + sqlite3_stmt *fst = NULL; + sqlite3_prepare_v2(s->db, file_sql, -1, &fst, NULL); + if (fst) { + bind_text(fst, 1, project); + exec_sql(s, "BEGIN TRANSACTION"); + + /* Re-prepare insert for this transaction */ + sqlite3_stmt *ins2 = NULL; + sqlite3_prepare_v2(s->db, + "INSERT OR IGNORE INTO channels" + "(project,channel_name,direction,transport,node_id,file_path,function_name) " + "VALUES(?1,?2,?3,?4,0,?5,'(file-level)')", -1, &ins2, NULL); + + while (sqlite3_step(fst) == SQLITE_ROW) { + const char *fpath = (const char *)sqlite3_column_text(fst, 0); + if (!fpath) continue; + + char full_path[2048]; + snprintf(full_path, sizeof(full_path), "%s/%s", repo_path, fpath); + + FILE *f = fopen(full_path, "r"); + if (!f) continue; + + /* Read entire file */ + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); + if (fsize <= 0 || fsize > 512 * 1024) { fclose(f); continue; } /* skip huge files */ + char *full_source = malloc((size_t)fsize + 1); + size_t nread = fread(full_source, 1, (size_t)fsize, f); + full_source[nread] = '\0'; + fclose(f); + + cbm_channel_match_t matches[64]; + int mc = cbm_extract_js_channels_constants(full_source, matches, 64); + + for (int i = 0; i < mc && ins2; i++) { + /* Filter out short constant names (single-letter variables) */ + if (strlen(matches[i].channel) < 3) continue; + sqlite3_reset(ins2); + bind_text(ins2, 1, project); + bind_text(ins2, 2, matches[i].channel); + bind_text(ins2, 3, matches[i].direction); + bind_text(ins2, 4, matches[i].transport); + bind_text(ins2, 5, fpath); + sqlite3_step(ins2); + total++; + } + free(full_source); + } + exec_sql(s, "COMMIT"); + sqlite3_finalize(fst); + if (ins2) sqlite3_finalize(ins2); + } + } + + if (ins) sqlite3_finalize(ins); + return total; +} + +int cbm_store_find_channels(cbm_store_t *s, const char *project, const char *channel, + cbm_channel_info_t **out, int *count) { + *out = NULL; + *count = 0; + + /* Build query — if project is NULL, search all; if channel is NULL, return all. + * Use DISTINCT to prevent duplicate rows from different extraction passes. */ + char sql[1024]; + if (project && channel) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE project = ?1 AND channel_name LIKE ?2 " + "ORDER BY channel_name LIMIT 500"); + } else if (project) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE project = ?1 ORDER BY channel_name LIMIT 500"); + } else if (channel) { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels WHERE channel_name LIKE ?1 ORDER BY channel_name LIMIT 500"); + } else { + snprintf(sql, sizeof(sql), + "SELECT DISTINCT channel_name, direction, transport, project, file_path, function_name " + "FROM channels ORDER BY channel_name LIMIT 500"); + } + + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) return CBM_STORE_OK; + + int bi = 0; + if (project && channel) { + bind_text(stmt, 1, project); + char pat[256]; + snprintf(pat, sizeof(pat), "%%%s%%", channel); + bind_text(stmt, 2, pat); + } else if (project) { + bind_text(stmt, 1, project); + } else if (channel) { + char pat[256]; + snprintf(pat, sizeof(pat), "%%%s%%", channel); + bind_text(stmt, 1, pat); + } + (void)bi; + + int cap = 64; + int n = 0; + cbm_channel_info_t *arr = calloc((size_t)cap, sizeof(cbm_channel_info_t)); + while (sqlite3_step(stmt) == SQLITE_ROW) { + if (n >= cap) { cap *= 2; arr = safe_realloc(arr, (size_t)cap * sizeof(cbm_channel_info_t)); } + arr[n].channel_name = heap_strdup((const char *)sqlite3_column_text(stmt, 0)); + arr[n].direction = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + arr[n].transport = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + arr[n].project = heap_strdup((const char *)sqlite3_column_text(stmt, 3)); + arr[n].file_path = heap_strdup((const char *)sqlite3_column_text(stmt, 4)); + arr[n].function_name = heap_strdup((const char *)sqlite3_column_text(stmt, 5)); + n++; + } + sqlite3_finalize(stmt); + *out = arr; + *count = n; + return CBM_STORE_OK; +} + +void cbm_store_free_channels(cbm_channel_info_t *arr, int count) { + for (int i = 0; i < count; i++) { + free((void *)arr[i].channel_name); + free((void *)arr[i].direction); + free((void *)arr[i].transport); + free((void *)arr[i].project); + free((void *)arr[i].file_path); + free((void *)arr[i].function_name); + } + free(arr); +} + /* ── ADR (Architecture Decision Record) ────────────────────────── */ static const char *canonical_sections[] = {"PURPOSE", "STACK", "ARCHITECTURE", @@ -4567,3 +5961,153 @@ void cbm_store_free_file_hashes(cbm_file_hash_t *hashes, int count) { } free(hashes); } + +/* ── Embeddings (vector search) ─────────────────────────────────── */ + +int cbm_store_upsert_embedding(cbm_store_t *s, int64_t node_id, const char *project, + const float *embedding, int dims) { + if (!s || !s->db || !embedding || dims <= 0) return CBM_STORE_ERR; + + const char *sql = + "INSERT OR REPLACE INTO embeddings(node_id, project, embedding, dimensions) " + "VALUES(?1, ?2, ?3, ?4)"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_ERR; + } + sqlite3_bind_int64(stmt, 1, node_id); + bind_text(stmt, 2, project); + sqlite3_bind_blob(stmt, 3, embedding, dims * (int)sizeof(float), SQLITE_STATIC); + sqlite3_bind_int(stmt, 4, dims); + int rc = (sqlite3_step(stmt) == SQLITE_DONE) ? CBM_STORE_OK : CBM_STORE_ERR; + sqlite3_finalize(stmt); + return rc; +} + +int cbm_store_upsert_embedding_batch(cbm_store_t *s, const int64_t *node_ids, + const char *project, const float *embeddings, + int dims, int count) { + if (!s || !s->db || !embeddings || dims <= 0 || count <= 0) return CBM_STORE_ERR; + + const char *sql = + "INSERT OR REPLACE INTO embeddings(node_id, project, embedding, dimensions) " + "VALUES(?1, ?2, ?3, ?4)"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_ERR; + } + + int blob_size = dims * (int)sizeof(float); + for (int i = 0; i < count; i++) { + sqlite3_reset(stmt); + sqlite3_bind_int64(stmt, 1, node_ids[i]); + bind_text(stmt, 2, project); + sqlite3_bind_blob(stmt, 3, &embeddings[i * dims], blob_size, SQLITE_STATIC); + sqlite3_bind_int(stmt, 4, dims); + if (sqlite3_step(stmt) != SQLITE_DONE) { + sqlite3_finalize(stmt); + return CBM_STORE_ERR; + } + } + sqlite3_finalize(stmt); + return CBM_STORE_OK; +} + +int cbm_store_count_embeddings(cbm_store_t *s, const char *project) { + if (!s || !s->db) return 0; + const char *sql = project + ? "SELECT COUNT(*) FROM embeddings WHERE project = ?1" + : "SELECT COUNT(*) FROM embeddings"; + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) return 0; + if (project) bind_text(stmt, 1, project); + int count = 0; + if (sqlite3_step(stmt) == SQLITE_ROW) { + count = sqlite3_column_int(stmt, 0); + } + sqlite3_finalize(stmt); + return count; +} + +int cbm_store_delete_embeddings(cbm_store_t *s, const char *project) { + if (!s || !s->db || !project) return CBM_STORE_ERR; + char sql[256]; + snprintf(sql, sizeof(sql), "DELETE FROM embeddings WHERE project = '%s'", project); + return exec_sql(s, sql); +} + +/* Semantic search: find top-k nodes by cosine similarity to query vector. + * Returns node IDs and similarity scores, ordered by similarity descending. + * Only searches nodes with embeddings in the given project. + * Filters to embeddable labels (Function, Method, Class, Interface, Route). */ +int cbm_store_vector_search(cbm_store_t *s, const char *project, + const float *query_vec, int dims, int limit, + cbm_vector_result_t **out, int *out_count) { + if (!s || !s->db || !query_vec || dims <= 0 || !out || !out_count) { + return CBM_STORE_ERR; + } + *out = NULL; + *out_count = 0; + + /* Brute-force cosine similarity scan using registered cbm_cosine_sim() */ + const char *sql = + "SELECT e.node_id, n.name, n.label, n.qualified_name, n.file_path, " + "n.start_line, n.end_line, n.properties, " + "cbm_cosine_sim(?1, e.embedding) AS similarity " + "FROM embeddings e " + "JOIN nodes n ON n.id = e.node_id " + "WHERE e.project = ?2 " + "AND n.label IN ('Function','Method','Class','Interface','Route') " + "AND similarity > 0.3 " + "ORDER BY similarity DESC " + "LIMIT ?3"; + + sqlite3_stmt *stmt = NULL; + if (sqlite3_prepare_v2(s->db, sql, -1, &stmt, NULL) != SQLITE_OK) { + return CBM_STORE_ERR; + } + + int blob_size = dims * (int)sizeof(float); + sqlite3_bind_blob(stmt, 1, query_vec, blob_size, SQLITE_STATIC); + bind_text(stmt, 2, project); + sqlite3_bind_int(stmt, 3, limit > 0 ? limit : 50); + + int cap = limit > 0 ? limit : 50; + cbm_vector_result_t *results = calloc((size_t)cap, sizeof(cbm_vector_result_t)); + if (!results) { + sqlite3_finalize(stmt); + return CBM_STORE_ERR; + } + + int count = 0; + while (sqlite3_step(stmt) == SQLITE_ROW && count < cap) { + cbm_vector_result_t *r = &results[count]; + r->node_id = sqlite3_column_int64(stmt, 0); + r->name = heap_strdup((const char *)sqlite3_column_text(stmt, 1)); + r->label = heap_strdup((const char *)sqlite3_column_text(stmt, 2)); + r->qualified_name = heap_strdup((const char *)sqlite3_column_text(stmt, 3)); + r->file_path = heap_strdup((const char *)sqlite3_column_text(stmt, 4)); + r->start_line = sqlite3_column_int(stmt, 5); + r->end_line = sqlite3_column_int(stmt, 6); + r->properties_json = heap_strdup((const char *)sqlite3_column_text(stmt, 7)); + r->similarity = sqlite3_column_double(stmt, 8); + count++; + } + sqlite3_finalize(stmt); + + *out = results; + *out_count = count; + return CBM_STORE_OK; +} + +void cbm_store_free_vector_results(cbm_vector_result_t *results, int count) { + if (!results) return; + for (int i = 0; i < count; i++) { + free((void *)results[i].name); + free((void *)results[i].label); + free((void *)results[i].qualified_name); + free((void *)results[i].file_path); + free((void *)results[i].properties_json); + } + free(results); +} diff --git a/src/store/store.h b/src/store/store.h index 17b0df11..17a39b25 100644 --- a/src/store/store.h +++ b/src/store/store.h @@ -108,6 +108,7 @@ typedef struct { const char *name_pattern; /* regex on name, NULL = any */ const char *qn_pattern; /* regex on qualified_name, NULL = any */ const char *file_pattern; /* glob on file_path, NULL = any */ + const char *query; /* free-text BM25 query via FTS5, NULL = disabled */ const char *relationship; /* edge type filter, NULL = any */ const char *direction; /* "inbound" / "outbound" / "any", NULL = any */ int min_degree; /* -1 = no filter (default), 0+ = minimum */ @@ -209,6 +210,9 @@ cbm_store_t *cbm_store_open(const char *project); /* Close the store and free all resources. NULL-safe. */ void cbm_store_close(cbm_store_t *s); +/* Execute a raw SQL statement (for DDL, DML, etc.). */ +int cbm_store_exec(cbm_store_t *s, const char *sql); + /* Get the underlying sqlite3 handle (for testing only). */ struct sqlite3 *cbm_store_get_db(cbm_store_t *s); @@ -514,6 +518,57 @@ int cbm_store_get_architecture(cbm_store_t *s, const char *project, const char * int aspect_count, cbm_architecture_info_t *out); void cbm_store_architecture_free(cbm_architecture_info_t *out); +/* ── Processes (execution flows) ─────────────────────────────────── */ + +typedef struct { + int64_t id; + const char *label; /* "EntryPoint → Terminal" */ + const char *process_type; /* "cross_community" or "intra_community" */ + int step_count; + int64_t entry_point_id; + int64_t terminal_id; +} cbm_process_info_t; + +typedef struct { + int64_t node_id; + const char *name; + const char *qualified_name; + const char *file_path; + int step; +} cbm_process_step_t; + +int cbm_store_list_processes(cbm_store_t *s, const char *project, + cbm_process_info_t **out, int *count); +int cbm_store_get_process_steps(cbm_store_t *s, int64_t process_id, + cbm_process_step_t **out, int *count); +void cbm_store_free_processes(cbm_process_info_t *arr, int count); +void cbm_store_free_process_steps(cbm_process_step_t *arr, int count); + +/* Detect execution flows from entry points via BFS + Louvain community crossing. */ +int cbm_store_detect_processes(cbm_store_t *s, const char *project, int max_processes); + +/* ── Channels (cross-service message tracing) ────────────────────── */ + +typedef struct { + const char *channel_name; + const char *direction; /* "emit" or "listen" */ + const char *transport; /* "socketio", "eventemitter" */ + const char *project; + const char *file_path; + const char *function_name; +} cbm_channel_info_t; + +/* Detect channel emit/listen patterns in indexed source files. + * Reads source from disk for JS/TS/Python files and scans for + * socket.emit/on, emitter.emit/on patterns. */ +int cbm_store_detect_channels(cbm_store_t *s, const char *project, const char *repo_path); + +/* Query channels by name (partial match). If channel is NULL, returns all. + * If project is NULL, searches across all loaded projects. */ +int cbm_store_find_channels(cbm_store_t *s, const char *project, const char *channel, + cbm_channel_info_t **out, int *count); +void cbm_store_free_channels(cbm_channel_info_t *arr, int count); + /* ── ADR (Architecture Decision Record) ────────────────────────── */ #define CBM_ADR_MAX_LENGTH 8000 @@ -606,4 +661,43 @@ void cbm_store_free_projects(cbm_project_t *projects, int count); /* Free an array of file hashes. */ void cbm_store_free_file_hashes(cbm_file_hash_t *hashes, int count); +/* ── Embeddings (semantic vector search) ─────────────────────────── */ + +typedef struct { + int64_t node_id; + const char *name; + const char *label; + const char *qualified_name; + const char *file_path; + int start_line; + int end_line; + const char *properties_json; + double similarity; /* cosine similarity to query vector (0.0 - 1.0) */ +} cbm_vector_result_t; + +/* Store a single embedding for a node. embedding is float32[dims]. */ +int cbm_store_upsert_embedding(cbm_store_t *s, int64_t node_id, const char *project, + const float *embedding, int dims); + +/* Store embeddings in batch. embeddings is a flat float32[count * dims] array. */ +int cbm_store_upsert_embedding_batch(cbm_store_t *s, const int64_t *node_ids, + const char *project, const float *embeddings, + int dims, int count); + +/* Count embeddings for a project (or all projects if project is NULL). */ +int cbm_store_count_embeddings(cbm_store_t *s, const char *project); + +/* Delete all embeddings for a project. */ +int cbm_store_delete_embeddings(cbm_store_t *s, const char *project); + +/* Semantic search: top-k nodes by cosine similarity to query vector. + * Brute-force scan — fast enough for <100K vectors at 384-768 dims. + * Returns allocated array via out/out_count. Caller must free with + * cbm_store_free_vector_results(). */ +int cbm_store_vector_search(cbm_store_t *s, const char *project, + const float *query_vec, int dims, int limit, + cbm_vector_result_t **out, int *out_count); + +void cbm_store_free_vector_results(cbm_vector_result_t *results, int count); + #endif /* CBM_STORE_H */