From c2513c089f3a462b4200149487edb526c4acbda9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 01:08:00 +0000
Subject: [PATCH 01/18] Port graphrag-rs core pipeline to Swift

Reimplement the core of the Rust crate graphrag-rs
(https://github.com/automataIA/graphrag-rs) as an idiomatic, dependency-free
Swift 6 package.

Modules:
- Core: Document/TextChunk/Entity/Relationship model, typed IDs, GraphRAGError,
  and pluggable protocols (LanguageModel, EmbeddingModel, EntityExtracting,
  ChunkingStrategy).
- Text: HierarchicalChunker, TextProcessor, TfIdfKeywordExtractor.
- Graph: KnowledgeGraph (value-type adjacency), PageRank, traversal
  (BFS/DFS/ego/paths), analytics (degree/closeness/betweenness/components).
- Retrieval: BM25Retriever, cosine InMemoryVectorStore, HybridRetriever with
  RRF/weighted/CombSUM/MaxScore fusion.
- Entity: PatternEntityExtractor (offline) and LLMEntityExtractor with the
  upstream extraction prompts and staged JSON-recovery parsing.
- Embeddings: deterministic offline HashEmbedder and OllamaEmbedder; OllamaClient
  for LLM completion.
- Orchestration: GraphRAG actor (ingest -> build -> ask), GraphRAGBuilder, Config.

Defaults mirror the Rust crate (PageRank d=0.85/tol=1e-6, BM25 k1=1.2/b=0.75,
hybrid RRF k=60, weights 0.7/0.3, traversal depth 3, min strength 0.5). Runs
fully offline by default; optional local Ollama for LLM-backed extraction and
answer generation. Includes Swift Testing coverage and a README.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 README.md                                     |  98 +++++++
 Sources/GraphRAG/Core/Error.swift             |  85 ++++++
 Sources/GraphRAG/Core/Identifiers.swift       |  39 +++
 Sources/GraphRAG/Core/Models.swift            | 158 +++++++++++
 Sources/GraphRAG/Core/Protocols.swift         |  59 +++++
 Sources/GraphRAG/Core/Types.swift             | 113 ++++++++
 .../GraphRAG/Embeddings/HashEmbedder.swift    |  70 +++++
 Sources/GraphRAG/Embeddings/Ollama.swift      | 193 ++++++++++++++
 Sources/GraphRAG/Entity/LLMExtractor.swift    | 220 ++++++++++++++++
 .../GraphRAG/Entity/PatternExtractor.swift    | 247 ++++++++++++++++++
 Sources/GraphRAG/Entity/Prompts.swift         | 131 ++++++++++
 Sources/GraphRAG/Graph/Analytics.swift        | 205 +++++++++++++++
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   | 229 ++++++++++++++++
 Sources/GraphRAG/Graph/PageRank.swift         |  88 +++++++
 Sources/GraphRAG/Graph/Traversal.swift        | 193 ++++++++++++++
 Sources/GraphRAG/GraphRAG.swift               |  42 ++-
 Sources/GraphRAG/GraphRAG/Builder.swift       | 139 ++++++++++
 Sources/GraphRAG/GraphRAG/Config.swift        |  95 +++++++
 Sources/GraphRAG/GraphRAG/Engine.swift        | 180 +++++++++++++
 Sources/GraphRAG/Retrieval/BM25.swift         | 137 ++++++++++
 Sources/GraphRAG/Retrieval/Hybrid.swift       | 207 +++++++++++++++
 Sources/GraphRAG/Retrieval/VectorStore.swift  |  84 ++++++
 Sources/GraphRAG/Text/Chunking.swift          | 237 +++++++++++++++++
 Sources/GraphRAG/Text/KeywordExtraction.swift | 100 +++++++
 Tests/GraphRAGTests/GraphRAGTests.swift       | 222 +++++++++++++++-
 25 files changed, 3565 insertions(+), 6 deletions(-)
 create mode 100644 README.md
 create mode 100644 Sources/GraphRAG/Core/Error.swift
 create mode 100644 Sources/GraphRAG/Core/Identifiers.swift
 create mode 100644 Sources/GraphRAG/Core/Models.swift
 create mode 100644 Sources/GraphRAG/Core/Protocols.swift
 create mode 100644 Sources/GraphRAG/Core/Types.swift
 create mode 100644 Sources/GraphRAG/Embeddings/HashEmbedder.swift
 create mode 100644 Sources/GraphRAG/Embeddings/Ollama.swift
 create mode 100644 Sources/GraphRAG/Entity/LLMExtractor.swift
 create mode 100644 Sources/GraphRAG/Entity/PatternExtractor.swift
 create mode 100644 Sources/GraphRAG/Entity/Prompts.swift
 create mode 100644 Sources/GraphRAG/Graph/Analytics.swift
 create mode 100644 Sources/GraphRAG/Graph/KnowledgeGraph.swift
 create mode 100644 Sources/GraphRAG/Graph/PageRank.swift
 create mode 100644 Sources/GraphRAG/Graph/Traversal.swift
 create mode 100644 Sources/GraphRAG/GraphRAG/Builder.swift
 create mode 100644 Sources/GraphRAG/GraphRAG/Config.swift
 create mode 100644 Sources/GraphRAG/GraphRAG/Engine.swift
 create mode 100644 Sources/GraphRAG/Retrieval/BM25.swift
 create mode 100644 Sources/GraphRAG/Retrieval/Hybrid.swift
 create mode 100644 Sources/GraphRAG/Retrieval/VectorStore.swift
 create mode 100644 Sources/GraphRAG/Text/Chunking.swift
 create mode 100644 Sources/GraphRAG/Text/KeywordExtraction.swift

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..c9b3632
--- /dev/null
+++ b/README.md
@@ -0,0 +1,98 @@
+# GraphRAG (Swift)
+
+A Swift port of the Rust crate [`graphrag-rs`](https://github.com/automataIA/graphrag-rs):
+Graph-based Retrieval Augmented Generation. It builds a knowledge graph from
+documents and answers natural-language questions using graph-based context
+retrieval.
+
+This package ports the **core library** (`graphrag-core`) — the parts that make
+GraphRAG work end to end — into idiomatic, Swift 6, dependency-free code. It runs
+fully offline out of the box, and can optionally talk to a local
+[Ollama](https://ollama.com) server for LLM-backed extraction and answer
+generation.
+
+## Installation
+
+Add the package to your `Package.swift`:
+
+```swift
+.package(url: "https://github.com/picomlx/graphrag.git", branch: "main")
+```
+
+and depend on the `GraphRAG` product.
+
+## Quick start
+
+```swift
+import GraphRAG
+
+// Offline pipeline: hash embeddings + pattern-based entity extraction.
+let rag = try GraphRAGBuilder()
+    .withChunkSize(800)
+    .withChunkOverlap(100)
+    .withTopK(5)
+    .build()
+
+await rag.addDocument(text: """
+    Ada Lovelace collaborated with Charles Babbage on the Analytical Engine,
+    an early mechanical general-purpose computer.
+    """)
+
+try await rag.build()                       // chunk → extract → embed → index
+let answer = try await rag.ask("Who worked on the Analytical Engine?")
+print(answer.text)
+print(answer.sources)                        // grounding chunk ids
+```
+
+### Using a local LLM (Ollama)
+
+```swift
+let rag = try GraphRAGBuilder()
+    .withLocalDefaults()                      // Ollama chat + embeddings
+    .build()
+```
+
+With Ollama enabled, entity/relationship extraction uses the LLM extraction
+prompt, and `ask` synthesizes a natural-language answer from the retrieved
+context. Without it, extraction is pattern-based and `ask` returns an extractive
+summary of the top chunks.
+
+## What's included
+
+| Area | Types |
+| --- | --- |
+| Core model | `Document`, `TextChunk`, `Entity`, `Relationship`, `EntityMention`, typed IDs, `GraphRAGError` |
+| Abstractions | `LanguageModel`, `EmbeddingModel`, `EntityExtracting`, `ChunkingStrategy` |
+| Text | `HierarchicalChunker`, `TextProcessor`, `TfIdfKeywordExtractor` |
+| Graph | `KnowledgeGraph`, `PageRank`, `GraphTraversal` (BFS/DFS/ego/paths), `GraphAnalytics` (degree/closeness/betweenness/components) |
+| Retrieval | `BM25Retriever`, `InMemoryVectorStore` (cosine), `HybridRetriever` (RRF / weighted / CombSUM / MaxScore fusion) |
+| Extraction | `PatternEntityExtractor`, `LLMEntityExtractor`, `Prompts` |
+| Embeddings | `HashEmbedder` (offline, deterministic), `OllamaEmbedder` |
+| LLM | `OllamaClient` |
+| Orchestration | `GraphRAG` (actor), `GraphRAGBuilder`, `Config` |
+
+## Design notes / port fidelity
+
+- **Defaults match the Rust crate**: PageRank damping `0.85` / tolerance `1e-6`,
+  BM25 `k1 = 1.2`, `b = 0.75`, hybrid `RRF k = 60`, semantic/keyword weights
+  `0.7 / 0.3`, traversal `maxDepth = 3`, min relationship strength `0.5`, etc.
+- **Concurrency**: `GraphRAG` is an `actor`; backends are `Sendable` existentials
+  (`any EmbeddingModel`, `any LanguageModel`, `any EntityExtracting`). Builds
+  cleanly under Swift 6 strict concurrency.
+- **Unicode safety**: the Rust chunker works on UTF-8 byte offsets guarded by
+  `is_char_boundary`. This port operates on `Character` (grapheme) arrays, which
+  are always valid boundaries; sizes and offsets are measured in characters.
+- **Scope**: this is the portable core pipeline. The Rust workspace's
+  server/WASM/CLI crates and heavier optional subsystems (LightRAG, ROGRAG,
+  Leiden communities, distributed caching, persistence backends) are out of
+  scope for this port.
+
+## Testing
+
+```bash
+swift test
+```
+
+The suite covers chunking, keyword extraction, BM25 ranking, cosine/vector
+search, the knowledge graph, PageRank, traversal, analytics, pattern extraction,
+and the end-to-end offline build/ask pipeline.
diff --git a/Sources/GraphRAG/Core/Error.swift b/Sources/GraphRAG/Core/Error.swift
new file mode 100644
index 0000000..a03944c
--- /dev/null
+++ b/Sources/GraphRAG/Core/Error.swift
@@ -0,0 +1,85 @@
+// Error.swift
+// Ported from graphrag-rs `core::error::GraphRAGError`.
+
+import Foundation
+
+/// The unified error type for every fallible GraphRAG operation.
+///
+/// Mirrors the variants of the Rust `GraphRAGError` enum. Each case carries a
+/// human-readable message (and, where relevant, structured fields) so callers
+/// can pattern-match or surface a description.
+public enum GraphRAGError: Error, Sendable, CustomStringConvertible {
+    case config(message: String)
+    case notInitialized
+    case noDocuments
+    case io(message: String)
+    case http(message: String)
+    case json(message: String)
+    case textProcessing(message: String)
+    case graphConstruction(message: String)
+    case vectorSearch(message: String)
+    case entityExtraction(message: String)
+    case retrieval(message: String)
+    case generation(message: String)
+    case functionCall(message: String)
+    case storage(message: String)
+    case embedding(message: String)
+    case languageModel(message: String)
+    case parallel(message: String)
+    case serialization(message: String)
+    case validation(message: String)
+    case network(message: String)
+    case auth(message: String)
+    case notFound(resource: String, id: String)
+    case alreadyExists(resource: String, id: String)
+    case timeout(operation: String, seconds: Double)
+    case resourceLimit(resource: String, limit: Int)
+    case dataCorruption(message: String)
+    case unsupported(operation: String, reason: String)
+    case rateLimit(message: String)
+    case conflictResolution(message: String)
+    case incrementalUpdate(message: String)
+
+    public var description: String {
+        switch self {
+        case .config(let m): return "Configuration error: \(m)"
+        case .notInitialized: return "GraphRAG system is not initialized"
+        case .noDocuments: return "No documents have been added"
+        case .io(let m): return "I/O error: \(m)"
+        case .http(let m): return "HTTP error: \(m)"
+        case .json(let m): return "JSON error: \(m)"
+        case .textProcessing(let m): return "Text processing error: \(m)"
+        case .graphConstruction(let m): return "Graph construction error: \(m)"
+        case .vectorSearch(let m): return "Vector search error: \(m)"
+        case .entityExtraction(let m): return "Entity extraction error: \(m)"
+        case .retrieval(let m): return "Retrieval error: \(m)"
+        case .generation(let m): return "Generation error: \(m)"
+        case .functionCall(let m): return "Function call error: \(m)"
+        case .storage(let m): return "Storage error: \(m)"
+        case .embedding(let m): return "Embedding error: \(m)"
+        case .languageModel(let m): return "Language model error: \(m)"
+        case .parallel(let m): return "Parallel processing error: \(m)"
+        case .serialization(let m): return "Serialization error: \(m)"
+        case .validation(let m): return "Validation error: \(m)"
+        case .network(let m): return "Network error: \(m)"
+        case .auth(let m): return "Authentication error: \(m)"
+        case .notFound(let resource, let id):
+            return "\(resource) not found: \(id)"
+        case .alreadyExists(let resource, let id):
+            return "\(resource) already exists: \(id)"
+        case .timeout(let operation, let seconds):
+            return "Operation '\(operation)' timed out after \(seconds)s"
+        case .resourceLimit(let resource, let limit):
+            return "Resource limit exceeded for \(resource): \(limit)"
+        case .dataCorruption(let m): return "Data corruption: \(m)"
+        case .unsupported(let operation, let reason):
+            return "Unsupported operation '\(operation)': \(reason)"
+        case .rateLimit(let m): return "Rate limit exceeded: \(m)"
+        case .conflictResolution(let m): return "Conflict resolution error: \(m)"
+        case .incrementalUpdate(let m): return "Incremental update error: \(m)"
+        }
+    }
+}
+
+/// Convenience matching the Rust `pub type Result<T> = ...` alias.
+public typealias GraphRAGResult<T> = Swift.Result<T, GraphRAGError>
diff --git a/Sources/GraphRAG/Core/Identifiers.swift b/Sources/GraphRAG/Core/Identifiers.swift
new file mode 100644
index 0000000..03671c0
--- /dev/null
+++ b/Sources/GraphRAG/Core/Identifiers.swift
@@ -0,0 +1,39 @@
+// Identifiers.swift
+// Strongly-typed identifier wrappers, ported from graphrag-rs `core::DocumentId`,
+// `core::EntityId` and `core::ChunkId`.
+
+/// Stable identifier for a `Document`.
+public struct DocumentID: Hashable, Codable, Sendable, CustomStringConvertible,
+    ExpressibleByStringLiteral
+{
+    public var raw: String
+
+    public init(_ raw: String) { self.raw = raw }
+    public init(stringLiteral value: String) { self.raw = value }
+
+    public var description: String { raw }
+}
+
+/// Stable identifier for an `Entity`.
+public struct EntityID: Hashable, Codable, Sendable, CustomStringConvertible,
+    ExpressibleByStringLiteral
+{
+    public var raw: String
+
+    public init(_ raw: String) { self.raw = raw }
+    public init(stringLiteral value: String) { self.raw = value }
+
+    public var description: String { raw }
+}
+
+/// Stable identifier for a `TextChunk`.
+public struct ChunkID: Hashable, Codable, Sendable, CustomStringConvertible,
+    ExpressibleByStringLiteral
+{
+    public var raw: String
+
+    public init(_ raw: String) { self.raw = raw }
+    public init(stringLiteral value: String) { self.raw = value }
+
+    public var description: String { raw }
+}
diff --git a/Sources/GraphRAG/Core/Models.swift b/Sources/GraphRAG/Core/Models.swift
new file mode 100644
index 0000000..8cefbea
--- /dev/null
+++ b/Sources/GraphRAG/Core/Models.swift
@@ -0,0 +1,158 @@
+// Models.swift
+// Core domain model, ported from graphrag-rs `core::mod`.
+
+import Foundation
+
+/// Optional metadata attached to a chunk during enrichment.
+///
+/// In the Rust source this is a dedicated `ChunkMetadata` struct; here it keeps
+/// the most useful fields plus an open key/value bag for extensions.
+public struct ChunkMetadata: Codable, Sendable, Equatable {
+    /// Zero-based index of the chunk within its source document.
+    public var index: Int
+    /// Approximate token / word count of the chunk content.
+    public var wordCount: Int
+    /// Keywords extracted from the chunk, if any.
+    public var keywords: [String]
+    /// Arbitrary extra fields.
+    public var extra: [String: String]
+
+    public init(
+        index: Int = 0,
+        wordCount: Int = 0,
+        keywords: [String] = [],
+        extra: [String: String] = [:]
+    ) {
+        self.index = index
+        self.wordCount = wordCount
+        self.keywords = keywords
+        self.extra = extra
+    }
+}
+
+/// A contiguous span of a document produced by the chunking stage.
+public struct TextChunk: Codable, Sendable, Identifiable, Equatable {
+    public var id: ChunkID
+    public var documentID: DocumentID
+    public var content: String
+    /// Byte offset of the chunk start within the original document content.
+    public var startOffset: Int
+    /// Byte offset of the chunk end within the original document content.
+    public var endOffset: Int
+    /// Optional dense embedding for semantic search.
+    public var embedding: [Float]?
+    /// Entities mentioned within this chunk.
+    public var entities: [EntityID]
+    public var metadata: ChunkMetadata
+
+    public init(
+        id: ChunkID,
+        documentID: DocumentID,
+        content: String,
+        startOffset: Int,
+        endOffset: Int,
+        embedding: [Float]? = nil,
+        entities: [EntityID] = [],
+        metadata: ChunkMetadata = ChunkMetadata()
+    ) {
+        self.id = id
+        self.documentID = documentID
+        self.content = content
+        self.startOffset = startOffset
+        self.endOffset = endOffset
+        self.embedding = embedding
+        self.entities = entities
+        self.metadata = metadata
+    }
+}
+
+/// A source document and its derived chunks.
+public struct Document: Codable, Sendable, Identifiable, Equatable {
+    public var id: DocumentID
+    public var title: String
+    public var content: String
+    public var metadata: [String: String]
+    public var chunks: [TextChunk]
+
+    public init(
+        id: DocumentID,
+        title: String,
+        content: String,
+        metadata: [String: String] = [:],
+        chunks: [TextChunk] = []
+    ) {
+        self.id = id
+        self.title = title
+        self.content = content
+        self.metadata = metadata
+        self.chunks = chunks
+    }
+}
+
+/// A single mention (occurrence) of an entity inside a chunk.
+public struct EntityMention: Codable, Sendable, Equatable {
+    public var chunkID: ChunkID
+    public var startOffset: Int
+    public var endOffset: Int
+    public var confidence: Float
+
+    public init(chunkID: ChunkID, startOffset: Int, endOffset: Int, confidence: Float) {
+        self.chunkID = chunkID
+        self.startOffset = startOffset
+        self.endOffset = endOffset
+        self.confidence = confidence
+    }
+}
+
+/// A node in the knowledge graph.
+public struct Entity: Codable, Sendable, Identifiable, Equatable {
+    public var id: EntityID
+    public var name: String
+    public var entityType: String
+    public var confidence: Float
+    public var mentions: [EntityMention]
+    public var embedding: [Float]?
+
+    public init(
+        id: EntityID,
+        name: String,
+        entityType: String,
+        confidence: Float = 1.0,
+        mentions: [EntityMention] = [],
+        embedding: [Float]? = nil
+    ) {
+        self.id = id
+        self.name = name
+        self.entityType = entityType
+        self.confidence = confidence
+        self.mentions = mentions
+        self.embedding = embedding
+    }
+}
+
+/// A directed, typed edge between two entities.
+public struct Relationship: Codable, Sendable, Equatable {
+    public var source: EntityID
+    public var target: EntityID
+    public var relationType: String
+    public var confidence: Float
+    /// Chunks that provide evidence for this relationship.
+    public var context: [ChunkID]
+    public var embedding: [Float]?
+
+    public init(
+        source: EntityID,
+        target: EntityID,
+        relationType: String,
+        confidence: Float = 1.0,
+        context: [ChunkID] = [],
+        embedding: [Float]? = nil
+    ) {
+        self.source = source
+        self.target = target
+        self.relationType = relationType
+        self.confidence = confidence
+        self.context = context
+        self.embedding = embedding
+    }
+}
diff --git a/Sources/GraphRAG/Core/Protocols.swift b/Sources/GraphRAG/Core/Protocols.swift
new file mode 100644
index 0000000..155ddd4
--- /dev/null
+++ b/Sources/GraphRAG/Core/Protocols.swift
@@ -0,0 +1,59 @@
+// Protocols.swift
+// Pluggable abstractions, ported from graphrag-rs `core::traits`.
+//
+// The Rust crate exposes both synchronous and async variants of each trait.
+// In Swift we model the async variants (the ones the pipeline actually uses)
+// with `async` requirements and require `Sendable` so implementations can cross
+// concurrency domains.
+
+/// A text-generation backend (the "LLM").
+public protocol LanguageModel: Sendable {
+    /// Complete `prompt` with default parameters.
+    func complete(_ prompt: String) async throws -> String
+    /// Complete `prompt` with explicit generation parameters.
+    func complete(_ prompt: String, params: GenerationParams) async throws -> String
+    /// Whether the backend is reachable / configured.
+    func isAvailable() async -> Bool
+    /// Static model description.
+    var modelInfo: ModelInfo { get }
+}
+
+extension LanguageModel {
+    public func complete(_ prompt: String) async throws -> String {
+        try await complete(prompt, params: .default)
+    }
+}
+
+/// An embedding backend that turns text into dense vectors.
+public protocol EmbeddingModel: Sendable {
+    /// Embed a single string.
+    func embed(_ text: String) async throws -> [Float]
+    /// Embed a batch of strings (default: sequential `embed`).
+    func embedBatch(_ texts: [String]) async throws -> [[Float]]
+    /// Dimensionality of produced vectors.
+    var dimension: Int { get }
+    /// Whether the backend is ready.
+    func isAvailable() async -> Bool
+}
+
+extension EmbeddingModel {
+    public func embedBatch(_ texts: [String]) async throws -> [[Float]] {
+        var out: [[Float]] = []
+        out.reserveCapacity(texts.count)
+        for text in texts {
+            out.append(try await embed(text))
+        }
+        return out
+    }
+}
+
+/// A strategy that splits raw text into chunks.
+public protocol ChunkingStrategy: Sendable {
+    /// Split `text` belonging to `documentID` into ordered chunks.
+    func chunk(_ text: String, documentID: DocumentID) -> [TextChunk]
+}
+
+/// Extracts entities (and optionally relationships) from text.
+public protocol EntityExtracting: Sendable {
+    func extract(from chunk: TextChunk) async throws -> (entities: [Entity], relationships: [Relationship])
+}
diff --git a/Sources/GraphRAG/Core/Types.swift b/Sources/GraphRAG/Core/Types.swift
new file mode 100644
index 0000000..1f6d737
--- /dev/null
+++ b/Sources/GraphRAG/Core/Types.swift
@@ -0,0 +1,113 @@
+// Types.swift
+// Shared supporting value types, ported from graphrag-rs `core::traits` helpers.
+
+import Foundation
+
+/// A single hit returned by a vector store search.
+public struct SearchResult: Sendable, Equatable {
+    public var id: String
+    /// Distance (lower is closer) — for cosine stores this is `1 - similarity`.
+    public var distance: Float
+    public var metadata: [String: String]?
+
+    public init(id: String, distance: Float, metadata: [String: String]? = nil) {
+        self.id = id
+        self.distance = distance
+        self.metadata = metadata
+    }
+
+    /// Convenience similarity score for cosine-based stores.
+    public var similarity: Float { 1.0 - distance }
+}
+
+/// Knobs passed to a `LanguageModel` completion call.
+public struct GenerationParams: Sendable, Equatable {
+    public var maxTokens: Int?
+    public var temperature: Float?
+    public var topP: Float?
+    public var stopSequences: [String]?
+
+    public init(
+        maxTokens: Int? = nil,
+        temperature: Float? = nil,
+        topP: Float? = nil,
+        stopSequences: [String]? = nil
+    ) {
+        self.maxTokens = maxTokens
+        self.temperature = temperature
+        self.topP = topP
+        self.stopSequences = stopSequences
+    }
+
+    public static let `default` = GenerationParams()
+}
+
+/// Static description of a language model.
+public struct ModelInfo: Sendable, Equatable {
+    public var name: String
+    public var version: String?
+    public var maxContextLength: Int?
+    public var supportsStreaming: Bool
+
+    public init(
+        name: String,
+        version: String? = nil,
+        maxContextLength: Int? = nil,
+        supportsStreaming: Bool = false
+    ) {
+        self.name = name
+        self.version = version
+        self.maxContextLength = maxContextLength
+        self.supportsStreaming = supportsStreaming
+    }
+}
+
+/// Aggregate counts describing a graph.
+public struct GraphStats: Sendable, Equatable {
+    public var nodeCount: Int
+    public var edgeCount: Int
+    public var averageDegree: Float
+    public var maxDepth: Int
+
+    public init(nodeCount: Int, edgeCount: Int, averageDegree: Float, maxDepth: Int) {
+        self.nodeCount = nodeCount
+        self.edgeCount = edgeCount
+        self.averageDegree = averageDegree
+        self.maxDepth = maxDepth
+    }
+}
+
+/// Counts produced by `GraphRAG.stats()`.
+public struct Stats: Sendable, Equatable {
+    public var documentCount: Int
+    public var chunkCount: Int
+    public var entityCount: Int
+    public var relationshipCount: Int
+
+    public init(
+        documentCount: Int = 0,
+        chunkCount: Int = 0,
+        entityCount: Int = 0,
+        relationshipCount: Int = 0
+    ) {
+        self.documentCount = documentCount
+        self.chunkCount = chunkCount
+        self.entityCount = entityCount
+        self.relationshipCount = relationshipCount
+    }
+}
+
+/// The result of an `ask` query.
+public struct Answer: Sendable, Equatable {
+    public var text: String
+    /// Confidence in `[0, 1]`, when available.
+    public var confidence: Float
+    /// Chunk identifiers used to ground the answer.
+    public var sources: [ChunkID]
+
+    public init(text: String, confidence: Float = 0.0, sources: [ChunkID] = []) {
+        self.text = text
+        self.confidence = confidence
+        self.sources = sources
+    }
+}
diff --git a/Sources/GraphRAG/Embeddings/HashEmbedder.swift b/Sources/GraphRAG/Embeddings/HashEmbedder.swift
new file mode 100644
index 0000000..579e8ab
--- /dev/null
+++ b/Sources/GraphRAG/Embeddings/HashEmbedder.swift
@@ -0,0 +1,70 @@
+// HashEmbedder.swift
+// Offline, deterministic embedding backend (the default in graphrag-rs when no
+// neural/remote provider is configured).
+//
+// Each token is hashed (FNV-1a, stable across runs) into a bucket with a signed
+// contribution; the accumulated vector is L2-normalized. Texts sharing tokens
+// land near each other under cosine similarity, which is enough to drive the
+// retrieval pipeline without any model download or network call.
+
+import Foundation
+
+public struct HashEmbedder: EmbeddingModel {
+    public let dimension: Int
+
+    public init(dimension: Int = 384) {
+        self.dimension = max(1, dimension)
+    }
+
+    public func isAvailable() async -> Bool { true }
+
+    public func embed(_ text: String) async throws -> [Float] {
+        embedSync(text)
+    }
+
+    /// Synchronous variant (the hashing is pure and cheap).
+    public func embedSync(_ text: String) -> [Float] {
+        var vector = [Float](repeating: 0, count: dimension)
+        let tokens = tokenize(text)
+        guard !tokens.isEmpty else { return vector }
+
+        for token in tokens {
+            let hash = HashEmbedder.fnv1a(token)
+            let bucket = Int(hash % UInt64(dimension))
+            let sign: Float = (hash & 0x1) == 0 ? 1 : -1
+            vector[bucket] += sign
+        }
+
+        // L2 normalize.
+        var norm: Float = 0
+        for value in vector { norm += value * value }
+        norm = norm.squareRoot()
+        if norm > 0 {
+            for i in 0..<dimension { vector[i] /= norm }
+        }
+        return vector
+    }
+
+    private func tokenize(_ text: String) -> [String] {
+        var tokens: [String] = []
+        for rawWord in text.split(whereSeparator: { $0.isWhitespace }) {
+            var cleaned = ""
+            for ch in rawWord where ch.isLetter || ch.isNumber {
+                cleaned.append(contentsOf: ch.lowercased())
+            }
+            if !cleaned.isEmpty { tokens.append(cleaned) }
+        }
+        return tokens
+    }
+
+    /// 64-bit FNV-1a hash — stable across processes (unlike Swift's `Hasher`).
+    static func fnv1a(_ string: String) -> UInt64 {
+        var hash: UInt64 = 0xcbf2_9ce4_8422_2325
+        let prime: UInt64 = 0x0000_0100_0000_01b3
+        for byte in string.utf8 {
+            hash ^= UInt64(byte)
+            hash = hash &* prime
+        }
+        return hash
+    }
+}
diff --git a/Sources/GraphRAG/Embeddings/Ollama.swift b/Sources/GraphRAG/Embeddings/Ollama.swift
new file mode 100644
index 0000000..228f43b
--- /dev/null
+++ b/Sources/GraphRAG/Embeddings/Ollama.swift
@@ -0,0 +1,193 @@
+// Ollama.swift
+// Ported from graphrag-rs `ollama` and `embeddings::ollama`.
+//
+// Talks to a local Ollama daemon over HTTP: `/api/generate` for completions and
+// `/api/embeddings` for embeddings. Network calls go through URLSession.
+
+import Foundation
+
+#if canImport(FoundationNetworking)
+    import FoundationNetworking
+#endif
+
+/// Connection + generation settings for a local Ollama server.
+public struct OllamaConfig: Sendable {
+    public var host: String
+    public var port: Int
+    public var chatModel: String
+    public var embeddingModel: String
+    public var embeddingDimension: Int
+    public var temperature: Float
+    public var maxTokens: Int
+    public var timeoutSeconds: Double
+    public var keepAlive: String?
+    public var numCtx: Int?
+
+    public init(
+        host: String = "http://localhost",
+        port: Int = 11434,
+        chatModel: String = "llama3.2:3b",
+        embeddingModel: String = "nomic-embed-text",
+        embeddingDimension: Int = 1024,
+        temperature: Float = 0.7,
+        maxTokens: Int = 2000,
+        timeoutSeconds: Double = 30,
+        keepAlive: String? = nil,
+        numCtx: Int? = nil
+    ) {
+        self.host = host
+        self.port = port
+        self.chatModel = chatModel
+        self.embeddingModel = embeddingModel
+        self.embeddingDimension = embeddingDimension
+        self.temperature = temperature
+        self.maxTokens = maxTokens
+        self.timeoutSeconds = timeoutSeconds
+        self.keepAlive = keepAlive
+        self.numCtx = numCtx
+    }
+
+    var baseURL: String { "\(host):\(port)" }
+}
+
+/// Shared low-level HTTP helpers for the Ollama REST API.
+enum OllamaHTTP {
+    /// Serialize a JSON object to `Data` (synchronous; nothing crosses an await).
+    static func encode(_ body: [String: Any]) throws -> Data {
+        do {
+            return try JSONSerialization.data(withJSONObject: body)
+        } catch {
+            throw GraphRAGError.serialization(message: error.localizedDescription)
+        }
+    }
+
+    static func post(
+        urlString: String, jsonBody: Data, timeout: Double
+    ) async throws -> Data {
+        guard let url = URL(string: urlString) else {
+            throw GraphRAGError.network(message: "Invalid Ollama URL: \(urlString)")
+        }
+        var request = URLRequest(url: url)
+        request.httpMethod = "POST"
+        request.timeoutInterval = timeout
+        request.setValue("application/json", forHTTPHeaderField: "Content-Type")
+        request.httpBody = jsonBody
+        return try await perform(request)
+    }
+
+    static func get(urlString: String, timeout: Double) async throws -> Data {
+        guard let url = URL(string: urlString) else {
+            throw GraphRAGError.network(message: "Invalid Ollama URL: \(urlString)")
+        }
+        var request = URLRequest(url: url)
+        request.httpMethod = "GET"
+        request.timeoutInterval = timeout
+        return try await perform(request)
+    }
+
+    private static func perform(_ request: URLRequest) async throws -> Data {
+        try await withCheckedThrowingContinuation { continuation in
+            let task = URLSession.shared.dataTask(with: request) { data, response, error in
+                if let error {
+                    continuation.resume(throwing: GraphRAGError.network(message: error.localizedDescription))
+                    return
+                }
+                if let http = response as? HTTPURLResponse, !(200..<300).contains(http.statusCode) {
+                    continuation.resume(
+                        throwing: GraphRAGError.http(message: "HTTP \(http.statusCode)"))
+                    return
+                }
+                continuation.resume(returning: data ?? Data())
+            }
+            task.resume()
+        }
+    }
+}
+
+/// `LanguageModel` backed by Ollama's `/api/generate`.
+public struct OllamaClient: LanguageModel {
+    public let config: OllamaConfig
+
+    public init(config: OllamaConfig = OllamaConfig()) {
+        self.config = config
+    }
+
+    public var modelInfo: ModelInfo {
+        ModelInfo(
+            name: config.chatModel, maxContextLength: config.numCtx, supportsStreaming: true)
+    }
+
+    public func isAvailable() async -> Bool {
+        do {
+            _ = try await OllamaHTTP.get(
+                urlString: "\(config.baseURL)/api/tags", timeout: config.timeoutSeconds)
+            return true
+        } catch {
+            return false
+        }
+    }
+
+    public func complete(_ prompt: String, params: GenerationParams) async throws -> String {
+        var options: [String: Any] = [
+            "temperature": Double(params.temperature ?? config.temperature),
+            "num_predict": params.maxTokens ?? config.maxTokens,
+        ]
+        if let topP = params.topP { options["top_p"] = Double(topP) }
+        if let numCtx = config.numCtx { options["num_ctx"] = numCtx }
+        if let stop = params.stopSequences { options["stop"] = stop }
+
+        var body: [String: Any] = [
+            "model": config.chatModel,
+            "prompt": prompt,
+            "stream": false,
+            "options": options,
+        ]
+        if let keepAlive = config.keepAlive { body["keep_alive"] = keepAlive }
+
+        let jsonBody = try OllamaHTTP.encode(body)
+        let data = try await OllamaHTTP.post(
+            urlString: "\(config.baseURL)/api/generate", jsonBody: jsonBody,
+            timeout: config.timeoutSeconds)
+        struct GenerateResponse: Codable { let response: String }
+        do {
+            return try JSONDecoder().decode(GenerateResponse.self, from: data).response
+        } catch {
+            throw GraphRAGError.generation(message: "Failed to decode Ollama response")
+        }
+    }
+}
+
+/// `EmbeddingModel` backed by Ollama's `/api/embeddings`.
+public struct OllamaEmbedder: EmbeddingModel {
+    public let config: OllamaConfig
+
+    public init(config: OllamaConfig = OllamaConfig()) {
+        self.config = config
+    }
+
+    public var dimension: Int { config.embeddingDimension }
+
+    public func isAvailable() async -> Bool {
+        do {
+            _ = try await OllamaHTTP.get(
+                urlString: "\(config.baseURL)/api/tags", timeout: config.timeoutSeconds)
+            return true
+        } catch {
+            return false
+        }
+    }
+
+    public func embed(_ text: String) async throws -> [Float] {
+        let body: [String: Any] = ["model": config.embeddingModel, "prompt": text]
+        let jsonBody = try OllamaHTTP.encode(body)
+        let data = try await OllamaHTTP.post(
+            urlString: "\(config.baseURL)/api/embeddings", jsonBody: jsonBody,
+            timeout: config.timeoutSeconds)
+        struct EmbeddingResponse: Codable { let embedding: [Float] }
+        do {
+            return try JSONDecoder().decode(EmbeddingResponse.self, from: data).embedding
+        } catch {
+            throw GraphRAGError.embedding(message: "Failed to decode Ollama embedding")
+        }
+    }
+}
diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
new file mode 100644
index 0000000..84f62ed
--- /dev/null
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -0,0 +1,220 @@
+// LLMExtractor.swift
+// Ported from graphrag-rs `entity::llm_extractor`.
+
+import Foundation
+
+/// LLM-driven entity & relationship extractor.
+///
+/// Builds the extraction prompt, calls a `LanguageModel`, and parses the JSON
+/// response with the same staged fallbacks as the Rust version (direct decode →
+/// fenced code block → first/last brace slice).
+public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
+    public let model: Model
+    public var entityTypes: [String]
+    public var temperature: Float
+    public var maxTokens: Int
+    /// Extra gleaning passes to recover missed items (0 = single pass).
+    public var gleaningRounds: Int
+
+    public init(
+        model: Model,
+        entityTypes: [String] = Prompts.defaultEntityTypes,
+        temperature: Float = 0.0,
+        maxTokens: Int = 1500,
+        gleaningRounds: Int = 0
+    ) {
+        self.model = model
+        self.entityTypes = entityTypes
+        self.temperature = temperature
+        self.maxTokens = maxTokens
+        self.gleaningRounds = gleaningRounds
+    }
+
+    public func extract(from chunk: TextChunk) async throws
+        -> (entities: [Entity], relationships: [Relationship])
+    {
+        let typesList = entityTypes.joined(separator: ", ")
+        let prompt = Prompts.fill(
+            Prompts.entityExtraction,
+            ["entity_types": typesList, "input_text": chunk.content])
+        let params = GenerationParams(maxTokens: maxTokens, temperature: temperature)
+        let response = try await model.complete(prompt, params: params)
+
+        var output = LLMEntityExtractor.parse(response) ?? ExtractionOutput()
+
+        // Optional gleaning passes.
+        var round = 0
+        while round < gleaningRounds {
+            let prevEntities = output.entities.map { "- \($0.name) (\($0.type))" }
+                .joined(separator: "\n")
+            let prevRelationships = output.relationships
+                .map { "- \($0.source) -> \($0.target)" }.joined(separator: "\n")
+            let gleanPrompt = Prompts.fill(
+                Prompts.gleaningContinuation,
+                [
+                    "entity_types": typesList,
+                    "input_text": chunk.content,
+                    "previous_entities": prevEntities.isEmpty ? "(none)" : prevEntities,
+                    "previous_relationships": prevRelationships.isEmpty ? "(none)" : prevRelationships,
+                ])
+            let gleanResponse = try await model.complete(gleanPrompt, params: params)
+            if let extra = LLMEntityExtractor.parse(gleanResponse) {
+                if extra.entities.isEmpty && extra.relationships.isEmpty { break }
+                output.entities.append(contentsOf: extra.entities)
+                output.relationships.append(contentsOf: extra.relationships)
+            } else {
+                break
+            }
+            round += 1
+        }
+
+        return convert(output, chunk: chunk)
+    }
+
+    // MARK: - Conversion
+
+    private func convert(_ output: ExtractionOutput, chunk: TextChunk)
+        -> (entities: [Entity], relationships: [Relationship])
+    {
+        var entities: [Entity] = []
+        var idByName: [String: EntityID] = [:]
+        let lowerContent = chunk.content.lowercased()
+
+        for data in output.entities {
+            let name = data.name.trimmingCharacters(in: .whitespacesAndNewlines)
+            guard !name.isEmpty else { continue }
+            let type = data.type.isEmpty ? "CONCEPT" : data.type.uppercased()
+            let id = PatternEntityExtractor.makeEntityID(type: type, name: name)
+
+            var mentions: [EntityMention] = []
+            if let range = lowerContent.range(of: name.lowercased()) {
+                let start = lowerContent.distance(from: lowerContent.startIndex, to: range.lowerBound)
+                mentions.append(
+                    EntityMention(
+                        chunkID: chunk.id, startOffset: start,
+                        endOffset: start + name.count, confidence: 0.9))
+            }
+
+            entities.append(
+                Entity(id: id, name: name, entityType: type, confidence: 0.9, mentions: mentions))
+            idByName[name.lowercased()] = id
+        }
+
+        var relationships: [Relationship] = []
+        for data in output.relationships {
+            let src = data.source.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
+            let tgt = data.target.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
+            guard let sourceID = idByName[src], let targetID = idByName[tgt] else { continue }
+            let relType = LLMEntityExtractor.relationTypeLabel(from: data.description)
+            relationships.append(
+                Relationship(
+                    source: sourceID, target: targetID, relationType: relType,
+                    confidence: data.strength ?? 0.7, context: [chunk.id]))
+        }
+
+        return (entities, relationships)
+    }
+
+    private static func relationTypeLabel(from description: String) -> String {
+        let trimmed = description.trimmingCharacters(in: .whitespacesAndNewlines)
+        if trimmed.isEmpty { return "RELATED_TO" }
+        // Use the first few words, upper-snake-cased, as a coarse relation label.
+        let words = trimmed.split(whereSeparator: { $0.isWhitespace }).prefix(3)
+        let label = words.map { word in
+            String(word.filter { $0.isLetter || $0.isNumber }).uppercased()
+        }.filter { !$0.isEmpty }.joined(separator: "_")
+        return label.isEmpty ? "RELATED_TO" : label
+    }
+
+    // MARK: - Parsing
+
+    struct ExtractionOutput: Codable {
+        var entities: [EntityData] = []
+        var relationships: [RelationshipData] = []
+
+        init() {}
+
+        enum CodingKeys: String, CodingKey { case entities, relationships }
+        init(from decoder: Decoder) throws {
+            let c = try decoder.container(keyedBy: CodingKeys.self)
+            entities = (try? c.decode([EntityData].self, forKey: .entities)) ?? []
+            relationships = (try? c.decode([RelationshipData].self, forKey: .relationships)) ?? []
+        }
+    }
+
+    struct EntityData: Codable {
+        var name: String
+        var type: String
+        var description: String?
+
+        enum CodingKeys: String, CodingKey { case name, type, description }
+        init(from decoder: Decoder) throws {
+            let c = try decoder.container(keyedBy: CodingKeys.self)
+            name = (try? c.decode(String.self, forKey: .name)) ?? ""
+            type = (try? c.decode(String.self, forKey: .type)) ?? ""
+            description = try? c.decode(String.self, forKey: .description)
+        }
+    }
+
+    struct RelationshipData: Codable {
+        var source: String
+        var target: String
+        var description: String
+        var strength: Float?
+
+        enum CodingKeys: String, CodingKey { case source, target, description, strength }
+        init(from decoder: Decoder) throws {
+            let c = try decoder.container(keyedBy: CodingKeys.self)
+            source = (try? c.decode(String.self, forKey: .source)) ?? ""
+            target = (try? c.decode(String.self, forKey: .target)) ?? ""
+            description = (try? c.decode(String.self, forKey: .description)) ?? ""
+            strength = try? c.decode(Float.self, forKey: .strength)
+        }
+    }
+
+    /// Try to recover an `ExtractionOutput` from a possibly-noisy LLM response.
+    static func parse(_ response: String) -> ExtractionOutput? {
+        let decoder = JSONDecoder()
+
+        // 1. Direct decode.
+        if let data = response.data(using: .utf8),
+            let parsed = try? decoder.decode(ExtractionOutput.self, from: data)
+        {
+            return parsed
+        }
+        // 2. Fenced code block.
+        if let fenced = extractFencedJSON(response),
+            let data = fenced.data(using: .utf8),
+            let parsed = try? decoder.decode(ExtractionOutput.self, from: data)
+        {
+            return parsed
+        }
+        // 3. First '{' to last '}'.
+        if let first = response.firstIndex(of: "{"),
+            let last = response.lastIndex(of: "}"), first < last
+        {
+            let slice = String(response[first...last])
+            if let data = slice.data(using: .utf8),
+                let parsed = try? decoder.decode(ExtractionOutput.self, from: data)
+            {
+                return parsed
+            }
+        }
+        return nil
+    }
+
+    private static func extractFencedJSON(_ text: String) -> String? {
+        guard let fenceStart = text.range(of: "```") else { return nil }
+        var afterFence = text[fenceStart.upperBound...]
+        // Skip an optional language tag line ("json").
+        if let newline = afterFence.firstIndex(of: "\n") {
+            let firstLine = afterFence[afterFence.startIndex..<newline]
+                .trimmingCharacters(in: .whitespaces)
+            if firstLine.lowercased() == "json" || firstLine.isEmpty {
+                afterFence = afterFence[afterFence.index(after: newline)...]
+            }
+        }
+        guard let closing = afterFence.range(of: "```") else { return nil }
+        return String(afterFence[afterFence.startIndex..<closing.lowerBound])
+    }
+}
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
new file mode 100644
index 0000000..91ce98a
--- /dev/null
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -0,0 +1,247 @@
+// PatternExtractor.swift
+// Ported from graphrag-rs pattern-based fallback extractor in `entity::mod`.
+//
+// This is the offline default: it finds Title-Case spans, classifies them with
+// suffix/prefix/known-list heuristics, then infers typed relationships between
+// co-occurring entities from surrounding context keywords.
+
+import Foundation
+
+/// Deterministic capitalization/heuristic entity extractor.
+public struct PatternEntityExtractor: EntityExtracting {
+    public var minConfidence: Float
+
+    public init(minConfidence: Float = 0.5) {
+        self.minConfidence = minConfidence
+    }
+
+    public func extract(from chunk: TextChunk) async throws
+        -> (entities: [Entity], relationships: [Relationship])
+    {
+        let candidates = capitalizedSpans(in: chunk.content)
+
+        var byName: [String: Entity] = [:]
+        var orderedNames: [String] = []
+        for candidate in candidates {
+            guard let (type, confidence) = classify(candidate.text) else { continue }
+            guard confidence >= minConfidence else { continue }
+            let name = candidate.text
+            let mention = EntityMention(
+                chunkID: chunk.id,
+                startOffset: candidate.start,
+                endOffset: candidate.end,
+                confidence: confidence)
+            if var existing = byName[name] {
+                existing.mentions.append(mention)
+                existing.confidence = max(existing.confidence, confidence)
+                byName[name] = existing
+            } else {
+                let entity = Entity(
+                    id: PatternEntityExtractor.makeEntityID(type: type, name: name),
+                    name: name,
+                    entityType: type,
+                    confidence: confidence,
+                    mentions: [mention])
+                byName[name] = entity
+                orderedNames.append(name)
+            }
+        }
+
+        let entities = orderedNames.compactMap { byName[$0] }
+        let relationships = inferRelationships(entities: entities, chunk: chunk)
+        return (entities, relationships)
+    }
+
+    /// Stable `"TYPE_normalized_name"` identifier.
+    public static func makeEntityID(type: String, name: String) -> EntityID {
+        let normalized = name.lowercased().map { ch -> Character in
+            (ch.isLetter || ch.isNumber) ? ch : "_"
+        }
+        var collapsed = ""
+        var lastUnderscore = false
+        for ch in normalized {
+            if ch == "_" {
+                if !lastUnderscore { collapsed.append(ch) }
+                lastUnderscore = true
+            } else {
+                collapsed.append(ch)
+                lastUnderscore = false
+            }
+        }
+        let trimmed = collapsed.trimmingCharacters(in: CharacterSet(charactersIn: "_"))
+        return EntityID("\(type.lowercased())_\(trimmed)")
+    }
+
+    // MARK: - Span detection
+
+    private struct Span { var text: String; var start: Int; var end: Int }
+
+    /// Maximal runs of Title-Case words (allowing a leading title like "Dr.").
+    private func capitalizedSpans(in text: String) -> [Span] {
+        let chars = Array(text)
+        let n = chars.count
+        var spans: [Span] = []
+        var i = 0
+        while i < n {
+            if isWordStart(chars, i) && chars[i].isUppercase {
+                let runStart = i
+                var j = i
+                // Consume consecutive capitalized words (optionally separated by a
+                // single space and an optional connector like "of"/"the").
+                while true {
+                    // advance to end of current word
+                    while j < n && !chars[j].isWhitespace { j += 1 }
+                    // peek next word
+                    var k = j
+                    while k < n && chars[k] == " " { k += 1 }
+                    if k < n && chars[k].isUppercase && k == j + 1 {
+                        j = k
+                        continue
+                    }
+                    // also allow lowercase connector ("of"/"the") between caps
+                    if k < n && chars[k].isLowercase && k == j + 1 {
+                        let connectorStart = k
+                        var c = k
+                        while c < n && !chars[c].isWhitespace { c += 1 }
+                        let connector = String(chars[connectorStart..<c]).lowercased()
+                        var after = c
+                        while after < n && chars[after] == " " { after += 1 }
+                        if (connector == "of" || connector == "the")
+                            && after < n && chars[after].isUppercase
+                        {
+                            j = after
+                            continue
+                        }
+                    }
+                    break
+                }
+                let raw = String(chars[runStart..<j])
+                let cleaned = raw.trimmingCharacters(
+                    in: CharacterSet(charactersIn: ".,;:!?\"'()"))
+                if cleaned.count >= 2 {
+                    spans.append(Span(text: cleaned, start: runStart, end: runStart + cleaned.count))
+                }
+                i = j
+            } else {
+                i += 1
+            }
+        }
+        return spans
+    }
+
+    private func isWordStart(_ chars: [Character], _ i: Int) -> Bool {
+        i == 0 || chars[i - 1].isWhitespace
+    }
+
+    // MARK: - Classification
+
+    private func classify(_ text: String) -> (type: String, confidence: Float)? {
+        let words = text.split(separator: " ").map(String.init)
+        guard !words.isEmpty else { return nil }
+
+        // Blocklist single sentence-initial words that are common/structural.
+        if words.count == 1, PatternEntityExtractor.blocklist.contains(words[0].lowercased()) {
+            return nil
+        }
+
+        // Organizations by suffix.
+        if let last = words.last,
+            PatternEntityExtractor.orgSuffixes.contains(last.lowercased())
+        {
+            return ("ORGANIZATION", 0.9)
+        }
+        // Organizations by prefix ("University of ...", etc.).
+        let lower = text.lowercased()
+        for prefix in PatternEntityExtractor.orgPrefixes where lower.hasPrefix(prefix) {
+            return ("ORGANIZATION", 0.9)
+        }
+        // Known locations.
+        if PatternEntityExtractor.knownLocations.contains(lower) {
+            return ("LOCATION", 0.9)
+        }
+        // Titled persons.
+        if let first = words.first,
+            PatternEntityExtractor.personTitles.contains(
+                first.trimmingCharacters(in: CharacterSet(charactersIn: ".")).lowercased())
+        {
+            return ("PERSON", 0.9)
+        }
+        // Multi-word Title Case -> likely a person/proper noun.
+        if words.count >= 2 {
+            return ("PERSON", 0.8)
+        }
+        // Single capitalized word -> generic concept.
+        if words[0].count >= 3 {
+            return ("CONCEPT", 0.6)
+        }
+        return nil
+    }
+
+    // MARK: - Relationship inference
+
+    private func inferRelationships(entities: [Entity], chunk: TextChunk) -> [Relationship] {
+        guard entities.count >= 2 else { return [] }
+        let context = chunk.content.lowercased()
+        var relationships: [Relationship] = []
+        var seen: Set<String> = []
+
+        for i in 0..<entities.count {
+            for j in (i + 1)..<entities.count {
+                let a = entities[i]
+                let b = entities[j]
+                let relType = relationType(for: a.entityType, b.entityType, context: context)
+                let key = "\(a.id.raw)|\(b.id.raw)|\(relType)"
+                if seen.contains(key) { continue }
+                seen.insert(key)
+                relationships.append(
+                    Relationship(
+                        source: a.id, target: b.id, relationType: relType,
+                        confidence: 0.6, context: [chunk.id]))
+            }
+        }
+        return relationships
+    }
+
+    private func relationType(for a: String, _ b: String, context: String) -> String {
+        func has(_ s: String) -> Bool { context.contains(s) }
+        switch (a, b) {
+        case ("PERSON", "ORGANIZATION"), ("ORGANIZATION", "PERSON"):
+            if has("works for") || has("employed by") { return "WORKS_FOR" }
+            if has("founded") || has("ceo") { return "LEADS" }
+            return "ASSOCIATED_WITH"
+        case ("PERSON", "LOCATION"), ("LOCATION", "PERSON"):
+            if has("born in") || has(" from ") { return "BORN_IN" }
+            if has("lives in") || has("based in") { return "LOCATED_IN" }
+            return "ASSOCIATED_WITH"
+        case ("ORGANIZATION", "LOCATION"), ("LOCATION", "ORGANIZATION"):
+            if has("headquartered") || has("based in") { return "HEADQUARTERED_IN" }
+            return "LOCATED_IN"
+        case ("PERSON", "PERSON"):
+            if has("married") || has("spouse") { return "MARRIED_TO" }
+            if has("colleague") || has("partner") { return "COLLEAGUE_OF" }
+            return "KNOWS"
+        default:
+            return "RELATED_TO"
+        }
+    }
+
+    // MARK: - Lexicons
+
+    static let orgSuffixes: Set<String> = [
+        "inc", "inc.", "corp", "corp.", "llc", "ltd", "ltd.", "company",
+        "corporation", "group", "solutions", "technologies",
+    ]
+    static let orgPrefixes: [String] = ["university of", "institute of", "department of"]
+    static let knownLocations: Set<String> = [
+        "united states", "new york", "california", "london", "paris", "tokyo",
+        "berlin", "washington", "boston", "chicago",
+    ]
+    static let personTitles: Set<String> = ["dr", "prof", "mr", "mrs", "ms"]
+    static let blocklist: Set<String> = [
+        "the", "and", "but", "or", "chapter", "section", "however", "therefore",
+        "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
+        "january", "february", "march", "april", "may", "june", "july", "august",
+        "september", "october", "november", "december", "this", "that", "these",
+        "those", "there", "here", "when", "where", "what", "who", "why", "how",
+    ]
+}
diff --git a/Sources/GraphRAG/Entity/Prompts.swift b/Sources/GraphRAG/Entity/Prompts.swift
new file mode 100644
index 0000000..fd627a2
--- /dev/null
+++ b/Sources/GraphRAG/Entity/Prompts.swift
@@ -0,0 +1,131 @@
+// Prompts.swift
+// Ported from graphrag-rs `entity::prompts` and the answer-generation template
+// in `graphrag::ask`. Templates use `{placeholder}` markers filled by callers.
+
+import Foundation
+
+public enum Prompts {
+    /// Default entity types requested from the LLM.
+    public static let defaultEntityTypes: [String] = [
+        "PERSON", "ORGANIZATION", "LOCATION", "EVENT", "CONCEPT", "OBJECT",
+    ]
+
+    /// Single-pass entity + relationship extraction prompt.
+    public static let entityExtraction = """
+        -Goal-
+        Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
+
+        -Steps-
+        1. Identify all entities. For each identified entity, extract the following information:
+        - entity_name: Name of the entity, capitalized
+        - entity_type: One of the following types: [{entity_types}]
+        - entity_description: Comprehensive description of the entity's attributes and activities
+
+        2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
+        For each pair of related entities, extract the following information:
+        - source_entity: name of the source entity, as identified in step 1
+        - target_entity: name of the target entity, as identified in step 1
+        - relationship_description: explanation as to why you think the source entity and the target entity are related to each other
+        - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
+
+        3. Return output in JSON format with the following structure:
+        {
+          "entities": [
+            { "name": "entity name", "type": "entity type", "description": "entity description" }
+          ],
+          "relationships": [
+            { "source": "source entity name", "target": "target entity name", "description": "relationship description", "strength": 0.8 }
+          ]
+        }
+
+        -Real Data-
+        ######################
+        Entity Types: {entity_types}
+        Text: {input_text}
+        ######################
+        Output:
+        """
+
+    /// Gleaning continuation prompt to catch entities/relationships missed on
+    /// the first pass.
+    public static let gleaningContinuation = """
+        -Goal-
+        You previously extracted entities and relationships from a text document. Review your previous extraction and the original text to identify any additional entities or relationships you may have missed in the first pass.
+
+        -Steps-
+        1. Review the entities you previously identified:
+        {previous_entities}
+
+        2. Review the relationships you previously identified:
+        {previous_relationships}
+
+        3. Carefully review the original text again and identify any entities or relationships you may have missed.
+
+        4. Return ONLY the NEW entities and relationships you discovered in this pass, using the same JSON format:
+        {
+          "entities": [
+            { "name": "entity name", "type": "entity type", "description": "entity description" }
+          ],
+          "relationships": [
+            { "source": "source entity name", "target": "target entity name", "description": "relationship description", "strength": 0.8 }
+          ]
+        }
+
+        If you found no additional entities or relationships, return empty arrays.
+
+        -Real Data-
+        ######################
+        Entity Types: {entity_types}
+        Text: {input_text}
+        ######################
+        Output:
+        """
+
+    /// Completion-check prompt; the model answers only YES or NO.
+    public static let completionCheck = """
+        Based on the text below and the entities/relationships already extracted, are there any significant entities or relationships that have been missed?
+
+        Text:
+        {input_text}
+
+        Current Entities ({entity_count}):
+        {entities_summary}
+
+        Current Relationships ({relationship_count}):
+        {relationships_summary}
+
+        Respond with ONLY "YES" if the extraction is complete and thorough, or "NO" if there are still significant entities or relationships missing.
+
+        Answer (YES or NO):
+        """
+
+    /// Answer-generation prompt used by `GraphRAG.ask`.
+    public static let answerGeneration = """
+        You are a knowledgeable assistant specialized in answering questions based on a knowledge graph.
+
+        IMPORTANT INSTRUCTIONS:
+        - Answer ONLY using information from the provided context below
+        - Synthesize information from ALL context sections to give a comprehensive answer
+        - Provide direct, conversational, and natural responses
+        - Do NOT show your reasoning process or use <think> tags
+        - If the context lacks sufficient information, clearly state: "I don't have enough information to answer this question."
+        - Aim for a complete answer (3-6 sentences) that covers different aspects found across the context
+        - Use a natural, helpful tone as if speaking to a person
+
+        CONTEXT:
+        {context}
+
+        QUESTION: {query}
+
+        ANSWER (direct response only, no reasoning):
+        """
+
+    /// Fill `{key}` placeholders in `template` with `values`.
+    public static func fill(_ template: String, _ values: [String: String]) -> String {
+        var result = template
+        for (key, value) in values {
+            result = result.replacingOccurrences(of: "{\(key)}", with: value)
+        }
+        return result
+    }
+}
diff --git a/Sources/GraphRAG/Graph/Analytics.swift b/Sources/GraphRAG/Graph/Analytics.swift
new file mode 100644
index 0000000..4ad9b89
--- /dev/null
+++ b/Sources/GraphRAG/Graph/Analytics.swift
@@ -0,0 +1,205 @@
+// Analytics.swift
+// Ported from graphrag-rs `graph::analytics`.
+//
+// Centrality measures treat the graph as undirected (edges connect both
+// endpoints), matching the bidirectional neighbour semantics used elsewhere.
+
+import Foundation
+
+/// The three centrality scores for an entity.
+public struct CentralityScores: Sendable, Equatable {
+    public var degree: Float
+    public var betweenness: Float
+    public var closeness: Float
+
+    public init(degree: Float = 0, betweenness: Float = 0, closeness: Float = 0) {
+        self.degree = degree
+        self.betweenness = betweenness
+        self.closeness = closeness
+    }
+}
+
+/// Graph-level and node-level structural metrics.
+public struct GraphAnalytics: Sendable {
+    private let graph: KnowledgeGraph
+    private let nodes: [EntityID]
+    private let adjacency: [EntityID: [EntityID]]
+
+    public init(_ graph: KnowledgeGraph) {
+        self.graph = graph
+        let nodeList = graph.entities.map(\.id)
+        self.nodes = nodeList
+        let nodeSet = Set(nodeList)
+        var adj: [EntityID: Set<EntityID>] = [:]
+        for id in nodeList { adj[id] = [] }
+        for rel in graph.relationships {
+            // Only connect endpoints that are actual graph nodes.
+            guard nodeSet.contains(rel.source), nodeSet.contains(rel.target) else { continue }
+            adj[rel.source, default: []].insert(rel.target)
+            adj[rel.target, default: []].insert(rel.source)
+        }
+        self.adjacency = adj.mapValues { Array($0) }
+    }
+
+    private func neighbors(_ id: EntityID) -> [EntityID] { adjacency[id] ?? [] }
+
+    // MARK: - Degree
+
+    /// Degree centrality: `degree / (n - 1)`, in `[0, 1]`.
+    public func degreeCentrality(_ id: EntityID) -> Float {
+        let n = nodes.count
+        guard n > 1 else { return 0 }
+        return Float(neighbors(id).count) / Float(n - 1)
+    }
+
+    // MARK: - Closeness
+
+    /// Closeness centrality: reachable node count divided by total distance.
+    public func closenessCentrality(_ id: EntityID) -> Float {
+        let distances = bfsDistances(from: id)
+        var total = 0
+        var reachable = 0
+        for (node, dist) in distances where node != id {
+            total += dist
+            reachable += 1
+        }
+        guard total > 0 else { return 0 }
+        return Float(reachable) / Float(total)
+    }
+
+    // MARK: - Betweenness (Brandes, unweighted)
+
+    /// Normalized betweenness centrality for every node, via Brandes' algorithm.
+    public func betweennessCentrality() -> [EntityID: Float] {
+        var betweenness: [EntityID: Double] = [:]
+        for id in nodes { betweenness[id] = 0 }
+        let n = nodes.count
+        guard n > 2 else { return betweenness.mapValues { Float($0) } }
+
+        for source in nodes {
+            var stack: [EntityID] = []
+            var predecessors: [EntityID: [EntityID]] = [:]
+            var sigma: [EntityID: Double] = [:]
+            var dist: [EntityID: Int] = [:]
+            for id in nodes { sigma[id] = 0; dist[id] = -1; predecessors[id] = [] }
+            sigma[source] = 1
+            dist[source] = 0
+
+            var queue: [EntityID] = [source]
+            var head = 0
+            while head < queue.count {
+                let v = queue[head]; head += 1
+                stack.append(v)
+                for w in neighbors(v) {
+                    if dist[w]! < 0 {
+                        dist[w] = dist[v]! + 1
+                        queue.append(w)
+                    }
+                    if dist[w]! == dist[v]! + 1 {
+                        sigma[w]! += sigma[v]!
+                        predecessors[w]!.append(v)
+                    }
+                }
+            }
+
+            var delta: [EntityID: Double] = [:]
+            for id in nodes { delta[id] = 0 }
+            while let w = stack.popLast() {
+                for v in predecessors[w]! {
+                    delta[v]! += (sigma[v]! / sigma[w]!) * (1 + delta[w]!)
+                }
+                if w != source { betweenness[w]! += delta[w]! }
+            }
+        }
+
+        // Undirected: each pair counted twice; normalize to [0, 1].
+        let norm = Double((n - 1) * (n - 2))
+        var result: [EntityID: Float] = [:]
+        for (id, value) in betweenness {
+            result[id] = norm > 0 ? Float(value / norm) : 0
+        }
+        return result
+    }
+
+    /// Combined centrality scores for a single node.
+    public func centrality(_ id: EntityID) -> CentralityScores {
+        CentralityScores(
+            degree: degreeCentrality(id),
+            betweenness: betweennessCentrality()[id] ?? 0,
+            closeness: closenessCentrality(id)
+        )
+    }
+
+    // MARK: - Components
+
+    /// The connected component (undirected) containing `start`.
+    public func connectedComponent(containing start: EntityID) -> [EntityID] {
+        guard graph.contains(start) else { return [] }
+        var visited: Set<EntityID> = [start]
+        var queue: [EntityID] = [start]
+        var component: [EntityID] = []
+        var head = 0
+        while head < queue.count {
+            let current = queue[head]; head += 1
+            component.append(current)
+            for neighbor in neighbors(current) where !visited.contains(neighbor) {
+                visited.insert(neighbor)
+                queue.append(neighbor)
+            }
+        }
+        return component
+    }
+
+    /// All connected components of the graph.
+    public func connectedComponents() -> [[EntityID]] {
+        var visited: Set<EntityID> = []
+        var components: [[EntityID]] = []
+        for node in nodes where !visited.contains(node) {
+            let component = connectedComponent(containing: node)
+            for c in component { visited.insert(c) }
+            components.append(component)
+        }
+        return components
+    }
+
+    // MARK: - Global
+
+    /// Graph density: `2E / (n(n-1))`.
+    public func density() -> Float {
+        let n = nodes.count
+        guard n > 1 else { return 0 }
+        return Float(2 * graph.relationshipCount) / Float(n * (n - 1))
+    }
+
+    /// Local clustering coefficient: fraction of a node's neighbour pairs that
+    /// are themselves connected.
+    public func clusteringCoefficient(_ id: EntityID) -> Float {
+        let ns = neighbors(id)
+        let k = ns.count
+        guard k > 1 else { return 0 }
+        var links = 0
+        for i in 0..<k {
+            let iNeighbors = Set(neighbors(ns[i]))
+            for j in (i + 1)..<k where iNeighbors.contains(ns[j]) {
+                links += 1
+            }
+        }
+        let possible = k * (k - 1) / 2
+        return possible > 0 ? Float(links) / Float(possible) : 0
+    }
+
+    private func bfsDistances(from source: EntityID) -> [EntityID: Int] {
+        var dist: [EntityID: Int] = [source: 0]
+        var queue: [EntityID] = [source]
+        var head = 0
+        while head < queue.count {
+            let current = queue[head]; head += 1
+            let d = dist[current]!
+            for neighbor in neighbors(current) where dist[neighbor] == nil {
+                dist[neighbor] = d + 1
+                queue.append(neighbor)
+            }
+        }
+        return dist
+    }
+}
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
new file mode 100644
index 0000000..4f67279
--- /dev/null
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -0,0 +1,229 @@
+// KnowledgeGraph.swift
+// Ported from graphrag-rs `core::KnowledgeGraph`.
+//
+// The Rust version is backed by petgraph plus side indexes. This port uses a
+// value-type adjacency representation: entities/relationships are stored in
+// insertion order with `[ID: Int]` indexes for O(1) lookup, mirroring the
+// `entity_index` HashMap and IndexMap behaviour.
+
+import Foundation
+
+public struct KnowledgeGraph: Sendable, Codable {
+    // Entities, in insertion order.
+    private var entitiesByID: [EntityID: Entity]
+    private var entityOrder: [EntityID]
+
+    // Relationships, in insertion order, with adjacency indexes into the array.
+    public private(set) var relationships: [Relationship]
+    private var outgoing: [EntityID: [Int]]
+    private var incoming: [EntityID: [Int]]
+
+    // Documents and chunks, in insertion order.
+    private var documentsByID: [DocumentID: Document]
+    private var documentOrder: [DocumentID]
+    private var chunksByID: [ChunkID: TextChunk]
+    private var chunkOrder: [ChunkID]
+
+    public init() {
+        entitiesByID = [:]
+        entityOrder = []
+        relationships = []
+        outgoing = [:]
+        incoming = [:]
+        documentsByID = [:]
+        documentOrder = []
+        chunksByID = [:]
+        chunkOrder = []
+    }
+
+    // MARK: - Mutation
+
+    /// Insert an entity. If one with the same id already exists, mentions are
+    /// merged and the higher confidence / any available embedding is kept.
+    public mutating func addEntity(_ entity: Entity) {
+        if var existing = entitiesByID[entity.id] {
+            existing.mentions.append(contentsOf: entity.mentions)
+            existing.confidence = max(existing.confidence, entity.confidence)
+            if existing.embedding == nil { existing.embedding = entity.embedding }
+            if existing.entityType.isEmpty { existing.entityType = entity.entityType }
+            entitiesByID[entity.id] = existing
+        } else {
+            entitiesByID[entity.id] = entity
+            entityOrder.append(entity.id)
+        }
+    }
+
+    /// Insert a directed relationship. Duplicate (source, target, type) edges are
+    /// merged: their evidence context is unioned and the max confidence kept.
+    public mutating func addRelationship(_ relationship: Relationship) {
+        // Merge duplicates.
+        if let existingIndices = outgoing[relationship.source] {
+            for idx in existingIndices
+            where relationships[idx].target == relationship.target
+                && relationships[idx].relationType == relationship.relationType
+            {
+                relationships[idx].confidence = max(
+                    relationships[idx].confidence, relationship.confidence)
+                for ctx in relationship.context where !relationships[idx].context.contains(ctx) {
+                    relationships[idx].context.append(ctx)
+                }
+                return
+            }
+        }
+        let index = relationships.count
+        relationships.append(relationship)
+        outgoing[relationship.source, default: []].append(index)
+        incoming[relationship.target, default: []].append(index)
+    }
+
+    public mutating func addDocument(_ document: Document) {
+        if documentsByID[document.id] == nil { documentOrder.append(document.id) }
+        documentsByID[document.id] = document
+    }
+
+    public mutating func addChunk(_ chunk: TextChunk) {
+        if chunksByID[chunk.id] == nil { chunkOrder.append(chunk.id) }
+        chunksByID[chunk.id] = chunk
+    }
+
+    /// Drop all entities and relationships, preserving documents and chunks.
+    public mutating func clearEntitiesAndRelationships() {
+        entitiesByID.removeAll()
+        entityOrder.removeAll()
+        relationships.removeAll()
+        outgoing.removeAll()
+        incoming.removeAll()
+    }
+
+    // MARK: - Lookup
+
+    public func entity(_ id: EntityID) -> Entity? { entitiesByID[id] }
+    public func document(_ id: DocumentID) -> Document? { documentsByID[id] }
+    public func chunk(_ id: ChunkID) -> TextChunk? { chunksByID[id] }
+    public func contains(_ id: EntityID) -> Bool { entitiesByID[id] != nil }
+
+    public var entities: [Entity] { entityOrder.compactMap { entitiesByID[$0] } }
+    public var documents: [Document] { documentOrder.compactMap { documentsByID[$0] } }
+    public var chunks: [TextChunk] { chunkOrder.compactMap { chunksByID[$0] } }
+
+    public var entityCount: Int { entitiesByID.count }
+    public var relationshipCount: Int { relationships.count }
+    public var documentCount: Int { documentsByID.count }
+    public var chunkCount: Int { chunksByID.count }
+
+    /// Bidirectional neighbors: for every incident edge, the other endpoint and
+    /// the relationship. Deduplicated per (neighbor, relationType).
+    public func neighbors(of id: EntityID) -> [(neighbor: EntityID, relationship: Relationship)] {
+        var result: [(neighbor: EntityID, relationship: Relationship)] = []
+        for idx in outgoing[id] ?? [] {
+            result.append((relationships[idx].target, relationships[idx]))
+        }
+        for idx in incoming[id] ?? [] {
+            result.append((relationships[idx].source, relationships[idx]))
+        }
+        return result
+    }
+
+    /// All relationships where `id` is the source or target.
+    public func entityRelationships(_ id: EntityID) -> [Relationship] {
+        var out: [Relationship] = []
+        for idx in outgoing[id] ?? [] { out.append(relationships[idx]) }
+        for idx in incoming[id] ?? [] { out.append(relationships[idx]) }
+        return out
+    }
+
+    public func outDegree(_ id: EntityID) -> Int { (outgoing[id] ?? []).count }
+    public func inDegree(_ id: EntityID) -> Int { (incoming[id] ?? []).count }
+    public func degree(_ id: EntityID) -> Int { outDegree(id) + inDegree(id) }
+
+    /// Case-insensitive substring match against entity names.
+    public func findEntitiesByName(_ name: String) -> [Entity] {
+        let needle = name.lowercased()
+        return entities.filter { $0.name.lowercased().contains(needle) }
+    }
+
+    /// Shortest path (by hop count) between two entities via BFS, inclusive of
+    /// endpoints, or nil if unreachable within `maxDepth`.
+    public func findRelationshipPath(
+        from source: EntityID, to target: EntityID, maxDepth: Int = 5
+    ) -> [EntityID]? {
+        if source == target { return [source] }
+        var visited: Set<EntityID> = [source]
+        var queue: [(EntityID, [EntityID])] = [(source, [source])]
+        while !queue.isEmpty {
+            let (current, path) = queue.removeFirst()
+            if path.count > maxDepth { continue }
+            for (neighbor, _) in neighbors(of: current) where !visited.contains(neighbor) {
+                let newPath = path + [neighbor]
+                if neighbor == target { return newPath }
+                visited.insert(neighbor)
+                queue.append((neighbor, newPath))
+            }
+        }
+        return nil
+    }
+
+    public func stats() -> GraphStats {
+        let n = entityCount
+        let avgDegree = n > 0 ? Float(2 * relationshipCount) / Float(n) : 0
+        return GraphStats(
+            nodeCount: n,
+            edgeCount: relationshipCount,
+            averageDegree: avgDegree,
+            maxDepth: 0
+        )
+    }
+
+    // MARK: - Codable
+
+    private enum CodingKeys: String, CodingKey {
+        case entities, relationships, documents, chunks
+    }
+
+    public init(from decoder: Decoder) throws {
+        self.init()
+        let container = try decoder.container(keyedBy: CodingKeys.self)
+        let decodedEntities = try container.decode([Entity].self, forKey: .entities)
+        let decodedDocuments = try container.decode([Document].self, forKey: .documents)
+        let decodedChunks = try container.decode([TextChunk].self, forKey: .chunks)
+        let decodedRelationships = try container.decode([Relationship].self, forKey: .relationships)
+        for e in decodedEntities { addEntity(e) }
+        for d in decodedDocuments { addDocument(d) }
+        for c in decodedChunks { addChunk(c) }
+        for r in decodedRelationships { addRelationship(r) }
+    }
+
+    public func encode(to encoder: Encoder) throws {
+        var container = encoder.container(keyedBy: CodingKeys.self)
+        try container.encode(entities, forKey: .entities)
+        try container.encode(relationships, forKey: .relationships)
+        try container.encode(documents, forKey: .documents)
+        try container.encode(chunks, forKey: .chunks)
+    }
+
+    /// Serialize the graph to a JSON file.
+    public func save(toJSON path: String) throws {
+        let encoder = JSONEncoder()
+        encoder.outputFormatting = [.prettyPrinted, .sortedKeys]
+        do {
+            let data = try encoder.encode(self)
+            try data.write(to: URL(fileURLWithPath: path))
+        } catch let error as GraphRAGError {
+            throw error
+        } catch {
+            throw GraphRAGError.io(message: error.localizedDescription)
+        }
+    }
+
+    /// Load a graph from a JSON file.
+    public static func load(fromJSON path: String) throws -> KnowledgeGraph {
+        do {
+            let data = try Data(contentsOf: URL(fileURLWithPath: path))
+            return try JSONDecoder().decode(KnowledgeGraph.self, from: data)
+        } catch let error as GraphRAGError {
+            throw error
+        } catch {
+            throw GraphRAGError.io(message: error.localizedDescription)
+        }
+    }
+}
diff --git a/Sources/GraphRAG/Graph/PageRank.swift b/Sources/GraphRAG/Graph/PageRank.swift
new file mode 100644
index 0000000..f78622e
--- /dev/null
+++ b/Sources/GraphRAG/Graph/PageRank.swift
@@ -0,0 +1,88 @@
+// PageRank.swift
+// Ported from graphrag-rs `graph::pagerank`.
+
+import Foundation
+
+/// Weighted PageRank over the knowledge graph's directed relationships.
+public struct PageRank: Sendable {
+    /// Probability of following a link vs. teleporting (default 0.85).
+    public var dampingFactor: Double
+    /// Maximum power iterations (default 100).
+    public var maxIterations: Int
+    /// L-infinity convergence threshold (default 1e-6).
+    public var tolerance: Double
+
+    public init(dampingFactor: Double = 0.85, maxIterations: Int = 100, tolerance: Double = 1e-6) {
+        self.dampingFactor = dampingFactor
+        self.maxIterations = maxIterations
+        self.tolerance = tolerance
+    }
+
+    /// Compute a PageRank score in `[0, 1]` for each entity. Scores sum to 1.
+    public func compute(_ graph: KnowledgeGraph) -> [EntityID: Double] {
+        let nodes = graph.entities.map(\.id)
+        let n = nodes.count
+        guard n > 0 else { return [:] }
+        if n == 1 { return [nodes[0]: 1.0] }
+
+        var indexOf: [EntityID: Int] = [:]
+        for (i, id) in nodes.enumerated() { indexOf[id] = i }
+
+        // Incoming contributions: for each target i, list of (source j, weight).
+        var incomingEdges: [[(source: Int, weight: Double)]] = Array(repeating: [], count: n)
+        var outWeight = [Double](repeating: 0, count: n)
+        for rel in graph.relationships {
+            guard let s = indexOf[rel.source], let t = indexOf[rel.target] else { continue }
+            let w = Double(max(rel.confidence, 0.0001))
+            incomingEdges[t].append((s, w))
+            outWeight[s] += w
+        }
+
+        let d = dampingFactor
+        let teleport = (1.0 - d) / Double(n)
+        var scores = [Double](repeating: 1.0 / Double(n), count: n)
+
+        for _ in 0..<maxIterations {
+            // Dangling-node mass: nodes with no out-edges redistribute uniformly.
+            var danglingMass = 0.0
+            for i in 0..<n where outWeight[i] == 0 { danglingMass += scores[i] }
+            let danglingShare = d * danglingMass / Double(n)
+
+            var next = [Double](repeating: teleport + danglingShare, count: n)
+            for i in 0..<n {
+                var sum = 0.0
+                for edge in incomingEdges[i] {
+                    sum += (edge.weight / outWeight[edge.source]) * scores[edge.source]
+                }
+                next[i] += d * sum
+            }
+
+            var delta = 0.0
+            for i in 0..<n { delta = max(delta, abs(next[i] - scores[i])) }
+            scores = next
+            if delta < tolerance { break }
+        }
+
+        // Normalize to a probability distribution.
+        let total = scores.reduce(0, +)
+        if total > 0 {
+            for i in 0..<n { scores[i] /= total }
+        }
+
+        var result: [EntityID: Double] = [:]
+        result.reserveCapacity(n)
+        for (i, id) in nodes.enumerated() { result[id] = scores[i] }
+        return result
+    }
+
+    /// Top-`k` entities by PageRank score, highest first.
+    public func topEntities(_ graph: KnowledgeGraph, k: Int) -> [(id: EntityID, score: Double)] {
+        let scores = compute(graph)
+        return scores.sorted { lhs, rhs in
+            if lhs.value == rhs.value { return lhs.key.raw < rhs.key.raw }
+            return lhs.value > rhs.value
+        }
+        .prefix(k)
+        .map { (id: $0.key, score: $0.value) }
+    }
+}
diff --git a/Sources/GraphRAG/Graph/Traversal.swift b/Sources/GraphRAG/Graph/Traversal.swift
new file mode 100644
index 0000000..4853cee
--- /dev/null
+++ b/Sources/GraphRAG/Graph/Traversal.swift
@@ -0,0 +1,193 @@
+// Traversal.swift
+// Ported from graphrag-rs `graph::traversal`.
+
+import Foundation
+
+/// Tunables that govern graph traversal.
+public struct TraversalConfig: Sendable {
+    public var maxDepth: Int
+    public var maxPaths: Int
+    public var useEdgeWeights: Bool
+    public var minRelationshipStrength: Float
+
+    public init(
+        maxDepth: Int = 3,
+        maxPaths: Int = 100,
+        useEdgeWeights: Bool = true,
+        minRelationshipStrength: Float = 0.5
+    ) {
+        self.maxDepth = maxDepth
+        self.maxPaths = maxPaths
+        self.useEdgeWeights = useEdgeWeights
+        self.minRelationshipStrength = minRelationshipStrength
+    }
+}
+
+/// The product of a traversal: discovered entities, the edges walked, and the
+/// depth/distance of each entity from the source(s).
+public struct TraversalResult: Sendable {
+    public var entities: [EntityID]
+    public var relationships: [Relationship]
+    public var distances: [EntityID: Int]
+
+    public init(
+        entities: [EntityID] = [],
+        relationships: [Relationship] = [],
+        distances: [EntityID: Int] = [:]
+    ) {
+        self.entities = entities
+        self.relationships = relationships
+        self.distances = distances
+    }
+}
+
+/// Breadth-/depth-first traversal of the knowledge graph with edge-strength
+/// filtering.
+public struct GraphTraversal: Sendable {
+    public var config: TraversalConfig
+
+    public init(config: TraversalConfig = TraversalConfig()) {
+        self.config = config
+    }
+
+    private func passesFilter(_ relationship: Relationship) -> Bool {
+        !config.useEdgeWeights || relationship.confidence >= config.minRelationshipStrength
+    }
+
+    /// Breadth-first search from a single source.
+    public func bfs(_ graph: KnowledgeGraph, from source: EntityID) -> TraversalResult {
+        multiSourceBFS(graph, from: [source])
+    }
+
+    /// Breadth-first search from multiple sources simultaneously.
+    public func multiSourceBFS(_ graph: KnowledgeGraph, from sources: [EntityID]) -> TraversalResult {
+        var result = TraversalResult()
+        var visited: Set<EntityID> = []
+        var queue: [EntityID] = []
+        for source in sources where graph.contains(source) && !visited.contains(source) {
+            visited.insert(source)
+            result.distances[source] = 0
+            result.entities.append(source)
+            queue.append(source)
+        }
+
+        var head = 0
+        while head < queue.count {
+            let current = queue[head]
+            head += 1
+            let depth = result.distances[current] ?? 0
+            if depth >= config.maxDepth { continue }
+            for (neighbor, relationship) in graph.neighbors(of: current) {
+                guard passesFilter(relationship) else { continue }
+                if !visited.contains(neighbor) {
+                    visited.insert(neighbor)
+                    result.distances[neighbor] = depth + 1
+                    result.entities.append(neighbor)
+                    result.relationships.append(relationship)
+                    queue.append(neighbor)
+                }
+            }
+        }
+        return result
+    }
+
+    /// Depth-first search from a single source.
+    public func dfs(_ graph: KnowledgeGraph, from source: EntityID) -> TraversalResult {
+        var result = TraversalResult()
+        guard graph.contains(source) else { return result }
+        var visited: Set<EntityID> = []
+        dfsVisit(graph, current: source, depth: 0, visited: &visited, result: &result)
+        return result
+    }
+
+    private func dfsVisit(
+        _ graph: KnowledgeGraph,
+        current: EntityID,
+        depth: Int,
+        visited: inout Set<EntityID>,
+        result: inout TraversalResult
+    ) {
+        if depth > config.maxDepth || visited.contains(current) { return }
+        visited.insert(current)
+        result.distances[current] = depth
+        result.entities.append(current)
+        for (neighbor, relationship) in graph.neighbors(of: current) {
+            guard passesFilter(relationship) else { continue }
+            if !visited.contains(neighbor) {
+                result.relationships.append(relationship)
+                dfsVisit(graph, current: neighbor, depth: depth + 1, visited: &visited, result: &result)
+            }
+        }
+    }
+
+    /// k-hop ego network expanding layer by layer around `center`.
+    public func egoNetwork(_ graph: KnowledgeGraph, center: EntityID, hops: Int? = nil) -> TraversalResult {
+        let k = hops ?? config.maxDepth
+        var result = TraversalResult()
+        guard graph.contains(center) else { return result }
+        var visited: Set<EntityID> = [center]
+        result.distances[center] = 0
+        result.entities.append(center)
+        var currentLayer = [center]
+
+        var hop = 1
+        while hop <= k && !currentLayer.isEmpty {
+            var nextLayer: [EntityID] = []
+            for entity in currentLayer {
+                for (neighbor, relationship) in graph.neighbors(of: entity) {
+                    guard passesFilter(relationship) else { continue }
+                    result.relationships.append(relationship)
+                    if !visited.contains(neighbor) {
+                        visited.insert(neighbor)
+                        result.distances[neighbor] = hop
+                        result.entities.append(neighbor)
+                        nextLayer.append(neighbor)
+                    }
+                }
+            }
+            currentLayer = nextLayer
+            hop += 1
+        }
+        return result
+    }
+
+    /// Enumerate simple paths from `source` to `target` up to `maxDepth` hops,
+    /// capped at `maxPaths`.
+    public func findAllPaths(_ graph: KnowledgeGraph, from source: EntityID, to target: EntityID) -> [[EntityID]] {
+        var paths: [[EntityID]] = []
+        guard graph.contains(source), graph.contains(target) else { return paths }
+        var visited: Set<EntityID> = []
+        var current: [EntityID] = [source]
+        pathDFS(graph, current: source, target: target, remaining: config.maxDepth,
+                path: &current, visited: &visited, paths: &paths)
+        return paths
+    }
+
+    private func pathDFS(
+        _ graph: KnowledgeGraph,
+        current: EntityID,
+        target: EntityID,
+        remaining: Int,
+        path: inout [EntityID],
+        visited: inout Set<EntityID>,
+        paths: inout [[EntityID]]
+    ) {
+        if paths.count >= config.maxPaths { return }
+        if current == target {
+            paths.append(path)
+            return
+        }
+        if remaining == 0 { return }
+        visited.insert(current)
+        for (neighbor, relationship) in graph.neighbors(of: current) {
+            guard passesFilter(relationship) else { continue }
+            if !visited.contains(neighbor) {
+                path.append(neighbor)
+                pathDFS(graph, current: neighbor, target: target, remaining: remaining - 1,
+                        path: &path, visited: &visited, paths: &paths)
+                path.removeLast()
+            }
+        }
+        visited.remove(current)
+    }
+}
diff --git a/Sources/GraphRAG/GraphRAG.swift b/Sources/GraphRAG/GraphRAG.swift
index 08b22b8..87791e7 100644
--- a/Sources/GraphRAG/GraphRAG.swift
+++ b/Sources/GraphRAG/GraphRAG.swift
@@ -1,2 +1,40 @@
-// The Swift Programming Language
-// https://docs.swift.org/swift-book
+// GraphRAG.swift
+// Umbrella documentation for the GraphRAG Swift package — a port of the Rust
+// crate graphrag-rs (https://github.com/automataIA/graphrag-rs).
+//
+// GraphRAG builds a knowledge graph from documents and answers natural-language
+// questions using graph-based context retrieval.
+//
+// Quick start:
+// ```swift
+// import GraphRAG
+//
+// let rag = try GraphRAGBuilder()
+//     .withChunkSize(800)
+//     .withChunkOverlap(100)
+//     .withTopK(5)
+//     .build()
+//
+// await rag.addDocument(text: "Ada Lovelace worked with Charles Babbage ...")
+// try await rag.build()
+// let answer = try await rag.ask("Who did Ada Lovelace work with?")
+// print(answer.text)
+// ```
+//
+// Everything in this package is `public`. The principal entry points are:
+//   - `GraphRAG`            the orchestrating actor (ingest → build → ask)
+//   - `GraphRAGBuilder`     fluent configuration
+//   - `Config`              tunable defaults
+//   - `KnowledgeGraph`      the entity/relationship graph + documents/chunks
+//   - `HybridRetriever`     BM25 + vector fusion retrieval
+//   - `PageRank`, `GraphTraversal`, `GraphAnalytics`  graph algorithms
+//
+// Pluggable backends conform to `EmbeddingModel`, `LanguageModel`, and
+// `EntityExtracting`. Offline defaults (`HashEmbedder`, `PatternEntityExtractor`)
+// require no network or model download; `OllamaClient` / `OllamaEmbedder` enable
+// local LLM-backed extraction and generation.
+
+/// The semantic version of this GraphRAG port.
+public enum GraphRAGVersion {
+    public static let current = "0.2.0"
+}
diff --git a/Sources/GraphRAG/GraphRAG/Builder.swift b/Sources/GraphRAG/GraphRAG/Builder.swift
new file mode 100644
index 0000000..87a64c8
--- /dev/null
+++ b/Sources/GraphRAG/GraphRAG/Builder.swift
@@ -0,0 +1,139 @@
+// Builder.swift
+// Ported from graphrag-rs `builder::mod` (the fluent GraphRAGBuilder).
+
+import Foundation
+
+/// Fluent builder for assembling a configured `GraphRAG` instance.
+///
+/// ```swift
+/// let rag = try GraphRAGBuilder()
+///     .withChunkSize(800)
+///     .withTopK(5)
+///     .build()
+/// ```
+public struct GraphRAGBuilder: Sendable {
+    private var config: Config
+    private var ollamaConfig: OllamaConfig
+    private var useOllamaChat: Bool = false
+    private var useOllamaEmbeddings: Bool = false
+
+    public init(config: Config = .default) {
+        self.config = config
+        self.ollamaConfig = OllamaConfig()
+    }
+
+    // MARK: - General config
+
+    public func withOutputDir(_ dir: String) -> Self {
+        var copy = self
+        copy.config.outputDir = dir
+        return copy
+    }
+
+    public func withChunkSize(_ size: Int) -> Self {
+        var copy = self
+        copy.config.chunkSize = size
+        copy.config.text.chunkSize = size
+        return copy
+    }
+
+    public func withChunkOverlap(_ overlap: Int) -> Self {
+        var copy = self
+        copy.config.chunkOverlap = overlap
+        copy.config.text.overlap = overlap
+        return copy
+    }
+
+    public func withTopK(_ k: Int) -> Self {
+        var copy = self
+        copy.config.topKResults = k
+        copy.config.retrieval.topK = k
+        return copy
+    }
+
+    public func withSimilarityThreshold(_ threshold: Float) -> Self {
+        var copy = self
+        copy.config.similarityThreshold = threshold
+        copy.config.retrieval.similarityThreshold = threshold
+        return copy
+    }
+
+    public func withApproach(_ approach: String) -> Self {
+        var copy = self
+        copy.config.approach = approach
+        return copy
+    }
+
+    public func withEmbeddingDimension(_ dimension: Int) -> Self {
+        var copy = self
+        copy.config.embedding.dimension = dimension
+        return copy
+    }
+
+    // MARK: - Backend selection
+
+    /// Use the offline, deterministic hash embedder (the default).
+    public func withHashEmbeddings() -> Self {
+        var copy = self
+        copy.config.embedding.backend = "hash"
+        copy.useOllamaEmbeddings = false
+        return copy
+    }
+
+    /// Enable a local Ollama chat model (also used for LLM-based extraction).
+    public func withOllama(
+        host: String = "http://localhost", port: Int = 11434, chatModel: String = "llama3.2:3b"
+    ) -> Self {
+        var copy = self
+        copy.ollamaConfig.host = host
+        copy.ollamaConfig.port = port
+        copy.ollamaConfig.chatModel = chatModel
+        copy.useOllamaChat = true
+        return copy
+    }
+
+    /// Use Ollama for embeddings instead of the hash embedder.
+    public func withOllamaEmbeddings(model: String = "nomic-embed-text", dimension: Int = 1024) -> Self {
+        var copy = self
+        copy.ollamaConfig.embeddingModel = model
+        copy.ollamaConfig.embeddingDimension = dimension
+        copy.config.embedding.backend = "ollama"
+        copy.config.embedding.dimension = dimension
+        copy.useOllamaEmbeddings = true
+        return copy
+    }
+
+    /// Preconfigure for a fully local Ollama setup (chat + embeddings).
+    public func withLocalDefaults() -> Self {
+        self.withOllama().withOllamaEmbeddings()
+    }
+
+    public func withConfig(_ config: Config) -> Self {
+        var copy = self
+        copy.config = config
+        return copy
+    }
+
+    // MARK: - Build
+
+    /// Construct the configured `GraphRAG` engine.
+    public func build() throws -> GraphRAG {
+        let embedder: any EmbeddingModel =
+            useOllamaEmbeddings
+            ? OllamaEmbedder(config: ollamaConfig)
+            : HashEmbedder(dimension: config.embedding.dimension)
+
+        let languageModel: (any LanguageModel)? =
+            useOllamaChat ? OllamaClient(config: ollamaConfig) : nil
+
+        let extractor: any EntityExtracting
+        if useOllamaChat {
+            extractor = LLMEntityExtractor(model: OllamaClient(config: ollamaConfig))
+        } else {
+            extractor = PatternEntityExtractor(minConfidence: config.entity.minConfidence)
+        }
+
+        return try GraphRAG(
+            config: config, embedder: embedder, languageModel: languageModel, extractor: extractor)
+    }
+}
diff --git a/Sources/GraphRAG/GraphRAG/Config.swift b/Sources/GraphRAG/GraphRAG/Config.swift
new file mode 100644
index 0000000..7847110
--- /dev/null
+++ b/Sources/GraphRAG/GraphRAG/Config.swift
@@ -0,0 +1,95 @@
+// Config.swift
+// Ported from graphrag-rs `config::mod`. Defaults mirror the Rust crate.
+
+import Foundation
+
+public struct EmbeddingConfig: Sendable {
+    public var dimension: Int
+    /// "hash" (offline, deterministic) or "ollama".
+    public var backend: String
+
+    public init(dimension: Int = 384, backend: String = "hash") {
+        self.dimension = dimension
+        self.backend = backend
+    }
+}
+
+public struct GraphConfig: Sendable {
+    public var maxConnections: Int
+    public var threshold: Float
+
+    public init(maxConnections: Int = 10, threshold: Float = 0.8) {
+        self.maxConnections = maxConnections
+        self.threshold = threshold
+    }
+}
+
+public struct TextConfig: Sendable {
+    public var chunkSize: Int
+    public var overlap: Int
+    public var languages: [String]
+
+    public init(chunkSize: Int = 1000, overlap: Int = 200, languages: [String] = ["en"]) {
+        self.chunkSize = chunkSize
+        self.overlap = overlap
+        self.languages = languages
+    }
+}
+
+public struct EntityConfig: Sendable {
+    public var minConfidence: Float
+    public var extractRelationships: Bool
+
+    public init(minConfidence: Float = 0.7, extractRelationships: Bool = true) {
+        self.minConfidence = minConfidence
+        self.extractRelationships = extractRelationships
+    }
+}
+
+/// Top-level GraphRAG configuration.
+public struct Config: Sendable {
+    public var outputDir: String
+    public var chunkSize: Int
+    public var chunkOverlap: Int
+    public var maxEntitiesPerChunk: Int
+    public var topKResults: Int
+    public var similarityThreshold: Float
+    /// "semantic", "keyword", or "hybrid".
+    public var approach: String
+
+    public var embedding: EmbeddingConfig
+    public var graph: GraphConfig
+    public var text: TextConfig
+    public var entity: EntityConfig
+    public var retrieval: RetrievalConfig
+
+    public init(
+        outputDir: String = "./output",
+        chunkSize: Int = 1000,
+        chunkOverlap: Int = 200,
+        maxEntitiesPerChunk: Int = 10,
+        topKResults: Int = 10,
+        similarityThreshold: Float = 0.8,
+        approach: String = "hybrid",
+        embedding: EmbeddingConfig = EmbeddingConfig(),
+        graph: GraphConfig = GraphConfig(),
+        text: TextConfig = TextConfig(),
+        entity: EntityConfig = EntityConfig(),
+        retrieval: RetrievalConfig = RetrievalConfig()
+    ) {
+        self.outputDir = outputDir
+        self.chunkSize = chunkSize
+        self.chunkOverlap = chunkOverlap
+        self.maxEntitiesPerChunk = maxEntitiesPerChunk
+        self.topKResults = topKResults
+        self.similarityThreshold = similarityThreshold
+        self.approach = approach
+        self.embedding = embedding
+        self.graph = graph
+        self.text = text
+        self.entity = entity
+        self.retrieval = retrieval
+    }
+
+    public static let `default` = Config()
+}
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
new file mode 100644
index 0000000..8b81ecb
--- /dev/null
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -0,0 +1,180 @@
+// Engine.swift
+// Ported from graphrag-rs `graphrag::mod` / `build` / `ask`.
+//
+// `GraphRAG` is the high-level orchestrator. It is an `actor` so its mutable
+// graph/index state is safe to share across tasks. Pluggable backends (embedder,
+// optional LLM, entity extractor) are injected as existentials.
+
+import Foundation
+
+public actor GraphRAG {
+    public let config: Config
+
+    private var graph: KnowledgeGraph
+    private let embedder: any EmbeddingModel
+    private let languageModel: (any LanguageModel)?
+    private let extractor: any EntityExtracting
+    private let textProcessor: TextProcessor
+    private var retriever: HybridRetriever
+    private var isBuilt: Bool = false
+
+    /// Designated initializer.
+    public init(
+        config: Config = .default,
+        embedder: (any EmbeddingModel)? = nil,
+        languageModel: (any LanguageModel)? = nil,
+        extractor: (any EntityExtracting)? = nil
+    ) throws {
+        self.config = config
+        self.graph = KnowledgeGraph()
+        self.embedder = embedder ?? HashEmbedder(dimension: config.embedding.dimension)
+        self.languageModel = languageModel
+        self.extractor = extractor ?? PatternEntityExtractor(minConfidence: config.entity.minConfidence)
+        self.textProcessor = try TextProcessor(
+            chunkSize: config.chunkSize, chunkOverlap: config.chunkOverlap)
+        self.retriever = HybridRetriever(
+            config: HybridConfig(maxCandidates: max(100, config.topKResults * 10)))
+    }
+
+    // MARK: - Ingestion
+
+    /// Add raw text as a new document (auto-titled, UUID id) and chunk it.
+    @discardableResult
+    public func addDocument(text: String, title: String? = nil) -> DocumentID {
+        let id = DocumentID(UUID().uuidString)
+        let document = Document(
+            id: id, title: title ?? "Document \(graph.documentCount + 1)", content: text)
+        addDocument(document)
+        return id
+    }
+
+    /// Add a pre-built document, chunking it if it has no chunks yet.
+    public func addDocument(_ document: Document) {
+        var doc = document
+        if doc.chunks.isEmpty {
+            doc.chunks = textProcessor.chunk(doc)
+        }
+        graph.addDocument(doc)
+        for chunk in doc.chunks { graph.addChunk(chunk) }
+        isBuilt = false
+    }
+
+    // MARK: - Build
+
+    /// Run the full indexing pipeline: extract entities/relationships, embed
+    /// chunks, and build the retrieval index.
+    public func build() async throws {
+        guard graph.documentCount > 0 else { throw GraphRAGError.noDocuments }
+        graph.clearEntitiesAndRelationships()
+
+        // Stage 1: entity & relationship extraction per chunk.
+        for chunk in graph.chunks {
+            let (entities, relationships) = try await extractor.extract(from: chunk)
+            for entity in entities { graph.addEntity(entity) }
+            if config.entity.extractRelationships {
+                for relationship in relationships { graph.addRelationship(relationship) }
+            }
+            // Record which entities were found in this chunk.
+            if !entities.isEmpty {
+                var updated = chunk
+                updated.entities = entities.map(\.id)
+                graph.addChunk(updated)
+            }
+        }
+
+        // Stage 2: embed chunks.
+        for chunk in graph.chunks {
+            let embedding = try await embedder.embed(chunk.content)
+            var updated = chunk
+            updated.embedding = embedding
+            graph.addChunk(updated)
+        }
+
+        // Stage 3: build the hybrid retrieval index.
+        retriever.clear()
+        retriever.index(graph: graph)
+
+        isBuilt = true
+    }
+
+    // MARK: - Query
+
+    /// Answer a natural-language question over the indexed corpus.
+    public func ask(_ query: String) async throws -> Answer {
+        guard isBuilt else { throw GraphRAGError.notInitialized }
+
+        let queryEmbedding = try await embedder.embed(query)
+        let results = retriever.search(
+            query: query, queryEmbedding: queryEmbedding, limit: config.topKResults)
+
+        guard !results.isEmpty else {
+            return Answer(
+                text: "I don't have enough information to answer this question.",
+                confidence: 0)
+        }
+
+        let context = assembleContext(results)
+        let sources = results.map { ChunkID($0.id) }
+        let confidence = min(1.0, Float(results.count) / Float(max(1, config.topKResults)))
+
+        // If an LLM is configured, synthesize a natural-language answer.
+        if let languageModel, await languageModel.isAvailable() {
+            let prompt = Prompts.fill(
+                Prompts.answerGeneration, ["context": context, "query": query])
+            let raw = try await languageModel.complete(prompt)
+            return Answer(
+                text: GraphRAG.stripThinkingTags(raw), confidence: confidence, sources: sources)
+        }
+
+        // Otherwise return an extractive summary of the top chunks.
+        let extractive = results.prefix(3).map(\.content).joined(separator: "\n\n")
+        return Answer(
+            text: "Based on the retrieved context:\n\n\(extractive)",
+            confidence: confidence, sources: sources)
+    }
+
+    /// Hybrid search without answer synthesis.
+    public func search(_ query: String, limit: Int? = nil) async throws -> [HybridSearchResult] {
+        guard isBuilt else { throw GraphRAGError.notInitialized }
+        let queryEmbedding = try await embedder.embed(query)
+        return retriever.search(
+            query: query, queryEmbedding: queryEmbedding, limit: limit ?? config.topKResults)
+    }
+
+    // MARK: - Introspection
+
+    public func stats() -> Stats {
+        Stats(
+            documentCount: graph.documentCount,
+            chunkCount: graph.chunkCount,
+            entityCount: graph.entityCount,
+            relationshipCount: graph.relationshipCount)
+    }
+
+    /// Direct access to the underlying knowledge graph (a value-type snapshot).
+    public func knowledgeGraph() -> KnowledgeGraph { graph }
+
+    /// Persist the knowledge graph to JSON.
+    public func save(toJSON path: String) throws { try graph.save(toJSON: path) }
+
+    // MARK: - Helpers
+
+    private func assembleContext(_ results: [HybridSearchResult]) -> String {
+        results.map { result in
+            let score = String(format: "%.3f", result.score)
+            return "[Chunk | Relevance: \(score)]\n\(result.content)"
+        }.joined(separator: "\n\n---\n\n")
+    }
+
+    /// Remove `<think>...</think>` blocks emitted by some reasoning models.
+    static func stripThinkingTags(_ text: String) -> String {
+        var result = text
+        while let open = result.range(of: "<think>"),
+            let close = result.range(of: "</think>"),
+            open.lowerBound < close.lowerBound
+        {
+            result.removeSubrange(open.lowerBound..<close.upperBound)
+        }
+        return result.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
diff --git a/Sources/GraphRAG/Retrieval/BM25.swift b/Sources/GraphRAG/Retrieval/BM25.swift
new file mode 100644
index 0000000..3cf8b60
--- /dev/null
+++ b/Sources/GraphRAG/Retrieval/BM25.swift
@@ -0,0 +1,137 @@
+// BM25.swift
+// Ported from graphrag-rs `retrieval::bm25`.
+
+import Foundation
+
+/// A single BM25 hit.
+public struct BM25Result: Sendable, Equatable {
+    public var id: String
+    public var score: Float
+    public var content: String
+
+    public init(id: String, score: Float, content: String) {
+        self.id = id
+        self.score = score
+        self.content = content
+    }
+}
+
+/// Okapi BM25 keyword retriever over an in-memory document collection.
+///
+/// Matches the Rust implementation: term frequency is normalized by document
+/// length, IDF is `log(N / df) + 1`, with `k1 = 1.2` and `b = 0.75`.
+public struct BM25Retriever: Sendable {
+    public let k1: Float
+    public let b: Float
+
+    private struct Entry {
+        var content: String
+        var length: Int
+        var termCounts: [String: Int]
+    }
+
+    private var entries: [String: Entry] = [:]
+    private var order: [String] = []
+    private var documentFrequency: [String: Int] = [:]
+    private var totalLength: Int = 0
+
+    public init(k1: Float = 1.2, b: Float = 0.75) {
+        self.k1 = k1
+        self.b = b
+    }
+
+    public var documentCount: Int { entries.count }
+    public var termCount: Int { documentFrequency.count }
+    public var averageDocumentLength: Float {
+        entries.isEmpty ? 0 : Float(totalLength) / Float(entries.count)
+    }
+
+    /// Index a document under `id` with the given `content`.
+    public mutating func index(id: String, content: String) {
+        if entries[id] != nil { remove(id: id) }
+
+        let tokens = BM25Retriever.tokenize(content)
+        var counts: [String: Int] = [:]
+        for token in tokens { counts[token, default: 0] += 1 }
+
+        for term in counts.keys { documentFrequency[term, default: 0] += 1 }
+
+        let entry = Entry(content: content, length: tokens.count, termCounts: counts)
+        entries[id] = entry
+        order.append(id)
+        totalLength += tokens.count
+    }
+
+    /// Remove a previously indexed document.
+    @discardableResult
+    public mutating func remove(id: String) -> Bool {
+        guard let entry = entries.removeValue(forKey: id) else { return false }
+        order.removeAll { $0 == id }
+        totalLength -= entry.length
+        for term in entry.termCounts.keys {
+            if let df = documentFrequency[term] {
+                if df <= 1 { documentFrequency.removeValue(forKey: term) }
+                else { documentFrequency[term] = df - 1 }
+            }
+        }
+        return true
+    }
+
+    public mutating func clear() {
+        entries.removeAll()
+        order.removeAll()
+        documentFrequency.removeAll()
+        totalLength = 0
+    }
+
+    public func content(for id: String) -> String? { entries[id]?.content }
+
+    /// Score and rank documents against `query`, returning the top `limit`.
+    public func search(_ query: String, limit: Int) -> [BM25Result] {
+        guard !entries.isEmpty, limit > 0 else { return [] }
+        let queryTerms = Set(BM25Retriever.tokenize(query))
+        guard !queryTerms.isEmpty else { return [] }
+
+        let n = Float(entries.count)
+        let avgdl = averageDocumentLength
+
+        var results: [BM25Result] = []
+        for id in order {
+            guard let entry = entries[id] else { continue }
+            var score: Float = 0
+            for term in queryTerms {
+                guard let rawCount = entry.termCounts[term], rawCount > 0 else { continue }
+                let df = Float(documentFrequency[term] ?? 1)
+                let idf = log(n / df) + 1.0
+                let tf = Float(rawCount) / Float(max(entry.length, 1))
+                let denom = tf + k1 * (1 - b + b * (Float(entry.length) / max(avgdl, 1)))
+                score += idf * (tf * (k1 + 1)) / max(denom, 0.0001)
+            }
+            if score > 0 {
+                results.append(BM25Result(id: id, score: score, content: entry.content))
+            }
+        }
+
+        results.sort { lhs, rhs in
+            if lhs.score == rhs.score { return lhs.id < rhs.id }
+            return lhs.score > rhs.score
+        }
+        return Array(results.prefix(limit))
+    }
+
+    // MARK: - Tokenization
+
+    static func tokenize(_ text: String) -> [String] {
+        var tokens: [String] = []
+        for rawWord in text.split(whereSeparator: { $0.isWhitespace }) {
+            var cleaned = ""
+            for ch in rawWord where ch.isLetter || ch.isNumber {
+                cleaned.append(contentsOf: ch.lowercased())
+            }
+            if cleaned.count <= 2 { continue }
+            if TfIdfKeywordExtractor.defaultStopwords.contains(cleaned) { continue }
+            tokens.append(cleaned)
+        }
+        return tokens
+    }
+}
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
new file mode 100644
index 0000000..e42b4dc
--- /dev/null
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -0,0 +1,207 @@
+// Hybrid.swift
+// Ported from graphrag-rs `retrieval::hybrid`.
+
+import Foundation
+
+/// Strategy used to merge ranked lists from different retrievers.
+public enum FusionMethod: Sendable, Equatable {
+    /// Reciprocal Rank Fusion (default).
+    case rrf
+    /// Weighted sum of max-normalized scores.
+    case weighted
+    /// Raw sum of scores.
+    case combSum
+    /// Maximum of the per-method scores.
+    case maxScore
+}
+
+/// Configuration for `HybridRetriever`.
+public struct HybridConfig: Sendable {
+    public var semanticWeight: Float
+    public var keywordWeight: Float
+    public var fusionMethod: FusionMethod
+    public var rrfK: Float
+    public var maxCandidates: Int
+    public var minScoreThreshold: Float
+
+    public init(
+        semanticWeight: Float = 0.7,
+        keywordWeight: Float = 0.3,
+        fusionMethod: FusionMethod = .rrf,
+        rrfK: Float = 60.0,
+        maxCandidates: Int = 100,
+        minScoreThreshold: Float = 0.1
+    ) {
+        self.semanticWeight = semanticWeight
+        self.keywordWeight = keywordWeight
+        self.fusionMethod = fusionMethod
+        self.rrfK = rrfK
+        self.maxCandidates = maxCandidates
+        self.minScoreThreshold = minScoreThreshold
+    }
+}
+
+/// A fused search hit combining keyword and semantic signals.
+public struct HybridSearchResult: Sendable, Equatable {
+    public var id: String
+    public var content: String
+    public var score: Float
+    public var semanticScore: Float
+    public var keywordScore: Float
+
+    public init(id: String, content: String, score: Float, semanticScore: Float, keywordScore: Float) {
+        self.id = id
+        self.content = content
+        self.score = score
+        self.semanticScore = semanticScore
+        self.keywordScore = keywordScore
+    }
+}
+
+/// Combines BM25 keyword search with cosine vector search over a chunk corpus.
+public struct HybridRetriever: Sendable {
+    public var config: HybridConfig
+    private var bm25: BM25Retriever
+    private var vectors: InMemoryVectorStore
+    private var contents: [String: String] = [:]
+
+    public init(config: HybridConfig = HybridConfig()) {
+        self.config = config
+        self.bm25 = BM25Retriever()
+        self.vectors = InMemoryVectorStore()
+    }
+
+    public var isInitialized: Bool { !contents.isEmpty }
+    public var documentCount: Int { contents.count }
+
+    /// Index a chunk for keyword search, and for semantic search if it carries
+    /// an embedding.
+    public mutating func index(id: String, content: String, embedding: [Float]?) {
+        contents[id] = content
+        bm25.index(id: id, content: content)
+        if let embedding { vectors.add(id: id, vector: embedding) }
+    }
+
+    /// Index all chunks of a knowledge graph.
+    public mutating func index(graph: KnowledgeGraph) {
+        for chunk in graph.chunks {
+            index(id: chunk.id.raw, content: chunk.content, embedding: chunk.embedding)
+        }
+    }
+
+    public mutating func clear() {
+        bm25.clear()
+        vectors.clear()
+        contents.removeAll()
+    }
+
+    /// Run both retrievers and fuse the results.
+    ///
+    /// - Parameters:
+    ///   - query: The raw query text (for BM25).
+    ///   - queryEmbedding: Optional query vector (for semantic search).
+    ///   - limit: Number of fused results to return.
+    public func search(query: String, queryEmbedding: [Float]?, limit: Int) -> [HybridSearchResult] {
+        let keyword = bm25.search(query, limit: config.maxCandidates)
+            .map { (id: $0.id, score: $0.score) }
+        let semantic: [(id: String, score: Float)] =
+            queryEmbedding.map { vectors.search($0, k: config.maxCandidates) } ?? []
+
+        let fused = fuse(semantic: semantic, keyword: keyword)
+        // RRF scores are rank-based and inherently small (≈ 1/(k+rank)); the
+        // absolute `minScoreThreshold` only makes sense for magnitude-based
+        // fusion (weighted / CombSUM / MaxScore).
+        let applyThreshold = config.fusionMethod != .rrf
+        return Array(
+            fused
+                .filter { !applyThreshold || $0.score >= config.minScoreThreshold }
+                .prefix(limit)
+        )
+    }
+
+    // MARK: - Fusion
+
+    private func fuse(
+        semantic: [(id: String, score: Float)],
+        keyword: [(id: String, score: Float)]
+    ) -> [HybridSearchResult] {
+        var semScore: [String: Float] = [:]
+        var kwScore: [String: Float] = [:]
+        var semRank: [String: Int] = [:]
+        var kwRank: [String: Int] = [:]
+        for (rank, item) in semantic.enumerated() {
+            semScore[item.id] = item.score
+            semRank[item.id] = rank
+        }
+        for (rank, item) in keyword.enumerated() {
+            kwScore[item.id] = item.score
+            kwRank[item.id] = rank
+        }
+
+        let maxSem = semantic.map(\.score).max() ?? 0
+        let maxKw = keyword.map(\.score).max() ?? 0
+        let allIDs = Set(semScore.keys).union(kwScore.keys)
+
+        var results: [HybridSearchResult] = []
+        for id in allIDs {
+            let sem = semScore[id] ?? 0
+            let kw = kwScore[id] ?? 0
+            let combined: Float
+            switch config.fusionMethod {
+            case .rrf:
+                var s: Float = 0
+                if let r = semRank[id] {
+                    s += (1.0 / (config.rrfK + Float(r) + 1.0)) * config.semanticWeight
+                }
+                if let r = kwRank[id] {
+                    s += (1.0 / (config.rrfK + Float(r) + 1.0)) * config.keywordWeight
+                }
+                combined = s
+            case .weighted:
+                let nSem = maxSem > 0 ? sem / maxSem : 0
+                let nKw = maxKw > 0 ? kw / maxKw : 0
+                combined = nSem * config.semanticWeight + nKw * config.keywordWeight
+            case .combSum:
+                combined = sem + kw
+            case .maxScore:
+                combined = max(sem, kw)
+            }
+            results.append(
+                HybridSearchResult(
+                    id: id, content: contents[id] ?? "",
+                    score: combined, semanticScore: sem, keywordScore: kw))
+        }
+
+        results.sort { lhs, rhs in
+            if lhs.score == rhs.score { return lhs.id < rhs.id }
+            return lhs.score > rhs.score
+        }
+        return results
+    }
+}
+
+/// Retrieval-tuning knobs mirroring the Rust `RetrievalConfig`.
+public struct RetrievalConfig: Sendable {
+    public var topK: Int
+    public var similarityThreshold: Float
+    public var maxExpansionDepth: Int
+    public var entityWeight: Float
+    public var chunkWeight: Float
+    public var graphWeight: Float
+
+    public init(
+        topK: Int = 10,
+        similarityThreshold: Float = 0.7,
+        maxExpansionDepth: Int = 2,
+        entityWeight: Float = 0.4,
+        chunkWeight: Float = 0.4,
+        graphWeight: Float = 0.2
+    ) {
+        self.topK = topK
+        self.similarityThreshold = similarityThreshold
+        self.maxExpansionDepth = maxExpansionDepth
+        self.entityWeight = entityWeight
+        self.chunkWeight = chunkWeight
+        self.graphWeight = graphWeight
+    }
+}
diff --git a/Sources/GraphRAG/Retrieval/VectorStore.swift b/Sources/GraphRAG/Retrieval/VectorStore.swift
new file mode 100644
index 0000000..639f24f
--- /dev/null
+++ b/Sources/GraphRAG/Retrieval/VectorStore.swift
@@ -0,0 +1,84 @@
+// VectorStore.swift
+// Ported from graphrag-rs `storage` in-memory vector store.
+
+import Foundation
+
+/// Cosine similarity between two equal-length vectors. Returns 0 if either is
+/// zero-length or dimensions mismatch.
+public func cosineSimilarity(_ a: [Float], _ b: [Float]) -> Float {
+    guard a.count == b.count, !a.isEmpty else { return 0 }
+    var dot: Float = 0
+    var normA: Float = 0
+    var normB: Float = 0
+    for i in 0..<a.count {
+        dot += a[i] * b[i]
+        normA += a[i] * a[i]
+        normB += b[i] * b[i]
+    }
+    let denom = (normA.squareRoot()) * (normB.squareRoot())
+    return denom > 0 ? dot / denom : 0
+}
+
+/// A brute-force, cosine-similarity in-memory vector store.
+public struct InMemoryVectorStore: Sendable {
+    private var vectors: [String: [Float]] = [:]
+    private var order: [String] = []
+
+    public init() {}
+
+    public var count: Int { vectors.count }
+    public var isEmpty: Bool { vectors.isEmpty }
+    public var ids: [String] { order }
+    public var dimension: Int? { order.first.flatMap { vectors[$0]?.count } }
+
+    public func contains(_ id: String) -> Bool { vectors[id] != nil }
+    public func embedding(for id: String) -> [Float]? { vectors[id] }
+
+    /// Insert or replace a vector.
+    public mutating func add(id: String, vector: [Float]) {
+        if vectors[id] == nil { order.append(id) }
+        vectors[id] = vector
+    }
+
+    public mutating func addBatch(_ items: [(id: String, vector: [Float])]) {
+        for item in items { add(id: item.id, vector: item.vector) }
+    }
+
+    @discardableResult
+    public mutating func remove(id: String) -> Bool {
+        guard vectors.removeValue(forKey: id) != nil else { return false }
+        order.removeAll { $0 == id }
+        return true
+    }
+
+    public mutating func clear() {
+        vectors.removeAll()
+        order.removeAll()
+    }
+
+    /// Top-`k` ids by descending cosine similarity to `query`.
+    public func search(_ query: [Float], k: Int) -> [(id: String, score: Float)] {
+        guard !vectors.isEmpty, k > 0 else { return [] }
+        var scored: [(id: String, score: Float)] = []
+        scored.reserveCapacity(order.count)
+        for id in order {
+            guard let v = vectors[id] else { continue }
+            scored.append((id, cosineSimilarity(query, v)))
+        }
+        scored.sort { lhs, rhs in
+            if lhs.score == rhs.score { return lhs.id < rhs.id }
+            return lhs.score > rhs.score
+        }
+        return Array(scored.prefix(k))
+    }
+
+    /// Like `search`, but discards results below `threshold`.
+    public func search(_ query: [Float], k: Int, threshold: Float) -> [(id: String, score: Float)] {
+        search(query, k: k).filter { $0.score >= threshold }
+    }
+
+    /// All vectors whose similarity to `query` is at least `threshold`.
+    public func findSimilar(_ query: [Float], threshold: Float) -> [(id: String, score: Float)] {
+        search(query, k: order.count, threshold: threshold)
+    }
+}
diff --git a/Sources/GraphRAG/Text/Chunking.swift b/Sources/GraphRAG/Text/Chunking.swift
new file mode 100644
index 0000000..287ff75
--- /dev/null
+++ b/Sources/GraphRAG/Text/Chunking.swift
@@ -0,0 +1,237 @@
+// Chunking.swift
+// Ported from graphrag-rs `text::chunking` (HierarchicalChunker) and the
+// `TextProcessor` API in `text::mod`.
+//
+// The Rust implementation works on UTF-8 byte indices and guards every slice
+// with `is_char_boundary`. Swift's `Character` (extended grapheme cluster) is
+// always a valid boundary, so this port operates over a `[Character]` array and
+// measures sizes/offsets in characters. For typical text this matches the byte
+// behaviour while remaining Unicode-safe by construction.
+
+import Foundation
+
+/// A chunk's content together with its character offsets in the source text.
+public struct ChunkSpan: Sendable, Equatable {
+    public var content: String
+    public var startOffset: Int
+    public var endOffset: Int
+
+    public init(content: String, startOffset: Int, endOffset: Int) {
+        self.content = content
+        self.startOffset = startOffset
+        self.endOffset = endOffset
+    }
+}
+
+/// Recursive, separator-aware chunker.
+///
+/// Splits on a hierarchy of separators (paragraph → line → sentence → clause →
+/// word), preferring the "highest" separator that yields a boundary past the
+/// first quarter of the window.
+public struct HierarchicalChunker: Sendable {
+    /// Ordered, most-significant-first list of separators.
+    public var separators: [String]
+    /// Chunks whose trimmed length is below this are discarded.
+    public var minChunkSize: Int
+
+    public static let defaultSeparators: [String] = [
+        "\n\n", "\n", ". ", "! ", "? ", "; ", ": ", " ", "",
+    ]
+
+    public init(separators: [String] = HierarchicalChunker.defaultSeparators, minChunkSize: Int = 50) {
+        self.separators = separators
+        self.minChunkSize = minChunkSize
+    }
+
+    public func withSeparators(_ separators: [String]) -> HierarchicalChunker {
+        HierarchicalChunker(separators: separators, minChunkSize: minChunkSize)
+    }
+
+    public func withMinSize(_ size: Int) -> HierarchicalChunker {
+        HierarchicalChunker(separators: separators, minChunkSize: size)
+    }
+
+    /// Split `text` into chunk strings of approximately `chunkSize` characters,
+    /// overlapping consecutive chunks by `overlap` characters.
+    public func chunkText(_ text: String, chunkSize: Int, overlap: Int) -> [String] {
+        chunkSpans(text, chunkSize: chunkSize, overlap: overlap).map(\.content)
+    }
+
+    /// Like `chunkText` but also returns character offsets for each chunk.
+    public func chunkSpans(_ text: String, chunkSize: Int, overlap: Int) -> [ChunkSpan] {
+        let chars = Array(text)
+        let n = chars.count
+        guard n > 0, chunkSize > 0 else { return [] }
+
+        var spans: [ChunkSpan] = []
+        var start = 0
+
+        while start < n {
+            var end = min(start + chunkSize, n)
+
+            // Final chunk: take the remainder.
+            if end >= n {
+                let slice = chars[start..<n]
+                if trimmedCount(slice) >= minChunkSize || spans.isEmpty {
+                    spans.append(makeSpan(slice, start: start, end: n))
+                }
+                break
+            }
+
+            let optimalEnd = findOptimalBoundary(chars, start: start, maxEnd: end)
+            if optimalEnd > start { end = optimalEnd }
+
+            let slice = chars[start..<end]
+            if trimmedCount(slice) >= minChunkSize {
+                spans.append(makeSpan(slice, start: start, end: end))
+            }
+
+            // Advance with overlap, snapped back to a word boundary.
+            var nextStart = max(0, end - overlap)
+            nextStart = findWordBoundaryBackward(chars, pos: nextStart)
+            // Guarantee forward progress.
+            if nextStart <= start { nextStart = end }
+            start = nextStart
+        }
+
+        return spans
+    }
+
+    // MARK: - Boundary helpers
+
+    private func makeSpan(_ slice: ArraySlice<Character>, start: Int, end: Int) -> ChunkSpan {
+        ChunkSpan(content: String(slice), startOffset: start, endOffset: end)
+    }
+
+    private func trimmedCount(_ slice: ArraySlice<Character>) -> Int {
+        String(slice).trimmingCharacters(in: .whitespacesAndNewlines).count
+    }
+
+    /// Find the best split point in `chars[start..<maxEnd]` by walking the
+    /// separator hierarchy and taking the last occurrence that falls past the
+    /// first quarter of the window.
+    func findOptimalBoundary(_ chars: [Character], start: Int, maxEnd: Int) -> Int {
+        let rangeLen = maxEnd - start
+        guard rangeLen > 0 else { return maxEnd }
+        let quarter = rangeLen / 4
+
+        for separator in separators where !separator.isEmpty {
+            let sep = Array(separator)
+            if let matchStart = lastRange(of: sep, in: chars, start: start, end: maxEnd) {
+                let boundary = matchStart + sep.count
+                if boundary > start + quarter {
+                    return boundary
+                }
+            }
+        }
+        return findWordBoundaryBackward(chars, pos: maxEnd)
+    }
+
+    /// Largest index `p <= pos` such that the character before `p` is whitespace.
+    func findWordBoundaryBackward(_ chars: [Character], pos: Int) -> Int {
+        var p = min(pos, chars.count)
+        while p > 0 {
+            if chars[p - 1].isWhitespace { return p }
+            p -= 1
+        }
+        return 0
+    }
+
+    /// Last start-index of `needle` within `chars[start..<end]`, or nil.
+    private func lastRange(of needle: [Character], in chars: [Character], start: Int, end: Int) -> Int? {
+        guard !needle.isEmpty, end - start >= needle.count else { return nil }
+        var i = end - needle.count
+        while i >= start {
+            var matched = true
+            for j in 0..<needle.count where chars[i + j] != needle[j] {
+                matched = false
+                break
+            }
+            if matched { return i }
+            i -= 1
+        }
+        return nil
+    }
+}
+
+/// High-level text-processing facade mirroring the Rust `TextProcessor`.
+public struct TextProcessor: Sendable {
+    public var chunkSize: Int
+    public var chunkOverlap: Int
+    private let chunker: HierarchicalChunker
+    private let keywordExtractor: TfIdfKeywordExtractor
+
+    public init(chunkSize: Int = 1000, chunkOverlap: Int = 200) throws {
+        guard chunkSize > 0 else {
+            throw GraphRAGError.config(message: "chunk_size must be > 0")
+        }
+        guard chunkOverlap < chunkSize else {
+            throw GraphRAGError.config(message: "chunk_overlap must be < chunk_size")
+        }
+        self.chunkSize = chunkSize
+        self.chunkOverlap = chunkOverlap
+        // For shorter documents the 50-char minimum can drop everything; scale
+        // the floor down for small chunk sizes.
+        let minSize = min(50, max(1, chunkSize / 4))
+        self.chunker = HierarchicalChunker(minChunkSize: minSize)
+        self.keywordExtractor = TfIdfKeywordExtractor()
+    }
+
+    /// Hierarchically chunk a document into `TextChunk`s with offsets and metadata.
+    public func chunk(_ document: Document) -> [TextChunk] {
+        let spans = chunker.chunkSpans(document.content, chunkSize: chunkSize, overlap: chunkOverlap)
+        var chunks: [TextChunk] = []
+        chunks.reserveCapacity(spans.count)
+        for (index, span) in spans.enumerated() {
+            let id = ChunkID("\(document.id.raw)_\(index)")
+            let metadata = ChunkMetadata(
+                index: index,
+                wordCount: wordCount(span.content),
+                keywords: extractKeywords(span.content, maxKeywords: 5)
+            )
+            chunks.append(
+                TextChunk(
+                    id: id,
+                    documentID: document.id,
+                    content: span.content,
+                    startOffset: span.startOffset,
+                    endOffset: span.endOffset,
+                    metadata: metadata
+                )
+            )
+        }
+        return chunks
+    }
+
+    /// Extract up to `maxKeywords` keywords from `text`.
+    public func extractKeywords(_ text: String, maxKeywords: Int) -> [String] {
+        keywordExtractor.extractKeywordStrings(text, topK: maxKeywords)
+    }
+
+    /// Naive sentence splitter on `.`, `!`, `?`, and newlines.
+    public func extractSentences(_ text: String) -> [String] {
+        var sentences: [String] = []
+        var current = ""
+        for ch in text {
+            current.append(ch)
+            if ch == "." || ch == "!" || ch == "?" || ch == "\n" {
+                let trimmed = current.trimmingCharacters(in: .whitespacesAndNewlines)
+                if !trimmed.isEmpty { sentences.append(trimmed) }
+                current = ""
+            }
+        }
+        let tail = current.trimmingCharacters(in: .whitespacesAndNewlines)
+        if !tail.isEmpty { sentences.append(tail) }
+        return sentences
+    }
+
+    /// Collapse runs of whitespace and trim.
+    public func cleanText(_ text: String) -> String {
+        let collapsed = text.split(whereSeparator: { $0.isWhitespace }).joined(separator: " ")
+        return collapsed.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+
+    public func wordCount(_ text: String) -> Int {
+        text.split(whereSeparator: { $0.isWhitespace }).count
+    }
+}
diff --git a/Sources/GraphRAG/Text/KeywordExtraction.swift b/Sources/GraphRAG/Text/KeywordExtraction.swift
new file mode 100644
index 0000000..963261c
--- /dev/null
+++ b/Sources/GraphRAG/Text/KeywordExtraction.swift
@@ -0,0 +1,100 @@
+// KeywordExtraction.swift
+// Ported from graphrag-rs `text::keyword_extraction` (TfIdfKeywordExtractor).
+
+import Foundation
+
+/// TF-IDF keyword extractor.
+///
+/// Maintains corpus document frequencies so IDF can be computed across a growing
+/// collection. With an empty corpus every term has an assumed document frequency
+/// of 1 (treated as rare), so scoring degrades gracefully to plain TF weighting.
+public struct TfIdfKeywordExtractor: Sendable {
+    public private(set) var documentFrequencies: [String: Int]
+    public private(set) var totalDocuments: Int
+    public let stopwords: Set<String>
+
+    public init(documentFrequencies: [String: Int] = [:], totalDocuments: Int = 1) {
+        self.documentFrequencies = documentFrequencies
+        self.totalDocuments = max(1, totalDocuments)
+        self.stopwords = TfIdfKeywordExtractor.defaultStopwords
+    }
+
+    /// Extract the top-`topK` `(term, score)` pairs, sorted by descending score.
+    public func extractKeywords(_ text: String, topK: Int) -> [(term: String, score: Float)] {
+        let tokens = tokenize(text)
+        guard !tokens.isEmpty, topK > 0 else { return [] }
+
+        // Term frequency (normalized by document length).
+        var counts: [String: Int] = [:]
+        for token in tokens { counts[token, default: 0] += 1 }
+        let totalTerms = Float(tokens.count)
+
+        var scored: [(term: String, score: Float)] = []
+        scored.reserveCapacity(counts.count)
+        for (term, count) in counts {
+            let tf = Float(count) / totalTerms
+            let idf = inverseDocumentFrequency(term)
+            scored.append((term, tf * idf))
+        }
+
+        scored.sort { lhs, rhs in
+            if lhs.score == rhs.score { return lhs.term < rhs.term }
+            return lhs.score > rhs.score
+        }
+        return Array(scored.prefix(topK))
+    }
+
+    /// Extract just the top-`topK` keyword strings.
+    public func extractKeywordStrings(_ text: String, topK: Int) -> [String] {
+        extractKeywords(text, topK: topK).map(\.term)
+    }
+
+    /// Add a document's terms to the corpus statistics (for IDF).
+    public mutating func addDocumentToCorpus(_ text: String) {
+        let unique = Set(tokenize(text))
+        for term in unique { documentFrequencies[term, default: 0] += 1 }
+        totalDocuments += 1
+    }
+
+    public func corpusStats() -> (totalDocuments: Int, uniqueTerms: Int) {
+        (totalDocuments, documentFrequencies.count)
+    }
+
+    // MARK: - Internals
+
+    private func inverseDocumentFrequency(_ term: String) -> Float {
+        let df = documentFrequencies[term] ?? 1
+        let idf = log(Float(totalDocuments) / Float(df))
+        return max(idf, 0.0)
+    }
+
+    /// Lowercase, keep alphanumerics/`-`/`_`, drop short / numeric / stopword tokens.
+    func tokenize(_ text: String) -> [String] {
+        var tokens: [String] = []
+        for rawWord in text.split(whereSeparator: { $0.isWhitespace }) {
+            var cleaned = ""
+            for ch in rawWord where ch.isLetter || ch.isNumber || ch == "-" || ch == "_" {
+                cleaned.append(contentsOf: ch.lowercased())
+            }
+            if cleaned.count <= 2 { continue }
+            if stopwords.contains(cleaned) { continue }
+            if cleaned.allSatisfy({ $0.isNumber }) { continue }
+            tokens.append(cleaned)
+        }
+        return tokens
+    }
+
+    public static let defaultStopwords: Set<String> = [
+        "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it",
+        "for", "not", "on", "with", "he", "as", "you", "do", "at", "this", "but",
+        "his", "by", "from", "they", "we", "say", "her", "she", "or", "an", "will",
+        "my", "one", "all", "would", "there", "their", "what", "so", "up", "out",
+        "if", "about", "who", "get", "which", "go", "me", "when", "make", "can",
+        "like", "time", "no", "just", "him", "know", "take", "people", "into",
+        "year", "your", "good", "some", "could", "them", "see", "other", "than",
+        "then", "now", "look", "only", "come", "its", "over", "think", "also",
+        "back", "after", "use", "two", "how", "our", "work", "first", "well",
+        "way", "even", "new", "want", "because", "any", "these", "give", "day",
+        "most", "us", "is", "was", "are", "been", "has", "had", "were", "said", "did",
+    ]
+}
diff --git a/Tests/GraphRAGTests/GraphRAGTests.swift b/Tests/GraphRAGTests/GraphRAGTests.swift
index 5616821..75d3ac9 100644
--- a/Tests/GraphRAGTests/GraphRAGTests.swift
+++ b/Tests/GraphRAGTests/GraphRAGTests.swift
@@ -1,8 +1,222 @@
 import Testing
+
 @testable import GraphRAG
 
-@Test func example() async throws {
-    // Write your test here and use APIs like `#expect(...)` to check expected conditions.
-    // Swift Testing Documentation
-    // https://developer.apple.com/documentation/testing
+// MARK: - Text chunking
+
+@Test func chunkerProducesOverlappingChunks() throws {
+    let text = String(
+        repeating:
+            "The quick brown fox jumps over the lazy dog. Knowledge graphs connect entities. ",
+        count: 20)
+    let chunker = HierarchicalChunker(minChunkSize: 10)
+    let spans = chunker.chunkSpans(text, chunkSize: 200, overlap: 50)
+
+    #expect(spans.count > 1)
+    // Offsets are ordered and within bounds.
+    for span in spans {
+        #expect(span.startOffset >= 0)
+        #expect(span.endOffset <= text.count)
+        #expect(span.startOffset < span.endOffset)
+    }
+    // Consecutive chunks overlap (next start is before previous end).
+    for i in 1..<spans.count {
+        #expect(spans[i].startOffset < spans[i - 1].endOffset)
+    }
+}
+
+@Test func textProcessorChunksShortDocument() throws {
+    let processor = try TextProcessor(chunkSize: 1000, chunkOverlap: 100)
+    let doc = Document(
+        id: "doc1", title: "T",
+        content: "Ada Lovelace collaborated with Charles Babbage on the Analytical Engine.")
+    let chunks = processor.chunk(doc)
+    #expect(chunks.count == 1)
+    #expect(chunks[0].id == ChunkID("doc1_0"))
+    #expect(chunks[0].metadata.wordCount > 0)
+}
+
+// MARK: - Keyword extraction
+
+@Test func tfidfExtractsContentKeywords() {
+    let extractor = TfIdfKeywordExtractor()
+    let keywords = extractor.extractKeywordStrings(
+        "Knowledge graphs represent entities and relationships between entities.", topK: 3)
+    #expect(!keywords.isEmpty)
+    // Stopwords like "and"/"between" must be filtered out.
+    #expect(!keywords.contains("and"))
+}
+
+// MARK: - BM25
+
+@Test func bm25RanksRelevantDocumentFirst() {
+    var bm25 = BM25Retriever()
+    bm25.index(id: "a", content: "Graph databases store nodes and edges efficiently.")
+    bm25.index(id: "b", content: "Cooking recipes for delicious pasta dishes.")
+    bm25.index(id: "c", content: "Knowledge graphs use nodes edges and graph traversal.")
+
+    let results = bm25.search("graph nodes edges", limit: 3)
+    #expect(!results.isEmpty)
+    // A graph-related doc should outrank the cooking doc.
+    #expect(results.first?.id == "a" || results.first?.id == "c")
+    #expect(!results.contains { $0.id == "b" && $0.score > (results.first?.score ?? 0) })
+}
+
+// MARK: - Vector store & embeddings
+
+@Test func cosineSimilarityBasics() {
+    #expect(abs(cosineSimilarity([1, 0], [1, 0]) - 1.0) < 1e-6)
+    #expect(abs(cosineSimilarity([1, 0], [0, 1])) < 1e-6)
+}
+
+@Test func hashEmbedderIsDeterministicAndDimensioned() {
+    let embedder = HashEmbedder(dimension: 64)
+    let a = embedder.embedSync("knowledge graph retrieval")
+    let b = embedder.embedSync("knowledge graph retrieval")
+    #expect(a == b)
+    #expect(a.count == 64)
+}
+
+@Test func vectorStoreReturnsNearestNeighbor() {
+    let embedder = HashEmbedder(dimension: 128)
+    var store = InMemoryVectorStore()
+    store.add(id: "graphs", vector: embedder.embedSync("graphs nodes edges entities"))
+    store.add(id: "cooking", vector: embedder.embedSync("cooking pasta tomato recipe"))
+
+    let query = embedder.embedSync("entities and nodes in graphs")
+    let results = store.search(query, k: 2)
+    #expect(results.first?.id == "graphs")
+}
+
+// MARK: - Knowledge graph
+
+@Test func knowledgeGraphStoresEntitiesAndNeighbors() {
+    var graph = KnowledgeGraph()
+    let ada = Entity(id: "person_ada", name: "Ada Lovelace", entityType: "PERSON")
+    let babbage = Entity(id: "person_babbage", name: "Charles Babbage", entityType: "PERSON")
+    graph.addEntity(ada)
+    graph.addEntity(babbage)
+    graph.addRelationship(
+        Relationship(source: ada.id, target: babbage.id, relationType: "COLLEAGUE_OF"))
+
+    #expect(graph.entityCount == 2)
+    #expect(graph.relationshipCount == 1)
+    let neighbors = graph.neighbors(of: ada.id)
+    #expect(neighbors.contains { $0.neighbor == babbage.id })
+    // Bidirectional lookup.
+    #expect(graph.neighbors(of: babbage.id).contains { $0.neighbor == ada.id })
+}
+
+@Test func knowledgeGraphMergesDuplicateRelationships() {
+    var graph = KnowledgeGraph()
+    graph.addEntity(Entity(id: "a", name: "A", entityType: "X"))
+    graph.addEntity(Entity(id: "b", name: "B", entityType: "X"))
+    graph.addRelationship(Relationship(source: "a", target: "b", relationType: "R", confidence: 0.5))
+    graph.addRelationship(Relationship(source: "a", target: "b", relationType: "R", confidence: 0.9))
+    #expect(graph.relationshipCount == 1)
+    #expect(graph.relationships[0].confidence == 0.9)
+}
+
+// MARK: - Graph algorithms
+
+@Test func pageRankScoresSumToOneAndRankHub() {
+    var graph = KnowledgeGraph()
+    for name in ["a", "b", "c", "hub"] {
+        graph.addEntity(Entity(id: EntityID(name), name: name, entityType: "X"))
+    }
+    // Everyone points to the hub.
+    graph.addRelationship(Relationship(source: "a", target: "hub", relationType: "R"))
+    graph.addRelationship(Relationship(source: "b", target: "hub", relationType: "R"))
+    graph.addRelationship(Relationship(source: "c", target: "hub", relationType: "R"))
+
+    let scores = PageRank().compute(graph)
+    let total = scores.values.reduce(0, +)
+    #expect(abs(total - 1.0) < 1e-6)
+    let hub = scores[EntityID("hub")] ?? 0
+    #expect(hub > (scores[EntityID("a")] ?? 0))
+}
+
+@Test func bfsTraversalRespectsDepth() {
+    var graph = KnowledgeGraph()
+    for name in ["a", "b", "c", "d"] {
+        graph.addEntity(Entity(id: EntityID(name), name: name, entityType: "X"))
+    }
+    graph.addRelationship(Relationship(source: "a", target: "b", relationType: "R", confidence: 1))
+    graph.addRelationship(Relationship(source: "b", target: "c", relationType: "R", confidence: 1))
+    graph.addRelationship(Relationship(source: "c", target: "d", relationType: "R", confidence: 1))
+
+    let traversal = GraphTraversal(config: TraversalConfig(maxDepth: 2, minRelationshipStrength: 0.5))
+    let result = traversal.bfs(graph, from: "a")
+    #expect(result.distances[EntityID("a")] == 0)
+    #expect(result.distances[EntityID("b")] == 1)
+    #expect(result.distances[EntityID("c")] == 2)
+    // 'd' is at depth 3, beyond maxDepth.
+    #expect(result.distances[EntityID("d")] == nil)
+}
+
+@Test func analyticsDegreeAndComponents() {
+    var graph = KnowledgeGraph()
+    for name in ["a", "b", "c"] {
+        graph.addEntity(Entity(id: EntityID(name), name: name, entityType: "X"))
+    }
+    graph.addRelationship(Relationship(source: "a", target: "b", relationType: "R"))
+    let analytics = GraphAnalytics(graph)
+    // 'a' connects to 'b' out of 2 possible -> 0.5.
+    #expect(abs(analytics.degreeCentrality("a") - 0.5) < 1e-6)
+    // 'a'+'b' connected, 'c' isolated -> 2 components.
+    #expect(analytics.connectedComponents().count == 2)
+}
+
+// MARK: - Pattern extraction
+
+@Test func patternExtractorFindsPeople() async throws {
+    let extractor = PatternEntityExtractor(minConfidence: 0.5)
+    let chunk = TextChunk(
+        id: "c0", documentID: "d0",
+        content: "Ada Lovelace worked with Charles Babbage in London.",
+        startOffset: 0, endOffset: 0)
+    let (entities, _) = try await extractor.extract(from: chunk)
+    let names = Set(entities.map(\.name))
+    #expect(names.contains("Ada Lovelace"))
+    #expect(names.contains("Charles Babbage"))
+}
+
+// MARK: - End-to-end pipeline
+
+@Test func endToEndBuildAndAskWithoutLLM() async throws {
+    let rag = try GraphRAGBuilder()
+        .withChunkSize(400)
+        .withChunkOverlap(50)
+        .withTopK(3)
+        .build()
+
+    await rag.addDocument(
+        text: """
+            Ada Lovelace was an English mathematician. She collaborated with Charles Babbage
+            on the Analytical Engine, an early mechanical general-purpose computer. Ada is
+            often regarded as the first computer programmer.
+            """)
+    await rag.addDocument(
+        text: "Pasta is cooked in boiling water with salt. Tomato sauce is a common topping.")
+
+    try await rag.build()
+
+    let stats = await rag.stats()
+    #expect(stats.documentCount == 2)
+    #expect(stats.chunkCount >= 2)
+    #expect(stats.entityCount > 0)
+
+    let answer = try await rag.ask("Who worked on the Analytical Engine?")
+    #expect(!answer.text.isEmpty)
+    #expect(!answer.sources.isEmpty)
+    // The relevant (computing) chunk should be retrieved over the pasta chunk.
+    #expect(answer.text.lowercased().contains("babbage") || answer.text.lowercased().contains("ada"))
+}
+
+@Test func askBeforeBuildThrows() async throws {
+    let rag = try GraphRAGBuilder().build()
+    await rag.addDocument(text: "Some content about graphs and entities.")
+    await #expect(throws: GraphRAGError.self) {
+        _ = try await rag.ask("anything")
+    }
 }

From 71b04cbe7d78fe1ced7bd5813f581a880b7a9517 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 01:49:28 +0000
Subject: [PATCH 02/18] Address Gemini review: BM25 tf, offset accuracy, async
 URLSession, neighbor dedup

- BM25: use raw term frequency; document-length normalization is handled only by
  the |D|/avgdl denominator factor (previously normalized twice).
- PatternExtractor: trim leading/trailing punctuation on the index range so
  recorded span offsets stay aligned with the source text.
- Ollama: replace the continuation-wrapped dataTask with async-native
  URLSession.data(for:), which supports task cancellation.
- LLMExtractor: derive mention endOffset from the matched range's upperBound
  (case folding can change grapheme counts).
- KnowledgeGraph.neighbors(of:): deduplicate by (neighbor, relationType) to match
  the documented contract.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Embeddings/Ollama.swift      | 26 +++++++++---------
 Sources/GraphRAG/Entity/LLMExtractor.swift    |  5 +++-
 .../GraphRAG/Entity/PatternExtractor.swift    | 27 +++++++++++++++----
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   | 13 +++++++--
 Sources/GraphRAG/Retrieval/BM25.swift         |  6 ++++-
 5 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/Sources/GraphRAG/Embeddings/Ollama.swift b/Sources/GraphRAG/Embeddings/Ollama.swift
index 228f43b..e1147a2 100644
--- a/Sources/GraphRAG/Embeddings/Ollama.swift
+++ b/Sources/GraphRAG/Embeddings/Ollama.swift
@@ -86,21 +86,19 @@ enum OllamaHTTP {
     }
 
     private static func perform(_ request: URLRequest) async throws -> Data {
-        try await withCheckedThrowingContinuation { continuation in
-            let task = URLSession.shared.dataTask(with: request) { data, response, error in
-                if let error {
-                    continuation.resume(throwing: GraphRAGError.network(message: error.localizedDescription))
-                    return
-                }
-                if let http = response as? HTTPURLResponse, !(200..<300).contains(http.statusCode) {
-                    continuation.resume(
-                        throwing: GraphRAGError.http(message: "HTTP \(http.statusCode)"))
-                    return
-                }
-                continuation.resume(returning: data ?? Data())
-            }
-            task.resume()
+        // Async-native URLSession supports task cancellation, unlike the legacy
+        // callback API wrapped in a continuation.
+        let data: Data
+        let response: URLResponse
+        do {
+            (data, response) = try await URLSession.shared.data(for: request)
+        } catch {
+            throw GraphRAGError.network(message: error.localizedDescription)
+        }
+        if let http = response as? HTTPURLResponse, !(200..<300).contains(http.statusCode) {
+            throw GraphRAGError.http(message: "HTTP \(http.statusCode)")
         }
+        return data
     }
 }
 
diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index 84f62ed..db77237 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -88,11 +88,14 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
 
             var mentions: [EntityMention] = []
             if let range = lowerContent.range(of: name.lowercased()) {
+                // Derive both offsets from the matched range; case folding can
+                // change grapheme counts, so `start + name.count` is unreliable.
                 let start = lowerContent.distance(from: lowerContent.startIndex, to: range.lowerBound)
+                let end = lowerContent.distance(from: lowerContent.startIndex, to: range.upperBound)
                 mentions.append(
                     EntityMention(
                         chunkID: chunk.id, startOffset: start,
-                        endOffset: start + name.count, confidence: 0.9))
+                        endOffset: end, confidence: 0.9))
             }
 
             entities.append(
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 91ce98a..437e670 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -115,11 +115,28 @@ public struct PatternEntityExtractor: EntityExtracting {
                     }
                     break
                 }
-                let raw = String(chars[runStart..<j])
-                let cleaned = raw.trimmingCharacters(
-                    in: CharacterSet(charactersIn: ".,;:!?\"'()"))
-                if cleaned.count >= 2 {
-                    spans.append(Span(text: cleaned, start: runStart, end: runStart + cleaned.count))
+                // Trim leading/trailing punctuation on the index range so the
+                // recorded offsets stay aligned with the original text (trimming
+                // the string alone would leave `runStart` pointing at a dropped
+                // leading character).
+                let trimSet = CharacterSet(charactersIn: ".,;:!?\"'()")
+                var spanStart = runStart
+                var spanEnd = j
+                while spanStart < spanEnd,
+                    let scalar = chars[spanStart].unicodeScalars.first,
+                    trimSet.contains(scalar)
+                {
+                    spanStart += 1
+                }
+                while spanEnd > spanStart,
+                    let scalar = chars[spanEnd - 1].unicodeScalars.first,
+                    trimSet.contains(scalar)
+                {
+                    spanEnd -= 1
+                }
+                if spanEnd - spanStart >= 2 {
+                    let cleaned = String(chars[spanStart..<spanEnd])
+                    spans.append(Span(text: cleaned, start: spanStart, end: spanEnd))
                 }
                 i = j
             } else {
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 4f67279..f2699d1 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -115,11 +115,20 @@ public struct KnowledgeGraph: Sendable, Codable {
     /// the relationship. Deduplicated per (neighbor, relationType).
     public func neighbors(of id: EntityID) -> [(neighbor: EntityID, relationship: Relationship)] {
         var result: [(neighbor: EntityID, relationship: Relationship)] = []
+        var seen: Set<String> = []
         for idx in outgoing[id] ?? [] {
-            result.append((relationships[idx].target, relationships[idx]))
+            let target = relationships[idx].target
+            let key = "\(target.raw)|\(relationships[idx].relationType)"
+            if seen.insert(key).inserted {
+                result.append((target, relationships[idx]))
+            }
         }
         for idx in incoming[id] ?? [] {
-            result.append((relationships[idx].source, relationships[idx]))
+            let source = relationships[idx].source
+            let key = "\(source.raw)|\(relationships[idx].relationType)"
+            if seen.insert(key).inserted {
+                result.append((source, relationships[idx]))
+            }
         }
         return result
     }
diff --git a/Sources/GraphRAG/Retrieval/BM25.swift b/Sources/GraphRAG/Retrieval/BM25.swift
index 3cf8b60..4146cc7 100644
--- a/Sources/GraphRAG/Retrieval/BM25.swift
+++ b/Sources/GraphRAG/Retrieval/BM25.swift
@@ -103,7 +103,11 @@ public struct BM25Retriever: Sendable {
                 guard let rawCount = entry.termCounts[term], rawCount > 0 else { continue }
                 let df = Float(documentFrequency[term] ?? 1)
                 let idf = log(n / df) + 1.0
-                let tf = Float(rawCount) / Float(max(entry.length, 1))
+                // Standard BM25 uses the raw term count; document-length
+                // normalization is handled solely by the `|D| / avgdl` factor in
+                // the denominator (normalizing tf here as well would penalize
+                // length twice).
+                let tf = Float(rawCount)
                 let denom = tf + k1 * (1 - b + b * (Float(entry.length) / max(avgdl, 1)))
                 score += idf * (tf * (k1 + 1)) / max(denom, 0.0001)
             }

From 33d42c83032480731a98a99fa371e032053e388a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 01:50:22 +0000
Subject: [PATCH 03/18] Update BM25 docstring to match raw-tf scoring

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Retrieval/BM25.swift | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Sources/GraphRAG/Retrieval/BM25.swift b/Sources/GraphRAG/Retrieval/BM25.swift
index 4146cc7..0c9d132 100644
--- a/Sources/GraphRAG/Retrieval/BM25.swift
+++ b/Sources/GraphRAG/Retrieval/BM25.swift
@@ -18,8 +18,8 @@ public struct BM25Result: Sendable, Equatable {
 
 /// Okapi BM25 keyword retriever over an in-memory document collection.
 ///
-/// Matches the Rust implementation: term frequency is normalized by document
-/// length, IDF is `log(N / df) + 1`, with `k1 = 1.2` and `b = 0.75`.
+/// Uses raw term frequency with `|D| / avgdl` length normalization, IDF is
+/// `log(N / df) + 1`, with `k1 = 1.2` and `b = 0.75`.
 public struct BM25Retriever: Sendable {
     public let k1: Float
     public let b: Float

From af6d547ec6f7bd00b28e8c739d31a7f25bf1f02b Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 02:02:17 +0000
Subject: [PATCH 04/18] Address Codex review: 10 correctness/robustness fixes

- Hybrid: drop non-positive cosine hits before RRF so off-topic/empty queries
  take the no-results path instead of surfacing arbitrary chunks.
- Hybrid: reject non-positive search limits (prefix traps on negatives).
- Engine: remove a document's stale chunks when it is replaced by the same id.
- Engine: guard against reentrant/overlapping builds; mark unbuilt on entry so a
  failed rebuild can't leave stale state queryable; only mark built if no
  document was ingested during the build (snapshot chunk ids).
- Engine/Builder: honor config.embedding.backend == "ollama" for a manually
  constructed/deserialized Config, not only the withOllamaEmbeddings() flag.
- Traversal: stop DFS expansion at maxDepth so edges to unvisited nodes are not
  recorded (matches BFS and the documented depth limit).
- KeywordExtraction: smoothed IDF (log((N+1)/(df+1))+1) so a fresh extractor
  ranks by term frequency instead of collapsing every score to zero.
- Chunking: reject negative chunk overlap (would skip text between chunks).
- PatternExtractor: stop a Title-Case run at separating punctuation so
  "Alice, Bob" yields two entities rather than one merged node.

Adds regression tests for negative overlap, document replacement, DFS depth
bounds, and negative top-K.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .../GraphRAG/Entity/PatternExtractor.swift    |  7 +++
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   |  9 ++++
 Sources/GraphRAG/Graph/Traversal.swift        |  3 ++
 Sources/GraphRAG/GraphRAG/Builder.swift       |  7 ++-
 Sources/GraphRAG/GraphRAG/Engine.swift        | 47 +++++++++++++++--
 Sources/GraphRAG/Retrieval/Hybrid.swift       |  9 +++-
 Sources/GraphRAG/Text/Chunking.swift          |  3 ++
 Sources/GraphRAG/Text/KeywordExtraction.swift |  5 +-
 Tests/GraphRAGTests/GraphRAGTests.swift       | 52 +++++++++++++++++++
 9 files changed, 134 insertions(+), 8 deletions(-)

diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 437e670..75e5c96 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -91,6 +91,13 @@ public struct PatternEntityExtractor: EntityExtracting {
                 while true {
                     // advance to end of current word
                     while j < n && !chars[j].isWhitespace { j += 1 }
+                    // A separating punctuation (comma/semicolon/colon) ends the
+                    // run so "Alice, Bob" stays two entities rather than merging.
+                    if j > runStart, let last = chars[j - 1].unicodeScalars.first,
+                        CharacterSet(charactersIn: ",;:").contains(last)
+                    {
+                        break
+                    }
                     // peek next word
                     var k = j
                     while k < n && chars[k] == " " { k += 1 }
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index f2699d1..ce5efc6 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -86,6 +86,15 @@ public struct KnowledgeGraph: Sendable, Codable {
         chunksByID[chunk.id] = chunk
     }
 
+    /// Remove all chunks belonging to a document (used when a document is
+    /// replaced so stale chunks don't survive).
+    public mutating func removeChunks(forDocument documentID: DocumentID) {
+        let removed = Set(chunkOrder.filter { chunksByID[$0]?.documentID == documentID })
+        guard !removed.isEmpty else { return }
+        chunkOrder.removeAll { removed.contains($0) }
+        for id in removed { chunksByID.removeValue(forKey: id) }
+    }
+
     /// Drop all entities and relationships, preserving documents and chunks.
     public mutating func clearEntitiesAndRelationships() {
         entitiesByID.removeAll()
diff --git a/Sources/GraphRAG/Graph/Traversal.swift b/Sources/GraphRAG/Graph/Traversal.swift
index 4853cee..20895ef 100644
--- a/Sources/GraphRAG/Graph/Traversal.swift
+++ b/Sources/GraphRAG/Graph/Traversal.swift
@@ -111,6 +111,9 @@ public struct GraphTraversal: Sendable {
         visited.insert(current)
         result.distances[current] = depth
         result.entities.append(current)
+        // Stop expanding at the depth limit so we never record an edge to a node
+        // that won't itself be visited (matches BFS and the documented limit).
+        guard depth < config.maxDepth else { return }
         for (neighbor, relationship) in graph.neighbors(of: current) {
             guard passesFilter(relationship) else { continue }
             if !visited.contains(neighbor) {
diff --git a/Sources/GraphRAG/GraphRAG/Builder.swift b/Sources/GraphRAG/GraphRAG/Builder.swift
index 87a64c8..c52d379 100644
--- a/Sources/GraphRAG/GraphRAG/Builder.swift
+++ b/Sources/GraphRAG/GraphRAG/Builder.swift
@@ -118,8 +118,13 @@ public struct GraphRAGBuilder: Sendable {
 
     /// Construct the configured `GraphRAG` engine.
     public func build() throws -> GraphRAG {
+        // Honor an Ollama backend requested either via `withOllamaEmbeddings()`
+        // or directly through `config.embedding.backend` (e.g. a deserialized or
+        // hand-built Config).
+        let wantsOllamaEmbeddings =
+            useOllamaEmbeddings || config.embedding.backend.lowercased() == "ollama"
         let embedder: any EmbeddingModel =
-            useOllamaEmbeddings
+            wantsOllamaEmbeddings
             ? OllamaEmbedder(config: ollamaConfig)
             : HashEmbedder(dimension: config.embedding.dimension)
 
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 8b81ecb..4eeadba 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -17,6 +17,10 @@ public actor GraphRAG {
     private let textProcessor: TextProcessor
     private var retriever: HybridRetriever
     private var isBuilt: Bool = false
+    private var isBuilding: Bool = false
+    /// Bumped on every ingestion so a `build()` can detect documents added while
+    /// it was suspended at an `await` (actors are reentrant).
+    private var ingestionVersion: Int = 0
 
     /// Designated initializer.
     public init(
@@ -27,7 +31,7 @@ public actor GraphRAG {
     ) throws {
         self.config = config
         self.graph = KnowledgeGraph()
-        self.embedder = embedder ?? HashEmbedder(dimension: config.embedding.dimension)
+        self.embedder = embedder ?? GraphRAG.defaultEmbedder(for: config)
         self.languageModel = languageModel
         self.extractor = extractor ?? PatternEntityExtractor(minConfidence: config.entity.minConfidence)
         self.textProcessor = try TextProcessor(
@@ -36,6 +40,16 @@ public actor GraphRAG {
             config: HybridConfig(maxCandidates: max(100, config.topKResults * 10)))
     }
 
+    /// Pick the default embedder honoring `config.embedding.backend` when no
+    /// embedder was injected.
+    private static func defaultEmbedder(for config: Config) -> any EmbeddingModel {
+        if config.embedding.backend.lowercased() == "ollama" {
+            return OllamaEmbedder(
+                config: OllamaConfig(embeddingDimension: config.embedding.dimension))
+        }
+        return HashEmbedder(dimension: config.embedding.dimension)
+    }
+
     // MARK: - Ingestion
 
     /// Add raw text as a new document (auto-titled, UUID id) and chunk it.
@@ -48,15 +62,19 @@ public actor GraphRAG {
         return id
     }
 
-    /// Add a pre-built document, chunking it if it has no chunks yet.
+    /// Add a pre-built document, chunking it if it has no chunks yet. Replacing a
+    /// document with the same id drops the previous version's chunks first, so
+    /// stale text can't linger in the index.
     public func addDocument(_ document: Document) {
         var doc = document
         if doc.chunks.isEmpty {
             doc.chunks = textProcessor.chunk(doc)
         }
+        graph.removeChunks(forDocument: doc.id)
         graph.addDocument(doc)
         for chunk in doc.chunks { graph.addChunk(chunk) }
         isBuilt = false
+        ingestionVersion += 1
     }
 
     // MARK: - Build
@@ -65,10 +83,26 @@ public actor GraphRAG {
     /// chunks, and build the retrieval index.
     public func build() async throws {
         guard graph.documentCount > 0 else { throw GraphRAGError.noDocuments }
+        // Actors are reentrant at `await`, so refuse overlapping builds.
+        guard !isBuilding else {
+            throw GraphRAGError.validation(message: "A build is already in progress")
+        }
+        isBuilding = true
+        // Any failure below leaves the system unbuilt: ask() must require a fresh,
+        // successful build rather than querying half-rebuilt state.
+        isBuilt = false
+        defer { isBuilding = false }
+
+        let startVersion = ingestionVersion
         graph.clearEntitiesAndRelationships()
 
+        // Operate on a fixed snapshot of chunk ids so documents ingested mid-build
+        // (which bump ingestionVersion) don't get half-processed this round.
+        let chunkIDs = graph.chunks.map(\.id)
+
         // Stage 1: entity & relationship extraction per chunk.
-        for chunk in graph.chunks {
+        for id in chunkIDs {
+            guard let chunk = graph.chunk(id) else { continue }
             let (entities, relationships) = try await extractor.extract(from: chunk)
             for entity in entities { graph.addEntity(entity) }
             if config.entity.extractRelationships {
@@ -83,7 +117,8 @@ public actor GraphRAG {
         }
 
         // Stage 2: embed chunks.
-        for chunk in graph.chunks {
+        for id in chunkIDs {
+            guard let chunk = graph.chunk(id) else { continue }
             let embedding = try await embedder.embed(chunk.content)
             var updated = chunk
             updated.embedding = embedding
@@ -94,7 +129,9 @@ public actor GraphRAG {
         retriever.clear()
         retriever.index(graph: graph)
 
-        isBuilt = true
+        // Only declare success if no new documents arrived during the build;
+        // otherwise the index is already stale and a rebuild is required.
+        isBuilt = (ingestionVersion == startVersion)
     }
 
     // MARK: - Query
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
index e42b4dc..226506e 100644
--- a/Sources/GraphRAG/Retrieval/Hybrid.swift
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -102,10 +102,17 @@ public struct HybridRetriever: Sendable {
     ///   - queryEmbedding: Optional query vector (for semantic search).
     ///   - limit: Number of fused results to return.
     public func search(query: String, queryEmbedding: [Float]?, limit: Int) -> [HybridSearchResult] {
+        // A negative limit would trap in `prefix`; treat anything <= 0 as empty.
+        guard limit > 0 else { return [] }
         let keyword = bm25.search(query, limit: config.maxCandidates)
             .map { (id: $0.id, score: $0.score) }
+        // Drop non-positive cosine hits: the vector store always returns its
+        // nearest `maxCandidates`, so for an off-topic/empty query (no keyword
+        // matches and every similarity <= 0) RRF would otherwise surface
+        // arbitrary chunks instead of letting `ask()` take its no-results path.
         let semantic: [(id: String, score: Float)] =
-            queryEmbedding.map { vectors.search($0, k: config.maxCandidates) } ?? []
+            queryEmbedding.map { vectors.search($0, k: config.maxCandidates).filter { $0.score > 0 } }
+            ?? []
 
         let fused = fuse(semantic: semantic, keyword: keyword)
         // RRF scores are rank-based and inherently small (≈ 1/(k+rank)); the
diff --git a/Sources/GraphRAG/Text/Chunking.swift b/Sources/GraphRAG/Text/Chunking.swift
index 287ff75..5c7004d 100644
--- a/Sources/GraphRAG/Text/Chunking.swift
+++ b/Sources/GraphRAG/Text/Chunking.swift
@@ -165,6 +165,9 @@ public struct TextProcessor: Sendable {
         guard chunkSize > 0 else {
             throw GraphRAGError.config(message: "chunk_size must be > 0")
         }
+        guard chunkOverlap >= 0 else {
+            throw GraphRAGError.config(message: "chunk_overlap must be >= 0")
+        }
         guard chunkOverlap < chunkSize else {
             throw GraphRAGError.config(message: "chunk_overlap must be < chunk_size")
         }
diff --git a/Sources/GraphRAG/Text/KeywordExtraction.swift b/Sources/GraphRAG/Text/KeywordExtraction.swift
index 963261c..6139ede 100644
--- a/Sources/GraphRAG/Text/KeywordExtraction.swift
+++ b/Sources/GraphRAG/Text/KeywordExtraction.swift
@@ -64,7 +64,10 @@ public struct TfIdfKeywordExtractor: Sendable {
 
     private func inverseDocumentFrequency(_ term: String) -> Float {
         let df = documentFrequencies[term] ?? 1
-        let idf = log(Float(totalDocuments) / Float(df))
+        // Smoothed IDF: stays strictly positive even for an empty corpus
+        // (N = 1, df = 1 -> 1.0), so ranking falls back to term frequency rather
+        // than collapsing every score to zero.
+        let idf = log(Float(totalDocuments + 1) / Float(df + 1)) + 1.0
         return max(idf, 0.0)
     }
 
diff --git a/Tests/GraphRAGTests/GraphRAGTests.swift b/Tests/GraphRAGTests/GraphRAGTests.swift
index 75d3ac9..ddd73af 100644
--- a/Tests/GraphRAGTests/GraphRAGTests.swift
+++ b/Tests/GraphRAGTests/GraphRAGTests.swift
@@ -220,3 +220,55 @@ import Testing
         _ = try await rag.ask("anything")
     }
 }
+
+// MARK: - Review regressions
+
+@Test func negativeChunkOverlapRejected() {
+    #expect(throws: GraphRAGError.self) {
+        _ = try TextProcessor(chunkSize: 100, chunkOverlap: -10)
+    }
+}
+
+@Test func replacingDocumentRemovesStaleChunks() async throws {
+    let rag = try GraphRAGBuilder().withChunkSize(500).withChunkOverlap(50).build()
+    let id = DocumentID("fixed-id")
+    await rag.addDocument(Document(id: id, title: "v1", content: "First version about apples."))
+    await rag.addDocument(Document(id: id, title: "v2", content: "Second version about oranges."))
+    try await rag.build()
+    let stats = await rag.stats()
+    // Only the replacement's chunk(s) should remain, not both versions'.
+    #expect(stats.documentCount == 1)
+    #expect(stats.chunkCount == 1)
+    let answer = try await rag.ask("oranges")
+    #expect(!answer.text.lowercased().contains("apples"))
+}
+
+@Test func dfsDoesNotRecordEdgesBeyondMaxDepth() {
+    var graph = KnowledgeGraph()
+    for name in ["a", "b", "c", "d"] {
+        graph.addEntity(Entity(id: EntityID(name), name: name, entityType: "X"))
+    }
+    graph.addRelationship(Relationship(source: "a", target: "b", relationType: "R", confidence: 1))
+    graph.addRelationship(Relationship(source: "b", target: "c", relationType: "R", confidence: 1))
+    graph.addRelationship(Relationship(source: "c", target: "d", relationType: "R", confidence: 1))
+
+    let traversal = GraphTraversal(config: TraversalConfig(maxDepth: 2, minRelationshipStrength: 0.5))
+    let result = traversal.dfs(graph, from: "a")
+    let visited = Set(result.entities)
+    #expect(!visited.contains(EntityID("d")))
+    // Every recorded edge must connect two visited nodes.
+    for rel in result.relationships {
+        #expect(visited.contains(rel.source))
+        #expect(visited.contains(rel.target))
+    }
+}
+
+@Test func negativeTopKDoesNotCrashSearch() async throws {
+    let config = Config(topKResults: -5)
+    let rag = try GraphRAGBuilder().withConfig(config).build()
+    await rag.addDocument(text: "Graphs connect entities and relationships.")
+    try await rag.build()
+    let answer = try await rag.ask("graphs")
+    // Should degrade to the no-results answer rather than trapping.
+    #expect(answer.sources.isEmpty)
+}

From 7bd410f3e95e32b9bb6ccc215dfa116491f329ad Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 02:05:49 +0000
Subject: [PATCH 05/18] Add GitHub Actions CI to build and test on every commit

Runs `swift build`/`swift test` in the official swift:6.3 container on every
push and pull request, matching the package's swift-tools-version. Adds a CI
status badge to the README.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .github/workflows/ci.yml | 30 ++++++++++++++++++++++++++++++
 README.md                |  2 ++
 2 files changed, 32 insertions(+)
 create mode 100644 .github/workflows/ci.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..c403274
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,30 @@
+name: CI
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  linux:
+    name: Build & Test (Linux / Swift)
+    runs-on: ubuntu-latest
+    # Official Swift toolchain image. Must be >= the package's
+    # swift-tools-version (6.3); bump this tag when raising the manifest.
+    container: swift:6.3
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Swift version
+        run: swift --version
+
+      - name: Build
+        run: swift build --build-tests
+
+      - name: Test
+        run: swift test --skip-build
diff --git a/README.md b/README.md
index c9b3632..a073aa7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # GraphRAG (Swift)
 
+[![CI](https://github.com/PicoMLX/GraphRAG/actions/workflows/ci.yml/badge.svg)](https://github.com/PicoMLX/GraphRAG/actions/workflows/ci.yml)
+
 A Swift port of the Rust crate [`graphrag-rs`](https://github.com/automataIA/graphrag-rs):
 Graph-based Retrieval Augmented Generation. It builds a knowledge graph from
 documents and answers natural-language questions using graph-based context

From c845d99b90f2210f6d21ccf328383e40265cb92e Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 02:07:32 +0000
Subject: [PATCH 06/18] CI: target stable Swift 6.1 toolchain

The swift:6.3 image does not exist on Docker Hub yet (6.3 is newly announced
and not in the stable registry), so the container failed to start. No released
toolchain can parse swift-tools-version 6.3 either. The port only uses standard
Swift 6 features (actors, Sendable, async/await, swift-testing), so lower the
manifest floor to 6.1 and pin the CI container to swift:6.1 so builds and tests
actually run. Bump both back to 6.3 once a stable 6.3 toolchain/image ships.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .github/workflows/ci.yml | 4 ++--
 Package.swift            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c403274..8eb3281 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,8 +14,8 @@ jobs:
     name: Build & Test (Linux / Swift)
     runs-on: ubuntu-latest
     # Official Swift toolchain image. Must be >= the package's
-    # swift-tools-version (6.3); bump this tag when raising the manifest.
-    container: swift:6.3
+    # swift-tools-version (6.1); bump this tag when raising the manifest.
+    container: swift:6.1
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/Package.swift b/Package.swift
index 3bc01d3..6a05933 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version: 6.3
+// swift-tools-version: 6.1
 // The swift-tools-version declares the minimum version of Swift required to build this package.
 
 import PackageDescription

From 159b1162acffaf85cbf0e1513f5e06f3c94ee87f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 02:25:28 +0000
Subject: [PATCH 07/18] Address Codex review (round 3): honor config knobs +
 graph edge cases

- Engine: honor `approach` (hybrid/keyword/semantic) by toggling the BM25 and
  vector stages in retrieval.
- Engine: thread `retrieval.similarityThreshold` into the semantic stage so weak
  matches can be suppressed as configured.
- Engine: enforce `maxEntitiesPerChunk` (keep highest-confidence) and drop
  relationships whose endpoints didn't survive the cap.
- Engine: always write a chunk's entity ids during build (empty clears stale ids
  left by a prior build when extraction now returns none).
- Config/Builder: remove redundant chunkSize/overlap from TextConfig so the
  top-level chunkSize/chunkOverlap is the single source of truth (a manually
  constructed Config can no longer chunk at the wrong size).
- Analytics.density(): count unique undirected pairs so reciprocal/multiple typed
  edges can't push density above 1.0.
- KnowledgeGraph.neighbors(of:): keep the highest-confidence edge per
  (neighbor, relationType) so a weak A->B can't hide a strong reciprocal B->A
  from strength-filtered traversals.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Graph/Analytics.swift      |  8 +++-
 Sources/GraphRAG/Graph/KnowledgeGraph.swift | 34 ++++++++------
 Sources/GraphRAG/GraphRAG/Builder.swift     |  2 -
 Sources/GraphRAG/GraphRAG/Config.swift      |  6 +--
 Sources/GraphRAG/GraphRAG/Engine.swift      | 51 +++++++++++++++------
 Sources/GraphRAG/Retrieval/Hybrid.swift     | 31 +++++++++----
 6 files changed, 87 insertions(+), 45 deletions(-)

diff --git a/Sources/GraphRAG/Graph/Analytics.swift b/Sources/GraphRAG/Graph/Analytics.swift
index 4ad9b89..be68a67 100644
--- a/Sources/GraphRAG/Graph/Analytics.swift
+++ b/Sources/GraphRAG/Graph/Analytics.swift
@@ -164,11 +164,15 @@ public struct GraphAnalytics: Sendable {
 
     // MARK: - Global
 
-    /// Graph density: `2E / (n(n-1))`.
+    /// Graph density: `2E / (n(n-1))`, where `E` is the number of unique
+    /// undirected pairs. Counting unique pairs (rather than raw stored edges)
+    /// keeps density in `[0, 1]` even with reciprocal or multiple typed edges
+    /// between the same two entities.
     public func density() -> Float {
         let n = nodes.count
         guard n > 1 else { return 0 }
-        return Float(2 * graph.relationshipCount) / Float(n * (n - 1))
+        let uniqueEdges = adjacency.values.reduce(0) { $0 + $1.count } / 2
+        return Float(2 * uniqueEdges) / Float(n * (n - 1))
     }
 
     /// Local clustering coefficient: fraction of a node's neighbour pairs that
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index ce5efc6..34825b7 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -123,23 +123,29 @@ public struct KnowledgeGraph: Sendable, Codable {
     /// Bidirectional neighbors: for every incident edge, the other endpoint and
     /// the relationship. Deduplicated per (neighbor, relationType).
     public func neighbors(of id: EntityID) -> [(neighbor: EntityID, relationship: Relationship)] {
-        var result: [(neighbor: EntityID, relationship: Relationship)] = []
-        var seen: Set<String> = []
-        for idx in outgoing[id] ?? [] {
-            let target = relationships[idx].target
-            let key = "\(target.raw)|\(relationships[idx].relationType)"
-            if seen.insert(key).inserted {
-                result.append((target, relationships[idx]))
+        // Keep the highest-confidence edge per (neighbor, relationType) so a weak
+        // A->B can't hide a stronger reciprocal B->A from strength-filtered
+        // traversals.
+        var bestIndexByKey: [String: Int] = [:]
+        var order: [String] = []
+        func consider(_ index: Int, neighbor: EntityID) {
+            let key = "\(neighbor.raw)|\(relationships[index].relationType)"
+            if let existing = bestIndexByKey[key] {
+                if relationships[index].confidence > relationships[existing].confidence {
+                    bestIndexByKey[key] = index
+                }
+            } else {
+                bestIndexByKey[key] = index
+                order.append(key)
             }
         }
-        for idx in incoming[id] ?? [] {
-            let source = relationships[idx].source
-            let key = "\(source.raw)|\(relationships[idx].relationType)"
-            if seen.insert(key).inserted {
-                result.append((source, relationships[idx]))
-            }
+        for idx in outgoing[id] ?? [] { consider(idx, neighbor: relationships[idx].target) }
+        for idx in incoming[id] ?? [] { consider(idx, neighbor: relationships[idx].source) }
+        return order.map { key in
+            let rel = relationships[bestIndexByKey[key]!]
+            let neighbor = rel.source == id ? rel.target : rel.source
+            return (neighbor, rel)
         }
-        return result
     }
 
     /// All relationships where `id` is the source or target.
diff --git a/Sources/GraphRAG/GraphRAG/Builder.swift b/Sources/GraphRAG/GraphRAG/Builder.swift
index c52d379..08a9992 100644
--- a/Sources/GraphRAG/GraphRAG/Builder.swift
+++ b/Sources/GraphRAG/GraphRAG/Builder.swift
@@ -33,14 +33,12 @@ public struct GraphRAGBuilder: Sendable {
     public func withChunkSize(_ size: Int) -> Self {
         var copy = self
         copy.config.chunkSize = size
-        copy.config.text.chunkSize = size
         return copy
     }
 
     public func withChunkOverlap(_ overlap: Int) -> Self {
         var copy = self
         copy.config.chunkOverlap = overlap
-        copy.config.text.overlap = overlap
         return copy
     }
 
diff --git a/Sources/GraphRAG/GraphRAG/Config.swift b/Sources/GraphRAG/GraphRAG/Config.swift
index 7847110..f55e4c9 100644
--- a/Sources/GraphRAG/GraphRAG/Config.swift
+++ b/Sources/GraphRAG/GraphRAG/Config.swift
@@ -25,13 +25,9 @@ public struct GraphConfig: Sendable {
 }
 
 public struct TextConfig: Sendable {
-    public var chunkSize: Int
-    public var overlap: Int
     public var languages: [String]
 
-    public init(chunkSize: Int = 1000, overlap: Int = 200, languages: [String] = ["en"]) {
-        self.chunkSize = chunkSize
-        self.overlap = overlap
+    public init(languages: [String] = ["en"]) {
         self.languages = languages
     }
 }
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 4eeadba..5651393 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -103,17 +103,30 @@ public actor GraphRAG {
         // Stage 1: entity & relationship extraction per chunk.
         for id in chunkIDs {
             guard let chunk = graph.chunk(id) else { continue }
-            let (entities, relationships) = try await extractor.extract(from: chunk)
+            var (entities, relationships) = try await extractor.extract(from: chunk)
+
+            // Honor the per-chunk entity cap, keeping the highest-confidence ones.
+            if config.maxEntitiesPerChunk > 0, entities.count > config.maxEntitiesPerChunk {
+                entities = Array(
+                    entities.sorted { $0.confidence > $1.confidence }
+                        .prefix(config.maxEntitiesPerChunk))
+            }
+
             for entity in entities { graph.addEntity(entity) }
             if config.entity.extractRelationships {
-                for relationship in relationships { graph.addRelationship(relationship) }
-            }
-            // Record which entities were found in this chunk.
-            if !entities.isEmpty {
-                var updated = chunk
-                updated.entities = entities.map(\.id)
-                graph.addChunk(updated)
+                // Keep only relationships whose endpoints exist in the graph
+                // (drops edges to entities removed by the per-chunk cap).
+                for relationship in relationships
+                where graph.contains(relationship.source) && graph.contains(relationship.target) {
+                    graph.addRelationship(relationship)
+                }
             }
+
+            // Always record the chunk's entity ids — writing an empty list clears
+            // stale ids from a prior build when extraction now yields nothing.
+            var updated = chunk
+            updated.entities = entities.map(\.id)
+            graph.addChunk(updated)
         }
 
         // Stage 2: embed chunks.
@@ -140,9 +153,7 @@ public actor GraphRAG {
     public func ask(_ query: String) async throws -> Answer {
         guard isBuilt else { throw GraphRAGError.notInitialized }
 
-        let queryEmbedding = try await embedder.embed(query)
-        let results = retriever.search(
-            query: query, queryEmbedding: queryEmbedding, limit: config.topKResults)
+        let results = try await runRetrieval(query, limit: config.topKResults)
 
         guard !results.isEmpty else {
             return Answer(
@@ -173,9 +184,23 @@ public actor GraphRAG {
     /// Hybrid search without answer synthesis.
     public func search(_ query: String, limit: Int? = nil) async throws -> [HybridSearchResult] {
         guard isBuilt else { throw GraphRAGError.notInitialized }
-        let queryEmbedding = try await embedder.embed(query)
+        return try await runRetrieval(query, limit: limit ?? config.topKResults)
+    }
+
+    /// Run retrieval honoring the configured `approach` (hybrid / keyword /
+    /// semantic) and `retrieval.similarityThreshold`.
+    private func runRetrieval(_ query: String, limit: Int) async throws -> [HybridSearchResult] {
+        let approach = config.approach.lowercased()
+        let includeKeyword = approach != "semantic"
+        let includeSemantic = approach != "keyword"
+        let queryEmbedding = includeSemantic ? try await embedder.embed(query) : nil
         return retriever.search(
-            query: query, queryEmbedding: queryEmbedding, limit: limit ?? config.topKResults)
+            query: query,
+            queryEmbedding: queryEmbedding,
+            limit: limit,
+            semanticThreshold: config.retrieval.similarityThreshold,
+            includeKeyword: includeKeyword,
+            includeSemantic: includeSemantic)
     }
 
     // MARK: - Introspection
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
index 226506e..3635254 100644
--- a/Sources/GraphRAG/Retrieval/Hybrid.swift
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -101,18 +101,31 @@ public struct HybridRetriever: Sendable {
     ///   - query: The raw query text (for BM25).
     ///   - queryEmbedding: Optional query vector (for semantic search).
     ///   - limit: Number of fused results to return.
-    public func search(query: String, queryEmbedding: [Float]?, limit: Int) -> [HybridSearchResult] {
+    ///   - semanticThreshold: Minimum cosine similarity for a semantic hit.
+    ///   - includeKeyword: Include BM25 results (false for a semantic-only approach).
+    ///   - includeSemantic: Include vector results (false for a keyword-only approach).
+    public func search(
+        query: String,
+        queryEmbedding: [Float]?,
+        limit: Int,
+        semanticThreshold: Float = 0,
+        includeKeyword: Bool = true,
+        includeSemantic: Bool = true
+    ) -> [HybridSearchResult] {
         // A negative limit would trap in `prefix`; treat anything <= 0 as empty.
         guard limit > 0 else { return [] }
-        let keyword = bm25.search(query, limit: config.maxCandidates)
-            .map { (id: $0.id, score: $0.score) }
-        // Drop non-positive cosine hits: the vector store always returns its
-        // nearest `maxCandidates`, so for an off-topic/empty query (no keyword
-        // matches and every similarity <= 0) RRF would otherwise surface
-        // arbitrary chunks instead of letting `ask()` take its no-results path.
+        let keyword: [(id: String, score: Float)] =
+            includeKeyword
+            ? bm25.search(query, limit: config.maxCandidates).map { (id: $0.id, score: $0.score) }
+            : []
+        // Drop non-positive cosine hits (off-topic protection: the vector store
+        // always returns its nearest `maxCandidates`) and anything below the
+        // caller's similarity threshold.
         let semantic: [(id: String, score: Float)] =
-            queryEmbedding.map { vectors.search($0, k: config.maxCandidates).filter { $0.score > 0 } }
-            ?? []
+            (includeSemantic ? queryEmbedding : nil).map {
+                vectors.search($0, k: config.maxCandidates)
+                    .filter { $0.score > 0 && $0.score >= semanticThreshold }
+            } ?? []
 
         let fused = fuse(semantic: semantic, keyword: keyword)
         // RRF scores are rank-based and inherently small (≈ 1/(k+rank)); the

From 5cf557671a04e37f1970a6367308410dff6c9faa Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 02:46:02 +0000
Subject: [PATCH 08/18] Address Codex review (round 4): config unification,
 guards, extractor heuristics

Pattern extractor:
- Stop Title-Case runs at sentence punctuation (./!/?) with an abbreviation/title
  exception so "Acme. Bob" splits but "Dr. Smith"/"Acme Inc." stay merged.
- Recognize entities preceded by opening punctuation (quotes/parens).
- Orient asymmetric inferred relationships (WORKS_FOR/LEADS/BORN_IN/
  HEADQUARTERED_IN/LOCATED_IN) by entity role, not text order.

Config (single source of truth):
- Remove duplicated topK/similarityThreshold from RetrievalConfig; top-level
  Config.topKResults/similarityThreshold are authoritative (Engine threads the
  latter into the semantic stage).
- Builder: drive the embedding backend solely from config.embedding.backend (no
  sticky flag, so a later withConfig can switch back to hash), and sync the
  Ollama embedder dimension from the config.

Robustness:
- Engine.build re-fetches each chunk before writing so a document replaced during
  an extraction/embedding await isn't clobbered by the pre-await snapshot.
- PageRank clamps negative maxIterations and guards topEntities against negative k.
- Traversal findAllPaths treats negative depth as no expansion; egoNetwork
  de-duplicates emitted edges.
- HybridRetriever.index drops a stale vector when reindexing an id without an
  embedding.
- Prompts.fill scans once and never re-substitutes inserted values (keeps literal
  braces intact).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .../GraphRAG/Entity/PatternExtractor.swift    | 64 ++++++++++++++++---
 Sources/GraphRAG/Entity/Prompts.swift         | 23 ++++++-
 Sources/GraphRAG/Graph/PageRank.swift         |  3 +-
 Sources/GraphRAG/Graph/Traversal.swift        | 12 +++-
 Sources/GraphRAG/GraphRAG/Builder.swift       | 25 ++++----
 Sources/GraphRAG/GraphRAG/Engine.swift        | 30 +++++----
 Sources/GraphRAG/Retrieval/Hybrid.swift       | 18 ++++--
 7 files changed, 128 insertions(+), 47 deletions(-)

diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 75e5c96..bf8b63e 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -91,12 +91,20 @@ public struct PatternEntityExtractor: EntityExtracting {
                 while true {
                     // advance to end of current word
                     while j < n && !chars[j].isWhitespace { j += 1 }
-                    // A separating punctuation (comma/semicolon/colon) ends the
-                    // run so "Alice, Bob" stays two entities rather than merging.
-                    if j > runStart, let last = chars[j - 1].unicodeScalars.first,
-                        CharacterSet(charactersIn: ",;:").contains(last)
-                    {
-                        break
+                    // Clause punctuation (comma/semicolon/colon) ends the run so
+                    // "Alice, Bob" stays two entities. Sentence punctuation
+                    // (./!/?) also ends it ("Acme. Bob") — unless the word is a
+                    // known abbreviation/title like "Dr." so "Dr. Smith" merges.
+                    if j > runStart, let last = chars[j - 1].unicodeScalars.first {
+                        if CharacterSet(charactersIn: ",;:").contains(last) { break }
+                        if CharacterSet(charactersIn: ".!?").contains(last) {
+                            var ws = j - 1
+                            while ws > runStart && !chars[ws - 1].isWhitespace { ws -= 1 }
+                            let word = String(chars[ws..<j])
+                                .trimmingCharacters(in: CharacterSet(charactersIn: ".!?"))
+                                .lowercased()
+                            if !PatternEntityExtractor.abbreviations.contains(word) { break }
+                        }
                     }
                     // peek next word
                     var k = j
@@ -154,7 +162,11 @@ public struct PatternEntityExtractor: EntityExtracting {
     }
 
     private func isWordStart(_ chars: [Character], _ i: Int) -> Bool {
-        i == 0 || chars[i - 1].isWhitespace
+        if i == 0 { return true }
+        let prev = chars[i - 1]
+        // Start a run after whitespace or opening punctuation, so quoted or
+        // parenthesized names (e.g. "Ada Lovelace" or (Paris)) aren't skipped.
+        return prev.isWhitespace || "\"'([{".contains(prev)
     }
 
     // MARK: - Classification
@@ -214,18 +226,46 @@ public struct PatternEntityExtractor: EntityExtracting {
                 let a = entities[i]
                 let b = entities[j]
                 let relType = relationType(for: a.entityType, b.entityType, context: context)
-                let key = "\(a.id.raw)|\(b.id.raw)|\(relType)"
+                // Orient asymmetric relations by entity role, independent of the
+                // order the spans happened to appear in the text.
+                let (source, target) = orient(relType, a, b)
+                let key = "\(source.id.raw)|\(target.id.raw)|\(relType)"
                 if seen.contains(key) { continue }
                 seen.insert(key)
                 relationships.append(
                     Relationship(
-                        source: a.id, target: b.id, relationType: relType,
+                        source: source.id, target: target.id, relationType: relType,
                         confidence: 0.6, context: [chunk.id]))
             }
         }
         return relationships
     }
 
+    /// Order (source, target) for a typed relation by the entities' roles.
+    /// Symmetric relations keep their text order.
+    private func orient(_ relType: String, _ a: Entity, _ b: Entity) -> (Entity, Entity) {
+        func pick(_ source: String, _ target: String) -> (Entity, Entity)? {
+            if a.entityType == source && b.entityType == target { return (a, b) }
+            if b.entityType == source && a.entityType == target { return (b, a) }
+            return nil
+        }
+        switch relType {
+        case "WORKS_FOR", "LEADS":
+            return pick("PERSON", "ORGANIZATION") ?? (a, b)
+        case "BORN_IN":
+            return pick("PERSON", "LOCATION") ?? (a, b)
+        case "HEADQUARTERED_IN":
+            return pick("ORGANIZATION", "LOCATION") ?? (a, b)
+        case "LOCATED_IN":
+            // Whichever endpoint is the location is the target.
+            if a.entityType == "LOCATION" { return (b, a) }
+            if b.entityType == "LOCATION" { return (a, b) }
+            return (a, b)
+        default:
+            return (a, b)  // ASSOCIATED_WITH / KNOWS / MARRIED_TO / RELATED_TO ...
+        }
+    }
+
     private func relationType(for a: String, _ b: String, context: String) -> String {
         func has(_ s: String) -> Bool { context.contains(s) }
         switch (a, b) {
@@ -261,6 +301,12 @@ public struct PatternEntityExtractor: EntityExtracting {
         "berlin", "washington", "boston", "chicago",
     ]
     static let personTitles: Set<String> = ["dr", "prof", "mr", "mrs", "ms"]
+    /// Words ending in `.` that should NOT end a Title-Case run (titles and
+    /// common abbreviations), so e.g. "Dr. Smith" / "Acme Inc." stay merged.
+    static let abbreviations: Set<String> = [
+        "dr", "prof", "mr", "mrs", "ms", "jr", "sr", "st", "vs", "etc",
+        "inc", "corp", "ltd", "co",
+    ]
     static let blocklist: Set<String> = [
         "the", "and", "but", "or", "chapter", "section", "however", "therefore",
         "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
diff --git a/Sources/GraphRAG/Entity/Prompts.swift b/Sources/GraphRAG/Entity/Prompts.swift
index fd627a2..1aa1fbb 100644
--- a/Sources/GraphRAG/Entity/Prompts.swift
+++ b/Sources/GraphRAG/Entity/Prompts.swift
@@ -121,10 +121,27 @@ public enum Prompts {
         """
 
     /// Fill `{key}` placeholders in `template` with `values`.
+    ///
+    /// Scans the template once: a `{key}` is replaced only when `key` is a known
+    /// value, and inserted values are never re-scanned. This keeps literal braces
+    /// in the template (e.g. JSON examples) intact and prevents a value that
+    /// itself contains `{query}`-style text from being substituted further.
     public static func fill(_ template: String, _ values: [String: String]) -> String {
-        var result = template
-        for (key, value) in values {
-            result = result.replacingOccurrences(of: "{\(key)}", with: value)
+        var result = ""
+        var i = template.startIndex
+        while i < template.endIndex {
+            if template[i] == "{",
+                let close = template[template.index(after: i)...].firstIndex(of: "}")
+            {
+                let key = String(template[template.index(after: i)..<close])
+                if let value = values[key] {
+                    result += value
+                    i = template.index(after: close)
+                    continue
+                }
+            }
+            result.append(template[i])
+            i = template.index(after: i)
         }
         return result
     }
diff --git a/Sources/GraphRAG/Graph/PageRank.swift b/Sources/GraphRAG/Graph/PageRank.swift
index f78622e..7265f81 100644
--- a/Sources/GraphRAG/Graph/PageRank.swift
+++ b/Sources/GraphRAG/Graph/PageRank.swift
@@ -42,7 +42,7 @@ public struct PageRank: Sendable {
         let teleport = (1.0 - d) / Double(n)
         var scores = [Double](repeating: 1.0 / Double(n), count: n)
 
-        for _ in 0..<maxIterations {
+        for _ in 0..<max(0, maxIterations) {
             // Dangling-node mass: nodes with no out-edges redistribute uniformly.
             var danglingMass = 0.0
             for i in 0..<n where outWeight[i] == 0 { danglingMass += scores[i] }
@@ -77,6 +77,7 @@ public struct PageRank: Sendable {
 
     /// Top-`k` entities by PageRank score, highest first.
     public func topEntities(_ graph: KnowledgeGraph, k: Int) -> [(id: EntityID, score: Double)] {
+        guard k > 0 else { return [] }
         let scores = compute(graph)
         return scores.sorted { lhs, rhs in
             if lhs.value == rhs.value { return lhs.key.raw < rhs.key.raw }
diff --git a/Sources/GraphRAG/Graph/Traversal.swift b/Sources/GraphRAG/Graph/Traversal.swift
index 20895ef..9535e83 100644
--- a/Sources/GraphRAG/Graph/Traversal.swift
+++ b/Sources/GraphRAG/Graph/Traversal.swift
@@ -132,6 +132,9 @@ public struct GraphTraversal: Sendable {
         result.distances[center] = 0
         result.entities.append(center)
         var currentLayer = [center]
+        // De-duplicate emitted edges: neighbors of adjacent layers can revisit
+        // the same edge, which would otherwise overcount evidence/degrees.
+        var emittedEdges: Set<String> = []
 
         var hop = 1
         while hop <= k && !currentLayer.isEmpty {
@@ -139,7 +142,11 @@ public struct GraphTraversal: Sendable {
             for entity in currentLayer {
                 for (neighbor, relationship) in graph.neighbors(of: entity) {
                     guard passesFilter(relationship) else { continue }
-                    result.relationships.append(relationship)
+                    let edgeKey =
+                        "\(relationship.source.raw)|\(relationship.target.raw)|\(relationship.relationType)"
+                    if emittedEdges.insert(edgeKey).inserted {
+                        result.relationships.append(relationship)
+                    }
                     if !visited.contains(neighbor) {
                         visited.insert(neighbor)
                         result.distances[neighbor] = hop
@@ -180,7 +187,8 @@ public struct GraphTraversal: Sendable {
             paths.append(path)
             return
         }
-        if remaining == 0 { return }
+        // `<= 0` (not `== 0`) so a negative configured maxDepth yields no expansion.
+        if remaining <= 0 { return }
         visited.insert(current)
         for (neighbor, relationship) in graph.neighbors(of: current) {
             guard passesFilter(relationship) else { continue }
diff --git a/Sources/GraphRAG/GraphRAG/Builder.swift b/Sources/GraphRAG/GraphRAG/Builder.swift
index 08a9992..532c3ee 100644
--- a/Sources/GraphRAG/GraphRAG/Builder.swift
+++ b/Sources/GraphRAG/GraphRAG/Builder.swift
@@ -15,7 +15,6 @@ public struct GraphRAGBuilder: Sendable {
     private var config: Config
     private var ollamaConfig: OllamaConfig
     private var useOllamaChat: Bool = false
-    private var useOllamaEmbeddings: Bool = false
 
     public init(config: Config = .default) {
         self.config = config
@@ -45,14 +44,12 @@ public struct GraphRAGBuilder: Sendable {
     public func withTopK(_ k: Int) -> Self {
         var copy = self
         copy.config.topKResults = k
-        copy.config.retrieval.topK = k
         return copy
     }
 
     public func withSimilarityThreshold(_ threshold: Float) -> Self {
         var copy = self
         copy.config.similarityThreshold = threshold
-        copy.config.retrieval.similarityThreshold = threshold
         return copy
     }
 
@@ -74,7 +71,6 @@ public struct GraphRAGBuilder: Sendable {
     public func withHashEmbeddings() -> Self {
         var copy = self
         copy.config.embedding.backend = "hash"
-        copy.useOllamaEmbeddings = false
         return copy
     }
 
@@ -97,7 +93,6 @@ public struct GraphRAGBuilder: Sendable {
         copy.ollamaConfig.embeddingDimension = dimension
         copy.config.embedding.backend = "ollama"
         copy.config.embedding.dimension = dimension
-        copy.useOllamaEmbeddings = true
         return copy
     }
 
@@ -116,15 +111,17 @@ public struct GraphRAGBuilder: Sendable {
 
     /// Construct the configured `GraphRAG` engine.
     public func build() throws -> GraphRAG {
-        // Honor an Ollama backend requested either via `withOllamaEmbeddings()`
-        // or directly through `config.embedding.backend` (e.g. a deserialized or
-        // hand-built Config).
-        let wantsOllamaEmbeddings =
-            useOllamaEmbeddings || config.embedding.backend.lowercased() == "ollama"
-        let embedder: any EmbeddingModel =
-            wantsOllamaEmbeddings
-            ? OllamaEmbedder(config: ollamaConfig)
-            : HashEmbedder(dimension: config.embedding.dimension)
+        // The embedding backend is driven solely by `config.embedding.backend`,
+        // so a later `withConfig(...)` can switch it back to hash (no sticky
+        // flag). Sync the Ollama embedder's dimension from the config.
+        let embedder: any EmbeddingModel
+        if config.embedding.backend.lowercased() == "ollama" {
+            var oc = ollamaConfig
+            oc.embeddingDimension = config.embedding.dimension
+            embedder = OllamaEmbedder(config: oc)
+        } else {
+            embedder = HashEmbedder(dimension: config.embedding.dimension)
+        }
 
         let languageModel: (any LanguageModel)? =
             useOllamaChat ? OllamaClient(config: ollamaConfig) : nil
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 5651393..2bbcd79 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -103,7 +103,9 @@ public actor GraphRAG {
         // Stage 1: entity & relationship extraction per chunk.
         for id in chunkIDs {
             guard let chunk = graph.chunk(id) else { continue }
-            var (entities, relationships) = try await extractor.extract(from: chunk)
+            let extracted = try await extractor.extract(from: chunk)
+            var entities = extracted.entities
+            let relationships = extracted.relationships
 
             // Honor the per-chunk entity cap, keeping the highest-confidence ones.
             if config.maxEntitiesPerChunk > 0, entities.count > config.maxEntitiesPerChunk {
@@ -122,20 +124,26 @@ public actor GraphRAG {
                 }
             }
 
-            // Always record the chunk's entity ids — writing an empty list clears
-            // stale ids from a prior build when extraction now yields nothing.
-            var updated = chunk
-            updated.entities = entities.map(\.id)
-            graph.addChunk(updated)
+            // Re-fetch before writing: if the document was replaced during the
+            // extraction await, write entity ids onto the current chunk rather
+            // than clobbering new content with the pre-await snapshot. Always
+            // writing (even an empty list) clears stale ids from a prior build.
+            if var current = graph.chunk(id) {
+                current.entities = entities.map(\.id)
+                graph.addChunk(current)
+            }
         }
 
         // Stage 2: embed chunks.
         for id in chunkIDs {
             guard let chunk = graph.chunk(id) else { continue }
             let embedding = try await embedder.embed(chunk.content)
-            var updated = chunk
-            updated.embedding = embedding
-            graph.addChunk(updated)
+            // Re-fetch so a document replaced during the embedding await isn't
+            // overwritten by the stale snapshot.
+            if var current = graph.chunk(id) {
+                current.embedding = embedding
+                graph.addChunk(current)
+            }
         }
 
         // Stage 3: build the hybrid retrieval index.
@@ -188,7 +196,7 @@ public actor GraphRAG {
     }
 
     /// Run retrieval honoring the configured `approach` (hybrid / keyword /
-    /// semantic) and `retrieval.similarityThreshold`.
+    /// semantic) and the top-level `similarityThreshold`.
     private func runRetrieval(_ query: String, limit: Int) async throws -> [HybridSearchResult] {
         let approach = config.approach.lowercased()
         let includeKeyword = approach != "semantic"
@@ -198,7 +206,7 @@ public actor GraphRAG {
             query: query,
             queryEmbedding: queryEmbedding,
             limit: limit,
-            semanticThreshold: config.retrieval.similarityThreshold,
+            semanticThreshold: config.similarityThreshold,
             includeKeyword: includeKeyword,
             includeSemantic: includeSemantic)
     }
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
index 3635254..9d07a4d 100644
--- a/Sources/GraphRAG/Retrieval/Hybrid.swift
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -79,7 +79,13 @@ public struct HybridRetriever: Sendable {
     public mutating func index(id: String, content: String, embedding: [Float]?) {
         contents[id] = content
         bm25.index(id: id, content: content)
-        if let embedding { vectors.add(id: id, vector: embedding) }
+        if let embedding {
+            vectors.add(id: id, vector: embedding)
+        } else {
+            // Drop any vector from a previous version so semantic search can't
+            // return this id using a stale embedding.
+            vectors.remove(id: id)
+        }
     }
 
     /// Index all chunks of a knowledge graph.
@@ -201,24 +207,22 @@ public struct HybridRetriever: Sendable {
 }
 
 /// Retrieval-tuning knobs mirroring the Rust `RetrievalConfig`.
+///
+/// Top-k and similarity threshold live on the top-level `Config`
+/// (`topKResults` / `similarityThreshold`) to avoid two sources of truth; the
+/// fields here govern graph-expansion scoring.
 public struct RetrievalConfig: Sendable {
-    public var topK: Int
-    public var similarityThreshold: Float
     public var maxExpansionDepth: Int
     public var entityWeight: Float
     public var chunkWeight: Float
     public var graphWeight: Float
 
     public init(
-        topK: Int = 10,
-        similarityThreshold: Float = 0.7,
         maxExpansionDepth: Int = 2,
         entityWeight: Float = 0.4,
         chunkWeight: Float = 0.4,
         graphWeight: Float = 0.2
     ) {
-        self.topK = topK
-        self.similarityThreshold = similarityThreshold
         self.maxExpansionDepth = maxExpansionDepth
         self.entityWeight = entityWeight
         self.chunkWeight = chunkWeight

From 74524ba374e5bdd08c0f1ddf070a218c76c0e72a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 03:06:21 +0000
Subject: [PATCH 09/18] Address Codex review (round 5): cap scoping, sync,
 tokenization, validation

- PatternExtractor: only person titles cross a sentence period (org suffixes like
  "Acme Inc. Bob" now split); infer relationships only between entities sharing a
  sentence, using that sentence as keyword context (prevents one "works for"
  phrase linking every pair in a multi-fact chunk).
- KnowledgeGraph: addRelationship ignores dangling edges (both endpoints must be
  nodes); addChunk/removeChunks keep the document's embedded chunk copies in sync
  so document(id)?.chunks and saved JSON reflect enrichment.
- Engine: scope the per-chunk relationship filter to the entities that survived
  this chunk's maxEntitiesPerChunk cap (not global graph state).
- HybridRetriever.index(graph:) clears first so it's a safe full reindex.
- BM25 tokenizer splits on punctuation ("graph-based" -> "graph","based").
- Ollama baseURL normalizes bare hosts ("localhost" -> "http://localhost").
- LLMExtractor matches entity mentions on token boundaries ("Ann" no longer
  matches inside "Annabelle").

findAllPaths zero-hop was already handled by the `remaining <= 0` guard.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Embeddings/Ollama.swift      |  7 ++-
 Sources/GraphRAG/Entity/LLMExtractor.swift    | 23 ++++++++-
 .../GraphRAG/Entity/PatternExtractor.swift    | 50 ++++++++++++++++---
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   | 19 +++++++
 Sources/GraphRAG/GraphRAG/Engine.swift        | 11 ++--
 Sources/GraphRAG/Retrieval/BM25.swift         | 22 +++++---
 Sources/GraphRAG/Retrieval/Hybrid.swift       |  5 +-
 7 files changed, 114 insertions(+), 23 deletions(-)

diff --git a/Sources/GraphRAG/Embeddings/Ollama.swift b/Sources/GraphRAG/Embeddings/Ollama.swift
index e1147a2..9103255 100644
--- a/Sources/GraphRAG/Embeddings/Ollama.swift
+++ b/Sources/GraphRAG/Embeddings/Ollama.swift
@@ -47,7 +47,12 @@ public struct OllamaConfig: Sendable {
         self.numCtx = numCtx
     }
 
-    var baseURL: String { "\(host):\(port)" }
+    var baseURL: String {
+        // Accept bare hosts ("localhost", "127.0.0.1"): without a scheme, URL
+        // parses the host as the scheme and the request fails.
+        let normalizedHost = host.contains("://") ? host : "http://\(host)"
+        return "\(normalizedHost):\(port)"
+    }
 }
 
 /// Shared low-level HTTP helpers for the Ollama REST API.
diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index db77237..72814d8 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -87,7 +87,9 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
             let id = PatternEntityExtractor.makeEntityID(type: type, name: name)
 
             var mentions: [EntityMention] = []
-            if let range = lowerContent.range(of: name.lowercased()) {
+            if let range = LLMEntityExtractor.tokenBoundaryRange(
+                of: name.lowercased(), in: lowerContent)
+            {
                 // Derive both offsets from the matched range; case folding can
                 // change grapheme counts, so `start + name.count` is unreliable.
                 let start = lowerContent.distance(from: lowerContent.startIndex, to: range.lowerBound)
@@ -129,6 +131,25 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         return label.isEmpty ? "RELATED_TO" : label
     }
 
+    /// First occurrence of `needle` in `haystack` that sits on token boundaries
+    /// (not embedded inside a larger word), so "Ann" won't match "Annabelle".
+    static func tokenBoundaryRange(of needle: String, in haystack: String) -> Range<String.Index>? {
+        guard !needle.isEmpty else { return nil }
+        func isWordChar(_ c: Character) -> Bool { c.isLetter || c.isNumber }
+        var searchStart = haystack.startIndex
+        while let range = haystack.range(of: needle, range: searchStart..<haystack.endIndex) {
+            let beforeOK =
+                range.lowerBound == haystack.startIndex
+                || !isWordChar(haystack[haystack.index(before: range.lowerBound)])
+            let afterOK =
+                range.upperBound == haystack.endIndex
+                || !isWordChar(haystack[range.upperBound])
+            if beforeOK && afterOK { return range }
+            searchStart = range.upperBound
+        }
+        return nil
+    }
+
     // MARK: - Parsing
 
     struct ExtractionOutput: Codable {
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index bf8b63e..378c963 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -103,7 +103,10 @@ public struct PatternEntityExtractor: EntityExtracting {
                             let word = String(chars[ws..<j])
                                 .trimmingCharacters(in: CharacterSet(charactersIn: ".!?"))
                                 .lowercased()
-                            if !PatternEntityExtractor.abbreviations.contains(word) { break }
+                            // Only person titles (which grammatically precede a
+                            // name) cross the period — "Dr. Smith" merges, but an
+                            // org suffix like "Acme Inc. Bob" must split.
+                            if !PatternEntityExtractor.personTitles.contains(word) { break }
                         }
                     }
                     // peek next word
@@ -217,7 +220,21 @@ public struct PatternEntityExtractor: EntityExtracting {
 
     private func inferRelationships(entities: [Entity], chunk: TextChunk) -> [Relationship] {
         guard entities.count >= 2 else { return [] }
-        let context = chunk.content.lowercased()
+        let chars = Array(chunk.content)
+
+        // Assign a sentence id to every character offset (incremented after
+        // ./!/?), so relationships are only inferred between entities that
+        // co-occur in the SAME sentence — otherwise one "works for" phrase would
+        // wrongly link every person/org pair sharing a chunk.
+        var sentenceID = [Int](repeating: 0, count: chars.count + 1)
+        var sid = 0
+        for k in 0..<chars.count {
+            sentenceID[k] = sid
+            if chars[k] == "." || chars[k] == "!" || chars[k] == "?" { sid += 1 }
+        }
+        sentenceID[chars.count] = sid
+        func sentence(of offset: Int) -> Int { sentenceID[max(0, min(offset, chars.count))] }
+
         var relationships: [Relationship] = []
         var seen: Set<String> = []
 
@@ -225,6 +242,17 @@ public struct PatternEntityExtractor: EntityExtracting {
             for j in (i + 1)..<entities.count {
                 let a = entities[i]
                 let b = entities[j]
+                // Find a mention pair that shares a sentence; skip the pair if none.
+                guard let (aOff, bOff) = sameSentenceMentions(a, b, sentence: sentence) else {
+                    continue
+                }
+                // Localize the keyword context to that sentence.
+                var lo = min(aOff, bOff)
+                while lo > 0 && sentence(of: lo - 1) == sentence(of: aOff) { lo -= 1 }
+                var hi = max(aOff, bOff)
+                while hi < chars.count && sentence(of: hi) == sentence(of: aOff) { hi += 1 }
+                let context = (lo < hi ? String(chars[lo..<hi]) : "").lowercased()
+
                 let relType = relationType(for: a.entityType, b.entityType, context: context)
                 // Orient asymmetric relations by entity role, independent of the
                 // order the spans happened to appear in the text.
@@ -241,6 +269,18 @@ public struct PatternEntityExtractor: EntityExtracting {
         return relationships
     }
 
+    /// First mention pair of `a` and `b` that falls in the same sentence.
+    private func sameSentenceMentions(
+        _ a: Entity, _ b: Entity, sentence: (Int) -> Int
+    ) -> (Int, Int)? {
+        for ma in a.mentions {
+            for mb in b.mentions where sentence(ma.startOffset) == sentence(mb.startOffset) {
+                return (ma.startOffset, mb.startOffset)
+            }
+        }
+        return nil
+    }
+
     /// Order (source, target) for a typed relation by the entities' roles.
     /// Symmetric relations keep their text order.
     private func orient(_ relType: String, _ a: Entity, _ b: Entity) -> (Entity, Entity) {
@@ -301,12 +341,6 @@ public struct PatternEntityExtractor: EntityExtracting {
         "berlin", "washington", "boston", "chicago",
     ]
     static let personTitles: Set<String> = ["dr", "prof", "mr", "mrs", "ms"]
-    /// Words ending in `.` that should NOT end a Title-Case run (titles and
-    /// common abbreviations), so e.g. "Dr. Smith" / "Acme Inc." stay merged.
-    static let abbreviations: Set<String> = [
-        "dr", "prof", "mr", "mrs", "ms", "jr", "sr", "st", "vs", "etc",
-        "inc", "corp", "ltd", "co",
-    ]
     static let blocklist: Set<String> = [
         "the", "and", "but", "or", "chapter", "section", "however", "therefore",
         "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 34825b7..823549b 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -56,6 +56,11 @@ public struct KnowledgeGraph: Sendable, Codable {
     /// Insert a directed relationship. Duplicate (source, target, type) edges are
     /// merged: their evidence context is unioned and the max confidence kept.
     public mutating func addRelationship(_ relationship: Relationship) {
+        // Ignore dangling edges: both endpoints must be nodes, otherwise
+        // `neighbors(of:)`/traversals could surface an EntityID with no node.
+        guard entitiesByID[relationship.source] != nil,
+            entitiesByID[relationship.target] != nil
+        else { return }
         // Merge duplicates.
         if let existingIndices = outgoing[relationship.source] {
             for idx in existingIndices
@@ -84,6 +89,16 @@ public struct KnowledgeGraph: Sendable, Codable {
     public mutating func addChunk(_ chunk: TextChunk) {
         if chunksByID[chunk.id] == nil { chunkOrder.append(chunk.id) }
         chunksByID[chunk.id] = chunk
+        // Keep the copy embedded in its document in sync, so
+        // `document(id)?.chunks` and saved JSON reflect enrichment too.
+        if var doc = documentsByID[chunk.documentID] {
+            if let idx = doc.chunks.firstIndex(where: { $0.id == chunk.id }) {
+                doc.chunks[idx] = chunk
+            } else {
+                doc.chunks.append(chunk)
+            }
+            documentsByID[chunk.documentID] = doc
+        }
     }
 
     /// Remove all chunks belonging to a document (used when a document is
@@ -93,6 +108,10 @@ public struct KnowledgeGraph: Sendable, Codable {
         guard !removed.isEmpty else { return }
         chunkOrder.removeAll { removed.contains($0) }
         for id in removed { chunksByID.removeValue(forKey: id) }
+        if var doc = documentsByID[documentID] {
+            doc.chunks.removeAll { removed.contains($0.id) }
+            documentsByID[documentID] = doc
+        }
     }
 
     /// Drop all entities and relationships, preserving documents and chunks.
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 2bbcd79..e3d5fbc 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -114,12 +114,14 @@ public actor GraphRAG {
                         .prefix(config.maxEntitiesPerChunk))
             }
 
+            let keptIDs = Set(entities.map(\.id))
             for entity in entities { graph.addEntity(entity) }
             if config.entity.extractRelationships {
-                // Keep only relationships whose endpoints exist in the graph
-                // (drops edges to entities removed by the per-chunk cap).
+                // Scope to entities that survived THIS chunk's cap — not global
+                // graph state — so the cap can't be defeated by an id that
+                // already exists from an earlier chunk.
                 for relationship in relationships
-                where graph.contains(relationship.source) && graph.contains(relationship.target) {
+                where keptIDs.contains(relationship.source) && keptIDs.contains(relationship.target) {
                     graph.addRelationship(relationship)
                 }
             }
@@ -146,8 +148,7 @@ public actor GraphRAG {
             }
         }
 
-        // Stage 3: build the hybrid retrieval index.
-        retriever.clear()
+        // Stage 3: build the hybrid retrieval index (index(graph:) clears first).
         retriever.index(graph: graph)
 
         // Only declare success if no new documents arrived during the build;
diff --git a/Sources/GraphRAG/Retrieval/BM25.swift b/Sources/GraphRAG/Retrieval/BM25.swift
index 0c9d132..d20ea38 100644
--- a/Sources/GraphRAG/Retrieval/BM25.swift
+++ b/Sources/GraphRAG/Retrieval/BM25.swift
@@ -127,15 +127,23 @@ public struct BM25Retriever: Sendable {
 
     static func tokenize(_ text: String) -> [String] {
         var tokens: [String] = []
-        for rawWord in text.split(whereSeparator: { $0.isWhitespace }) {
-            var cleaned = ""
-            for ch in rawWord where ch.isLetter || ch.isNumber {
-                cleaned.append(contentsOf: ch.lowercased())
+        var current = ""
+        func flush() {
+            if current.count > 2, !TfIdfKeywordExtractor.defaultStopwords.contains(current) {
+                tokens.append(current)
             }
-            if cleaned.count <= 2 { continue }
-            if TfIdfKeywordExtractor.defaultStopwords.contains(cleaned) { continue }
-            tokens.append(cleaned)
+            current = ""
         }
+        // Split on any non-alphanumeric so punctuation separates terms
+        // ("graph-based" -> "graph", "based") instead of concatenating them.
+        for ch in text {
+            if ch.isLetter || ch.isNumber {
+                current.append(contentsOf: ch.lowercased())
+            } else {
+                flush()
+            }
+        }
+        flush()
         return tokens
     }
 }
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
index 9d07a4d..6497ff4 100644
--- a/Sources/GraphRAG/Retrieval/Hybrid.swift
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -88,8 +88,11 @@ public struct HybridRetriever: Sendable {
         }
     }
 
-    /// Index all chunks of a knowledge graph.
+    /// Index all chunks of a knowledge graph as a full (re)index. Clears any
+    /// previously indexed content first, so ids removed since the last index
+    /// can't linger in `contents`, BM25, or the vector store.
     public mutating func index(graph: KnowledgeGraph) {
+        clear()
         for chunk in graph.chunks {
             index(id: chunk.id.raw, content: chunk.content, embedding: chunk.embedding)
         }

From 3bfd74fa267037f7f6eb5766dd0a3c6d111770aa Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 03:24:48 +0000
Subject: [PATCH 10/18] Address Codex review (round 6): tokenizer, corpus
 count, guards, relation scoping

- HashEmbedder tokenizer splits on punctuation (matches BM25) so "graph-based"
  and the query "graph based" share tokens in semantic-only mode.
- TfIdfKeywordExtractor starts totalDocuments at 0 (no phantom document); the
  smoothed IDF still handles an empty corpus.
- PageRank clamps dampingFactor to [0, 1] so a bad value can't yield negative
  scores.
- KnowledgeGraph.stats() reports the real maxDepth (undirected graph diameter)
  instead of a hardcoded 0.
- PatternExtractor: an abbreviation period ending a person title ("Dr.") no
  longer splits the sentence, so "Dr. Smith works for Acme Inc." keeps its
  WORKS_FOR edge; relationship keyword context is now the span between the two
  mentions, and pairs with an intervening same-type entity are skipped, removing
  false edges in multi-fact sentences.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .../GraphRAG/Embeddings/HashEmbedder.swift    | 15 ++++---
 .../GraphRAG/Entity/PatternExtractor.swift    | 45 +++++++++++++++----
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   | 30 ++++++++++++-
 Sources/GraphRAG/Graph/PageRank.swift         |  4 +-
 Sources/GraphRAG/Text/KeywordExtraction.swift |  6 ++-
 5 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/Sources/GraphRAG/Embeddings/HashEmbedder.swift b/Sources/GraphRAG/Embeddings/HashEmbedder.swift
index 579e8ab..ca7c9fc 100644
--- a/Sources/GraphRAG/Embeddings/HashEmbedder.swift
+++ b/Sources/GraphRAG/Embeddings/HashEmbedder.swift
@@ -46,14 +46,19 @@ public struct HashEmbedder: EmbeddingModel {
     }
 
     private func tokenize(_ text: String) -> [String] {
+        // Split on any non-alphanumeric so "graph-based" hashes as the same two
+        // tokens as the query "graph based" (preserves semantic overlap).
         var tokens: [String] = []
-        for rawWord in text.split(whereSeparator: { $0.isWhitespace }) {
-            var cleaned = ""
-            for ch in rawWord where ch.isLetter || ch.isNumber {
-                cleaned.append(contentsOf: ch.lowercased())
+        var current = ""
+        for ch in text {
+            if ch.isLetter || ch.isNumber {
+                current.append(contentsOf: ch.lowercased())
+            } else if !current.isEmpty {
+                tokens.append(current)
+                current = ""
             }
-            if !cleaned.isEmpty { tokens.append(cleaned) }
         }
+        if !current.isEmpty { tokens.append(current) }
         return tokens
     }
 
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 378c963..78d8fdc 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -225,12 +225,21 @@ public struct PatternEntityExtractor: EntityExtracting {
         // Assign a sentence id to every character offset (incremented after
         // ./!/?), so relationships are only inferred between entities that
         // co-occur in the SAME sentence — otherwise one "works for" phrase would
-        // wrongly link every person/org pair sharing a chunk.
+        // wrongly link every person/org pair sharing a chunk. A period that ends
+        // a person title ("Dr.") is an abbreviation, not a sentence boundary, so
+        // "Dr. Smith works for Acme Inc." stays one sentence.
+        func periodEndsTitle(_ periodIndex: Int) -> Bool {
+            var s = periodIndex
+            while s > 0 && chars[s - 1].isLetter { s -= 1 }
+            return PatternEntityExtractor.personTitles.contains(
+                String(chars[s..<periodIndex]).lowercased())
+        }
         var sentenceID = [Int](repeating: 0, count: chars.count + 1)
         var sid = 0
         for k in 0..<chars.count {
             sentenceID[k] = sid
-            if chars[k] == "." || chars[k] == "!" || chars[k] == "?" { sid += 1 }
+            let c = chars[k]
+            if c == "!" || c == "?" || (c == "." && !periodEndsTitle(k)) { sid += 1 }
         }
         sentenceID[chars.count] = sid
         func sentence(of offset: Int) -> Int { sentenceID[max(0, min(offset, chars.count))] }
@@ -246,12 +255,18 @@ public struct PatternEntityExtractor: EntityExtracting {
                 guard let (aOff, bOff) = sameSentenceMentions(a, b, sentence: sentence) else {
                     continue
                 }
-                // Localize the keyword context to that sentence.
-                var lo = min(aOff, bOff)
-                while lo > 0 && sentence(of: lo - 1) == sentence(of: aOff) { lo -= 1 }
-                var hi = max(aOff, bOff)
-                while hi < chars.count && sentence(of: hi) == sentence(of: aOff) { hi += 1 }
-                let context = (lo < hi ? String(chars[lo..<hi]) : "").lowercased()
+                let lo = min(aOff, bOff)
+                let hi = max(aOff, bOff)
+                // Proximity heuristic: skip the pair if another entity of the same
+                // type as an endpoint lies between them — the connecting phrase
+                // most likely belongs to that nearer pair. (Offline extractor; not
+                // a full relation classifier, so this trades some recall for far
+                // fewer false edges in multi-fact sentences.)
+                if hasInterveningSameType(a, b, lo: lo, hi: hi, among: entities) { continue }
+                // Keyword context is just the span between the two mentions, so a
+                // phrase belonging to a different pair in the sentence can't leak.
+                let upper = min(hi + 1, chars.count)
+                let context = (lo < upper ? String(chars[lo..<upper]) : "").lowercased()
 
                 let relType = relationType(for: a.entityType, b.entityType, context: context)
                 // Orient asymmetric relations by entity role, independent of the
@@ -269,6 +284,20 @@ public struct PatternEntityExtractor: EntityExtracting {
         return relationships
     }
 
+    /// Whether an entity (other than `a`/`b`) of the same type as one endpoint
+    /// has a mention strictly between offsets `lo` and `hi`.
+    private func hasInterveningSameType(
+        _ a: Entity, _ b: Entity, lo: Int, hi: Int, among entities: [Entity]
+    ) -> Bool {
+        for c in entities where c.id != a.id && c.id != b.id {
+            guard c.entityType == a.entityType || c.entityType == b.entityType else { continue }
+            for m in c.mentions where m.startOffset > lo && m.startOffset < hi {
+                return true
+            }
+        }
+        return false
+    }
+
     /// First mention pair of `a` and `b` that falls in the same sentence.
     private func sameSentenceMentions(
         _ a: Entity, _ b: Entity, sentence: (Int) -> Int
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 823549b..507a4fc 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -213,10 +213,38 @@ public struct KnowledgeGraph: Sendable, Codable {
             nodeCount: n,
             edgeCount: relationshipCount,
             averageDegree: avgDegree,
-            maxDepth: 0
+            maxDepth: diameter()
         )
     }
 
+    /// Longest shortest-path (in hops) over the undirected graph — i.e. the
+    /// graph's diameter. O(V·(V+E)); intended for occasional stats calls.
+    private func diameter() -> Int {
+        guard entityOrder.count > 1 else { return 0 }
+        var adjacency: [EntityID: [EntityID]] = [:]
+        for rel in relationships {
+            adjacency[rel.source, default: []].append(rel.target)
+            adjacency[rel.target, default: []].append(rel.source)
+        }
+        var maxDist = 0
+        for start in entityOrder {
+            var dist: [EntityID: Int] = [start: 0]
+            var queue: [EntityID] = [start]
+            var head = 0
+            while head < queue.count {
+                let current = queue[head]
+                head += 1
+                let d = dist[current]!
+                if d > maxDist { maxDist = d }
+                for neighbor in adjacency[current] ?? [] where dist[neighbor] == nil {
+                    dist[neighbor] = d + 1
+                    queue.append(neighbor)
+                }
+            }
+        }
+        return maxDist
+    }
+
     // MARK: - Codable
 
     private enum CodingKeys: String, CodingKey {
diff --git a/Sources/GraphRAG/Graph/PageRank.swift b/Sources/GraphRAG/Graph/PageRank.swift
index 7265f81..8de2844 100644
--- a/Sources/GraphRAG/Graph/PageRank.swift
+++ b/Sources/GraphRAG/Graph/PageRank.swift
@@ -38,7 +38,9 @@ public struct PageRank: Sendable {
             outWeight[s] += w
         }
 
-        let d = dampingFactor
+        // Clamp to a valid probability so a misconfigured factor can't produce a
+        // negative teleport term (and negative scores).
+        let d = min(max(dampingFactor, 0), 1)
         let teleport = (1.0 - d) / Double(n)
         var scores = [Double](repeating: 1.0 / Double(n), count: n)
 
diff --git a/Sources/GraphRAG/Text/KeywordExtraction.swift b/Sources/GraphRAG/Text/KeywordExtraction.swift
index 6139ede..727bccc 100644
--- a/Sources/GraphRAG/Text/KeywordExtraction.swift
+++ b/Sources/GraphRAG/Text/KeywordExtraction.swift
@@ -13,9 +13,11 @@ public struct TfIdfKeywordExtractor: Sendable {
     public private(set) var totalDocuments: Int
     public let stopwords: Set<String>
 
-    public init(documentFrequencies: [String: Int] = [:], totalDocuments: Int = 1) {
+    public init(documentFrequencies: [String: Int] = [:], totalDocuments: Int = 0) {
         self.documentFrequencies = documentFrequencies
-        self.totalDocuments = max(1, totalDocuments)
+        // Start at the true count (0 for a fresh corpus). The smoothed IDF below
+        // handles an empty corpus without a phantom document.
+        self.totalDocuments = max(0, totalDocuments)
         self.stopwords = TfIdfKeywordExtractor.defaultStopwords
     }
 

From b3de614f127c3aa75406b718e2394b2b66904a70 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 06:45:04 +0000
Subject: [PATCH 11/18] Address Codex review (round 7): sentence abbreviations,
 clamps, clear, BM25

- PatternExtractor: sentence segmentation treats org suffixes (Inc./Corp./Ltd.)
  and common abbreviations as non-sentence-ending, so "Acme Inc. was founded by
  Sam Altman" keeps its LEADS edge. Entity-span splitting still uses personTitles.
- KnowledgeGraph.clearEntitiesAndRelationships clears chunk entity references in
  both chunksByID and the document copies, so no chunk points at a removed id.
- BM25 keeps 2-letter acronym terms (AI/ML/EU); only single chars are dropped.
- HierarchicalChunker clamps negative overlap (public API parity with TextProcessor).
- LLMExtractor clamps model-provided relationship strength to [0, 1].
- Engine annotates a chunk during build only when its content still matches what
  was extracted/embedded, so a document replaced mid-await isn't tagged with
  stale entities/embeddings.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Entity/LLMExtractor.swift     |  5 ++++-
 Sources/GraphRAG/Entity/PatternExtractor.swift | 14 +++++++++++---
 Sources/GraphRAG/Graph/KnowledgeGraph.swift    | 12 ++++++++++++
 Sources/GraphRAG/GraphRAG/Engine.swift         | 16 ++++++++--------
 Sources/GraphRAG/Retrieval/BM25.swift          |  4 +++-
 Sources/GraphRAG/Text/Chunking.swift           |  3 +++
 6 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index 72814d8..16ce62e 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -111,10 +111,13 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
             let tgt = data.target.lowercased().trimmingCharacters(in: .whitespacesAndNewlines)
             guard let sourceID = idByName[src], let targetID = idByName[tgt] else { continue }
             let relType = LLMEntityExtractor.relationTypeLabel(from: data.description)
+            // Clamp model-provided strength to a valid confidence; out-of-range
+            // values would distort traversal filtering and PageRank weights.
+            let confidence = min(max(data.strength ?? 0.7, 0), 1)
             relationships.append(
                 Relationship(
                     source: sourceID, target: targetID, relationType: relType,
-                    confidence: data.strength ?? 0.7, context: [chunk.id]))
+                    confidence: confidence, context: [chunk.id]))
         }
 
         return (entities, relationships)
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 78d8fdc..792b67d 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -228,10 +228,10 @@ public struct PatternEntityExtractor: EntityExtracting {
         // wrongly link every person/org pair sharing a chunk. A period that ends
         // a person title ("Dr.") is an abbreviation, not a sentence boundary, so
         // "Dr. Smith works for Acme Inc." stays one sentence.
-        func periodEndsTitle(_ periodIndex: Int) -> Bool {
+        func periodEndsAbbreviation(_ periodIndex: Int) -> Bool {
             var s = periodIndex
             while s > 0 && chars[s - 1].isLetter { s -= 1 }
-            return PatternEntityExtractor.personTitles.contains(
+            return PatternEntityExtractor.sentenceAbbreviations.contains(
                 String(chars[s..<periodIndex]).lowercased())
         }
         var sentenceID = [Int](repeating: 0, count: chars.count + 1)
@@ -239,7 +239,7 @@ public struct PatternEntityExtractor: EntityExtracting {
         for k in 0..<chars.count {
             sentenceID[k] = sid
             let c = chars[k]
-            if c == "!" || c == "?" || (c == "." && !periodEndsTitle(k)) { sid += 1 }
+            if c == "!" || c == "?" || (c == "." && !periodEndsAbbreviation(k)) { sid += 1 }
         }
         sentenceID[chars.count] = sid
         func sentence(of offset: Int) -> Int { sentenceID[max(0, min(offset, chars.count))] }
@@ -370,6 +370,14 @@ public struct PatternEntityExtractor: EntityExtracting {
         "berlin", "washington", "boston", "chicago",
     ]
     static let personTitles: Set<String> = ["dr", "prof", "mr", "mrs", "ms"]
+    /// Words whose trailing period is an abbreviation rather than a sentence end,
+    /// used only for sentence segmentation in relationship inference (so
+    /// "Acme Inc. was founded by Sam Altman" stays one sentence). Entity-span
+    /// splitting still uses the narrower `personTitles`.
+    static let sentenceAbbreviations: Set<String> = [
+        "dr", "prof", "mr", "mrs", "ms", "jr", "sr", "st",
+        "inc", "corp", "ltd", "llc", "co", "etc", "vs",
+    ]
     static let blocklist: Set<String> = [
         "the", "and", "but", "or", "chapter", "section", "however", "therefore",
         "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 507a4fc..bafda2c 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -115,12 +115,24 @@ public struct KnowledgeGraph: Sendable, Codable {
     }
 
     /// Drop all entities and relationships, preserving documents and chunks.
+    /// Chunk entity references are cleared too (in both `chunksByID` and the
+    /// document copies) so no chunk points at an entity id that no longer exists.
     public mutating func clearEntitiesAndRelationships() {
         entitiesByID.removeAll()
         entityOrder.removeAll()
         relationships.removeAll()
         outgoing.removeAll()
         incoming.removeAll()
+        for id in chunkOrder where !(chunksByID[id]?.entities.isEmpty ?? true) {
+            chunksByID[id]?.entities = []
+        }
+        for did in documentOrder {
+            guard var doc = documentsByID[did] else { continue }
+            for i in doc.chunks.indices where !doc.chunks[i].entities.isEmpty {
+                doc.chunks[i].entities = []
+            }
+            documentsByID[did] = doc
+        }
     }
 
     // MARK: - Lookup
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index e3d5fbc..4c330b8 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -126,11 +126,11 @@ public actor GraphRAG {
                 }
             }
 
-            // Re-fetch before writing: if the document was replaced during the
-            // extraction await, write entity ids onto the current chunk rather
-            // than clobbering new content with the pre-await snapshot. Always
-            // writing (even an empty list) clears stale ids from a prior build.
-            if var current = graph.chunk(id) {
+            // Only annotate the chunk if it wasn't replaced during the await
+            // (content still matches what we extracted from). A replacement bumps
+            // ingestionVersion, so the build is already marked unbuilt and will
+            // redo this next round rather than tagging new text with stale ids.
+            if var current = graph.chunk(id), current.content == chunk.content {
                 current.entities = entities.map(\.id)
                 graph.addChunk(current)
             }
@@ -140,9 +140,9 @@ public actor GraphRAG {
         for id in chunkIDs {
             guard let chunk = graph.chunk(id) else { continue }
             let embedding = try await embedder.embed(chunk.content)
-            // Re-fetch so a document replaced during the embedding await isn't
-            // overwritten by the stale snapshot.
-            if var current = graph.chunk(id) {
+            // Skip if the chunk was replaced during the embedding await (content
+            // changed), so we never attach an old-content embedding to new text.
+            if var current = graph.chunk(id), current.content == chunk.content {
                 current.embedding = embedding
                 graph.addChunk(current)
             }
diff --git a/Sources/GraphRAG/Retrieval/BM25.swift b/Sources/GraphRAG/Retrieval/BM25.swift
index d20ea38..4e86490 100644
--- a/Sources/GraphRAG/Retrieval/BM25.swift
+++ b/Sources/GraphRAG/Retrieval/BM25.swift
@@ -129,7 +129,9 @@ public struct BM25Retriever: Sendable {
         var tokens: [String] = []
         var current = ""
         func flush() {
-            if current.count > 2, !TfIdfKeywordExtractor.defaultStopwords.contains(current) {
+            // Keep 2-letter acronyms (AI, ML, EU); only single chars are dropped.
+            // Common short words are already removed by the stopword filter.
+            if current.count >= 2, !TfIdfKeywordExtractor.defaultStopwords.contains(current) {
                 tokens.append(current)
             }
             current = ""
diff --git a/Sources/GraphRAG/Text/Chunking.swift b/Sources/GraphRAG/Text/Chunking.swift
index 5c7004d..22c3f00 100644
--- a/Sources/GraphRAG/Text/Chunking.swift
+++ b/Sources/GraphRAG/Text/Chunking.swift
@@ -62,6 +62,9 @@ public struct HierarchicalChunker: Sendable {
         let chars = Array(text)
         let n = chars.count
         guard n > 0, chunkSize > 0 else { return [] }
+        // Clamp: a negative overlap would advance past the chunk end and skip
+        // text. (TextProcessor rejects it, but this public API must be safe too.)
+        let overlap = max(0, overlap)
 
         var spans: [ChunkSpan] = []
         var start = 0

From f6bdaf4fea58d6178bc2c2a87938cb87aef27cfc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 07:04:47 +0000
Subject: [PATCH 12/18] Address Codex review (round 8): sentence-end context,
 minConfidence, dedup, reingest

- PatternExtractor: an abbreviation period ends a sentence when the next word is
  capitalized, so "Acme Inc. was founded by Sam Altman" stays one sentence while
  "...Acme Inc. Bob Jones..." splits (person titles still never split).
- Engine: apply EntityConfig.minConfidence uniformly to every extractor's output
  (LLM/injected extractors don't self-filter), before the per-chunk cap.
- LLMExtractor: deduplicate entities by id in convert (gleaning passes can repeat
  an entity) so a duplicate can't consume the per-chunk cap.
- KnowledgeGraph.addDocument: when replacing an existing document id, purge the
  previous version's chunks so direct callers don't retain stale chunk text.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Entity/LLMExtractor.swift     | 12 ++++++++++--
 Sources/GraphRAG/Entity/PatternExtractor.swift | 18 ++++++++++++++----
 Sources/GraphRAG/Graph/KnowledgeGraph.swift    |  8 +++++++-
 Sources/GraphRAG/GraphRAG/Engine.swift         |  6 +++++-
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index 16ce62e..f9eff92 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -77,6 +77,7 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         -> (entities: [Entity], relationships: [Relationship])
     {
         var entities: [Entity] = []
+        var indexByID: [EntityID: Int] = [:]
         var idByName: [String: EntityID] = [:]
         let lowerContent = chunk.content.lowercased()
 
@@ -100,8 +101,15 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
                         endOffset: end, confidence: 0.9))
             }
 
-            entities.append(
-                Entity(id: id, name: name, entityType: type, confidence: 0.9, mentions: mentions))
+            // Deduplicate by id (e.g. a gleaning pass repeating an entity) so a
+            // duplicate doesn't later consume the per-chunk cap.
+            if let idx = indexByID[id] {
+                entities[idx].mentions.append(contentsOf: mentions)
+            } else {
+                indexByID[id] = entities.count
+                entities.append(
+                    Entity(id: id, name: name, entityType: type, confidence: 0.9, mentions: mentions))
+            }
             idByName[name.lowercased()] = id
         }
 
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 792b67d..a762337 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -228,18 +228,28 @@ public struct PatternEntityExtractor: EntityExtracting {
         // wrongly link every person/org pair sharing a chunk. A period that ends
         // a person title ("Dr.") is an abbreviation, not a sentence boundary, so
         // "Dr. Smith works for Acme Inc." stays one sentence.
-        func periodEndsAbbreviation(_ periodIndex: Int) -> Bool {
+        func periodIsSentenceEnd(_ periodIndex: Int) -> Bool {
             var s = periodIndex
             while s > 0 && chars[s - 1].isLetter { s -= 1 }
-            return PatternEntityExtractor.sentenceAbbreviations.contains(
-                String(chars[s..<periodIndex]).lowercased())
+            let word = String(chars[s..<periodIndex]).lowercased()
+            // Person titles precede a name, so never a sentence end ("Dr. Smith").
+            if PatternEntityExtractor.personTitles.contains(word) { return false }
+            // Other abbreviations (Inc./Corp.) end a sentence only when the next
+            // word is capitalized — "Acme Inc. was ..." stays one sentence, but
+            // "... Acme Inc. Bob ..." splits.
+            if PatternEntityExtractor.sentenceAbbreviations.contains(word) {
+                var t = periodIndex + 1
+                while t < chars.count && chars[t] == " " { t += 1 }
+                return t < chars.count && chars[t].isUppercase
+            }
+            return true
         }
         var sentenceID = [Int](repeating: 0, count: chars.count + 1)
         var sid = 0
         for k in 0..<chars.count {
             sentenceID[k] = sid
             let c = chars[k]
-            if c == "!" || c == "?" || (c == "." && !periodEndsAbbreviation(k)) { sid += 1 }
+            if c == "!" || c == "?" || (c == "." && periodIsSentenceEnd(k)) { sid += 1 }
         }
         sentenceID[chars.count] = sid
         func sentence(of offset: Int) -> Int { sentenceID[max(0, min(offset, chars.count))] }
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index bafda2c..4a7e8bc 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -82,7 +82,13 @@ public struct KnowledgeGraph: Sendable, Codable {
     }
 
     public mutating func addDocument(_ document: Document) {
-        if documentsByID[document.id] == nil { documentOrder.append(document.id) }
+        if documentsByID[document.id] == nil {
+            documentOrder.append(document.id)
+        } else {
+            // Replacing an existing id: purge the previous version's chunks so
+            // direct KnowledgeGraph callers don't retain stale chunk text.
+            removeChunks(forDocument: document.id)
+        }
         documentsByID[document.id] = document
     }
 
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 4c330b8..a8c8bc7 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -104,7 +104,11 @@ public actor GraphRAG {
         for id in chunkIDs {
             guard let chunk = graph.chunk(id) else { continue }
             let extracted = try await extractor.extract(from: chunk)
-            var entities = extracted.entities
+            // Apply the configured confidence threshold uniformly — injected/LLM
+            // extractors don't self-filter the way the pattern extractor does.
+            var entities = extracted.entities.filter {
+                $0.confidence >= config.entity.minConfidence
+            }
             let relationships = extracted.relationships
 
             // Honor the per-chunk entity cap, keeping the highest-confidence ones.

From 420f1ccf7437441768a0eea6b573539e3c66ad2a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 07:18:47 +0000
Subject: [PATCH 13/18] Address Codex review (round 9): newline boundaries,
 best relation pair, PR weights

- PatternExtractor: a newline (or end of text) after an org-suffix abbreviation
  is a sentence boundary, so "Acme Inc.\nBob Jones ..." splits into two sentences.
- PatternExtractor: evaluate every same-sentence mention pair for an entity pair
  and keep the most specific typed relation, so a later "works for" co-occurrence
  isn't shadowed by an earlier generic one.
- PageRank: skip non-positive-confidence edges (treating such sources as
  dangling) so a single zero-confidence edge can't capture all of a node's mass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .../GraphRAG/Entity/PatternExtractor.swift    | 76 ++++++++++---------
 Sources/GraphRAG/Graph/PageRank.swift         |  6 +-
 2 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index a762337..8a08042 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -239,8 +239,11 @@ public struct PatternEntityExtractor: EntityExtracting {
             // "... Acme Inc. Bob ..." splits.
             if PatternEntityExtractor.sentenceAbbreviations.contains(word) {
                 var t = periodIndex + 1
-                while t < chars.count && chars[t] == " " { t += 1 }
-                return t < chars.count && chars[t].isUppercase
+                while t < chars.count && (chars[t] == " " || chars[t] == "\t") { t += 1 }
+                // End of text or a line break is a boundary; otherwise only when
+                // the next word is capitalized.
+                if t >= chars.count || chars[t].isNewline { return true }
+                return chars[t].isUppercase
             }
             return true
         }
@@ -261,39 +264,50 @@ public struct PatternEntityExtractor: EntityExtracting {
             for j in (i + 1)..<entities.count {
                 let a = entities[i]
                 let b = entities[j]
-                // Find a mention pair that shares a sentence; skip the pair if none.
-                guard let (aOff, bOff) = sameSentenceMentions(a, b, sentence: sentence) else {
-                    continue
+                // Evaluate every same-sentence mention pair and keep the most
+                // specific typed relation found — so "Alice met Acme. Alice works
+                // for Acme." yields WORKS_FOR, not just the earlier ASSOCIATED_WITH.
+                var chosen: (relType: String, source: EntityID, target: EntityID)?
+                search: for ma in a.mentions {
+                    for mb in b.mentions
+                    where sentence(of: ma.startOffset) == sentence(of: mb.startOffset) {
+                        let lo = min(ma.startOffset, mb.startOffset)
+                        let hi = max(ma.startOffset, mb.startOffset)
+                        // Proximity heuristic: skip if another same-type entity
+                        // lies between them — the phrase likely belongs to that
+                        // nearer pair. (Offline extractor, not a full classifier.)
+                        if hasInterveningSameType(a, b, lo: lo, hi: hi, among: entities) { continue }
+                        // Keyword context is just the span between the two mentions.
+                        let upper = min(hi + 1, chars.count)
+                        let context = (lo < upper ? String(chars[lo..<upper]) : "").lowercased()
+                        let relType = relationType(for: a.entityType, b.entityType, context: context)
+                        let (source, target) = orient(relType, a, b)
+                        if chosen == nil { chosen = (relType, source.id, target.id) }
+                        if PatternEntityExtractor.specificRelations.contains(relType) {
+                            chosen = (relType, source.id, target.id)
+                            break search
+                        }
+                    }
                 }
-                let lo = min(aOff, bOff)
-                let hi = max(aOff, bOff)
-                // Proximity heuristic: skip the pair if another entity of the same
-                // type as an endpoint lies between them — the connecting phrase
-                // most likely belongs to that nearer pair. (Offline extractor; not
-                // a full relation classifier, so this trades some recall for far
-                // fewer false edges in multi-fact sentences.)
-                if hasInterveningSameType(a, b, lo: lo, hi: hi, among: entities) { continue }
-                // Keyword context is just the span between the two mentions, so a
-                // phrase belonging to a different pair in the sentence can't leak.
-                let upper = min(hi + 1, chars.count)
-                let context = (lo < upper ? String(chars[lo..<upper]) : "").lowercased()
-
-                let relType = relationType(for: a.entityType, b.entityType, context: context)
-                // Orient asymmetric relations by entity role, independent of the
-                // order the spans happened to appear in the text.
-                let (source, target) = orient(relType, a, b)
-                let key = "\(source.id.raw)|\(target.id.raw)|\(relType)"
+                guard let result = chosen else { continue }
+                let key = "\(result.source.raw)|\(result.target.raw)|\(result.relType)"
                 if seen.contains(key) { continue }
                 seen.insert(key)
                 relationships.append(
                     Relationship(
-                        source: source.id, target: target.id, relationType: relType,
-                        confidence: 0.6, context: [chunk.id]))
+                        source: result.source, target: result.target,
+                        relationType: result.relType, confidence: 0.6, context: [chunk.id]))
             }
         }
         return relationships
     }
 
+    /// Typed relations preferred over the generic ASSOCIATED_WITH/KNOWS/RELATED_TO.
+    static let specificRelations: Set<String> = [
+        "WORKS_FOR", "LEADS", "BORN_IN", "LOCATED_IN", "HEADQUARTERED_IN",
+        "MARRIED_TO", "COLLEAGUE_OF",
+    ]
+
     /// Whether an entity (other than `a`/`b`) of the same type as one endpoint
     /// has a mention strictly between offsets `lo` and `hi`.
     private func hasInterveningSameType(
@@ -308,18 +322,6 @@ public struct PatternEntityExtractor: EntityExtracting {
         return false
     }
 
-    /// First mention pair of `a` and `b` that falls in the same sentence.
-    private func sameSentenceMentions(
-        _ a: Entity, _ b: Entity, sentence: (Int) -> Int
-    ) -> (Int, Int)? {
-        for ma in a.mentions {
-            for mb in b.mentions where sentence(ma.startOffset) == sentence(mb.startOffset) {
-                return (ma.startOffset, mb.startOffset)
-            }
-        }
-        return nil
-    }
-
     /// Order (source, target) for a typed relation by the entities' roles.
     /// Symmetric relations keep their text order.
     private func orient(_ relType: String, _ a: Entity, _ b: Entity) -> (Entity, Entity) {
diff --git a/Sources/GraphRAG/Graph/PageRank.swift b/Sources/GraphRAG/Graph/PageRank.swift
index 8de2844..a6f8284 100644
--- a/Sources/GraphRAG/Graph/PageRank.swift
+++ b/Sources/GraphRAG/Graph/PageRank.swift
@@ -33,7 +33,11 @@ public struct PageRank: Sendable {
         var outWeight = [Double](repeating: 0, count: n)
         for rel in graph.relationships {
             guard let s = indexOf[rel.source], let t = indexOf[rel.target] else { continue }
-            let w = Double(max(rel.confidence, 0.0001))
+            // Skip non-positive-confidence edges entirely; otherwise a single
+            // zero-confidence edge would receive all of a node's PageRank mass.
+            // Nodes left with no positive out-edge are handled as dangling.
+            let w = Double(rel.confidence)
+            guard w > 0 else { continue }
             incomingEdges[t].append((s, w))
             outWeight[s] += w
         }

From 38c78b756042e0d9e4885c3884b50c3f1c199123 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 07:35:14 +0000
Subject: [PATCH 14/18] Address Codex review (round 10): org-name commas, LLM
 keys, host port, articles, evidence scrub
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- PatternExtractor: a comma followed by an org suffix doesn't split the run, so
  "Acme, Inc." stays one organization span; leading articles ("The United States",
  "The University of California") are stripped before classification so they map
  to location/organization instead of person.
- LLMExtractor: EntityData/RelationshipData decode both the short JSON-example
  keys and the prompt's verbose names (entity_name/source_entity/…), so Ollama
  responses in either shape parse.
- Ollama baseURL: don't append a second port when the host already includes one
  ("http://localhost:11434").
- KnowledgeGraph.removeChunks: scrub entity mentions and relationship context that
  reference removed chunks, so replacing a document leaves no stale evidence.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Embeddings/Ollama.swift      |  7 +++
 Sources/GraphRAG/Entity/LLMExtractor.swift    | 47 +++++++++++++++----
 .../GraphRAG/Entity/PatternExtractor.swift    | 30 ++++++++++--
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   | 14 ++++++
 4 files changed, 85 insertions(+), 13 deletions(-)

diff --git a/Sources/GraphRAG/Embeddings/Ollama.swift b/Sources/GraphRAG/Embeddings/Ollama.swift
index 9103255..124a439 100644
--- a/Sources/GraphRAG/Embeddings/Ollama.swift
+++ b/Sources/GraphRAG/Embeddings/Ollama.swift
@@ -51,6 +51,13 @@ public struct OllamaConfig: Sendable {
         // Accept bare hosts ("localhost", "127.0.0.1"): without a scheme, URL
         // parses the host as the scheme and the request fails.
         let normalizedHost = host.contains("://") ? host : "http://\(host)"
+        // If the host already includes a port (e.g. "http://localhost:11434"),
+        // don't append another.
+        if let schemeRange = normalizedHost.range(of: "://"),
+            normalizedHost[schemeRange.upperBound...].contains(":")
+        {
+            return normalizedHost
+        }
         return "\(normalizedHost):\(port)"
     }
 }
diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index f9eff92..4b38109 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -182,12 +182,25 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         var type: String
         var description: String?
 
-        enum CodingKeys: String, CodingKey { case name, type, description }
+        // Accept both the short JSON-example keys and the verbose names the
+        // prompt's bullet instructions use (entity_name / entity_type / ...).
+        enum CodingKeys: String, CodingKey {
+            case name, type, description
+            case entityName = "entity_name"
+            case entityType = "entity_type"
+            case entityDescription = "entity_description"
+        }
         init(from decoder: Decoder) throws {
             let c = try decoder.container(keyedBy: CodingKeys.self)
-            name = (try? c.decode(String.self, forKey: .name)) ?? ""
-            type = (try? c.decode(String.self, forKey: .type)) ?? ""
-            description = try? c.decode(String.self, forKey: .description)
+            name =
+                (try? c.decode(String.self, forKey: .name))
+                ?? (try? c.decode(String.self, forKey: .entityName)) ?? ""
+            type =
+                (try? c.decode(String.self, forKey: .type))
+                ?? (try? c.decode(String.self, forKey: .entityType)) ?? ""
+            description =
+                (try? c.decode(String.self, forKey: .description))
+                ?? (try? c.decode(String.self, forKey: .entityDescription))
         }
     }
 
@@ -197,13 +210,29 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         var description: String
         var strength: Float?
 
-        enum CodingKeys: String, CodingKey { case source, target, description, strength }
+        // Accept both the short JSON-example keys and the verbose names from the
+        // prompt's bullet instructions (source_entity / relationship_strength / ...).
+        enum CodingKeys: String, CodingKey {
+            case source, target, description, strength
+            case sourceEntity = "source_entity"
+            case targetEntity = "target_entity"
+            case relationshipDescription = "relationship_description"
+            case relationshipStrength = "relationship_strength"
+        }
         init(from decoder: Decoder) throws {
             let c = try decoder.container(keyedBy: CodingKeys.self)
-            source = (try? c.decode(String.self, forKey: .source)) ?? ""
-            target = (try? c.decode(String.self, forKey: .target)) ?? ""
-            description = (try? c.decode(String.self, forKey: .description)) ?? ""
-            strength = try? c.decode(Float.self, forKey: .strength)
+            source =
+                (try? c.decode(String.self, forKey: .source))
+                ?? (try? c.decode(String.self, forKey: .sourceEntity)) ?? ""
+            target =
+                (try? c.decode(String.self, forKey: .target))
+                ?? (try? c.decode(String.self, forKey: .targetEntity)) ?? ""
+            description =
+                (try? c.decode(String.self, forKey: .description))
+                ?? (try? c.decode(String.self, forKey: .relationshipDescription)) ?? ""
+            strength =
+                (try? c.decode(Float.self, forKey: .strength))
+                ?? (try? c.decode(Float.self, forKey: .relationshipStrength))
         }
     }
 
diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 8a08042..8b977fc 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -96,8 +96,11 @@ public struct PatternEntityExtractor: EntityExtracting {
                     // (./!/?) also ends it ("Acme. Bob") — unless the word is a
                     // known abbreviation/title like "Dr." so "Dr. Smith" merges.
                     if j > runStart, let last = chars[j - 1].unicodeScalars.first {
-                        if CharacterSet(charactersIn: ",;:").contains(last) { break }
-                        if CharacterSet(charactersIn: ".!?").contains(last) {
+                        if CharacterSet(charactersIn: ",;:").contains(last) {
+                            // Keep "Acme, Inc." together: a comma immediately
+                            // followed by an org suffix doesn't end the run.
+                            if !(last == "," && nextWordIsOrgSuffix(chars, from: j)) { break }
+                        } else if CharacterSet(charactersIn: ".!?").contains(last) {
                             var ws = j - 1
                             while ws > runStart && !chars[ws - 1].isWhitespace { ws -= 1 }
                             let word = String(chars[ws..<j])
@@ -172,10 +175,29 @@ public struct PatternEntityExtractor: EntityExtracting {
         return prev.isWhitespace || "\"'([{".contains(prev)
     }
 
+    /// Whether the next word starting at/after `from` is an organization suffix
+    /// (e.g. "Inc"/"Inc."), used to keep "Acme, Inc." as one span.
+    private func nextWordIsOrgSuffix(_ chars: [Character], from: Int) -> Bool {
+        var t = from
+        while t < chars.count && (chars[t] == " " || chars[t] == "\t") { t += 1 }
+        var e = t
+        while e < chars.count && chars[e].isLetter { e += 1 }
+        guard e > t else { return false }
+        let word = String(chars[t..<e]).lowercased()
+        return PatternEntityExtractor.orgSuffixes.contains(word)
+            || PatternEntityExtractor.orgSuffixes.contains(word + ".")
+    }
+
     // MARK: - Classification
 
     private func classify(_ text: String) -> (type: String, confidence: Float)? {
-        let words = text.split(separator: " ").map(String.init)
+        var words = text.split(separator: " ").map(String.init)
+        guard !words.isEmpty else { return nil }
+        // Drop a leading article so "The United States" / "The University of
+        // California" classify as location/organization instead of person.
+        if words.count > 1, ["the", "a", "an"].contains(words[0].lowercased()) {
+            words.removeFirst()
+        }
         guard !words.isEmpty else { return nil }
 
         // Blocklist single sentence-initial words that are common/structural.
@@ -190,7 +212,7 @@ public struct PatternEntityExtractor: EntityExtracting {
             return ("ORGANIZATION", 0.9)
         }
         // Organizations by prefix ("University of ...", etc.).
-        let lower = text.lowercased()
+        let lower = words.joined(separator: " ").lowercased()
         for prefix in PatternEntityExtractor.orgPrefixes where lower.hasPrefix(prefix) {
             return ("ORGANIZATION", 0.9)
         }
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 4a7e8bc..98396a3 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -118,6 +118,20 @@ public struct KnowledgeGraph: Sendable, Codable {
             doc.chunks.removeAll { removed.contains($0.id) }
             documentsByID[documentID] = doc
         }
+        // Scrub evidence pointing at the removed chunks: drop entity mentions and
+        // relationship-context entries that reference them, so traversal/stats and
+        // saved JSON don't expose facts from a document version that's gone.
+        for eid in entityOrder {
+            guard var entity = entitiesByID[eid], !entity.mentions.isEmpty else { continue }
+            let kept = entity.mentions.filter { !removed.contains($0.chunkID) }
+            if kept.count != entity.mentions.count {
+                entity.mentions = kept
+                entitiesByID[eid] = entity
+            }
+        }
+        for idx in relationships.indices where !relationships[idx].context.isEmpty {
+            relationships[idx].context.removeAll { removed.contains($0) }
+        }
     }
 
     /// Drop all entities and relationships, preserving documents and chunks.

From f872b93bc44da58b467d2f1ee7357df52247ab33 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 07:36:47 +0000
Subject: [PATCH 15/18] Fix build: make LLM parse structs Decodable (CI compile
 error)

The alias CodingKeys added in round 10 (entity_name, source_entity, ...) broke
synthesized Encodable conformance, since every CodingKeys case must map to a
stored property. These structs are only decoded, so declare them Decodable
instead of Codable.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Entity/LLMExtractor.swift | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Sources/GraphRAG/Entity/LLMExtractor.swift b/Sources/GraphRAG/Entity/LLMExtractor.swift
index 4b38109..9c81faa 100644
--- a/Sources/GraphRAG/Entity/LLMExtractor.swift
+++ b/Sources/GraphRAG/Entity/LLMExtractor.swift
@@ -163,7 +163,7 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
 
     // MARK: - Parsing
 
-    struct ExtractionOutput: Codable {
+    struct ExtractionOutput: Decodable {
         var entities: [EntityData] = []
         var relationships: [RelationshipData] = []
 
@@ -177,7 +177,7 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         }
     }
 
-    struct EntityData: Codable {
+    struct EntityData: Decodable {
         var name: String
         var type: String
         var description: String?
@@ -204,7 +204,7 @@ public struct LLMEntityExtractor<Model: LanguageModel>: EntityExtracting {
         }
     }
 
-    struct RelationshipData: Codable {
+    struct RelationshipData: Decodable {
         var source: String
         var target: String
         var description: String

From ab37e9a485df7135400062c25f5e10367fe5183a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 07:58:49 +0000
Subject: [PATCH 16/18] Address Codex review (round 11): path guard, trailing
 cues, newlines, dead config

- KnowledgeGraph.findRelationshipPath requires both endpoints to exist before
  returning the self path.
- PatternExtractor: treat bare newlines as sentence boundaries; extend a pair's
  keyword context to the sentence end when no other entity mention follows the
  later mention (captures trailing cues like "... are married" without leaking
  into a following pair). Documented the deliberate precision-over-recall
  trade-off for coordinated subjects.
- Remove the unused RetrievalConfig (maxExpansionDepth / *Weight knobs were never
  read) so Config exposes no silently-ignored settings.
- Engine: documented that extractRelationships gates insertion only (skipping an
  LLM's relationship prompting would need a protocol change).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 .../GraphRAG/Entity/PatternExtractor.swift    | 38 +++++++++++++++++--
 Sources/GraphRAG/Graph/KnowledgeGraph.swift   |  2 +
 Sources/GraphRAG/GraphRAG/Config.swift        |  5 +--
 Sources/GraphRAG/GraphRAG/Engine.swift        |  5 +++
 Sources/GraphRAG/Retrieval/Hybrid.swift       | 23 -----------
 5 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/Sources/GraphRAG/Entity/PatternExtractor.swift b/Sources/GraphRAG/Entity/PatternExtractor.swift
index 8b977fc..8036fa9 100644
--- a/Sources/GraphRAG/Entity/PatternExtractor.swift
+++ b/Sources/GraphRAG/Entity/PatternExtractor.swift
@@ -274,7 +274,11 @@ public struct PatternEntityExtractor: EntityExtracting {
         for k in 0..<chars.count {
             sentenceID[k] = sid
             let c = chars[k]
-            if c == "!" || c == "?" || (c == "." && periodIsSentenceEnd(k)) { sid += 1 }
+            // A newline also separates facts (line-per-fact chunks without
+            // terminal punctuation).
+            if c.isNewline || c == "!" || c == "?" || (c == "." && periodIsSentenceEnd(k)) {
+                sid += 1
+            }
         }
         sentenceID[chars.count] = sid
         func sentence(of offset: Int) -> Int { sentenceID[max(0, min(offset, chars.count))] }
@@ -298,9 +302,23 @@ public struct PatternEntityExtractor: EntityExtracting {
                         // Proximity heuristic: skip if another same-type entity
                         // lies between them — the phrase likely belongs to that
                         // nearer pair. (Offline extractor, not a full classifier.)
+                        //
+                        // Known limitation: this also skips coordinated subjects
+                        // ("Alice and Bob are employed by Acme" drops Alice→Acme).
+                        // Relaxing it to allow coordination would reintroduce
+                        // cross-fact false edges ("Alice works for Acme and Bob
+                        // works for Beta" → Alice→Beta), so precision is preferred.
                         if hasInterveningSameType(a, b, lo: lo, hi: hi, among: entities) { continue }
-                        // Keyword context is just the span between the two mentions.
-                        let upper = min(hi + 1, chars.count)
+                        // Context is the span between the mentions. If no other
+                        // entity mention follows the later one in this sentence,
+                        // extend to the sentence end so a trailing cue ("... are
+                        // married") is seen — without leaking into a later pair.
+                        var upper = min(hi + 1, chars.count)
+                        if !entityMentionFollows(after: hi, sentence: sentence, entities: entities, a: a, b: b) {
+                            var e = hi
+                            while e < chars.count && sentence(of: e) == sentence(of: hi) { e += 1 }
+                            upper = e
+                        }
                         let context = (lo < upper ? String(chars[lo..<upper]) : "").lowercased()
                         let relType = relationType(for: a.entityType, b.entityType, context: context)
                         let (source, target) = orient(relType, a, b)
@@ -330,6 +348,20 @@ public struct PatternEntityExtractor: EntityExtracting {
         "MARRIED_TO", "COLLEAGUE_OF",
     ]
 
+    /// Whether an entity other than `a`/`b` has a mention after `offset` within
+    /// the same sentence (used to decide if a trailing cue window is safe).
+    private func entityMentionFollows(
+        after offset: Int, sentence: (Int) -> Int, entities: [Entity], a: Entity, b: Entity
+    ) -> Bool {
+        let s = sentence(offset)
+        for c in entities where c.id != a.id && c.id != b.id {
+            for m in c.mentions where m.startOffset > offset && sentence(m.startOffset) == s {
+                return true
+            }
+        }
+        return false
+    }
+
     /// Whether an entity (other than `a`/`b`) of the same type as one endpoint
     /// has a mention strictly between offsets `lo` and `hi`.
     private func hasInterveningSameType(
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 98396a3..ed903f5 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -222,6 +222,8 @@ public struct KnowledgeGraph: Sendable, Codable {
     public func findRelationshipPath(
         from source: EntityID, to target: EntityID, maxDepth: Int = 5
     ) -> [EntityID]? {
+        // Endpoints must exist; otherwise even the self-path is meaningless.
+        guard contains(source), contains(target) else { return nil }
         if source == target { return [source] }
         var visited: Set<EntityID> = [source]
         var queue: [(EntityID, [EntityID])] = [(source, [source])]
diff --git a/Sources/GraphRAG/GraphRAG/Config.swift b/Sources/GraphRAG/GraphRAG/Config.swift
index f55e4c9..e16d9e0 100644
--- a/Sources/GraphRAG/GraphRAG/Config.swift
+++ b/Sources/GraphRAG/GraphRAG/Config.swift
@@ -57,7 +57,6 @@ public struct Config: Sendable {
     public var graph: GraphConfig
     public var text: TextConfig
     public var entity: EntityConfig
-    public var retrieval: RetrievalConfig
 
     public init(
         outputDir: String = "./output",
@@ -70,8 +69,7 @@ public struct Config: Sendable {
         embedding: EmbeddingConfig = EmbeddingConfig(),
         graph: GraphConfig = GraphConfig(),
         text: TextConfig = TextConfig(),
-        entity: EntityConfig = EntityConfig(),
-        retrieval: RetrievalConfig = RetrievalConfig()
+        entity: EntityConfig = EntityConfig()
     ) {
         self.outputDir = outputDir
         self.chunkSize = chunkSize
@@ -84,7 +82,6 @@ public struct Config: Sendable {
         self.graph = graph
         self.text = text
         self.entity = entity
-        self.retrieval = retrieval
     }
 
     public static let `default` = Config()
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index a8c8bc7..2593c64 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -120,6 +120,11 @@ public actor GraphRAG {
 
             let keptIDs = Set(entities.map(\.id))
             for entity in entities { graph.addEntity(entity) }
+            // `extractRelationships == false` gates insertion here. We don't try
+            // to also suppress the extractor's own relationship work: the
+            // EntityExtracting protocol returns entities and relationships from a
+            // single call, so skipping the LLM's relationship prompting would
+            // require a protocol/prompt change. Deferred deliberately.
             if config.entity.extractRelationships {
                 // Scope to entities that survived THIS chunk's cap — not global
                 // graph state — so the cap can't be defeated by an id that
diff --git a/Sources/GraphRAG/Retrieval/Hybrid.swift b/Sources/GraphRAG/Retrieval/Hybrid.swift
index 6497ff4..9aac413 100644
--- a/Sources/GraphRAG/Retrieval/Hybrid.swift
+++ b/Sources/GraphRAG/Retrieval/Hybrid.swift
@@ -209,26 +209,3 @@ public struct HybridRetriever: Sendable {
     }
 }
 
-/// Retrieval-tuning knobs mirroring the Rust `RetrievalConfig`.
-///
-/// Top-k and similarity threshold live on the top-level `Config`
-/// (`topKResults` / `similarityThreshold`) to avoid two sources of truth; the
-/// fields here govern graph-expansion scoring.
-public struct RetrievalConfig: Sendable {
-    public var maxExpansionDepth: Int
-    public var entityWeight: Float
-    public var chunkWeight: Float
-    public var graphWeight: Float
-
-    public init(
-        maxExpansionDepth: Int = 2,
-        entityWeight: Float = 0.4,
-        chunkWeight: Float = 0.4,
-        graphWeight: Float = 0.2
-    ) {
-        self.maxExpansionDepth = maxExpansionDepth
-        self.entityWeight = entityWeight
-        self.chunkWeight = chunkWeight
-        self.graphWeight = graphWeight
-    }
-}

From b6dfa551cd9ea96422c13e76c69bd710657587de Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 08:15:53 +0000
Subject: [PATCH 17/18] Address Codex review (round 12): drop evidence-less
 edges, dead graph config, path dedup

- KnowledgeGraph.removeChunks now drops a relationship whose only evidence was a
  removed chunk (context-less leftover would expose a stale fact) and rebuilds the
  adjacency index; edges that never had context are preserved.
- Remove the unused GraphConfig (maxConnections/threshold were never read) so
  Config exposes no silently-ignored knobs.
- findAllPaths collapses parallel edges (multiple relation types between the same
  nodes) to one neighbor, so entity-only paths aren't duplicated and don't waste
  the maxPaths budget.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Graph/KnowledgeGraph.swift | 28 +++++++++++++++++++--
 Sources/GraphRAG/Graph/Traversal.swift      | 20 +++++++++------
 Sources/GraphRAG/GraphRAG/Config.swift      | 13 ----------
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index ed903f5..0986077 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -129,8 +129,32 @@ public struct KnowledgeGraph: Sendable, Codable {
                 entitiesByID[eid] = entity
             }
         }
-        for idx in relationships.indices where !relationships[idx].context.isEmpty {
-            relationships[idx].context.removeAll { removed.contains($0) }
+        // Scrub relationship context; drop a relationship whose only evidence was
+        // a removed chunk (a context-less leftover would expose a stale fact).
+        // Edges that never had context are kept.
+        var survived: [Relationship] = []
+        survived.reserveCapacity(relationships.count)
+        for var rel in relationships {
+            let hadContext = !rel.context.isEmpty
+            rel.context.removeAll { removed.contains($0) }
+            if hadContext && rel.context.isEmpty { continue }
+            survived.append(rel)
+        }
+        if survived.count != relationships.count {
+            relationships = survived
+            rebuildAdjacency()
+        } else {
+            relationships = survived
+        }
+    }
+
+    /// Rebuild the outgoing/incoming index after the `relationships` array changes.
+    private mutating func rebuildAdjacency() {
+        outgoing.removeAll()
+        incoming.removeAll()
+        for (index, rel) in relationships.enumerated() {
+            outgoing[rel.source, default: []].append(index)
+            incoming[rel.target, default: []].append(index)
         }
     }
 
diff --git a/Sources/GraphRAG/Graph/Traversal.swift b/Sources/GraphRAG/Graph/Traversal.swift
index 9535e83..2712fff 100644
--- a/Sources/GraphRAG/Graph/Traversal.swift
+++ b/Sources/GraphRAG/Graph/Traversal.swift
@@ -190,14 +190,18 @@ public struct GraphTraversal: Sendable {
         // `<= 0` (not `== 0`) so a negative configured maxDepth yields no expansion.
         if remaining <= 0 { return }
         visited.insert(current)
-        for (neighbor, relationship) in graph.neighbors(of: current) {
-            guard passesFilter(relationship) else { continue }
-            if !visited.contains(neighbor) {
-                path.append(neighbor)
-                pathDFS(graph, current: neighbor, target: target, remaining: remaining - 1,
-                        path: &path, visited: &visited, paths: &paths)
-                path.removeLast()
-            }
+        // Paths are entity-only, so collapse parallel edges (multiple relation
+        // types between the same nodes) to one neighbor to avoid duplicate paths.
+        var uniqueNeighbors: [EntityID] = []
+        var seenNeighbors: Set<EntityID> = []
+        for (neighbor, relationship) in graph.neighbors(of: current) where passesFilter(relationship) {
+            if seenNeighbors.insert(neighbor).inserted { uniqueNeighbors.append(neighbor) }
+        }
+        for neighbor in uniqueNeighbors where !visited.contains(neighbor) {
+            path.append(neighbor)
+            pathDFS(graph, current: neighbor, target: target, remaining: remaining - 1,
+                    path: &path, visited: &visited, paths: &paths)
+            path.removeLast()
         }
         visited.remove(current)
     }
diff --git a/Sources/GraphRAG/GraphRAG/Config.swift b/Sources/GraphRAG/GraphRAG/Config.swift
index e16d9e0..f940465 100644
--- a/Sources/GraphRAG/GraphRAG/Config.swift
+++ b/Sources/GraphRAG/GraphRAG/Config.swift
@@ -14,16 +14,6 @@ public struct EmbeddingConfig: Sendable {
     }
 }
 
-public struct GraphConfig: Sendable {
-    public var maxConnections: Int
-    public var threshold: Float
-
-    public init(maxConnections: Int = 10, threshold: Float = 0.8) {
-        self.maxConnections = maxConnections
-        self.threshold = threshold
-    }
-}
-
 public struct TextConfig: Sendable {
     public var languages: [String]
 
@@ -54,7 +44,6 @@ public struct Config: Sendable {
     public var approach: String
 
     public var embedding: EmbeddingConfig
-    public var graph: GraphConfig
     public var text: TextConfig
     public var entity: EntityConfig
 
@@ -67,7 +56,6 @@ public struct Config: Sendable {
         similarityThreshold: Float = 0.8,
         approach: String = "hybrid",
         embedding: EmbeddingConfig = EmbeddingConfig(),
-        graph: GraphConfig = GraphConfig(),
         text: TextConfig = TextConfig(),
         entity: EntityConfig = EntityConfig()
     ) {
@@ -79,7 +67,6 @@ public struct Config: Sendable {
         self.similarityThreshold = similarityThreshold
         self.approach = approach
         self.embedding = embedding
-        self.graph = graph
         self.text = text
         self.entity = entity
     }

From 8f51562e4a193aa2d74378b9468a18fcdd4b408a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 28 Jun 2026 08:32:05 +0000
Subject: [PATCH 18/18] Address Codex review (round 13): drop orphaned
 entities, skip keyword-only embeds, fix offset docs

- KnowledgeGraph.removeChunks removes entities whose mentions are entirely
  exhausted by the removal (no independent evidence) and drops relationships whose
  endpoint was removed, alongside the existing context scrub.
- Engine.build skips the embedding stage entirely for approach == "keyword", so a
  keyword-only corpus doesn't pay embedder latency or fail on a remote embedder.
- TextChunk start/end offsets are documented as character (grapheme) offsets, not
  UTF-8 byte offsets, matching how the chunker computes them.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01VUzx3BzstYuB68txcHvAUh
---
 Sources/GraphRAG/Core/Models.swift          |  6 ++--
 Sources/GraphRAG/Graph/KnowledgeGraph.swift | 31 +++++++++++++--------
 Sources/GraphRAG/GraphRAG/Engine.swift      | 22 +++++++++------
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/Sources/GraphRAG/Core/Models.swift b/Sources/GraphRAG/Core/Models.swift
index 8cefbea..dd08b44 100644
--- a/Sources/GraphRAG/Core/Models.swift
+++ b/Sources/GraphRAG/Core/Models.swift
@@ -35,9 +35,11 @@ public struct TextChunk: Codable, Sendable, Identifiable, Equatable {
     public var id: ChunkID
     public var documentID: DocumentID
     public var content: String
-    /// Byte offset of the chunk start within the original document content.
+    /// Character (grapheme) offset of the chunk start within the original
+    /// document content — not a UTF-8 byte offset.
     public var startOffset: Int
-    /// Byte offset of the chunk end within the original document content.
+    /// Character (grapheme) offset of the chunk end within the original document
+    /// content — not a UTF-8 byte offset.
     public var endOffset: Int
     /// Optional dense embedding for semantic search.
     public var embedding: [Float]?
diff --git a/Sources/GraphRAG/Graph/KnowledgeGraph.swift b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
index 0986077..5ab1d83 100644
--- a/Sources/GraphRAG/Graph/KnowledgeGraph.swift
+++ b/Sources/GraphRAG/Graph/KnowledgeGraph.swift
@@ -118,34 +118,41 @@ public struct KnowledgeGraph: Sendable, Codable {
             doc.chunks.removeAll { removed.contains($0.id) }
             documentsByID[documentID] = doc
         }
-        // Scrub evidence pointing at the removed chunks: drop entity mentions and
-        // relationship-context entries that reference them, so traversal/stats and
-        // saved JSON don't expose facts from a document version that's gone.
+        // Scrub evidence pointing at the removed chunks. Drop mentions that
+        // reference them; if an entity's mentions are entirely exhausted (no
+        // independent evidence remains), remove the entity too so it doesn't
+        // linger in stats/traversal/JSON from a document version that's gone.
+        var removedEntities: Set<EntityID> = []
         for eid in entityOrder {
             guard var entity = entitiesByID[eid], !entity.mentions.isEmpty else { continue }
             let kept = entity.mentions.filter { !removed.contains($0.chunkID) }
-            if kept.count != entity.mentions.count {
+            if kept.isEmpty {
+                removedEntities.insert(eid)
+            } else if kept.count != entity.mentions.count {
                 entity.mentions = kept
                 entitiesByID[eid] = entity
             }
         }
+        if !removedEntities.isEmpty {
+            for eid in removedEntities { entitiesByID.removeValue(forKey: eid) }
+            entityOrder.removeAll { removedEntities.contains($0) }
+        }
         // Scrub relationship context; drop a relationship whose only evidence was
-        // a removed chunk (a context-less leftover would expose a stale fact).
-        // Edges that never had context are kept.
+        // a removed chunk, or whose endpoint entity was just removed (a leftover
+        // would expose a stale fact). Edges that never had context are kept.
         var survived: [Relationship] = []
         survived.reserveCapacity(relationships.count)
         for var rel in relationships {
+            if removedEntities.contains(rel.source) || removedEntities.contains(rel.target) {
+                continue
+            }
             let hadContext = !rel.context.isEmpty
             rel.context.removeAll { removed.contains($0) }
             if hadContext && rel.context.isEmpty { continue }
             survived.append(rel)
         }
-        if survived.count != relationships.count {
-            relationships = survived
-            rebuildAdjacency()
-        } else {
-            relationships = survived
-        }
+        relationships = survived
+        rebuildAdjacency()
     }
 
     /// Rebuild the outgoing/incoming index after the `relationships` array changes.
diff --git a/Sources/GraphRAG/GraphRAG/Engine.swift b/Sources/GraphRAG/GraphRAG/Engine.swift
index 2593c64..7f312eb 100644
--- a/Sources/GraphRAG/GraphRAG/Engine.swift
+++ b/Sources/GraphRAG/GraphRAG/Engine.swift
@@ -145,15 +145,19 @@ public actor GraphRAG {
             }
         }
 
-        // Stage 2: embed chunks.
-        for id in chunkIDs {
-            guard let chunk = graph.chunk(id) else { continue }
-            let embedding = try await embedder.embed(chunk.content)
-            // Skip if the chunk was replaced during the embedding await (content
-            // changed), so we never attach an old-content embedding to new text.
-            if var current = graph.chunk(id), current.content == chunk.content {
-                current.embedding = embedding
-                graph.addChunk(current)
+        // Stage 2: embed chunks — skipped entirely for keyword-only retrieval,
+        // which never uses embeddings (avoids embedder latency/failure, e.g. a
+        // remote Ollama embedder, when only BM25 is used).
+        if config.approach.lowercased() != "keyword" {
+            for id in chunkIDs {
+                guard let chunk = graph.chunk(id) else { continue }
+                let embedding = try await embedder.embed(chunk.content)
+                // Skip if the chunk was replaced during the embedding await
+                // (content changed), so we never attach an old-content embedding.
+                if var current = graph.chunk(id), current.content == chunk.content {
+                    current.embedding = embedding
+                    graph.addChunk(current)
+                }
             }
         }