From 6eb1bcfad3b9d15b0375c140710fc2c294a3b341 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Mon, 16 Mar 2026 13:56:55 +0000 Subject: [PATCH] DAOS-18541 rebuild: accumulate more OIDs per migrate RPC to reduce RPC count Fix yield-count accounting in the scanner, A send-side batching policy is also introduced: the send ULT defers flushing until at least REBUILD_SEND_BATCH_MIN OIDs are queued or REBUILD_SEND_BATCH_TIMEOUT_SEC seconds have elapsed. Without batching, a fast scanner floods the destination rank with many small RPCs, exhausting IB receive buffers and triggering timeouts. This is especially severe during reintegration, where all OIDs are concentrated on a single target rank. Signed-off-by: Wang Shilong --- src/rebuild/rebuild_internal.h | 5 +-- src/rebuild/scan.c | 59 ++++++++++++++++++++++++++++++++-- src/rebuild/srv.c | 1 + 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 4eb7f8ef2b5..79dcb0d9514 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2017-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -249,7 +249,8 @@ struct rebuild_pool_tls { uint64_t rebuild_pool_reclaim_obj_count; unsigned int rebuild_pool_ver; uint32_t rebuild_pool_gen; - uint64_t rebuild_pool_leader_term; + uint64_t rebuild_pool_leader_term; + uint64_t rebuild_pool_obj_send_pending; int rebuild_pool_status; unsigned int rebuild_pool_scanning:1, rebuild_pool_scan_done:1; diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 9ee42ebf855..cb06a153e83 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -29,8 +29,14 @@ #include "rebuild_internal.h" #define REBUILD_SEND_LIMIT 4096 +/* Minimum pending objects before the send ULT flushes a batch (25% of max). */ +#define REBUILD_SEND_BATCH_MIN (REBUILD_SEND_LIMIT / 4) +/* Maximum seconds to wait for a batch to fill before flushing anyway. */ +#define REBUILD_SEND_BATCH_TIMEOUT_SEC 1 + struct rebuild_send_arg { struct rebuild_tgt_pool_tracker *rpt; + struct rebuild_pool_tls *tls; daos_unit_oid_t *oids; daos_epoch_t *ephs; daos_epoch_t *punched_ephs; @@ -76,6 +82,10 @@ rebuild_obj_fill_buf(daos_handle_t ih, d_iov_t *key_iov, if (rc != 0) return rc; + /* This OID is now removed from the btree; account for it. */ + D_ASSERT(arg->tls->rebuild_pool_obj_send_pending > 0); + arg->tls->rebuild_pool_obj_send_pending--; + /* re-probe the dbtree after delete */ rc = dbtree_iter_probe(ih, BTR_PROBE_FIRST, DAOS_INTENT_MIGRATION, NULL, NULL); @@ -274,6 +284,7 @@ rebuild_objects_send_ult(void *data) daos_epoch_t *punched_ephs = NULL; unsigned int *shards = NULL; int rc = 0; + uint64_t rebuild_send_wait_start; tls = rebuild_pool_tls_lookup(rpt->rt_pool_uuid, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); @@ -301,17 +312,55 @@ rebuild_objects_send_ult(void *data) arg.ephs = ephs; arg.punched_ephs = punched_ephs; arg.rpt = rpt; + arg.tls = tls; + + rebuild_send_wait_start = daos_gettime_coarse(); + + /* + * Batch OIDs before sending migrate RPCs to avoid RPC fragmentation. + * The scan ULT yields every ~SCAN_YIELD_CNT placement-cost units + * (1 per OID for small objects, up to more than 128 per OID for EC_16P3GX + * depends on cluster size), so without batching the send ULT would flush + * at most 64 OIDs per RPC instead of the REBUILD_SEND_LIMIT maximum. + * Hold the flush until REBUILD_SEND_BATCH_MIN OIDs are pending or + * REBUILD_SEND_BATCH_TIMEOUT_SEC seconds have elapsed; flush immediately + * when the scan is done. + */ while (!tls->rebuild_pool_scan_done || !dbtree_is_empty(tls->rebuild_tree_hdl)) { + bool scan_done; + bool tree_empty; + uint64_t now; + uint64_t elapsed; + if (rpt->rt_stable_epoch == 0) { dss_sleep(0); continue; } - if (dbtree_is_empty(tls->rebuild_tree_hdl)) { - dss_sleep(0); + tree_empty = dbtree_is_empty(tls->rebuild_tree_hdl); + scan_done = tls->rebuild_pool_scan_done; + now = daos_gettime_coarse(); + + if (tree_empty) { + /* Reset wait clock and yield to let scan make progress. */ + rebuild_send_wait_start = now; + dss_sleep(10); + continue; + } + + elapsed = now - rebuild_send_wait_start; + if (!scan_done && tls->rebuild_pool_obj_send_pending < REBUILD_SEND_BATCH_MIN && + elapsed < REBUILD_SEND_BATCH_TIMEOUT_SEC) { + dss_sleep(10); continue; } + D_DEBUG(DB_REBUILD, + DF_RB " send batch: pending %" PRIu64 " elapsed %" PRIu64 "s" + " scan_done %d\n", + DP_RB_RPT(rpt), tls->rebuild_pool_obj_send_pending, elapsed, + (int)scan_done); + /* walk through the rebuild tree and send the rebuild objects */ rc = dbtree_iterate(tls->rebuild_tree_hdl, DAOS_INTENT_MIGRATION, false, rebuild_cont_iter_cb, &arg); @@ -319,6 +368,8 @@ rebuild_objects_send_ult(void *data) D_ERROR("dbtree iterate failed: "DF_RC"\n", DP_RC(rc)); break; } + + rebuild_send_wait_start = now; dss_sleep(0); } @@ -389,6 +440,8 @@ rebuild_object_insert(struct rebuild_tgt_pool_tracker *rpt, uuid_t co_uuid, DP_UUID(co_uuid), DP_UOID(oid), tgt_id); rc = 0; } else { + if (rc == 0) + tls->rebuild_pool_obj_send_pending++; D_DEBUG(DB_REBUILD, "insert "DF_UOID"/"DF_UUID" tgt %u "DF_U64"/"DF_U64": " DF_RC"\n", DP_UOID(oid), DP_UUID(co_uuid), tgt_id, epoch, punched_epoch, DP_RC(rc)); @@ -790,7 +843,7 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, if (map != NULL) pl_map_decref(map); - if (--arg->yield_cnt <= 0) { + if (arg->yield_cnt <= 0) { D_DEBUG(DB_REBUILD, DF_UUID" rebuild yield: %d\n", DP_UUID(rpt->rt_pool_uuid), rc); arg->yield_cnt = SCAN_YIELD_CNT; diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 1e3b618e1f6..d435bc4af9c 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -127,6 +127,7 @@ rebuild_pool_tls_create(uuid_t pool_uuid, uuid_t poh_uuid, uuid_t coh_uuid, rebuild_pool_tls->rebuild_pool_scanning = 1; rebuild_pool_tls->rebuild_pool_scan_done = 0; rebuild_pool_tls->rebuild_pool_obj_count = 0; + rebuild_pool_tls->rebuild_pool_obj_send_pending = 0; rebuild_pool_tls->rebuild_pool_reclaim_obj_count = 0; rebuild_pool_tls->rebuild_tree_hdl = DAOS_HDL_INVAL; /* Only 1 thread will access the list, no need lock */