diff --git a/src/syscall/mem.c b/src/syscall/mem.c index b682b75..00f93b0 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -266,17 +266,21 @@ static void split_regions_at_boundary(guest_t *g, uint64_t boundary) static uint64_t find_free_gap_inner(const guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { - /* Round the search start up to the next host-page boundary so an unaligned - * addr hint cannot return a result that lands inside a host page already - * covered by a preceding region's overlay tail (the overlay extends to + /* Round the search start up to the requested alignment so an unaligned addr + * hint cannot return a result that lands inside a host page already covered + * by a preceding region's overlay tail (the overlay extends to * ALIGN_UP(r->end, hps)). Apple Silicon enforces 16 KiB host pages; * aligning to the guest 4 KiB page is not enough. Advance past each walked - * region to the same boundary for the same reason. + * region to the same boundary for the same reason. MAP_SHARED file-backed + * allocations may request 2 MiB alignment as a best-effort placement + * preference so consecutive mappings usually avoid sharing an HVF stage-2 + * segment, which reduces segment-table fragmentation for memfd-style + * allocation patterns. */ - size_t hps = host_page_size_cached(); - uint64_t gap_start = ALIGN_UP(min_addr, hps); + uint64_t gap_start = ALIGN_UP(min_addr, align); /* Skip the prefix of regions entirely below gap_start in O(log n). After a * successful allocation the gap hint advances near or past the existing @@ -307,8 +311,10 @@ static uint64_t find_free_gap_inner(const guest_t *g, g->regions[i].start >= gap_start + length) return gap_start; - /* Region overlaps; advance past it and round to the next host page */ - gap_start = ALIGN_UP(g->regions[i].end, hps); + /* Region overlaps; advance past it and round to the next aligned + * boundary so the caller's alignment promise holds across allocations. + */ + gap_start = ALIGN_UP(g->regions[i].end, align); } /* Check trailing space after all regions */ @@ -321,12 +327,16 @@ static uint64_t find_free_gap_inner(const guest_t *g, * The hint tracks the first address after the last successful mapping in each * region, which avoids rescanning the same prefix on sequential mmap activity. * A miss falls back to the region base so holes reopened by munmap are still - * reusable. + * reusable. The align argument is the per-call start boundary the result must + * satisfy; some sys_mmap callers first pass BLOCK_2MIB as a best-effort + * placement preference for MAP_SHARED file-backed allocations, then retry with + * host-page alignment when no 2 MiB-aligned gap is available. */ static uint64_t find_free_gap(guest_t *g, uint64_t length, uint64_t min_addr, - uint64_t max_addr) + uint64_t max_addr, + uint64_t align) { /* RX and RW mappings advance independently, so keep separate hints. */ uint64_t *hint = @@ -334,15 +344,20 @@ static uint64_t find_free_gap(guest_t *g, /* Advance the hint to the next host-page boundary so the following * sequential allocation lands on an address that the kernel accepts for - * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). The tradeoff - * is up to host_page-1 bytes of address-space waste per small allocation; - * physical pages are still demand-paged, so RAM cost is unchanged. + * mmap MAP_FIXED (Apple Silicon enforces 16 KiB host pages). Round to the + * host page even when the current call requested a larger align (e.g. + * BLOCK_2MIB for MAP_SHARED file-backed): a subsequent MAP_PRIVATE 4 KiB + * allocation should still be able to occupy the trailing space inside the + * 2 MiB block. find_free_gap_inner re-applies the caller's align on its + * next entry, so a subsequent MAP_SHARED allocation skips past the small + * tenant and lands on the next 2 MiB boundary anyway. */ size_t hps = host_page_size_cached(); /* Try cached hint first (only if within the valid range) */ if (*hint >= min_addr && *hint < max_addr) { - uint64_t result = find_free_gap_inner(g, length, *hint, max_addr); + uint64_t result = + find_free_gap_inner(g, length, *hint, max_addr, align); if (result != UINT64_MAX) { *hint = ALIGN_UP(result + length, hps); return result; @@ -350,7 +365,7 @@ static uint64_t find_free_gap(guest_t *g, } /* Full scan from base */ - uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr); + uint64_t result = find_free_gap_inner(g, length, min_addr, max_addr, align); if (result != UINT64_MAX) *hint = ALIGN_UP(result + length, hps); return result; @@ -2184,6 +2199,35 @@ int64_t sys_mmap(guest_t *g, if (high_hint >= 0) return high_hint; } + /* Open the backing fd before the gap-finder so the alignment heuristic + * can read the host fd's access mode through overlay_fd_writable. + * Closes on every failure path within the non-fixed branch. + */ + if (!is_anon) { + if (host_fd_ref_open(fd, &backing_ref) < 0) + return -LINUX_EBADF; + host_backing_fd = backing_ref.fd; + } + /* Prefer stage-2 2 MiB block boundaries for non-fixed MAP_SHARED + * file-backed allocations. Without this each shared file mmap whose + * result lands mid-block forces hvf_apply_file_overlay_quiesced to + * split the containing HVF segment at both ends; back-to-back memfd + * allocations burn segments at roughly two per mmap and run the table + * to GUEST_MAX_HVF_SEGMENTS quickly. This is a placement preference, + * not a Linux-visible constraint: if no 2 MiB-aligned gap exists, the + * allocation retries with host-page alignment. The condition mirrors + * the overlay fast-path's gate (host-page-aligned offset, writable + * backer) so read-only MAP_SHARED mappings that fall through to the + * pread snapshot do not pay the alignment cost without the + * segment-table benefit. + */ + size_t hps = host_page_size_cached(); + uint64_t align = (uint64_t) hps; + if (!is_anon && fd >= 0 && (flags & LINUX_MAP_SHARED) && + ((uint64_t) offset % hps == 0) && + overlay_fd_writable(host_backing_fd)) + align = BLOCK_2MIB; + uint64_t fallback_align = (uint64_t) hps; if (needs_exec && !(prot & LINUX_PROT_WRITE)) { /* PROT_EXEC without PROT_WRITE: allocate from the RX mmap region. * Apple HVF enforces W^X on 2MiB block page table entries, so @@ -2191,7 +2235,11 @@ int64_t sys_mmap(guest_t *g, * ones. The RX region at MMAP_RX_BASE is pre-mapped with execute * permission. */ - result_off = find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_RX_BASE, g->mmap_limit, align); + if (result_off == UINT64_MAX && align != fallback_align) + result_off = find_free_gap(g, length, MMAP_RX_BASE, + g->mmap_limit, fallback_align); if (result_off == UINT64_MAX) { log_debug( "mmap: RX address space exhausted " @@ -2199,6 +2247,7 @@ int64_t sys_mmap(guest_t *g, (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); + host_fd_ref_close(&backing_ref); return -LINUX_ENOMEM; } /* High-water mark for fork IPC state transfer */ @@ -2232,12 +2281,26 @@ int64_t sys_mmap(guest_t *g, */ uint64_t hint_max = (hint_off < MMAP_BASE) ? MMAP_BASE : g->mmap_limit; - result_off = - find_free_gap_inner(g, length, hint_off, hint_max); + if (align != fallback_align) { + uint64_t exact_hint_max = hint_off + length; + result_off = + find_free_gap_inner(g, length, hint_off, + exact_hint_max, fallback_align); + } + if (result_off == UINT64_MAX) + result_off = find_free_gap_inner(g, length, hint_off, + hint_max, align); + if (result_off == UINT64_MAX && align != fallback_align) + result_off = find_free_gap_inner( + g, length, hint_off, hint_max, fallback_align); } } if (result_off == UINT64_MAX) - result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit); + result_off = + find_free_gap(g, length, MMAP_BASE, g->mmap_limit, align); + if (result_off == UINT64_MAX && align != fallback_align) + result_off = find_free_gap(g, length, MMAP_BASE, g->mmap_limit, + fallback_align); if (result_off == UINT64_MAX) { log_debug( "mmap: RW address space exhausted " @@ -2245,6 +2308,7 @@ int64_t sys_mmap(guest_t *g, (unsigned long long) length, (unsigned long long) g->mmap_limit, g->ipa_bits, (unsigned long long) (g->guest_size >> 30)); + host_fd_ref_close(&backing_ref); return -LINUX_ENOMEM; } /* High-water mark for fork IPC state transfer */ @@ -2252,11 +2316,6 @@ int64_t sys_mmap(guest_t *g, if (rw_hwm > g->mmap_next) g->mmap_next = rw_hwm; } - if (!is_anon) { - if (host_fd_ref_open(fd, &backing_ref) < 0) - return -LINUX_EBADF; - host_backing_fd = backing_ref.fd; - } if (!region_has_capacity_after_removes(g, NULL, 0, 1)) { host_fd_ref_close(&backing_ref); return -LINUX_ENOMEM; @@ -2931,10 +2990,17 @@ int64_t sys_mremap(guest_t *g, int needs_exec = (prot & LINUX_PROT_EXEC) != 0; uint64_t new_off; + /* mremap moves the data via read_file_range_to_guest and does not + * reinstall a file overlay at the destination, so 2 MiB alignment + * would not narrow segment-table growth. Stay at host-page alignment. + */ + size_t mremap_align = host_page_size_cached(); if (needs_exec && !(prot & LINUX_PROT_WRITE)) - new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_RX_BASE, g->mmap_limit, + mremap_align); else - new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit); + new_off = find_free_gap(g, new_size, MMAP_BASE, g->mmap_limit, + mremap_align); if (new_off == UINT64_MAX) { if (track_backing_fd >= 0) diff --git a/tests/test-mmap-hint.c b/tests/test-mmap-hint.c index a5caab9..9542328 100644 --- a/tests/test-mmap-hint.c +++ b/tests/test-mmap-hint.c @@ -10,7 +10,9 @@ */ #include +#include #include +#include #include #include @@ -69,9 +71,71 @@ static void test_low_hint_exact(void) munmap(p, len); } +static void test_shared_file_hint_falls_back_from_2m_alignment(void) +{ + TEST("MAP_SHARED file hint falls back from 2MiB alignment"); + + const size_t page = (size_t) sysconf(_SC_PAGESIZE); + const uintptr_t block_2m = 2ULL * 1024ULL * 1024ULL; + + void *anchor = + mmap(NULL, page, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (anchor == MAP_FAILED) { + FAIL("anchor mmap failed"); + return; + } + + uintptr_t anchor_addr = (uintptr_t) anchor; + uintptr_t anchor_block = anchor_addr & ~(block_2m - 1); + if (anchor_addr != anchor_block) { + munmap(anchor, page); + FAIL("anchor not 2MiB-aligned"); + return; + } + if (anchor_block < 0x00400000ULL + 0x10000ULL) { + munmap(anchor, page); + FAIL("anchor too low for regression hint"); + return; + } + uintptr_t hint_addr = anchor_block - 0x10000ULL; + + char path[] = "/tmp/elfuse-mmap-hint-XXXXXX"; + int fd = mkstemp(path); + if (fd < 0) { + munmap(anchor, page); + FAIL("mkstemp failed"); + return; + } + unlink(path); + + if (ftruncate(fd, (off_t) page) < 0) { + close(fd); + munmap(anchor, page); + FAIL("ftruncate failed"); + return; + } + + void *hint = (void *) hint_addr; + void *p = mmap(hint, page, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + close(fd); + munmap(anchor, page); + FAIL("shared file mmap failed"); + return; + } + + EXPECT_TRUE((uintptr_t) p == hint_addr, + "shared file mmap should honor host-page-aligned hint"); + + munmap(p, page); + close(fd); + munmap(anchor, page); +} + int main(void) { test_low_hint_exact(); + test_shared_file_hint_falls_back_from_2m_alignment(); SUMMARY("test-mmap-hint"); return fails ? 1 : 0; } diff --git a/tests/test-msync.c b/tests/test-msync.c index 7249c97..da5986a 100644 --- a/tests/test-msync.c +++ b/tests/test-msync.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -542,6 +543,84 @@ static void test_shared_large_mapping_crosses_split_hvf_segments(void) close(large_fd); } +/* Consecutive non-fixed MAP_SHARED file-backed mmap allocations must succeed + * even when a previous shared mmap split an HVF stage-2 segment. The overlay + * path tolerates multi-segment ranges and the gap finder keeps shared + * file-backed allocations aligned to 2 MiB so subsequent mmaps do not re-split + * mid-segment. Cover both: each chunk must mmap, accept guest writes, and + * stay backed by its own memfd. + */ +static void test_shared_back_to_back_memfd_mappings(void) +{ + TEST("back-to-back non-fixed MAP_SHARED memfd mappings stay file-backed"); +#ifndef SYS_memfd_create +#define SYS_memfd_create 279 +#endif +#define CHUNKS 4 + const size_t chunk_len = (size_t) 16 * 1024 * 1024; + int fds[CHUNKS]; + void *maps[CHUNKS]; + for (int i = 0; i < CHUNKS; i++) { + fds[i] = -1; + maps[i] = MAP_FAILED; + } + + bool ok = true; + for (int i = 0; i < CHUNKS; i++) { + char name[32]; + snprintf(name, sizeof(name), "elfuse-msync-bb-%d", i); + fds[i] = (int) syscall(SYS_memfd_create, name, 0u); + if (fds[i] < 0) { + FAIL("memfd_create failed"); + ok = false; + goto out; + } + if (ftruncate(fds[i], (off_t) chunk_len) != 0) { + FAIL("ftruncate failed"); + ok = false; + goto out; + } + maps[i] = mmap(NULL, chunk_len, PROT_READ | PROT_WRITE, MAP_SHARED, + fds[i], 0); + if (maps[i] == MAP_FAILED) { + FAIL("consecutive shared mmap failed"); + ok = false; + goto out; + } + } + + for (int i = 0; i < CHUNKS; i++) { + unsigned char *p = (unsigned char *) maps[i]; + p[0] = (unsigned char) (0xA0 + i); + p[chunk_len - 1] = (unsigned char) (0xB0 + i); + } + for (int i = 0; i < CHUNKS; i++) { + unsigned char first = 0, last = 0; + if (pread(fds[i], &first, 1, 0) != 1 || + pread(fds[i], &last, 1, (off_t) (chunk_len - 1)) != 1) { + FAIL("pread failed"); + ok = false; + goto out; + } + if (first != (unsigned char) (0xA0 + i) || + last != (unsigned char) (0xB0 + i)) { + FAIL("shared write did not reach its own memfd"); + ok = false; + goto out; + } + } + if (ok) + PASS(); + +out: + for (int i = 0; i < CHUNKS; i++) { + if (maps[i] != MAP_FAILED) + munmap(maps[i], chunk_len); + if (fds[i] >= 0) + close(fds[i]); + } +} + int main(void) { printf("test-msync: MAP_SHARED msync tests\n\n"); @@ -553,6 +632,7 @@ int main(void) test_shared_guest_write_lands_in_file(); test_shared_adjacent_fixed_mapping_does_not_alias_file(); test_shared_large_mapping_crosses_split_hvf_segments(); + test_shared_back_to_back_memfd_mappings(); test_shm_name_visible_after_fork(); SUMMARY("test-msync");