From 36d3441dc03d1551d48aa424c55905b6c9e5c14d Mon Sep 17 00:00:00 2001 From: Shikhar Date: Thu, 1 Jan 2026 08:09:00 +0530 Subject: [PATCH 1/4] add bench_uint16 Signed-off-by: Shikhar --- benchmarks/CMakeLists.txt | 6 ++ benchmarks/bench_uint16.cpp | 139 ++++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 benchmarks/bench_uint16.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 4ee57895..81ea92a6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -11,7 +11,9 @@ FetchContent_MakeAvailable(counters) add_executable(realbenchmark benchmark.cpp) target_link_libraries(realbenchmark PRIVATE counters::counters) add_executable(bench_ip bench_ip.cpp) +add_executable(bench_uint16 bench_uint16.cpp) target_link_libraries(bench_ip PRIVATE counters::counters) +target_link_libraries(bench_uint16 PRIVATE counters::counters) set_property( TARGET realbenchmark @@ -19,8 +21,12 @@ set_property( set_property( TARGET bench_ip PROPERTY CXX_STANDARD 17) +set_property( + TARGET bench_uint16 + PROPERTY CXX_STANDARD 17) target_link_libraries(realbenchmark PUBLIC fast_float) target_link_libraries(bench_ip PUBLIC fast_float) +target_link_libraries(bench_uint16 PUBLIC fast_float) include(ExternalProject) diff --git a/benchmarks/bench_uint16.cpp b/benchmarks/bench_uint16.cpp new file mode 100644 index 00000000..c4cef81b --- /dev/null +++ b/benchmarks/bench_uint16.cpp @@ -0,0 +1,139 @@ +#include "counters/bench.h" +#include "fast_float/fast_float.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void pretty_print(size_t volume, size_t bytes, std::string name, + counters::event_aggregate agg) { + if (agg.inner_count > 1) { + printf("# (inner count: %d)\n", agg.inner_count); + } + printf("%-40s : ", name.c_str()); + printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns()); + printf(" %5.1f Mip/s ", volume * 1000.0 / agg.fastest_elapsed_ns()); + printf(" %5.2f ns/ip ", agg.fastest_elapsed_ns() / volume); + if (counters::event_collector().has_events()) { + printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); + printf(" %5.2f c/ip ", agg.fastest_cycles() / volume); + printf(" %5.2f i/ip ", agg.fastest_instructions() / volume); + printf(" %5.2f c/b ", agg.fastest_cycles() / bytes); + printf(" %5.2f i/b ", agg.fastest_instructions() / bytes); + printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); + } + printf("\n"); +} + +enum class parse_method { standard, fast_float }; + +void validate(const std::string &buffer, const std::vector &expected, + char delimiter) { + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + + for (size_t i = 0; i < expected.size(); i++) { + uint16_t val; + auto r = fast_float::from_chars(p, pend, val); + if (r.ec != std::errc() || val != expected[i]) { + printf("Validation failed at index %zu: expected %u, got %u\n", i, + expected[i], val); + std::abort(); + } + p = r.ptr; + if (i + 1 < expected.size()) { + if (p >= pend || *p != delimiter) { + printf("Validation failed at index %zu: delimiter mismatch\n", i); + std::abort(); + } + ++p; + } + } + + if (p != pend) { + printf("Validation failed: trailing bytes remain\n"); + std::abort(); + } + printf("Validation passed!\n"); +} + +int main() { + constexpr size_t N = 500000; + constexpr char delimiter = ','; + std::mt19937 rng(1234); + std::uniform_int_distribution dist(0, 65535); + + std::vector expected; + expected.reserve(N); + + std::string buffer; + buffer.reserve(N * 6); // up to 5 digits + delimiter + + for (size_t i = 0; i < N; ++i) { + uint16_t val = (uint16_t)dist(rng); + expected.push_back(val); + std::string s = std::to_string(val); + buffer.append(s); + if (i + 1 < N) { + buffer.push_back(delimiter); + } + } + + size_t total_bytes = buffer.size(); + + validate(buffer, expected, delimiter); + + volatile uint64_t sink = 0; + + pretty_print(N, total_bytes, "parse_uint16_std_fromchars", + counters::bench([&]() { + uint64_t sum = 0; + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + for (size_t i = 0; i < N; ++i) { + uint16_t value = 0; + auto r = std::from_chars(p, pend, value); + if (r.ec != std::errc()) + std::abort(); + sum += value; + p = r.ptr; + if (i + 1 < N) { + if (p >= pend || *p != delimiter) + std::abort(); + ++p; + } + } + if (p != pend) + std::abort(); + sink += sum; + })); + + pretty_print(N, total_bytes, "parse_uint16_fastfloat", counters::bench([&]() { + uint64_t sum = 0; + const char *p = buffer.data(); + const char *pend = p + buffer.size(); + for (size_t i = 0; i < N; ++i) { + uint16_t value = 0; + auto r = fast_float::from_chars(p, pend, value); + if (r.ec != std::errc()) + std::abort(); + sum += value; + p = r.ptr; + if (i + 1 < N) { + if (p >= pend || *p != delimiter) + std::abort(); + ++p; + } + } + if (p != pend) + std::abort(); + sink += sum; + })); + + return EXIT_SUCCESS; +} From d0af1cfdbd7dfa781f2e40c89bd6b85d886a538a Mon Sep 17 00:00:00 2001 From: Shikhar Date: Thu, 1 Jan 2026 08:29:14 +0530 Subject: [PATCH 2/4] optimize uint16 parsing Signed-off-by: Shikhar --- include/fast_float/ascii_number.h | 116 ++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 5609ba1a..b5826f6d 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -68,6 +68,26 @@ read8_to_u64(UC const *chars) { return val; } +// Read 4 UC into a u32. Truncates UC if not char. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +read4_to_u32(UC const *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { + uint32_t val = 0; + for (int i = 0; i < 4; ++i) { + val |= uint32_t(uint8_t(*chars)) << (i * 8); + ++chars; + } + return val; + } + uint32_t val; + ::memcpy(&val, chars, sizeof(uint32_t)); +#if FASTFLOAT_IS_BIG_ENDIAN == 1 + val = byteswap(val); +#endif + return val; +} + #ifdef FASTFLOAT_SSE2 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) { @@ -149,6 +169,13 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept { 0x8080808080808080)); } +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_four_digits_unrolled(uint32_t val) noexcept { + val -= 0x30303030; + val = (val * 10) + (val >> 8); + return (((val & 0x00FF00FF) * 0x00640001) >> 16) & 0xFFFF; +} + #ifdef FASTFLOAT_HAS_SIMD // Call this if chars might not be 8 digits. @@ -606,6 +633,95 @@ parse_int_string(UC const *p, UC const *pend, T &value, } } + FASTFLOAT_IF_CONSTEXPR17((std::is_same::value)) { + if (base == 10) { + const size_t len = size_t(pend - p); + if (len == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + uint32_t digits; + if (len >= 4) { + digits = read4_to_u32(p); + } else { + uint32_t b0 = uint32_t(uint8_t(p[0])); + uint32_t b1 = (len > 1) ? uint32_t(uint8_t(p[1])) : 0xFFu; + uint32_t b2 = (len > 2) ? uint32_t(uint8_t(p[2])) : 0xFFu; + digits = b0 | (b1 << 8) | (b2 << 16) | (0xFFu << 24); + } + + uint32_t magic = + ((digits + 0x46464646u) | (digits - 0x30303030u)) & 0x80808080u; + uint32_t nd = (magic == 0) ? 4u : (uint32_t(countr_zero_32(magic)) >> 3); + + if (nd == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + return answer; + } + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + + if (nd < 4) { + // mask out non-digit bytes and replace with '0' (0x30) + uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u); + uint32_t padded = (digits & mask) | (~mask & 0x30303030u); + uint32_t v = parse_four_digits_unrolled(padded); + static constexpr uint32_t divs[] = {0, 1000, 100, 10}; + value = (uint16_t)(v / divs[nd]); + answer.ec = std::errc(); + answer.ptr = p + nd; + return answer; + } + + uint32_t v = parse_four_digits_unrolled(digits); + + uint32_t d4 = (len > 4) ? uint32_t(p[4] - '0') : 10u; + if (d4 > 9u) { + value = (uint16_t)v; + answer.ec = std::errc(); + answer.ptr = p + 4; + return answer; + } + + if (len > 5) { + uint32_t d5 = uint32_t(p[5]) - uint32_t('0'); + if (d5 <= 9u) { + const UC *q = p + 6; + while (q < pend && uint32_t(*q) - uint32_t('0') <= 9u) + ++q; + answer.ec = std::errc::result_out_of_range; + answer.ptr = q; + return answer; + } + } + + // overflow check + if (v > 6553u || (v == 6553u && d4 > 5u)) { + answer.ec = std::errc::result_out_of_range; + answer.ptr = p + 5; + return answer; + } + + value = (uint16_t)(v * 10u + d4); + answer.ec = std::errc(); + answer.ptr = p + 5; + return answer; + } + } + uint64_t i = 0; if (base == 10) { loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible From 13d4b9418364ab8c68b9f235065c700f88b3719a Mon Sep 17 00:00:00 2001 From: Shikhar Date: Thu, 1 Jan 2026 17:42:30 +0530 Subject: [PATCH 3/4] small fix --- include/fast_float/ascii_number.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index b5826f6d..7422e74f 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -679,7 +679,7 @@ parse_int_string(UC const *p, UC const *pend, T &value, uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u); uint32_t padded = (digits & mask) | (~mask & 0x30303030u); uint32_t v = parse_four_digits_unrolled(padded); - static constexpr uint32_t divs[] = {0, 1000, 100, 10}; + constexpr uint32_t divs[] = {0, 1000, 100, 10}; value = (uint16_t)(v / divs[nd]); answer.ec = std::errc(); answer.ptr = p + nd; From b14e6a466aba8813a9e8da2defb0671fa2db3410 Mon Sep 17 00:00:00 2001 From: Shikhar Date: Fri, 2 Jan 2026 02:45:11 +0530 Subject: [PATCH 4/4] simpler optimizations Signed-off-by: Shikhar --- include/fast_float/ascii_number.h | 103 ++++++++++-------------------- 1 file changed, 34 insertions(+), 69 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 7422e74f..85435373 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -32,7 +32,7 @@ template fastfloat_really_inline constexpr bool has_simd_opt() { // able to optimize it well. template fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { - return !(c > UC('9') || c < UC('0')); + return (unsigned)(c - UC('0')) <= 9u; } fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { @@ -83,11 +83,10 @@ read4_to_u32(UC const *chars) { uint32_t val; ::memcpy(&val, chars, sizeof(uint32_t)); #if FASTFLOAT_IS_BIG_ENDIAN == 1 - val = byteswap(val); + val = byteswap_32(val); #endif return val; } - #ifdef FASTFLOAT_SSE2 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) { @@ -169,6 +168,11 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept { 0x8080808080808080)); } +fastfloat_really_inline constexpr bool +is_made_of_four_digits_fast(uint32_t val) noexcept { + return !((((val + 0x46464646) | (val - 0x30303030)) & 0x80808080)); +} + fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t parse_four_digits_unrolled(uint32_t val) noexcept { val -= 0x30303030; @@ -648,77 +652,38 @@ parse_int_string(UC const *p, UC const *pend, T &value, return answer; } - uint32_t digits; if (len >= 4) { - digits = read4_to_u32(p); - } else { - uint32_t b0 = uint32_t(uint8_t(p[0])); - uint32_t b1 = (len > 1) ? uint32_t(uint8_t(p[1])) : 0xFFu; - uint32_t b2 = (len > 2) ? uint32_t(uint8_t(p[2])) : 0xFFu; - digits = b0 | (b1 << 8) | (b2 << 16) | (0xFFu << 24); - } - - uint32_t magic = - ((digits + 0x46464646u) | (digits - 0x30303030u)) & 0x80808080u; - uint32_t nd = (magic == 0) ? 4u : (uint32_t(countr_zero_32(magic)) >> 3); - - if (nd == 0) { - if (has_leading_zeros) { - value = 0; + uint32_t digits = read4_to_u32(p); + if (is_made_of_four_digits_fast(digits)) { + uint32_t v = parse_four_digits_unrolled(digits); + if (len >= 5 && is_integer(p[4])) { + v = v * 10 + uint32_t(p[4] - '0'); + if (len >= 6 && is_integer(p[5])) { + answer.ec = std::errc::result_out_of_range; + const UC *q = p + 5; + while (q != pend && is_integer(*q)) { + q++; + } + answer.ptr = q; + return answer; + } + if (v > 65535) { + answer.ec = std::errc::result_out_of_range; + answer.ptr = p + 5; + return answer; + } + value = uint16_t(v); + answer.ec = std::errc(); + answer.ptr = p + 5; + return answer; + } + // 4 digits + value = uint16_t(v); answer.ec = std::errc(); - answer.ptr = p; - return answer; - } - answer.ec = std::errc::invalid_argument; - answer.ptr = first; - return answer; - } - - if (nd < 4) { - // mask out non-digit bytes and replace with '0' (0x30) - uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u); - uint32_t padded = (digits & mask) | (~mask & 0x30303030u); - uint32_t v = parse_four_digits_unrolled(padded); - constexpr uint32_t divs[] = {0, 1000, 100, 10}; - value = (uint16_t)(v / divs[nd]); - answer.ec = std::errc(); - answer.ptr = p + nd; - return answer; - } - - uint32_t v = parse_four_digits_unrolled(digits); - - uint32_t d4 = (len > 4) ? uint32_t(p[4] - '0') : 10u; - if (d4 > 9u) { - value = (uint16_t)v; - answer.ec = std::errc(); - answer.ptr = p + 4; - return answer; - } - - if (len > 5) { - uint32_t d5 = uint32_t(p[5]) - uint32_t('0'); - if (d5 <= 9u) { - const UC *q = p + 6; - while (q < pend && uint32_t(*q) - uint32_t('0') <= 9u) - ++q; - answer.ec = std::errc::result_out_of_range; - answer.ptr = q; + answer.ptr = p + 4; return answer; } } - - // overflow check - if (v > 6553u || (v == 6553u && d4 > 5u)) { - answer.ec = std::errc::result_out_of_range; - answer.ptr = p + 5; - return answer; - } - - value = (uint16_t)(v * 10u + d4); - answer.ec = std::errc(); - answer.ptr = p + 5; - return answer; } }