From 36d3441dc03d1551d48aa424c55905b6c9e5c14d Mon Sep 17 00:00:00 2001
From: Shikhar <shikharish05@gmail.com>
Date: Thu, 1 Jan 2026 08:09:00 +0530
Subject: [PATCH 1/4] add bench_uint16

Signed-off-by: Shikhar <shikharish05@gmail.com>
---
 benchmarks/CMakeLists.txt   |   6 ++
 benchmarks/bench_uint16.cpp | 139 ++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100644 benchmarks/bench_uint16.cpp
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 4ee57895..81ea92a6 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -11,7 +11,9 @@ FetchContent_MakeAvailable(counters)
 add_executable(realbenchmark benchmark.cpp)
 target_link_libraries(realbenchmark PRIVATE counters::counters)
 add_executable(bench_ip bench_ip.cpp)
+add_executable(bench_uint16 bench_uint16.cpp)
 target_link_libraries(bench_ip PRIVATE counters::counters)
+target_link_libraries(bench_uint16 PRIVATE counters::counters)
 
 set_property(
     TARGET realbenchmark
@@ -19,8 +21,12 @@ set_property(
 set_property(
     TARGET bench_ip
     PROPERTY CXX_STANDARD 17)
+set_property(
+    TARGET bench_uint16
+    PROPERTY CXX_STANDARD 17)
 target_link_libraries(realbenchmark PUBLIC fast_float)
 target_link_libraries(bench_ip PUBLIC fast_float)
+target_link_libraries(bench_uint16 PUBLIC fast_float)
 
 include(ExternalProject)
 
diff --git a/benchmarks/bench_uint16.cpp b/benchmarks/bench_uint16.cpp
new file mode 100644
index 00000000..c4cef81b
--- /dev/null
+++ b/benchmarks/bench_uint16.cpp
@@ -0,0 +1,139 @@
+#include "counters/bench.h"
+#include "fast_float/fast_float.h"
+#include <charconv>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <random>
+#include <atomic>
+#include <string>
+#include <vector>
+
+void pretty_print(size_t volume, size_t bytes, std::string name,
+                  counters::event_aggregate agg) {
+  if (agg.inner_count > 1) {
+    printf("# (inner count: %d)\n", agg.inner_count);
+  }
+  printf("%-40s : ", name.c_str());
+  printf(" %5.2f GB/s ", bytes / agg.fastest_elapsed_ns());
+  printf(" %5.1f Mip/s ", volume * 1000.0 / agg.fastest_elapsed_ns());
+  printf(" %5.2f ns/ip ", agg.fastest_elapsed_ns() / volume);
+  if (counters::event_collector().has_events()) {
+    printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
+    printf(" %5.2f c/ip ", agg.fastest_cycles() / volume);
+    printf(" %5.2f i/ip ", agg.fastest_instructions() / volume);
+    printf(" %5.2f c/b ", agg.fastest_cycles() / bytes);
+    printf(" %5.2f i/b ", agg.fastest_instructions() / bytes);
+    printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
+  }
+  printf("\n");
+}
+
+enum class parse_method { standard, fast_float };
+
+void validate(const std::string &buffer, const std::vector<uint16_t> &expected,
+              char delimiter) {
+  const char *p = buffer.data();
+  const char *pend = p + buffer.size();
+
+  for (size_t i = 0; i < expected.size(); i++) {
+    uint16_t val;
+    auto r = fast_float::from_chars(p, pend, val);
+    if (r.ec != std::errc() || val != expected[i]) {
+      printf("Validation failed at index %zu: expected %u, got %u\n", i,
+             expected[i], val);
+      std::abort();
+    }
+    p = r.ptr;
+    if (i + 1 < expected.size()) {
+      if (p >= pend || *p != delimiter) {
+        printf("Validation failed at index %zu: delimiter mismatch\n", i);
+        std::abort();
+      }
+      ++p;
+    }
+  }
+
+  if (p != pend) {
+    printf("Validation failed: trailing bytes remain\n");
+    std::abort();
+  }
+  printf("Validation passed!\n");
+}
+
+int main() {
+  constexpr size_t N = 500000;
+  constexpr char delimiter = ',';
+  std::mt19937 rng(1234);
+  std::uniform_int_distribution<int> dist(0, 65535);
+
+  std::vector<uint16_t> expected;
+  expected.reserve(N);
+
+  std::string buffer;
+  buffer.reserve(N * 6); // up to 5 digits + delimiter
+
+  for (size_t i = 0; i < N; ++i) {
+    uint16_t val = (uint16_t)dist(rng);
+    expected.push_back(val);
+    std::string s = std::to_string(val);
+    buffer.append(s);
+    if (i + 1 < N) {
+      buffer.push_back(delimiter);
+    }
+  }
+
+  size_t total_bytes = buffer.size();
+
+  validate(buffer, expected, delimiter);
+
+  volatile uint64_t sink = 0;
+
+  pretty_print(N, total_bytes, "parse_uint16_std_fromchars",
+               counters::bench([&]() {
+                 uint64_t sum = 0;
+                 const char *p = buffer.data();
+                 const char *pend = p + buffer.size();
+                 for (size_t i = 0; i < N; ++i) {
+                   uint16_t value = 0;
+                   auto r = std::from_chars(p, pend, value);
+                   if (r.ec != std::errc())
+                     std::abort();
+                   sum += value;
+                   p = r.ptr;
+                   if (i + 1 < N) {
+                     if (p >= pend || *p != delimiter)
+                       std::abort();
+                     ++p;
+                   }
+                 }
+                 if (p != pend)
+                   std::abort();
+                 sink += sum;
+               }));
+
+  pretty_print(N, total_bytes, "parse_uint16_fastfloat", counters::bench([&]() {
+                 uint64_t sum = 0;
+                 const char *p = buffer.data();
+                 const char *pend = p + buffer.size();
+                 for (size_t i = 0; i < N; ++i) {
+                   uint16_t value = 0;
+                   auto r = fast_float::from_chars(p, pend, value);
+                   if (r.ec != std::errc())
+                     std::abort();
+                   sum += value;
+                   p = r.ptr;
+                   if (i + 1 < N) {
+                     if (p >= pend || *p != delimiter)
+                       std::abort();
+                     ++p;
+                   }
+                 }
+                 if (p != pend)
+                   std::abort();
+                 sink += sum;
+               }));
+
+  return EXIT_SUCCESS;
+}

From d0af1cfdbd7dfa781f2e40c89bd6b85d886a538a Mon Sep 17 00:00:00 2001
From: Shikhar <shikharish05@gmail.com>
Date: Thu, 1 Jan 2026 08:29:14 +0530
Subject: [PATCH 2/4] optimize uint16 parsing

Signed-off-by: Shikhar <shikharish05@gmail.com>
---
 include/fast_float/ascii_number.h | 116 ++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 5609ba1a..b5826f6d 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -68,6 +68,26 @@ read8_to_u64(UC const *chars) {
   return val;
 }
 
+// Read 4 UC into a u32. Truncates UC if not char.
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t
+read4_to_u32(UC const *chars) {
+  if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
+    uint32_t val = 0;
+    for (int i = 0; i < 4; ++i) {
+      val |= uint32_t(uint8_t(*chars)) << (i * 8);
+      ++chars;
+    }
+    return val;
+  }
+  uint32_t val;
+  ::memcpy(&val, chars, sizeof(uint32_t));
+#if FASTFLOAT_IS_BIG_ENDIAN == 1
+  val = byteswap(val);
+#endif
+  return val;
+}
+
 #ifdef FASTFLOAT_SSE2
 
 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) {
@@ -149,6 +169,13 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept {
             0x8080808080808080));
 }
 
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
+parse_four_digits_unrolled(uint32_t val) noexcept {
+  val -= 0x30303030;
+  val = (val * 10) + (val >> 8);
+  return (((val & 0x00FF00FF) * 0x00640001) >> 16) & 0xFFFF;
+}
+
 #ifdef FASTFLOAT_HAS_SIMD
 
 // Call this if chars might not be 8 digits.
@@ -606,6 +633,95 @@ parse_int_string(UC const *p, UC const *pend, T &value,
     }
   }
 
+  FASTFLOAT_IF_CONSTEXPR17((std::is_same<T, std::uint16_t>::value)) {
+    if (base == 10) {
+      const size_t len = size_t(pend - p);
+      if (len == 0) {
+        if (has_leading_zeros) {
+          value = 0;
+          answer.ec = std::errc();
+          answer.ptr = p;
+        } else {
+          answer.ec = std::errc::invalid_argument;
+          answer.ptr = first;
+        }
+        return answer;
+      }
+
+      uint32_t digits;
+      if (len >= 4) {
+        digits = read4_to_u32(p);
+      } else {
+        uint32_t b0 = uint32_t(uint8_t(p[0]));
+        uint32_t b1 = (len > 1) ? uint32_t(uint8_t(p[1])) : 0xFFu;
+        uint32_t b2 = (len > 2) ? uint32_t(uint8_t(p[2])) : 0xFFu;
+        digits = b0 | (b1 << 8) | (b2 << 16) | (0xFFu << 24);
+      }
+
+      uint32_t magic =
+          ((digits + 0x46464646u) | (digits - 0x30303030u)) & 0x80808080u;
+      uint32_t nd = (magic == 0) ? 4u : (uint32_t(countr_zero_32(magic)) >> 3);
+
+      if (nd == 0) {
+        if (has_leading_zeros) {
+          value = 0;
+          answer.ec = std::errc();
+          answer.ptr = p;
+          return answer;
+        }
+        answer.ec = std::errc::invalid_argument;
+        answer.ptr = first;
+        return answer;
+      }
+
+      if (nd < 4) {
+        // mask out non-digit bytes and replace with '0' (0x30)
+        uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u);
+        uint32_t padded = (digits & mask) | (~mask & 0x30303030u);
+        uint32_t v = parse_four_digits_unrolled(padded);
+        static constexpr uint32_t divs[] = {0, 1000, 100, 10};
+        value = (uint16_t)(v / divs[nd]);
+        answer.ec = std::errc();
+        answer.ptr = p + nd;
+        return answer;
+      }
+
+      uint32_t v = parse_four_digits_unrolled(digits);
+
+      uint32_t d4 = (len > 4) ? uint32_t(p[4] - '0') : 10u;
+      if (d4 > 9u) {
+        value = (uint16_t)v;
+        answer.ec = std::errc();
+        answer.ptr = p + 4;
+        return answer;
+      }
+
+      if (len > 5) {
+        uint32_t d5 = uint32_t(p[5]) - uint32_t('0');
+        if (d5 <= 9u) {
+          const UC *q = p + 6;
+          while (q < pend && uint32_t(*q) - uint32_t('0') <= 9u)
+            ++q;
+          answer.ec = std::errc::result_out_of_range;
+          answer.ptr = q;
+          return answer;
+        }
+      }
+
+      // overflow check
+      if (v > 6553u || (v == 6553u && d4 > 5u)) {
+        answer.ec = std::errc::result_out_of_range;
+        answer.ptr = p + 5;
+        return answer;
+      }
+
+      value = (uint16_t)(v * 10u + d4);
+      answer.ec = std::errc();
+      answer.ptr = p + 5;
+      return answer;
+    }
+  }
+
   uint64_t i = 0;
   if (base == 10) {
     loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible

From 13d4b9418364ab8c68b9f235065c700f88b3719a Mon Sep 17 00:00:00 2001
From: Shikhar <shikharish05@gmail.com>
Date: Thu, 1 Jan 2026 17:42:30 +0530
Subject: [PATCH 3/4] small fix

---
 include/fast_float/ascii_number.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index b5826f6d..7422e74f 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -679,7 +679,7 @@ parse_int_string(UC const *p, UC const *pend, T &value,
         uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u);
         uint32_t padded = (digits & mask) | (~mask & 0x30303030u);
         uint32_t v = parse_four_digits_unrolled(padded);
-        static constexpr uint32_t divs[] = {0, 1000, 100, 10};
+        constexpr uint32_t divs[] = {0, 1000, 100, 10};
         value = (uint16_t)(v / divs[nd]);
         answer.ec = std::errc();
         answer.ptr = p + nd;

From b14e6a466aba8813a9e8da2defb0671fa2db3410 Mon Sep 17 00:00:00 2001
From: Shikhar <shikharish05@gmail.com>
Date: Fri, 2 Jan 2026 02:45:11 +0530
Subject: [PATCH 4/4] simpler optimizations

Signed-off-by: Shikhar <shikharish05@gmail.com>
---
 include/fast_float/ascii_number.h | 103 ++++++++++--------------------
 1 file changed, 34 insertions(+), 69 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 7422e74f..85435373 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -32,7 +32,7 @@ template <typename UC> fastfloat_really_inline constexpr bool has_simd_opt() {
 // able to optimize it well.
 template <typename UC>
 fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
-  return !(c > UC('9') || c < UC('0'));
+  return (unsigned)(c - UC('0')) <= 9u;
 }
 
 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
@@ -83,11 +83,10 @@ read4_to_u32(UC const *chars) {
   uint32_t val;
   ::memcpy(&val, chars, sizeof(uint32_t));
 #if FASTFLOAT_IS_BIG_ENDIAN == 1
-  val = byteswap(val);
+  val = byteswap_32(val);
 #endif
   return val;
 }
-
 #ifdef FASTFLOAT_SSE2
 
 fastfloat_really_inline uint64_t simd_read8_to_u64(__m128i const data) {
@@ -169,6 +168,11 @@ is_made_of_eight_digits_fast(uint64_t val) noexcept {
             0x8080808080808080));
 }
 
+fastfloat_really_inline constexpr bool
+is_made_of_four_digits_fast(uint32_t val) noexcept {
+  return !((((val + 0x46464646) | (val - 0x30303030)) & 0x80808080));
+}
+
 fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
 parse_four_digits_unrolled(uint32_t val) noexcept {
   val -= 0x30303030;
@@ -648,77 +652,38 @@ parse_int_string(UC const *p, UC const *pend, T &value,
         return answer;
       }
 
-      uint32_t digits;
       if (len >= 4) {
-        digits = read4_to_u32(p);
-      } else {
-        uint32_t b0 = uint32_t(uint8_t(p[0]));
-        uint32_t b1 = (len > 1) ? uint32_t(uint8_t(p[1])) : 0xFFu;
-        uint32_t b2 = (len > 2) ? uint32_t(uint8_t(p[2])) : 0xFFu;
-        digits = b0 | (b1 << 8) | (b2 << 16) | (0xFFu << 24);
-      }
-
-      uint32_t magic =
-          ((digits + 0x46464646u) | (digits - 0x30303030u)) & 0x80808080u;
-      uint32_t nd = (magic == 0) ? 4u : (uint32_t(countr_zero_32(magic)) >> 3);
-
-      if (nd == 0) {
-        if (has_leading_zeros) {
-          value = 0;
+        uint32_t digits = read4_to_u32(p);
+        if (is_made_of_four_digits_fast(digits)) {
+          uint32_t v = parse_four_digits_unrolled(digits);
+          if (len >= 5 && is_integer(p[4])) {
+            v = v * 10 + uint32_t(p[4] - '0');
+            if (len >= 6 && is_integer(p[5])) {
+              answer.ec = std::errc::result_out_of_range;
+              const UC *q = p + 5;
+              while (q != pend && is_integer(*q)) {
+                q++;
+              }
+              answer.ptr = q;
+              return answer;
+            }
+            if (v > 65535) {
+              answer.ec = std::errc::result_out_of_range;
+              answer.ptr = p + 5;
+              return answer;
+            }
+            value = uint16_t(v);
+            answer.ec = std::errc();
+            answer.ptr = p + 5;
+            return answer;
+          }
+          // 4 digits
+          value = uint16_t(v);
           answer.ec = std::errc();
-          answer.ptr = p;
-          return answer;
-        }
-        answer.ec = std::errc::invalid_argument;
-        answer.ptr = first;
-        return answer;
-      }
-
-      if (nd < 4) {
-        // mask out non-digit bytes and replace with '0' (0x30)
-        uint32_t mask = 0xFFFFFFFFu >> ((4u - nd) * 8u);
-        uint32_t padded = (digits & mask) | (~mask & 0x30303030u);
-        uint32_t v = parse_four_digits_unrolled(padded);
-        constexpr uint32_t divs[] = {0, 1000, 100, 10};
-        value = (uint16_t)(v / divs[nd]);
-        answer.ec = std::errc();
-        answer.ptr = p + nd;
-        return answer;
-      }
-
-      uint32_t v = parse_four_digits_unrolled(digits);
-
-      uint32_t d4 = (len > 4) ? uint32_t(p[4] - '0') : 10u;
-      if (d4 > 9u) {
-        value = (uint16_t)v;
-        answer.ec = std::errc();
-        answer.ptr = p + 4;
-        return answer;
-      }
-
-      if (len > 5) {
-        uint32_t d5 = uint32_t(p[5]) - uint32_t('0');
-        if (d5 <= 9u) {
-          const UC *q = p + 6;
-          while (q < pend && uint32_t(*q) - uint32_t('0') <= 9u)
-            ++q;
-          answer.ec = std::errc::result_out_of_range;
-          answer.ptr = q;
+          answer.ptr = p + 4;
           return answer;
         }
       }
-
-      // overflow check
-      if (v > 6553u || (v == 6553u && d4 > 5u)) {
-        answer.ec = std::errc::result_out_of_range;
-        answer.ptr = p + 5;
-        return answer;
-      }
-
-      value = (uint16_t)(v * 10u + d4);
-      answer.ec = std::errc();
-      answer.ptr = p + 5;
-      return answer;
     }
   }