diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 9783c2b7a..1b284974c 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -219,7 +219,7 @@ source code and documentation. - [dev/x86_64/src/pointwise_acc_l4.S](dev/x86_64/src/pointwise_acc_l4.S) - [dev/x86_64/src/pointwise_acc_l5.S](dev/x86_64/src/pointwise_acc_l5.S) - [dev/x86_64/src/pointwise_acc_l7.S](dev/x86_64/src/pointwise_acc_l7.S) - - [dev/x86_64/src/poly_caddq_avx2.c](dev/x86_64/src/poly_caddq_avx2.c) + - [dev/x86_64/src/poly_caddq_avx2.S](dev/x86_64/src/poly_caddq_avx2.S) - [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c) - [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c) - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) @@ -237,7 +237,7 @@ source code and documentation. - [mldsa/src/native/x86_64/src/pointwise_acc_l4.S](mldsa/src/native/x86_64/src/pointwise_acc_l4.S) - [mldsa/src/native/x86_64/src/pointwise_acc_l5.S](mldsa/src/native/x86_64/src/pointwise_acc_l5.S) - [mldsa/src/native/x86_64/src/pointwise_acc_l7.S](mldsa/src/native/x86_64/src/pointwise_acc_l7.S) - - [mldsa/src/native/x86_64/src/poly_caddq_avx2.c](mldsa/src/native/x86_64/src/poly_caddq_avx2.c) + - [mldsa/src/native/x86_64/src/poly_caddq_avx2.S](mldsa/src/native/x86_64/src/poly_caddq_avx2.S) - [mldsa/src/native/x86_64/src/poly_chknorm_avx2.c](mldsa/src/native/x86_64/src/poly_chknorm_avx2.c) - [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c) - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c) diff --git a/dev/x86_64/src/poly_caddq_avx2.c b/dev/x86_64/src/poly_caddq_avx2.S similarity index 53% rename from dev/x86_64/src/poly_caddq_avx2.c rename to dev/x86_64/src/poly_caddq_avx2.S index 05d86d14f..882424451 100644 --- a/dev/x86_64/src/poly_caddq_avx2.c +++ b/dev/x86_64/src/poly_caddq_avx2.S @@ -17,14 +17,6 @@ * AVX2 Dilithium implementation @[REF_AVX2]. */ -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" /************************************************* * Name: mld_poly_caddq_avx2 @@ -34,28 +26,45 @@ * * Arguments: - int32_t *r: pointer to input/output polynomial **************************************************/ -void mld_poly_caddq_avx2(int32_t *r) -{ - unsigned int i; - __m256i f, g; - const __m256i q = _mm256_set1_epi32(MLDSA_Q); - const __m256i zero = _mm256_setzero_si256(); - __m256i *rr = (__m256i *)r; - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256(&rr[i]); - g = _mm256_cmpgt_epi32(zero, f); - g = _mm256_and_si256(g, q); - f = _mm256_add_epi32(f, g); - _mm256_store_si256(&rr[i], f); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - */ - -MLD_EMPTY_CU(avx2_reduce) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +/* simpasm: header-end */ + +.macro caddq_vector offset, reg +vpcmpgtd \offset(%rdi), %ymm2, \reg +vpand %ymm1, \reg, \reg +vpaddd \offset(%rdi), \reg, \reg +vmovdqa \reg, \offset(%rdi) +.endm + +.text +.global MLD_ASM_NAMESPACE(poly_caddq_avx2) +.balign 16 +MLD_ASM_FN_SYMBOL(poly_caddq_avx2) + +mov $8380417, %edx +leaq 1024(%rdi), %rax +vpxor %xmm2, %xmm2, %xmm2 +vmovd %edx, %xmm1 +vpbroadcastd %xmm1, %ymm1 + +poly_caddq_avx2_loop: + +caddq_vector 0, %ymm0 +caddq_vector 32, %ymm3 +caddq_vector 64, %ymm4 +caddq_vector 96, %ymm5 + +addq $128, %rdi /* advance by 128 bytes (4 vectors) */ +cmpq %rdi, %rax +jne poly_caddq_avx2_loop /* 8 iterations (32/4 = 8) */ +ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index e20a82f2c..9ad43f785 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -82,7 +82,6 @@ #endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) #include "src/native/x86_64/src/consts.c" -#include "src/native/x86_64/src/poly_caddq_avx2.c" #include "src/native/x86_64/src/poly_chknorm_avx2.c" #include "src/native/x86_64/src/poly_decompose_32_avx2.c" #include "src/native/x86_64/src/poly_decompose_88_avx2.c" diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index 36296bb5e..f0c54203f 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -86,6 +86,7 @@ #include "src/native/x86_64/src/pointwise_acc_l4.S" #include "src/native/x86_64/src/pointwise_acc_l5.S" #include "src/native/x86_64/src/pointwise_acc_l7.S" +#include "src/native/x86_64/src/poly_caddq_avx2.S" #endif /* MLD_SYS_X86_64 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mldsa/src/native/x86_64/src/poly_caddq_avx2.S b/mldsa/src/native/x86_64/src/poly_caddq_avx2.S new file mode 100644 index 000000000..127dc7d01 --- /dev/null +++ b/mldsa/src/native/x86_64/src/poly_caddq_avx2.S @@ -0,0 +1,84 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_caddq_avx2 + * + * Description: For all coefficients of in/out polynomial add Q if + * coefficient is negative. + * + * Arguments: - int32_t *r: pointer to input/output polynomial + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_caddq_avx2.S using scripts/simpasm. Do not modify it directly. + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",@progbits +#endif + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(poly_caddq_avx2) +MLD_ASM_FN_SYMBOL(poly_caddq_avx2) + + .cfi_startproc + movl $0x7fe001, %edx # imm = 0x7FE001 + leaq 0x400(%rdi), %rax + vpxor %xmm2, %xmm2, %xmm2 + vmovd %edx, %xmm1 + vpbroadcastd %xmm1, %ymm1 + +Lpoly_caddq_avx2_loop: + vpcmpgtd (%rdi), %ymm2, %ymm0 + vpand %ymm1, %ymm0, %ymm0 + vpaddd (%rdi), %ymm0, %ymm0 + vmovdqa %ymm0, (%rdi) + vpcmpgtd 0x20(%rdi), %ymm2, %ymm3 + vpand %ymm1, %ymm3, %ymm3 + vpaddd 0x20(%rdi), %ymm3, %ymm3 + vmovdqa %ymm3, 0x20(%rdi) + vpcmpgtd 0x40(%rdi), %ymm2, %ymm4 + vpand %ymm1, %ymm4, %ymm4 + vpaddd 0x40(%rdi), %ymm4, %ymm4 + vmovdqa %ymm4, 0x40(%rdi) + vpcmpgtd 0x60(%rdi), %ymm2, %ymm5 + vpand %ymm1, %ymm5, %ymm5 + vpaddd 0x60(%rdi), %ymm5, %ymm5 + vmovdqa %ymm5, 0x60(%rdi) + addq $0x80, %rdi + cmpq %rdi, %rax + jne Lpoly_caddq_avx2_loop + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(poly_caddq_avx2) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/mldsa/src/native/x86_64/src/poly_caddq_avx2.c b/mldsa/src/native/x86_64/src/poly_caddq_avx2.c deleted file mode 100644 index 05d86d14f..000000000 --- a/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) - -#include -#include "arith_native_x86_64.h" -#include "consts.h" - -/************************************************* - * Name: mld_poly_caddq_avx2 - * - * Description: For all coefficients of in/out polynomial add Q if - * coefficient is negative. - * - * Arguments: - int32_t *r: pointer to input/output polynomial - **************************************************/ -void mld_poly_caddq_avx2(int32_t *r) -{ - unsigned int i; - __m256i f, g; - const __m256i q = _mm256_set1_epi32(MLDSA_Q); - const __m256i zero = _mm256_setzero_si256(); - __m256i *rr = (__m256i *)r; - - for (i = 0; i < MLDSA_N / 8; i++) - { - f = _mm256_load_si256(&rr[i]); - g = _mm256_cmpgt_epi32(zero, f); - g = _mm256_and_si256(g, q); - f = _mm256_add_epi32(f, g); - _mm256_store_si256(&rr[i], f); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ - */ - -MLD_EMPTY_CU(avx2_reduce) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ - !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/test/bench/bench_components_mldsa.c b/test/bench/bench_components_mldsa.c index fb83ddf82..6cf8627e5 100644 --- a/test/bench/bench_components_mldsa.c +++ b/test/bench/bench_components_mldsa.c @@ -84,6 +84,8 @@ static int bench(void) /* polyz_unpack */ BENCH("polyz_unpack", mld_polyz_unpack(&poly_out, (const uint8_t *)data0)) + BENCH("poly_caddq", mld_poly_caddq((mld_poly *)data0)); + return 0; }