Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions BIBLIOGRAPHY.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ source code and documentation.
- [dev/x86_64/src/pointwise_acc_l4.S](dev/x86_64/src/pointwise_acc_l4.S)
- [dev/x86_64/src/pointwise_acc_l5.S](dev/x86_64/src/pointwise_acc_l5.S)
- [dev/x86_64/src/pointwise_acc_l7.S](dev/x86_64/src/pointwise_acc_l7.S)
- [dev/x86_64/src/poly_caddq_avx2.c](dev/x86_64/src/poly_caddq_avx2.c)
- [dev/x86_64/src/poly_caddq_avx2.S](dev/x86_64/src/poly_caddq_avx2.S)
- [dev/x86_64/src/poly_chknorm_avx2.c](dev/x86_64/src/poly_chknorm_avx2.c)
- [dev/x86_64/src/poly_decompose_32_avx2.c](dev/x86_64/src/poly_decompose_32_avx2.c)
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
Expand All @@ -237,7 +237,7 @@ source code and documentation.
- [mldsa/src/native/x86_64/src/pointwise_acc_l4.S](mldsa/src/native/x86_64/src/pointwise_acc_l4.S)
- [mldsa/src/native/x86_64/src/pointwise_acc_l5.S](mldsa/src/native/x86_64/src/pointwise_acc_l5.S)
- [mldsa/src/native/x86_64/src/pointwise_acc_l7.S](mldsa/src/native/x86_64/src/pointwise_acc_l7.S)
- [mldsa/src/native/x86_64/src/poly_caddq_avx2.c](mldsa/src/native/x86_64/src/poly_caddq_avx2.c)
- [mldsa/src/native/x86_64/src/poly_caddq_avx2.S](mldsa/src/native/x86_64/src/poly_caddq_avx2.S)
- [mldsa/src/native/x86_64/src/poly_chknorm_avx2.c](mldsa/src/native/x86_64/src/poly_chknorm_avx2.c)
- [mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c)
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,6 @@
* AVX2 Dilithium implementation @[REF_AVX2].
*/

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)

#include <immintrin.h>
#include "arith_native_x86_64.h"
#include "consts.h"

/*************************************************
* Name: mld_poly_caddq_avx2
Expand All @@ -34,28 +26,45 @@
*
* Arguments: - int32_t *r: pointer to input/output polynomial
**************************************************/
void mld_poly_caddq_avx2(int32_t *r)
{
unsigned int i;
__m256i f, g;
const __m256i q = _mm256_set1_epi32(MLDSA_Q);
const __m256i zero = _mm256_setzero_si256();
__m256i *rr = (__m256i *)r;

for (i = 0; i < MLDSA_N / 8; i++)
{
f = _mm256_load_si256(&rr[i]);
g = _mm256_cmpgt_epi32(zero, f);
g = _mm256_and_si256(g, q);
f = _mm256_add_epi32(f, g);
_mm256_store_si256(&rr[i], f);
}
}

#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
*/

MLD_EMPTY_CU(avx2_reduce)

#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
!MLD_CONFIG_MULTILEVEL_NO_SHARED) */

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)

/* simpasm: header-end */

.macro caddq_vector offset, reg
vpcmpgtd \offset(%rdi), %ymm2, \reg
vpand %ymm1, \reg, \reg
vpaddd \offset(%rdi), \reg, \reg
vmovdqa \reg, \offset(%rdi)
.endm

.text
.global MLD_ASM_NAMESPACE(poly_caddq_avx2)
.balign 16
MLD_ASM_FN_SYMBOL(poly_caddq_avx2)

mov $8380417, %edx
leaq 1024(%rdi), %rax
vpxor %xmm2, %xmm2, %xmm2
vmovd %edx, %xmm1
vpbroadcastd %xmm1, %ymm1

poly_caddq_avx2_loop:

caddq_vector 0, %ymm0
caddq_vector 32, %ymm3
caddq_vector 64, %ymm4
caddq_vector 96, %ymm5

addq $128, %rdi /* advance by 128 bytes (4 vectors) */
cmpq %rdi, %rax
jne poly_caddq_avx2_loop /* 8 iterations (32/4 = 8) */
ret

/* simpasm: footer-start */

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
*/
1 change: 0 additions & 1 deletion mldsa/mldsa_native.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@
#endif /* MLD_SYS_AARCH64 */
#if defined(MLD_SYS_X86_64)
#include "src/native/x86_64/src/consts.c"
#include "src/native/x86_64/src/poly_caddq_avx2.c"
#include "src/native/x86_64/src/poly_chknorm_avx2.c"
#include "src/native/x86_64/src/poly_decompose_32_avx2.c"
#include "src/native/x86_64/src/poly_decompose_88_avx2.c"
Expand Down
1 change: 1 addition & 0 deletions mldsa/mldsa_native_asm.S
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
#include "src/native/x86_64/src/pointwise_acc_l4.S"
#include "src/native/x86_64/src/pointwise_acc_l5.S"
#include "src/native/x86_64/src/pointwise_acc_l7.S"
#include "src/native/x86_64/src/poly_caddq_avx2.S"
#endif /* MLD_SYS_X86_64 */
#endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */

Expand Down
84 changes: 84 additions & 0 deletions mldsa/src/native/x86_64/src/poly_caddq_avx2.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/

/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/


/*************************************************
* Name: mld_poly_caddq_avx2
*
* Description: For all coefficients of in/out polynomial add Q if
* coefficient is negative.
*
* Arguments: - int32_t *r: pointer to input/output polynomial
**************************************************/

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)


/*
* WARNING: This file is auto-derived from the mldsa-native source file
* dev/x86_64/src/poly_caddq_avx2.S using scripts/simpasm. Do not modify it directly.
*/

#if defined(__ELF__)
.section .note.GNU-stack,"",@progbits
#endif

.text
.balign 4
.global MLD_ASM_NAMESPACE(poly_caddq_avx2)
MLD_ASM_FN_SYMBOL(poly_caddq_avx2)

.cfi_startproc
movl $0x7fe001, %edx # imm = 0x7FE001
leaq 0x400(%rdi), %rax
vpxor %xmm2, %xmm2, %xmm2
vmovd %edx, %xmm1
vpbroadcastd %xmm1, %ymm1

Lpoly_caddq_avx2_loop:
vpcmpgtd (%rdi), %ymm2, %ymm0
vpand %ymm1, %ymm0, %ymm0
vpaddd (%rdi), %ymm0, %ymm0
vmovdqa %ymm0, (%rdi)
vpcmpgtd 0x20(%rdi), %ymm2, %ymm3
vpand %ymm1, %ymm3, %ymm3
vpaddd 0x20(%rdi), %ymm3, %ymm3
vmovdqa %ymm3, 0x20(%rdi)
vpcmpgtd 0x40(%rdi), %ymm2, %ymm4
vpand %ymm1, %ymm4, %ymm4
vpaddd 0x40(%rdi), %ymm4, %ymm4
vmovdqa %ymm4, 0x40(%rdi)
vpcmpgtd 0x60(%rdi), %ymm2, %ymm5
vpand %ymm1, %ymm5, %ymm5
vpaddd 0x60(%rdi), %ymm5, %ymm5
vmovdqa %ymm5, 0x60(%rdi)
addq $0x80, %rdi
cmpq %rdi, %rax
jne Lpoly_caddq_avx2_loop
retq
.cfi_endproc

MLD_ASM_FN_SIZE(poly_caddq_avx2)


#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
*/
61 changes: 0 additions & 61 deletions mldsa/src/native/x86_64/src/poly_caddq_avx2.c

This file was deleted.

2 changes: 2 additions & 0 deletions test/bench/bench_components_mldsa.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ static int bench(void)
/* polyz_unpack */
BENCH("polyz_unpack", mld_polyz_unpack(&poly_out, (const uint8_t *)data0))

BENCH("poly_caddq", mld_poly_caddq((mld_poly *)data0));

return 0;
}

Expand Down
Loading