xtool/contrib/FastMM4-AVX/FastMM4_AVX512.asm

344 lines
8.7 KiB
NASM

; This file is needed to enable AVX-512 code for FastMM4-AVX.
; Use "nasm.exe -Ox -f win64 FastMM4_AVX512.asm" to compile this file
; You can get The Netwide Assembler (NASM) from http://www.nasm.us/
; This file is a part of FastMM4-AVX.
; - Copyright (C) 2017-2020 Ritlabs, SRL. All rights reserved.
; - Copyright (C) 2020-2021 Maxim Masiutin. All rights reserved.
; Written by Maxim Masiutin <maxim@masiutin.com>
; FastMM4-AVX is a fork of the Fast Memory Manager 4.992 by Pierre le Riche
; FastMM4-AVX is released under a dual license, and you may choose to use it
; under either the Mozilla Public License 2.0 (MPL 2.1, available from
; https://www.mozilla.org/en-US/MPL/2.0/) or the GNU Lesser General Public
; License Version 3, dated 29 June 2007 (LGPL 3, available from
; https://www.gnu.org/licenses/lgpl.html).
; This code uses zmm26 - zmm31 registers to avoid AVX-SSE transition penalty.
; These regsters (zmm16 - zmm31) have no non-VEX counterpart. According to the
; advise of Agner Fog, there is no state transition and no penalty for mixing
; zmm16 - zmm31 with non-VEX SSE code. By using these registers (zmm16 - zmm31)
; rather than zmm0-xmm15 we save us from calling "vzeroupper".
; Source:
; https://stackoverflow.com/questions/43879935/avoiding-avx-sse-vex-transition-penalties/54587480#54587480
%define EVEXR512N0 zmm31
%define EVEXR512N1 zmm30
%define EVEXR512N2 zmm29
%define EVEXR512N3 zmm28
%define EVEXR512N4 zmm27
%define EVEXR512N5 zmm26
%define EVEXR256N0 ymm31
%define EVEXR256N1 ymm30
%define EVEXR256N2 ymm29
%define EVEXR256N3 ymm28
%define EVEXR256N4 ymm27
%define EVEXR256N5 ymm26
%define EVEXR128N0 xmm31
%define EVEXR128N1 xmm30
%define EVEXR128N2 xmm29
%define EVEXR128N3 xmm28
%define EVEXR128N4 xmm27
%define EVEXR128N5 xmm26
section .text
global Move24AVX512
global Move56AVX512
global Move88AVX512
global Move120AVX512
global Move152AVX512
global Move184AVX512
global Move216AVX512
global Move248AVX512
global Move280AVX512
global Move312AVX512
global Move344AVX512
global MoveX32LpAvx512WithErms
%use smartalign
ALIGNMODE p6, 32 ; p6 NOP strategy, and jump over the NOPs only if they're 32B or larger.
align 16
Move24AVX512:
vmovdqa64 EVEXR128N0, [rcx]
mov r8, [rcx+10h]
vmovdqa64 [rdx], EVEXR128N0
mov [rdx+10h], r8
vpxord EVEXR128N0, EVEXR128N0, EVEXR128N0
ret
Move56AVX512:
vmovdqa64 EVEXR256N0, [rcx+00h]
vmovdqa64 EVEXR128N1, [rcx+20h]
mov r8, [rcx+30h]
vmovdqa64 [rdx+00h], EVEXR256N0
vmovdqa64 [rdx+20h], EVEXR128N1
mov [rdx + 48], r8
vpxord EVEXR256N0, EVEXR256N0, EVEXR256N0
vpxord EVEXR128N1, EVEXR128N1, EVEXR128N1
ret
align 16
Move88AVX512:
vmovdqu64 EVEXR512N0, [rcx]
vmovdqa64 EVEXR128N1, [rcx+40h]
mov rcx, [rcx+50h]
vmovdqu64 [rdx], EVEXR512N0
vmovdqa64 [rdx+40h], EVEXR128N1
mov [rdx+50h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR128N1,EVEXR128N1,EVEXR128N1
ret
align 16
Move120AVX512:
vmovdqu64 EVEXR512N0, [rcx]
vmovdqa64 EVEXR256N1, [rcx+40h]
vmovdqa64 EVEXR128N2, [rcx+60h]
mov rcx, [rcx + 70h]
vmovdqu64 [rdx], EVEXR512N0
vmovdqa64 [rdx+40h], EVEXR256N1
vmovdqa64 [rdx+60h], EVEXR128N2
mov [rdx+70h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR256N1,EVEXR256N1,EVEXR256N1
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
ret
align 16
Move152AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqa64 EVEXR128N2, [rcx+80h]
mov rcx, [rcx+90h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqa64 [rdx+80h], EVEXR128N2
mov [rdx+90h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
ret
align 16
Move184AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqa64 EVEXR256N2, [rcx+80h]
vmovdqa64 EVEXR128N3, [rcx+0A0h]
mov rcx, [rcx+0B0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqa64 [rdx+80h], EVEXR256N2
vmovdqa64 [rdx+0A0h],EVEXR128N3
mov [rdx+0B0h],rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR256N2,EVEXR256N2,EVEXR256N2
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
ret
align 16
Move216AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqa64 EVEXR128N3, [rcx+0C0h]
mov rcx, [rcx+0D0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqa64 [rdx+0C0h], EVEXR128N3
mov [rdx+0D0h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
ret
align 16
Move248AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqa64 EVEXR256N3, [rcx+0C0h]
vmovdqa64 EVEXR128N4, [rcx+0E0h]
mov rcx, [rcx+0F0h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqa64 [rdx+0C0h], EVEXR256N3
vmovdqa64 [rdx+0E0h], EVEXR128N4
mov [rdx+0F0h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR256N3,EVEXR256N3,EVEXR256N3
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
ret
align 16
Move280AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqa64 EVEXR128N4, [rcx+100h]
mov rcx, [rcx+110h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqa64 [rdx+100h], EVEXR128N4
mov [rdx+110h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
ret
align 16
Move312AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqa64 EVEXR256N4, [rcx+100h]
vmovdqa64 EVEXR128N5, [rcx+120h]
mov rcx, [rcx+130h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqa64 [rdx+100h], EVEXR256N4
vmovdqa64 [rdx+120h], EVEXR128N5
mov [rdx+130h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR256N4,EVEXR256N4,EVEXR256N4
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
ret
align 16
Move344AVX512:
vmovdqu64 EVEXR512N0, [rcx+00h]
vmovdqu64 EVEXR512N1, [rcx+40h]
vmovdqu64 EVEXR512N2, [rcx+80h]
vmovdqu64 EVEXR512N3, [rcx+0C0h]
vmovdqu64 EVEXR512N4, [rcx+100h]
vmovdqa64 EVEXR128N5, [rcx+140h]
mov rcx, [rcx+150h]
vmovdqu64 [rdx+00h], EVEXR512N0
vmovdqu64 [rdx+40h], EVEXR512N1
vmovdqu64 [rdx+80h], EVEXR512N2
vmovdqu64 [rdx+0C0h], EVEXR512N3
vmovdqu64 [rdx+100h], EVEXR512N4
vmovdqa64 [rdx+140h], EVEXR128N5
mov [rdx+150h], rcx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
vpxord EVEXR512N4,EVEXR512N4,EVEXR512N4
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
ret
align 16
MoveX32LpAvx512WithErms:
; Make the counter negative based: The last 24 bytes are moved separately
mov eax, 8
sub r8, rax
add rcx, r8
add rdx, r8
neg r8
jns @MoveLast8
cmp r8, -2048 ; According to the Intel Manual, rep movsb outperforms AVX copy on blocks of 2048 bytes and above
jg @DontDoRepMovsb
align 4
@DoRepMovsb:
mov r10, rsi
mov r9, rdi
lea rsi, [rcx+r8]
lea rdi, [rdx+r8]
neg r8
add r8, rax
mov rcx, r8
cld
rep movsb
mov rdi, r9
mov rsi, r10
jmp @exit
align 16
@DontDoRepMovsb:
cmp r8, -(128+64)
jg @SmallAvxMove
mov eax, 128
sub rcx, rax
sub rdx, rax
add r8, rax
lea r9, [rdx+r8]
test r9b, 63
jz @Avx512BigMoveDestAligned
; destination is already 32-bytes aligned, so we just align by 64 bytes
vmovdqa64 EVEXR256N0, [rcx+r8]
vmovdqa64 [rdx+r8], EVEXR256N0
add r8, 20h
align 16
@Avx512BigMoveDestAligned:
vmovdqu64 EVEXR512N0, [rcx+r8+00h]
vmovdqu64 EVEXR512N1, [rcx+r8+40h]
vmovdqa64 [rdx+r8+00h], EVEXR512N0
vmovdqa64 [rdx+r8+40h], EVEXR512N1
add r8, rax
js @Avx512BigMoveDestAligned
sub r8, rax
add rcx, rax
add rdx, rax
align 16
@SmallAvxMove:
@MoveLoopAvx:
; Move a 16 byte block
vmovdqa64 EVEXR128N0, [rcx+r8]
vmovdqa64 [rdx+r8], EVEXR128N0
; Are there another 16 bytes to move?
add r8, 16
js @MoveLoopAvx
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
align 8
@MoveLast8:
; Do the last 8 bytes
mov rcx, [rcx+r8]
mov [rdx+r8], rcx
@exit:
ret