344 lines
8.7 KiB
NASM
344 lines
8.7 KiB
NASM
; This file is needed to enable AVX-512 code for FastMM4-AVX.
|
|
; Use "nasm.exe -Ox -f win64 FastMM4_AVX512.asm" to compile this file
|
|
; You can get The Netwide Assembler (NASM) from http://www.nasm.us/
|
|
|
|
; This file is a part of FastMM4-AVX.
|
|
; - Copyright (C) 2017-2020 Ritlabs, SRL. All rights reserved.
|
|
; - Copyright (C) 2020-2021 Maxim Masiutin. All rights reserved.
|
|
; Written by Maxim Masiutin <maxim@masiutin.com>
|
|
|
|
; FastMM4-AVX is a fork of the Fast Memory Manager 4.992 by Pierre le Riche
|
|
|
|
; FastMM4-AVX is released under a dual license, and you may choose to use it
|
|
; under either the Mozilla Public License 2.0 (MPL 2.1, available from
|
|
; https://www.mozilla.org/en-US/MPL/2.0/) or the GNU Lesser General Public
|
|
; License Version 3, dated 29 June 2007 (LGPL 3, available from
|
|
; https://www.gnu.org/licenses/lgpl.html).
|
|
|
|
; This code uses zmm26 - zmm31 registers to avoid AVX-SSE transition penalty.
|
|
; These regsters (zmm16 - zmm31) have no non-VEX counterpart. According to the
|
|
; advise of Agner Fog, there is no state transition and no penalty for mixing
|
|
; zmm16 - zmm31 with non-VEX SSE code. By using these registers (zmm16 - zmm31)
|
|
; rather than zmm0-xmm15 we save us from calling "vzeroupper".
|
|
; Source:
|
|
; https://stackoverflow.com/questions/43879935/avoiding-avx-sse-vex-transition-penalties/54587480#54587480
|
|
|
|
|
|
%define EVEXR512N0 zmm31
|
|
%define EVEXR512N1 zmm30
|
|
%define EVEXR512N2 zmm29
|
|
%define EVEXR512N3 zmm28
|
|
%define EVEXR512N4 zmm27
|
|
%define EVEXR512N5 zmm26
|
|
%define EVEXR256N0 ymm31
|
|
%define EVEXR256N1 ymm30
|
|
%define EVEXR256N2 ymm29
|
|
%define EVEXR256N3 ymm28
|
|
%define EVEXR256N4 ymm27
|
|
%define EVEXR256N5 ymm26
|
|
%define EVEXR128N0 xmm31
|
|
%define EVEXR128N1 xmm30
|
|
%define EVEXR128N2 xmm29
|
|
%define EVEXR128N3 xmm28
|
|
%define EVEXR128N4 xmm27
|
|
%define EVEXR128N5 xmm26
|
|
|
|
|
|
section .text
|
|
|
|
global Move24AVX512
|
|
global Move56AVX512
|
|
global Move88AVX512
|
|
global Move120AVX512
|
|
global Move152AVX512
|
|
global Move184AVX512
|
|
global Move216AVX512
|
|
global Move248AVX512
|
|
global Move280AVX512
|
|
global Move312AVX512
|
|
global Move344AVX512
|
|
global MoveX32LpAvx512WithErms
|
|
|
|
%use smartalign
|
|
ALIGNMODE p6, 32 ; p6 NOP strategy, and jump over the NOPs only if they're 32B or larger.
|
|
|
|
align 16
|
|
Move24AVX512:
|
|
vmovdqa64 EVEXR128N0, [rcx]
|
|
mov r8, [rcx+10h]
|
|
vmovdqa64 [rdx], EVEXR128N0
|
|
mov [rdx+10h], r8
|
|
vpxord EVEXR128N0, EVEXR128N0, EVEXR128N0
|
|
ret
|
|
|
|
Move56AVX512:
|
|
vmovdqa64 EVEXR256N0, [rcx+00h]
|
|
vmovdqa64 EVEXR128N1, [rcx+20h]
|
|
mov r8, [rcx+30h]
|
|
vmovdqa64 [rdx+00h], EVEXR256N0
|
|
vmovdqa64 [rdx+20h], EVEXR128N1
|
|
mov [rdx + 48], r8
|
|
vpxord EVEXR256N0, EVEXR256N0, EVEXR256N0
|
|
vpxord EVEXR128N1, EVEXR128N1, EVEXR128N1
|
|
ret
|
|
|
|
align 16
|
|
Move88AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx]
|
|
vmovdqa64 EVEXR128N1, [rcx+40h]
|
|
mov rcx, [rcx+50h]
|
|
vmovdqu64 [rdx], EVEXR512N0
|
|
vmovdqa64 [rdx+40h], EVEXR128N1
|
|
mov [rdx+50h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR128N1,EVEXR128N1,EVEXR128N1
|
|
ret
|
|
|
|
align 16
|
|
Move120AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx]
|
|
vmovdqa64 EVEXR256N1, [rcx+40h]
|
|
vmovdqa64 EVEXR128N2, [rcx+60h]
|
|
mov rcx, [rcx + 70h]
|
|
vmovdqu64 [rdx], EVEXR512N0
|
|
vmovdqa64 [rdx+40h], EVEXR256N1
|
|
vmovdqa64 [rdx+60h], EVEXR128N2
|
|
mov [rdx+70h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR256N1,EVEXR256N1,EVEXR256N1
|
|
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
|
|
ret
|
|
|
|
align 16
|
|
Move152AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqa64 EVEXR128N2, [rcx+80h]
|
|
mov rcx, [rcx+90h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqa64 [rdx+80h], EVEXR128N2
|
|
mov [rdx+90h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR128N2,EVEXR128N2,EVEXR128N2
|
|
ret
|
|
|
|
align 16
|
|
Move184AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqa64 EVEXR256N2, [rcx+80h]
|
|
vmovdqa64 EVEXR128N3, [rcx+0A0h]
|
|
mov rcx, [rcx+0B0h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqa64 [rdx+80h], EVEXR256N2
|
|
vmovdqa64 [rdx+0A0h],EVEXR128N3
|
|
mov [rdx+0B0h],rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR256N2,EVEXR256N2,EVEXR256N2
|
|
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
|
|
ret
|
|
|
|
align 16
|
|
Move216AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqu64 EVEXR512N2, [rcx+80h]
|
|
vmovdqa64 EVEXR128N3, [rcx+0C0h]
|
|
mov rcx, [rcx+0D0h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqu64 [rdx+80h], EVEXR512N2
|
|
vmovdqa64 [rdx+0C0h], EVEXR128N3
|
|
mov [rdx+0D0h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
|
|
vpxord EVEXR128N3,EVEXR128N3,EVEXR128N3
|
|
ret
|
|
|
|
align 16
|
|
Move248AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqu64 EVEXR512N2, [rcx+80h]
|
|
vmovdqa64 EVEXR256N3, [rcx+0C0h]
|
|
vmovdqa64 EVEXR128N4, [rcx+0E0h]
|
|
mov rcx, [rcx+0F0h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqu64 [rdx+80h], EVEXR512N2
|
|
vmovdqa64 [rdx+0C0h], EVEXR256N3
|
|
vmovdqa64 [rdx+0E0h], EVEXR128N4
|
|
mov [rdx+0F0h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
|
|
vpxord EVEXR256N3,EVEXR256N3,EVEXR256N3
|
|
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
|
|
ret
|
|
|
|
align 16
|
|
Move280AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqu64 EVEXR512N2, [rcx+80h]
|
|
vmovdqu64 EVEXR512N3, [rcx+0C0h]
|
|
vmovdqa64 EVEXR128N4, [rcx+100h]
|
|
mov rcx, [rcx+110h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqu64 [rdx+80h], EVEXR512N2
|
|
vmovdqu64 [rdx+0C0h], EVEXR512N3
|
|
vmovdqa64 [rdx+100h], EVEXR128N4
|
|
mov [rdx+110h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
|
|
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
|
|
vpxord EVEXR128N4,EVEXR128N4,EVEXR128N4
|
|
ret
|
|
|
|
align 16
|
|
Move312AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqu64 EVEXR512N2, [rcx+80h]
|
|
vmovdqu64 EVEXR512N3, [rcx+0C0h]
|
|
vmovdqa64 EVEXR256N4, [rcx+100h]
|
|
vmovdqa64 EVEXR128N5, [rcx+120h]
|
|
mov rcx, [rcx+130h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqu64 [rdx+80h], EVEXR512N2
|
|
vmovdqu64 [rdx+0C0h], EVEXR512N3
|
|
vmovdqa64 [rdx+100h], EVEXR256N4
|
|
vmovdqa64 [rdx+120h], EVEXR128N5
|
|
mov [rdx+130h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
|
|
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
|
|
vpxord EVEXR256N4,EVEXR256N4,EVEXR256N4
|
|
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
|
|
ret
|
|
|
|
align 16
|
|
Move344AVX512:
|
|
vmovdqu64 EVEXR512N0, [rcx+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+40h]
|
|
vmovdqu64 EVEXR512N2, [rcx+80h]
|
|
vmovdqu64 EVEXR512N3, [rcx+0C0h]
|
|
vmovdqu64 EVEXR512N4, [rcx+100h]
|
|
vmovdqa64 EVEXR128N5, [rcx+140h]
|
|
mov rcx, [rcx+150h]
|
|
vmovdqu64 [rdx+00h], EVEXR512N0
|
|
vmovdqu64 [rdx+40h], EVEXR512N1
|
|
vmovdqu64 [rdx+80h], EVEXR512N2
|
|
vmovdqu64 [rdx+0C0h], EVEXR512N3
|
|
vmovdqu64 [rdx+100h], EVEXR512N4
|
|
vmovdqa64 [rdx+140h], EVEXR128N5
|
|
mov [rdx+150h], rcx
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
vpxord EVEXR512N2,EVEXR512N2,EVEXR512N2
|
|
vpxord EVEXR512N3,EVEXR512N3,EVEXR512N3
|
|
vpxord EVEXR512N4,EVEXR512N4,EVEXR512N4
|
|
vpxord EVEXR128N5,EVEXR128N5,EVEXR128N5
|
|
ret
|
|
|
|
|
|
align 16
|
|
MoveX32LpAvx512WithErms:
|
|
|
|
; Make the counter negative based: The last 24 bytes are moved separately
|
|
|
|
mov eax, 8
|
|
sub r8, rax
|
|
add rcx, r8
|
|
add rdx, r8
|
|
neg r8
|
|
jns @MoveLast8
|
|
|
|
cmp r8, -2048 ; According to the Intel Manual, rep movsb outperforms AVX copy on blocks of 2048 bytes and above
|
|
jg @DontDoRepMovsb
|
|
|
|
align 4
|
|
|
|
@DoRepMovsb:
|
|
mov r10, rsi
|
|
mov r9, rdi
|
|
lea rsi, [rcx+r8]
|
|
lea rdi, [rdx+r8]
|
|
neg r8
|
|
add r8, rax
|
|
mov rcx, r8
|
|
cld
|
|
rep movsb
|
|
mov rdi, r9
|
|
mov rsi, r10
|
|
jmp @exit
|
|
|
|
align 16
|
|
|
|
@DontDoRepMovsb:
|
|
cmp r8, -(128+64)
|
|
jg @SmallAvxMove
|
|
|
|
mov eax, 128
|
|
|
|
sub rcx, rax
|
|
sub rdx, rax
|
|
add r8, rax
|
|
|
|
|
|
lea r9, [rdx+r8]
|
|
test r9b, 63
|
|
jz @Avx512BigMoveDestAligned
|
|
|
|
; destination is already 32-bytes aligned, so we just align by 64 bytes
|
|
vmovdqa64 EVEXR256N0, [rcx+r8]
|
|
vmovdqa64 [rdx+r8], EVEXR256N0
|
|
add r8, 20h
|
|
|
|
align 16
|
|
|
|
@Avx512BigMoveDestAligned:
|
|
vmovdqu64 EVEXR512N0, [rcx+r8+00h]
|
|
vmovdqu64 EVEXR512N1, [rcx+r8+40h]
|
|
vmovdqa64 [rdx+r8+00h], EVEXR512N0
|
|
vmovdqa64 [rdx+r8+40h], EVEXR512N1
|
|
add r8, rax
|
|
js @Avx512BigMoveDestAligned
|
|
|
|
sub r8, rax
|
|
add rcx, rax
|
|
add rdx, rax
|
|
|
|
align 16
|
|
|
|
@SmallAvxMove:
|
|
|
|
@MoveLoopAvx:
|
|
; Move a 16 byte block
|
|
vmovdqa64 EVEXR128N0, [rcx+r8]
|
|
vmovdqa64 [rdx+r8], EVEXR128N0
|
|
|
|
; Are there another 16 bytes to move?
|
|
add r8, 16
|
|
js @MoveLoopAvx
|
|
|
|
vpxord EVEXR512N0,EVEXR512N0,EVEXR512N0
|
|
vpxord EVEXR512N1,EVEXR512N1,EVEXR512N1
|
|
|
|
align 8
|
|
@MoveLast8:
|
|
; Do the last 8 bytes
|
|
mov rcx, [rcx+r8]
|
|
mov [rdx+r8], rcx
|
|
@exit:
|
|
ret
|