xtool/contrib/FastMM4-AVX/FastMM4LockFreeStack.pas

353 lines
9.7 KiB
ObjectPascal

// Based on TOmniBaseBoundedStack class from the OmniThreadLibrary,
// originally written by GJ and Primoz Gabrijelcic.
unit FastMM4LockFreeStack;
interface
type
PReferencedPtr = ^TReferencedPtr;
TReferencedPtr = record
PData : pointer;
Reference: NativeInt;
end;
PLinkedData = ^TLinkedData;
TLinkedData = packed record
Next: PLinkedData;
Data: record end; //user data, variable size
end;
TLFStack = record
strict private
FDataBuffer : pointer;
FElementSize : integer;
FNumElements : integer;
FPublicChainP : PReferencedPtr;
FRecycleChainP: PReferencedPtr;
class var
class var obsIsInitialized: boolean; //default is false
class var obsTaskPopLoops : NativeInt;
class var obsTaskPushLoops: NativeInt;
class function PopLink(var chain: TReferencedPtr): PLinkedData; static;
class procedure PushLink(const link: PLinkedData; var chain: TReferencedPtr); static;
procedure MeasureExecutionTimes;
public
procedure Empty;
procedure Initialize(numElements, elementSize: integer);
procedure Finalize;
function IsEmpty: boolean; inline;
function IsFull: boolean; inline;
function Pop(var value): boolean;
function Push(const value): boolean;
property ElementSize: integer read FElementSize;
property NumElements: integer read FNumElements;
end;
implementation
uses
Windows;
{$IF CompilerVersion < 23}
{$IFNDEF CPUX64}
type
NativeInt = integer;
NativeUInt = cardinal;
{$ENDIF}
{$IFEND}
var
CASAlignment: integer; //required alignment for the CAS function - 8 or 16, depending on the platform
function RoundUpTo(value: pointer; granularity: integer): pointer;
begin
Result := pointer((((NativeInt(value) - 1) div granularity) + 1) * granularity);
end;
function GetCPUTimeStamp: int64;
asm
rdtsc
{$IFDEF CPUX64}
shl rdx, 32
or rax, rdx
{$ENDIF CPUX64}
end;
function GetThreadId: NativeInt;
//result := GetCurrentThreadId;
asm
{$IFNDEF CPUX64}
mov eax, fs:[$18] //eax := thread information block
mov eax, [eax + $24] //eax := thread id
{$ELSE CPUX64}
mov rax, gs:[abs $30]
mov eax, [rax + $48]
{$ENDIF CPUX64}
end;
function CAS(const oldValue, newValue: NativeInt; var destination): boolean; overload;
asm
{$IFDEF CPUX64}
mov rax, oldValue
{$ENDIF CPUX64}
lock cmpxchg [destination], newValue
setz al
end;
function CAS(const oldValue, newValue: pointer; var destination): boolean; overload;
asm
{$IFDEF CPUX64}
mov rax, oldValue
{$ENDIF CPUX64}
lock cmpxchg [destination], newValue
setz al
end;
function CAS(const oldData: pointer; oldReference: NativeInt; newData: pointer;
newReference: NativeInt; var destination): boolean; overload;
asm
{$IFNDEF CPUX64}
push edi
push ebx
mov ebx, newData
mov ecx, newReference
mov edi, destination
lock cmpxchg8b qword ptr [edi]
pop ebx
pop edi
{$ELSE CPUX64}
.noframe
push rbx //rsp := rsp - 8 !
mov rax, oldData
mov rbx, newData
mov rcx, newReference
mov r8, [destination + 8] //+8 with respect to .noframe
lock cmpxchg16b [r8]
pop rbx
{$ENDIF CPUX64}
setz al
end;
{ TLFStack }
procedure TLFStack.Empty;
var
linkedData: PLinkedData;
begin
repeat
linkedData := PopLink(FPublicChainP^);
if not assigned(linkedData) then
break; //repeat
PushLink(linkedData, FRecycleChainP^);
until false;
end;
procedure TLFStack.Finalize;
begin
HeapFree(GetProcessHeap, 0, FDataBuffer);
end;
procedure TLFStack.Initialize(numElements, elementSize: integer);
var
bufferElementSize : integer;
currElement : PLinkedData;
dataBuffer : pointer;
iElement : integer;
nextElement : PLinkedData;
roundedElementSize: integer;
begin
Assert(SizeOf(NativeInt) = SizeOf(pointer));
Assert(numElements > 0);
Assert(elementSize > 0);
FNumElements := numElements;
FElementSize := elementSize;
//calculate element size, round up to next aligned value
roundedElementSize := (elementSize + SizeOf(pointer) - 1) AND NOT (SizeOf(pointer) - 1);
//calculate buffer element size, round up to next aligned value
bufferElementSize := ((SizeOf(TLinkedData) + roundedElementSize) + SizeOf(pointer) - 1) AND NOT (SizeOf(pointer) - 1);
//calculate DataBuffer
FDataBuffer := HeapAlloc(GetProcessHeap, HEAP_GENERATE_EXCEPTIONS, bufferElementSize * numElements + 2 * SizeOf(TReferencedPtr) + CASAlignment);
dataBuffer := RoundUpTo(FDataBuffer, CASAlignment);
if NativeInt(dataBuffer) AND (SizeOf(pointer) - 1) <> 0 then
// TODO 1 raise exception - how?
Halt; //raise Exception.Create('TOmniBaseContainer: obcBuffer is not aligned');
FPublicChainP := dataBuffer;
inc(NativeInt(dataBuffer), SizeOf(TReferencedPtr));
FRecycleChainP := dataBuffer;
inc(NativeInt(dataBuffer), SizeOf(TReferencedPtr));
//Format buffer to recycleChain, init obsRecycleChain and obsPublicChain.
//At the beginning, all elements are linked into the recycle chain.
FRecycleChainP^.PData := dataBuffer;
currElement := FRecycleChainP^.PData;
for iElement := 0 to FNumElements - 2 do begin
nextElement := PLinkedData(NativeInt(currElement) + bufferElementSize);
currElement.Next := nextElement;
currElement := nextElement;
end;
currElement.Next := nil; // terminate the chain
FPublicChainP^.PData := nil;
MeasureExecutionTimes;
end;
function TLFStack.IsEmpty: boolean;
begin
Result := not assigned(FPublicChainP^.PData);
end;
function TLFStack.IsFull: boolean;
begin
Result := not assigned(FRecycleChainP^.PData);
end;
procedure TLFStack.MeasureExecutionTimes;
const
NumOfSamples = 10;
var
TimeTestField: array [0..1] of array [1..NumOfSamples] of int64;
function GetMinAndClear(routine, count: cardinal): int64;
var
m: cardinal;
n: integer;
x: integer;
begin
Result := 0;
for m := 1 to count do begin
x:= 1;
for n:= 2 to NumOfSamples do
if TimeTestField[routine, n] < TimeTestField[routine, x] then
x := n;
Inc(Result, TimeTestField[routine, x]);
TimeTestField[routine, x] := MaxLongInt;
end;
end;
var
oldAffinity: NativeUInt;
currElement: PLinkedData;
n : integer;
begin
if not obsIsInitialized then begin
oldAffinity := SetThreadAffinityMask(GetCurrentThread, 1);
try
//Calculate TaskPopDelay and TaskPushDelay counter values depend on CPU speed!!!}
obsTaskPopLoops := 1;
obsTaskPushLoops := 1;
for n := 1 to NumOfSamples do begin
SwitchToThread;
//Measure RemoveLink rutine delay
TimeTestField[0, n] := GetCPUTimeStamp;
currElement := PopLink(FRecycleChainP^);
TimeTestField[0, n] := GetCPUTimeStamp - TimeTestField[0, n];
//Measure InsertLink rutine delay
TimeTestField[1, n] := GetCPUTimeStamp;
PushLink(currElement, FRecycleChainP^);
TimeTestField[1, n] := GetCPUTimeStamp - TimeTestField[1, n];
end;
//Calculate first 4 minimum average for RemoveLink rutine
obsTaskPopLoops := GetMinAndClear(0, 4) div 4;
//Calculate first 4 minimum average for InsertLink rutine
obsTaskPushLoops := GetMinAndClear(1, 4) div 4;
//This gives better performance (determined experimentally)
obsTaskPopLoops := obsTaskPopLoops * 2;
obsTaskPushLoops := obsTaskPushLoops * 2;
obsIsInitialized := true;
finally SetThreadAffinityMask(GetCurrentThread, oldAffinity); end;
end;
end;
function TLFStack.Pop(var value): boolean;
var
linkedData: PLinkedData;
begin
linkedData := PopLink(FPublicChainP^);
Result := assigned(linkedData);
if not Result then
Exit;
Move(linkedData.Data, value, ElementSize);
PushLink(linkedData, FRecycleChainP^);
end;
class function TLFStack.PopLink(var chain: TReferencedPtr): PLinkedData;
//nil << Link.Next << Link.Next << ... << Link.Next
// ^------ < chainHead
var
AtStartReference: NativeInt;
CurrentReference: NativeInt;
TaskCounter : NativeInt;
ThreadReference : NativeInt;
label
TryAgain;
begin
ThreadReference := GetThreadId + 1; //Reference.bit0 := 1
with chain do begin
TryAgain:
TaskCounter := obsTaskPopLoops;
AtStartReference := Reference OR 1; //Reference.bit0 := 1
repeat
CurrentReference := Reference;
Dec(TaskCounter);
until (TaskCounter = 0) or (CurrentReference AND 1 = 0);
if (CurrentReference AND 1 <> 0) and (AtStartReference <> CurrentReference) or
not CAS(CurrentReference, ThreadReference, Reference)
then
goto TryAgain;
//Reference is set...
Result := PData;
//Empty test
if result = nil then
CAS(ThreadReference, 0, Reference) //Clear Reference if task own reference
else if not CAS(Result, ThreadReference, Result.Next, 0, chain) then
goto TryAgain;
end; //with chain
end;
function TLFStack.Push(const value): boolean;
var
linkedData: PLinkedData;
begin
linkedData := PopLink(FRecycleChainP^);
Result := assigned(linkedData);
if not Result then
Exit;
Move(value, linkedData.Data, ElementSize);
PushLink(linkedData, FPublicChainP^);
end;
class procedure TLFStack.PushLink(const link: PLinkedData; var chain: TReferencedPtr);
var
PMemData : pointer;
TaskCounter: NativeInt;
begin
with chain do begin
for TaskCounter := 0 to obsTaskPushLoops do
if (Reference AND 1 = 0) then
break;
repeat
PMemData := PData;
link.Next := PMemData;
until CAS(PMemData, link, PData);
end;
end;
procedure InitializeTimingInfo;
var
stack: TLFStack;
begin
stack.Initialize(10, 4); // enough for initialization
stack.Finalize;
end;
initialization
{$IFDEF CPUX64}
CASAlignment := 16;
{$ELSE}
CASAlignment := 8;
{$ENDIF CPUX64}
InitializeTimingInfo;
end.