xtool/contrib/fundamentals/Utils/flcUTF.pas

1238 lines
33 KiB
ObjectPascal

{******************************************************************************}
{ }
{ Library: Fundamentals 5.00 }
{ File name: flcUTF.pas }
{ File version: 5.05 }
{ Description: UTF encoding and decoing functions. }
{ }
{ Copyright: Copyright (c) 2015-2020, David J Butler }
{ All rights reserved. }
{ Redistribution and use in source and binary forms, with }
{ or without modification, are permitted provided that }
{ the following conditions are met: }
{ Redistributions of source code must retain the above }
{ copyright notice, this list of conditions and the }
{ following disclaimer. }
{ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND }
{ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED }
{ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED }
{ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A }
{ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL }
{ THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, }
{ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR }
{ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, }
{ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF }
{ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) }
{ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER }
{ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING }
{ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE }
{ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE }
{ POSSIBILITY OF SUCH DAMAGE. }
{ }
{ Github: https://github.com/fundamentalslib }
{ E-mail: fundamentals.library at gmail.com }
{ }
{ Revision history: }
{ }
{ 2015/05/06 4.01 Add UTF functions from unit cUnicodeCodecs. }
{ 2017/10/07 5.02 Move to flcUTF unit. }
{ 2018/08/12 5.03 String type changes. }
{ 2019/10/03 5.04 UTF16LEToUCS4Char. }
{ 2020/03/12 5.05 NativeInt changes. }
{ Remove dependencies on flcUtils and flcAscii. }
{ }
{ Supported compilers: }
{ }
{ Delphi 2010-10.4 Win32/Win64 5.05 2020/06/02 }
{ Delphi 10.2-10.4 Linux64 5.05 2020/06/02 }
{ FreePascal 3.0.4 Win64 5.05 2020/06/02 }
{ }
{******************************************************************************}
{$INCLUDE ..\flcInclude.inc}
{$IFDEF FREEPASCAL}
{$WARNINGS OFF}
{$HINTS OFF}
{$ENDIF}
{$IFDEF DEBUG}
{$IFDEF TEST}
{$DEFINE UTF_TEST}
{$ENDIF}
{$ENDIF}
unit flcUTF;
interface
uses
{ Fundamentals }
flcStdTypes;
{ }
{ UTF-8 character conversion functions }
{ }
type
TUTF8Error = (
UTF8ErrorNone,
UTF8ErrorInvalidEncoding,
UTF8ErrorIncompleteEncoding,
UTF8ErrorInvalidBuffer,
UTF8ErrorOutOfRange
);
function UTF8ToUCS4Char(
const Buf: Pointer;
const Size: NativeInt;
out SeqSize: Integer;
out Ch: UCS4Char): TUTF8Error;
function UTF8ToWideChar(
const Buf: Pointer;
const Size: NativeInt;
out SeqSize: Integer;
out Ch: WideChar): TUTF8Error;
procedure UCS4CharToUTF8(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
procedure WideCharToUTF8(
const Ch: WideChar;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
{ }
{ UTF-16 character conversion functions }
{ }
procedure UCS4CharToUTF16BE(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
procedure UCS4CharToUTF16LE(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
function UTF16LEToUCS4Char(
const Buf: Pointer;
const Size: NativeInt;
out Ch: UCS4Char;
out SeqSize: Integer): Boolean;
{ }
{ UTF-8 string functions }
{ }
const
UTF8BOMSize = 3;
function DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean;
function UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer;
function UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt;
function UTF8StringLength(const S: RawByteString): NativeInt;
function UTF8StringToUnicodeString(const S: RawByteString): UnicodeString;
function UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString;
function UTF8StringToLongString(const S: RawByteString): RawByteString;
function UTF8StringToString(const S: RawByteString): String;
function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer;
function WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt;
function UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt;
function WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString;
function UnicodeStringToUTF8String(const S: UnicodeString): RawByteString;
function RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt;
function RawByteStringToUTF8Size(const S: RawByteString): NativeInt;
function RawByteStringToUTF8String(const S: RawByteString): RawByteString;
function UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString;
function ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString;
function StringToUTF8String(const S: String): RawByteString;
{ }
{ UTF-16 functions }
{ }
const
UTF16BOMSize = 2;
function DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean;
function DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean;
function DetectUTF16BOM(
const P: Pointer;
const Size: NativeInt;
out SwapEndian: Boolean): Boolean;
function SwapUTF16Endian(const P: WideChar): WideChar;
{ }
{ Tests }
{ }
{$IFDEF UTF_TEST}
procedure Test;
{$ENDIF}
implementation
uses
{ System }
SysUtils;
{ }
{ UTF-8 character conversion functions }
{ }
resourcestring
SInvalidCodePoint = '$%x is not a valid %s code point';
SUTFStringConvertError = 'UTF string conversion error';
{ UTF8ToUCS4Char returns UTF8ErrorNone if a valid UTF-8 sequence was decoded }
{ (and Ch contains the decoded UCS4 character and SeqSize contains the size }
{ of the UTF-8 sequence). If an incomplete UTF-8 sequence is encountered, the }
{ function returns UTF8ErrorIncompleteEncoding and SeqSize > Size. If an }
{ invalid UTF-8 sequence is encountered, the function returns }
{ UTF8ErrorInvalidEncoding and SeqSize (<= Size) is the size of the }
{ invalid sequence, and Ch may be the intended character. }
function UTF8ToUCS4Char(
const Buf: Pointer;
const Size: NativeInt;
out SeqSize: Integer;
out Ch: UCS4Char): TUTF8Error;
var
C : Byte;
D : Byte;
V : Word32;
I : Integer;
Q : PByte;
begin
if not Assigned(Buf) or (Size <= 0) then
begin
SeqSize := 0;
Ch := 0;
Result := UTF8ErrorInvalidBuffer;
exit;
end;
C := PByte(Buf)^;
if C < $80 then
begin
SeqSize := 1;
Ch := C;
Result := UTF8ErrorNone;
exit;
end;
// multi-byte characters always start with 11xxxxxx ($C0)
// following bytes always start with 10xxxxxx ($80)
if C and $C0 = $80 then
begin
SeqSize := 1;
Ch := C;
Result := UTF8ErrorInvalidEncoding;
exit;
end;
if C and $20 = 0 then // 2-byte sequence
begin
SeqSize := 2;
V := C and $1F;
end
else
if C and $10 = 0 then // 3-byte sequence
begin
SeqSize := 3;
V := C and $0F;
end
else
if C and $08 = 0 then // 4-byte sequence (max needed for Unicode $0-$1FFFFF)
begin
SeqSize := 4;
V := C and $07;
end
else
begin
SeqSize := 1;
Ch := C;
Result := UTF8ErrorInvalidEncoding;
exit;
end;
if Size < SeqSize then // incomplete
begin
Ch := C;
Result := UTF8ErrorIncompleteEncoding;
exit;
end;
Q := Buf;
for I := 1 to SeqSize - 1 do
begin
Inc(Q);
D := Ord(Q^);
if D and $C0 <> $80 then // following byte must start with 10xxxxxx
begin
SeqSize := 1;
Ch := C;
Result := UTF8ErrorInvalidEncoding;
exit;
end;
V := (V shl 6) or (D and $3F); // decode 6 bits
end;
Ch := V;
Result := UTF8ErrorNone;
end;
function UTF8ToWideChar(
const Buf: Pointer;
const Size: NativeInt;
out SeqSize: Integer;
out Ch: WideChar): TUTF8Error;
var Ch4 : UCS4Char;
begin
Result := UTF8ToUCS4Char(Buf, Size, SeqSize, Ch4);
if Ch4 > $FFFF then
begin
Result := UTF8ErrorOutOfRange;
Ch := #$0000;
end
else
Ch := WideChar(Ch4);
end;
{ UCS4CharToUTF8 transforms the UCS4 char Ch to UTF-8 encoding. SeqSize }
{ returns the number of bytes needed to transform Ch. Up to DestSize }
{ bytes of the UTF-8 encoding will be placed in Dest. }
procedure UCS4CharToUTF8(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
var P : PByte;
begin
P := DestBuf;
if Ch < $80 then // US-ASCII (1-byte sequence)
begin
SeqSize := 1;
if not Assigned(P) or (DestSize <= 0) then
exit;
P^ := Byte(Ch);
end else
if Ch < $800 then // 2-byte sequence
begin
SeqSize := 2;
if not Assigned(P) or (DestSize <= 0) then
exit;
P^ := $C0 or Byte(Ch shr 6);
if DestSize = 1 then
exit;
Inc(P);
P^ := $80 or (Ch and $3F);
end else
if Ch < $10000 then // 3-byte sequence
begin
SeqSize := 3;
if not Assigned(P) or (DestSize <= 0) then
exit;
P^ := $E0 or Byte(Ch shr 12);
if DestSize = 1 then
exit;
Inc(P);
P^ := $80 or ((Ch shr 6) and $3F);
if DestSize = 2 then
exit;
Inc(P);
P^ := $80 or (Ch and $3F);
end else
if Ch < $200000 then // 4-byte sequence
begin
SeqSize := 4;
if not Assigned(P) or (DestSize <= 0) then
exit;
P^ := $F0 or Byte(Ch shr 18);
if DestSize = 1 then
exit;
Inc(P);
P^ := $80 or ((Ch shr 12) and $3F);
if DestSize = 2 then
exit;
Inc(P);
P^ := $80 or ((Ch shr 6) and $3F);
if DestSize = 3 then
exit;
Inc(P);
P^ := $80 or (Ch and $3F);
end
else
raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']);
end;
procedure WideCharToUTF8(
const Ch: WideChar;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
begin
UCS4CharToUTF8(Ord(Ch), DestBuf, DestSize, SeqSize);
end;
{ }
{ UTF-16 character conversion functions }
{ }
resourcestring
SCannotConvertUCS4 = 'Cannot convert $%8.8X to %s';
{ UCS4CharToUTF16BE transforms the UCS4 char Ch to UTF-16BE encoding. SeqSize }
{ returns the number of bytes needed to transform Ch. Up to DestSize }
{ bytes of the UTF-16BE encoding will be placed in Dest. }
procedure UCS4CharToUTF16BE(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
var
P : PByte;
HighSurrogate : Word16;
LowSurrogate : Word16;
begin
P := DestBuf;
case Ch of
$00000000..$0000D7FF, $0000E000..$0000FFFF :
begin
SeqSize := 2;
if not Assigned(P) or (DestSize <= 0) then
exit;
{$IFDEF FREEPASCAL}
P^ := Byte((Ch and $FF00) shr 8);
{$ELSE}
P^ := Hi(Ch);
{$ENDIF}
if DestSize <= 1 then
exit;
Inc(P);
{$IFDEF FREEPASCAL}
P^ := Byte(Ch and $FF);
{$ELSE}
P^ := Lo(Ch);
{$ENDIF}
end;
$0000D800..$0000DFFF :
raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']);
$00010000..$0010FFFF :
begin
SeqSize := 4;
if not Assigned(P) or (DestSize <= 0) then
exit;
HighSurrogate := $D7C0 + (Ch shr 10);
P^ := Hi(HighSurrogate);
if DestSize <= 1 then
exit;
Inc(P);
P^ := Lo(HighSurrogate);
if DestSize <= 2 then
exit;
LowSurrogate := $DC00 xor (Ch and $3FF);
Inc(P);
P^ := Hi(LowSurrogate);
if DestSize <= 3 then
exit;
Inc(P);
P^ := Lo(LowSurrogate);
end;
else // out of UTF-16 range
raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16BE']);
end;
end;
{ UCS4CharToUTF16LE transforms the UCS4 char Ch to UTF-16LE encoding. SeqSize }
{ returns the number of bytes needed to transform Ch. Up to DestSize }
{ bytes of the UTF-16LE encoding will be placed in Dest. }
procedure UCS4CharToUTF16LE(
const Ch: UCS4Char;
const DestBuf: Pointer;
const DestSize: NativeInt;
out SeqSize: Integer);
var
P : PByte;
HighSurrogate : Word16;
LowSurrogate : Word16;
begin
P := DestBuf;
case Ch of
$00000000..$0000D7FF, $0000E000..$0000FFFF :
begin
SeqSize := 2;
if not Assigned(P) or (DestSize <= 0) then
exit;
{$IFDEF FREEPASCAL}
P^ := Byte(Ch and $FF);
{$ELSE}
P^ := Lo(Ch);
{$ENDIF}
if DestSize <= 1 then
exit;
Inc(P);
{$IFDEF FREEPASCAL}
P^ := Byte((Ch and $FF00) shr 8);
{$ELSE}
P^ := Hi(Ch);
{$ENDIF}
end;
$0000D800..$0000DFFF :
raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']);
$00010000..$0010FFFF:
begin
SeqSize := 4;
if not Assigned(P) or (DestSize <= 0) then
exit;
HighSurrogate := $D7C0 + (Ch shr 10);
P^ := Lo(HighSurrogate);
if DestSize <= 1 then
exit;
Inc(P);
P^ := Hi(HighSurrogate);
if DestSize <= 2 then
exit;
LowSurrogate := $DC00 xor (Ch and $3FF);
Inc(P);
P^ := Lo(LowSurrogate);
if DestSize <= 3 then
exit;
Inc(P);
P^ := Hi(LowSurrogate);
end;
else // out of UTF-16 range
raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16LE']);
end;
end;
// Returns True if valid encoding
// If invalid, Returns False with Ch the invalid character and SeqSize the
// size of a valid encoding
function UTF16LEToUCS4Char(
const Buf: Pointer;
const Size: NativeInt;
out Ch: UCS4Char;
out SeqSize: Integer): Boolean;
var
ChP : PWideChar;
C : Word16;
LowSurrogate : Word16;
begin
if Size < 2 then
begin
// Too few bytes in Source
Ch := 0;
SeqSize := 2;
Result := False;
exit;
end;
ChP := Buf;
C := Ord(ChP^); // UCS4Chars are stored in Little Endian mode
case C of
$D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF]
begin
if Size < 4 then
begin
// Too few bytes in Source
Ch := C;
SeqSize := 4;
Result := False;
exit;
end;
Inc(ChP);
LowSurrogate := Ord(ChP^);
case LowSurrogate shr 8 of
$DC..$DF :
begin
Ch := ((C - $D7C0) shl 10) + (((LowSurrogate shr 8) xor $DC) shl 8) + (LowSurrogate and $FF);
SeqSize := 4;
Result := True;
end;
else
begin
// Invalid encoding
Ch := C;
SeqSize := 4;
Result := False;
end;
end;
end;
$DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF]
begin
// Invalid encoding
Ch := C;
SeqSize := 2;
Result := False;
end
else
begin
// 2 byte character
Ch := C;
SeqSize := 2;
Result := True;
end;
end;
end;
{ }
{ UTF-8 string functions }
{ }
function DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean;
var
Q : PByte;
begin
Result := False;
if Assigned(Buf) and (Size >= 3) and (PByte(Buf)^ = $EF) then
begin
Q := Buf;
Inc(Q);
if Q^ = $BB then
begin
Inc(Q);
if Q^ = $BF then
Result := True;
end;
end;
end;
function UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer;
var
C : Byte;
I : Integer;
Q : PByte;
begin
if not Assigned(Buf) or (Size <= 0) then
begin
Result := 0;
exit;
end;
C := PByte(Buf)^;
if C < $80 then // 1-byte (US-ASCII value)
Result := 1
else
if C and $C0 = $80 then // invalid encoding
Result := 1
else
begin
// multi-byte character
if C and $20 = 0 then
Result := 2
else
if C and $10 = 0 then
Result := 3
else
if C and $08 = 0 then
Result := 4
else
begin
Result := 1; // invalid encoding
exit;
end;
if Size < Result then // incomplete encoding
exit;
Q := Buf;
Inc(Q);
for I := 1 to Result - 1 do
if Ord(Q^) and $C0 <> $80 then
begin
Result := 1; // invalid encoding
exit;
end
else
Inc(Q);
end;
end;
function UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt;
var
Q : PByte;
L : NativeInt;
C : Integer;
begin
Q := Buf;
L := Size;
Result := 0;
while L > 0 do
begin
C := UTF8CharSize(Q, L);
Dec(L, C);
Inc(Q, C);
Inc(Result);
end;
end;
function UTF8StringLength(const S: RawByteString): NativeInt;
begin
Result := UTF8BufLength(Pointer(S), Length(S));
end;
function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer;
begin
if Ch < $80 then
Result := 1 else
if Ch < $800 then
Result := 2 else
if Ch < $10000 then
Result := 3 else
if Ch < $200000 then
Result := 4
else
raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']);
end;
function WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt;
var
P : PWideChar;
I : NativeInt;
C : UCS4Char;
begin
P := Buf;
Result := 0;
for I := 1 to Len do
begin
C := UCS4Char(P^);
Inc(Result);
if C >= $80 then
if C >= $800 then
Inc(Result, 2) else
Inc(Result);
Inc(P);
end;
end;
function RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt;
var
P : PByte;
I : NativeInt;
begin
P := Buf;
Result := 0;
for I := 1 to Len do
begin
Inc(Result);
if P^ >= $80 then
Inc(Result);
Inc(P);
end;
end;
function UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt;
begin
Result := WideBufToUTF8Size(Pointer(S), Length(S));
end;
function RawByteStringToUTF8Size(const S: RawByteString): NativeInt;
begin
Result := RawByteBufToUTF8Size(Pointer(S), Length(S));
end;
procedure RawByteBufToWideBuf(
const Buf: Pointer;
const BufSize: NativeInt;
const DestBuf: Pointer);
var
I : NativeInt;
P : PWord32;
Q : PWord32;
V : Word32;
N : Integer;
E : PByte;
F : PWord16;
begin
if BufSize <= 0 then
exit;
P := Buf;
Q := DestBuf;
for I := 1 to BufSize div 4 do
begin
// convert 4 characters per iteration
V := P^;
Inc(P);
Q^ := (V and $FF) or ((V and $FF00) shl 8);
Inc(Q);
V := V shr 16;
Q^ := (V and $FF) or ((V and $FF00) shl 8);
Inc(Q);
end;
// convert remaining (<4)
N := BufSize mod 4;
if N > 0 then
begin
E := Pointer(P);
F := Pointer(Q);
for I := 1 to N do
begin
F^ := E^;
Inc(E);
Inc(F);
end;
end;
end;
function RawByteStrPtrToUnicodeString(const S: Pointer; const Len: NativeInt): UnicodeString; {$IFDEF UseInline}inline;{$ENDIF}
begin
if Len <= 0 then
Result := ''
else
begin
SetLength(Result, Len);
RawByteBufToWideBuf(S, Len, PWideChar(Result));
end;
end;
function IsAsciiBufB(const Buf: Pointer; const Len: NativeInt): Boolean; {$IFDEF UseInline}inline;{$ENDIF}
var
P : PByte;
I : NativeInt;
begin
P := Buf;
for I := 1 to Len do
if P^ >= $80 then
begin
Result := False;
exit;
end
else
Inc(P);
Result := True;
end;
function IsAsciiStringB(const S: RawByteString): Boolean; {$IFDEF UseInline}inline;{$ENDIF}
begin
Result := IsAsciiBufB(Pointer(S), Length(S));
end;
function UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString;
var
P : PByte;
Q : PWideChar;
L : NativeInt;
M : NativeInt;
I : Integer;
C : WideChar;
begin
if Size = 0 then
begin
Result := '';
exit;
end;
if IsAsciiBufB(S, Size) then // optimize for US-ASCII strings
begin
Result := RawByteStrPtrToUnicodeString(S, Size);
exit;
end;
// Decode UTF-8
L := Size;
P := S;
SetLength(Result, L); // maximum size
Q := Pointer(Result);
M := 0;
repeat
UTF8ToWideChar(P, Size, I, C);
Assert(I > 0);
Q^ := C;
Inc(Q);
Inc(M);
Inc(P, I);
Dec(L, I);
until L <= 0;
if M < Size then
SetLength(Result, M); // actual size
end;
function UTF8StringToUnicodeString(const S: RawByteString): UnicodeString;
begin
Result := UTF8StringToUnicodeStringP(Pointer(S), Length(S));
end;
function UTF8StringToLongString(const S: RawByteString): RawByteString;
var
N : NativeInt;
L : NativeInt;
P : PByte;
Q : PByte;
M : NativeInt;
I : Integer;
C : WideChar;
begin
N := Length(S);
if N = 0 then
begin
Result := '';
exit;
end;
if IsAsciiStringB(S) then // optimize for US-ASCII strings
begin
Result := S;
exit;
end;
// Decode UTF-8
L := N;
P := Pointer(S);
SetLength(Result, L); // maximum size
Q := Pointer(Result);
M := 0;
repeat
UTF8ToWideChar(P, L, I, C);
Assert(I > 0, 'I > 0');
if Ord(C) > $FF then
raise EConvertError.Create(SUTFStringConvertError);
Q^ := Byte(Ord(C));
Inc(Q);
Inc(M);
Inc(P, I);
Dec(L, I);
until L <= 0;
if M < N then
SetLength(Result, M); // actual size
end;
function UTF8StringToString(const S: RawByteString): String;
begin
{$IFDEF StringIsUnicode}
Result := UTF8StringToUnicodeString(S);
{$ELSE}
Result := S;
{$ENDIF}
end;
function WideBufToRawByteString(const P: PWideChar; const Len: NativeInt): RawByteString; {$IFDEF UseInline}inline;{$ENDIF}
var
S : PWideChar;
Q : PByte;
I : NativeInt;
V : WideChar;
begin
if Len <= 0 then
begin
Result := '';
exit;
end;
SetLength(Result, Len);
S := P;
Q := Pointer(Result);
for I := 1 to Len do
begin
V := S^;
if Ord(V) > $FF then
raise EConvertError.Create(SUTFStringConvertError);
Q^ := Byte(V);
Inc(S);
Inc(Q);
end;
end;
function WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString;
var
N : NativeInt;
P : PWideChar;
Q : PByte;
M : NativeInt;
I : NativeInt;
J : Integer;
begin
if Len = 0 then
begin
Result := '';
exit;
end;
N := WideBufToUTF8Size(Buf, Len);
if N = Len then // optimize for US-ASCII strings
begin
Result := WideBufToRawByteString(Buf, Len);
exit;
end;
SetLength(Result, N);
P := Buf;
Q := Pointer(Result);
M := 0;
for I := 1 to Len do
begin
UCS4CharToUTF8(UCS4Char(P^), Q, N, J);
Inc(P);
Inc(Q, J);
Dec(N, J);
Inc(M, J);
end;
if M < N then
SetLength(Result, M); // actual size
end;
function RawByteStringToUTF8String(const S: RawByteString): RawByteString;
var
P : PByte;
L : NativeInt;
N : NativeInt;
Q : PByte;
M : NativeInt;
I : NativeInt;
J : Integer;
begin
P := Pointer(S);
L := Length(S);
if L = 0 then
begin
Result := '';
exit;
end;
N := RawByteBufToUTF8Size(P, L);
if N = L then // optimize for US-ASCII strings
begin
Result := S;
exit;
end;
SetLength(Result, N);
Q := Pointer(Result);
M := 0;
for I := 1 to L do
begin
UCS4CharToUTF8(UCS4Char(Ord(P^)), Q, N, J);
Inc(P);
Inc(Q, J);
Dec(N, J);
Inc(M, J);
end;
if M < N then
SetLength(Result, M); // actual size
end;
function UnicodeStringToUTF8String(const S: UnicodeString): RawByteString;
begin
Result := WideBufToUTF8String(Pointer(S), Length(S));
end;
const
MaxUTF8SequenceSize = 4;
function UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString;
var
Buf : array[0..MaxUTF8SequenceSize - 1] of Byte;
Size : Integer;
I : Integer;
P, Q : PByte;
begin
Size := 0;
UCS4CharToUTF8(Ch, @Buf, Sizeof(Buf), Size);
SetLength(Result, Size);
if Size > 0 then
begin
P := Pointer(Result);
Q := @Buf;
for I := 0 to Size - 1 do
begin
P^ := Q^;
Inc(P);
Inc(Q);
end;
end;
end;
function ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString;
var
P, Q : PByte;
L, I : NativeInt;
M : NativeInt;
J : Integer;
begin
L := Length(S);
if L = 0 then
begin
Result := '';
exit;
end;
// Calculate size
M := L;
P := Pointer(S);
for I := 1 to L do
begin
if Ord(P^) >= $80 then
Inc(M); // 2 bytes required for #$80-#$FF
Inc(P);
end;
// Check if conversion is required
if M = L then
begin
// All characters are US-ASCII, return reference to same string
Result := S;
exit;
end;
// Convert
SetLength(Result, M);
Q := Pointer(Result);
P := Pointer(S);
for I := 1 to L do
begin
WideCharToUTF8(WideChar(P^), Q, M, J);
Inc(P);
Inc(Q, J);
Dec(M, J);
end;
end;
function StringToUTF8String(const S: String): RawByteString;
begin
{$IFDEF StringIsUnicode}
Result := UnicodeStringToUTF8String(S);
{$ELSE}
Result := S;
{$ENDIF}
end;
{ }
{ UTF-16 functions }
{ }
function DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean;
begin
Result := Assigned(P) and (Size >= Sizeof(WideChar)) and
(PWideChar(P)^ = WideChar($FFFE));
end;
function DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean;
begin
Result := Assigned(P) and (Size >= Sizeof(WideChar)) and
(PWideChar(P)^ = WideChar($FEFF));
end;
{ DetectUTF16Encoding returns True if the encoding was confirmed to be UTF-16. }
{ SwapEndian is True if it was detected that the UTF-16 data is in reverse }
{ endian from that used by the cpu. }
function DetectUTF16BOM(
const P: Pointer;
const Size: NativeInt;
out SwapEndian: Boolean): Boolean;
begin
if not Assigned(P) or (Size < Sizeof(WideChar)) then
begin
SwapEndian := False;
Result := False;
end else
if PWideChar(P)^ = WideChar($FEFF) then
begin
SwapEndian := False;
Result := True;
end else
if PWideChar(P)^ = WideChar($FFFE) then
begin
SwapEndian := True;
Result := True;
end
else
begin
SwapEndian := False;
Result := False;
end;
end;
function SwapUTF16Endian(const P: WideChar): WideChar;
begin
Result := WideChar(((Ord(P) and $FF) shl 8) or (Ord(P) shr 8));
end;
{ }
{ Tests }
{ }
{$IFDEF UTF_TEST}
{$ASSERTIONS ON}
procedure Test_UTF8;
const
W1 : array[0..3] of WideChar = (#$0041, #$2262, #$0391, #$002E);
W2 : array[0..2] of WideChar = (#$D55C, #$AD6D, #$C5B4);
W3 : array[0..2] of WideChar = (#$65E5, #$672C, #$8A9E);
S1 = RawByteString(#$41#$E2#$89#$A2#$CE#$91#$2E);
S2 = RawByteString(#$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4);
S3 = RawByteString(#$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E);
begin
// UTF-8 test cases from RFC 2279
Assert(UnicodeStringToUTF8String(W1) = #$41#$E2#$89#$A2#$CE#$91#$2E, 'UnicodeStringToUTF8String');
Assert(UnicodeStringToUTF8String(W2) = #$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4, 'UnicodeStringToUTF8String');
Assert(UnicodeStringToUTF8String(W3) = #$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E, 'UnicodeStringToUTF8String');
Assert(UTF8StringToUnicodeString(S1) = W1, 'UTF8StringToUnicodeString');
Assert(UTF8StringToUnicodeString(S2) = W2, 'UTF8StringToUnicodeString');
Assert(UTF8StringToUnicodeString(S3) = W3, 'UTF8StringToUnicodeString');
Assert(UTF8StringLength(S1) = 4, 'UTF8StringLength');
Assert(UTF8StringLength(S2) = 3, 'UTF8StringLength');
Assert(UTF8StringLength(S3) = 3, 'UTF8StringLength');
end;
procedure Test_UTF16;
const
W1 : array[0..1] of WideChar = (#$D83D, #$DE00);
W2 : array[0..1] of WideChar = (#$D83D, #$0000);
var
Ch : UCS4Char;
Size : Integer;
D1 : array[0..1] of WideChar;
begin
UCS4CharToUTF16LE($00, @D1[0], 4, Size);
Assert(Size = 2);
Assert(D1[0] = #$0000);
UCS4CharToUTF16LE($41, @D1[0], 4, Size);
Assert(Size = 2);
Assert(D1[0] = #$0041);
UCS4CharToUTF16LE($1234, @D1[0], 4, Size);
Assert(Size = 2);
Assert(D1[0] = #$1234);
UCS4CharToUTF16LE($1F600, @D1[0], 4, Size);
Assert(Size = 4);
Assert(D1[0] = #$D83D);
Assert(D1[1] = #$DE00);
Assert(UTF16LEToUCS4Char(@W1[0], 4, Ch, Size));
Assert(Size = 4);
Assert(Ch = $1F600);
Assert(not UTF16LEToUCS4Char(@W1[0], 2, Ch, Size));
Assert(Size = 4);
Assert(Ch = $D83D);
Assert(not UTF16LEToUCS4Char(@W1[1], 2, Ch, Size));
Assert(Size = 2);
Assert(Ch = $DE00);
Assert(not UTF16LEToUCS4Char(@W2[0], 4, Ch, Size));
Assert(Size = 4);
Assert(Ch = $D83D);
Assert(UTF16LEToUCS4Char(@W2[1], 2, Ch, Size));
Assert(Size = 2);
Assert(Ch = $0000);
end;
procedure Test;
begin
Test_UTF8;
Test_UTF16;
end;
{$ENDIF}
end.