{******************************************************************************} { } { Library: Fundamentals 5.00 } { File name: flcUTF.pas } { File version: 5.05 } { Description: UTF encoding and decoing functions. } { } { Copyright: Copyright (c) 2015-2020, David J Butler } { All rights reserved. } { Redistribution and use in source and binary forms, with } { or without modification, are permitted provided that } { the following conditions are met: } { Redistributions of source code must retain the above } { copyright notice, this list of conditions and the } { following disclaimer. } { THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND } { CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED } { WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED } { WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A } { PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL } { THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, } { INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR } { CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, } { PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF } { USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) } { HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER } { IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING } { NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE } { USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE } { POSSIBILITY OF SUCH DAMAGE. } { } { Github: https://github.com/fundamentalslib } { E-mail: fundamentals.library at gmail.com } { } { Revision history: } { } { 2015/05/06 4.01 Add UTF functions from unit cUnicodeCodecs. } { 2017/10/07 5.02 Move to flcUTF unit. } { 2018/08/12 5.03 String type changes. } { 2019/10/03 5.04 UTF16LEToUCS4Char. } { 2020/03/12 5.05 NativeInt changes. } { Remove dependencies on flcUtils and flcAscii. } { } { Supported compilers: } { } { Delphi 2010-10.4 Win32/Win64 5.05 2020/06/02 } { Delphi 10.2-10.4 Linux64 5.05 2020/06/02 } { FreePascal 3.0.4 Win64 5.05 2020/06/02 } { } {******************************************************************************} {$INCLUDE ..\flcInclude.inc} {$IFDEF FREEPASCAL} {$WARNINGS OFF} {$HINTS OFF} {$ENDIF} {$IFDEF DEBUG} {$IFDEF TEST} {$DEFINE UTF_TEST} {$ENDIF} {$ENDIF} unit flcUTF; interface uses { Fundamentals } flcStdTypes; { } { UTF-8 character conversion functions } { } type TUTF8Error = ( UTF8ErrorNone, UTF8ErrorInvalidEncoding, UTF8ErrorIncompleteEncoding, UTF8ErrorInvalidBuffer, UTF8ErrorOutOfRange ); function UTF8ToUCS4Char( const Buf: Pointer; const Size: NativeInt; out SeqSize: Integer; out Ch: UCS4Char): TUTF8Error; function UTF8ToWideChar( const Buf: Pointer; const Size: NativeInt; out SeqSize: Integer; out Ch: WideChar): TUTF8Error; procedure UCS4CharToUTF8( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); procedure WideCharToUTF8( const Ch: WideChar; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); { } { UTF-16 character conversion functions } { } procedure UCS4CharToUTF16BE( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); procedure UCS4CharToUTF16LE( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); function UTF16LEToUCS4Char( const Buf: Pointer; const Size: NativeInt; out Ch: UCS4Char; out SeqSize: Integer): Boolean; { } { UTF-8 string functions } { } const UTF8BOMSize = 3; function DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean; function UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer; function UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt; function UTF8StringLength(const S: RawByteString): NativeInt; function UTF8StringToUnicodeString(const S: RawByteString): UnicodeString; function UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString; function UTF8StringToLongString(const S: RawByteString): RawByteString; function UTF8StringToString(const S: RawByteString): String; function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer; function WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt; function UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt; function WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString; function UnicodeStringToUTF8String(const S: UnicodeString): RawByteString; function RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt; function RawByteStringToUTF8Size(const S: RawByteString): NativeInt; function RawByteStringToUTF8String(const S: RawByteString): RawByteString; function UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString; function ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString; function StringToUTF8String(const S: String): RawByteString; { } { UTF-16 functions } { } const UTF16BOMSize = 2; function DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean; function DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean; function DetectUTF16BOM( const P: Pointer; const Size: NativeInt; out SwapEndian: Boolean): Boolean; function SwapUTF16Endian(const P: WideChar): WideChar; { } { Tests } { } {$IFDEF UTF_TEST} procedure Test; {$ENDIF} implementation uses { System } SysUtils; { } { UTF-8 character conversion functions } { } resourcestring SInvalidCodePoint = '$%x is not a valid %s code point'; SUTFStringConvertError = 'UTF string conversion error'; { UTF8ToUCS4Char returns UTF8ErrorNone if a valid UTF-8 sequence was decoded } { (and Ch contains the decoded UCS4 character and SeqSize contains the size } { of the UTF-8 sequence). If an incomplete UTF-8 sequence is encountered, the } { function returns UTF8ErrorIncompleteEncoding and SeqSize > Size. If an } { invalid UTF-8 sequence is encountered, the function returns } { UTF8ErrorInvalidEncoding and SeqSize (<= Size) is the size of the } { invalid sequence, and Ch may be the intended character. } function UTF8ToUCS4Char( const Buf: Pointer; const Size: NativeInt; out SeqSize: Integer; out Ch: UCS4Char): TUTF8Error; var C : Byte; D : Byte; V : Word32; I : Integer; Q : PByte; begin if not Assigned(Buf) or (Size <= 0) then begin SeqSize := 0; Ch := 0; Result := UTF8ErrorInvalidBuffer; exit; end; C := PByte(Buf)^; if C < $80 then begin SeqSize := 1; Ch := C; Result := UTF8ErrorNone; exit; end; // multi-byte characters always start with 11xxxxxx ($C0) // following bytes always start with 10xxxxxx ($80) if C and $C0 = $80 then begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; if C and $20 = 0 then // 2-byte sequence begin SeqSize := 2; V := C and $1F; end else if C and $10 = 0 then // 3-byte sequence begin SeqSize := 3; V := C and $0F; end else if C and $08 = 0 then // 4-byte sequence (max needed for Unicode $0-$1FFFFF) begin SeqSize := 4; V := C and $07; end else begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; if Size < SeqSize then // incomplete begin Ch := C; Result := UTF8ErrorIncompleteEncoding; exit; end; Q := Buf; for I := 1 to SeqSize - 1 do begin Inc(Q); D := Ord(Q^); if D and $C0 <> $80 then // following byte must start with 10xxxxxx begin SeqSize := 1; Ch := C; Result := UTF8ErrorInvalidEncoding; exit; end; V := (V shl 6) or (D and $3F); // decode 6 bits end; Ch := V; Result := UTF8ErrorNone; end; function UTF8ToWideChar( const Buf: Pointer; const Size: NativeInt; out SeqSize: Integer; out Ch: WideChar): TUTF8Error; var Ch4 : UCS4Char; begin Result := UTF8ToUCS4Char(Buf, Size, SeqSize, Ch4); if Ch4 > $FFFF then begin Result := UTF8ErrorOutOfRange; Ch := #$0000; end else Ch := WideChar(Ch4); end; { UCS4CharToUTF8 transforms the UCS4 char Ch to UTF-8 encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-8 encoding will be placed in Dest. } procedure UCS4CharToUTF8( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); var P : PByte; begin P := DestBuf; if Ch < $80 then // US-ASCII (1-byte sequence) begin SeqSize := 1; if not Assigned(P) or (DestSize <= 0) then exit; P^ := Byte(Ch); end else if Ch < $800 then // 2-byte sequence begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $C0 or Byte(Ch shr 6); if DestSize = 1 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else if Ch < $10000 then // 3-byte sequence begin SeqSize := 3; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $E0 or Byte(Ch shr 12); if DestSize = 1 then exit; Inc(P); P^ := $80 or ((Ch shr 6) and $3F); if DestSize = 2 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else if Ch < $200000 then // 4-byte sequence begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; P^ := $F0 or Byte(Ch shr 18); if DestSize = 1 then exit; Inc(P); P^ := $80 or ((Ch shr 12) and $3F); if DestSize = 2 then exit; Inc(P); P^ := $80 or ((Ch shr 6) and $3F); if DestSize = 3 then exit; Inc(P); P^ := $80 or (Ch and $3F); end else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']); end; procedure WideCharToUTF8( const Ch: WideChar; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); begin UCS4CharToUTF8(Ord(Ch), DestBuf, DestSize, SeqSize); end; { } { UTF-16 character conversion functions } { } resourcestring SCannotConvertUCS4 = 'Cannot convert $%8.8X to %s'; { UCS4CharToUTF16BE transforms the UCS4 char Ch to UTF-16BE encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-16BE encoding will be placed in Dest. } procedure UCS4CharToUTF16BE( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); var P : PByte; HighSurrogate : Word16; LowSurrogate : Word16; begin P := DestBuf; case Ch of $00000000..$0000D7FF, $0000E000..$0000FFFF : begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; {$IFDEF FREEPASCAL} P^ := Byte((Ch and $FF00) shr 8); {$ELSE} P^ := Hi(Ch); {$ENDIF} if DestSize <= 1 then exit; Inc(P); {$IFDEF FREEPASCAL} P^ := Byte(Ch and $FF); {$ELSE} P^ := Lo(Ch); {$ENDIF} end; $0000D800..$0000DFFF : raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']); $00010000..$0010FFFF : begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; HighSurrogate := $D7C0 + (Ch shr 10); P^ := Hi(HighSurrogate); if DestSize <= 1 then exit; Inc(P); P^ := Lo(HighSurrogate); if DestSize <= 2 then exit; LowSurrogate := $DC00 xor (Ch and $3FF); Inc(P); P^ := Hi(LowSurrogate); if DestSize <= 3 then exit; Inc(P); P^ := Lo(LowSurrogate); end; else // out of UTF-16 range raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16BE']); end; end; { UCS4CharToUTF16LE transforms the UCS4 char Ch to UTF-16LE encoding. SeqSize } { returns the number of bytes needed to transform Ch. Up to DestSize } { bytes of the UTF-16LE encoding will be placed in Dest. } procedure UCS4CharToUTF16LE( const Ch: UCS4Char; const DestBuf: Pointer; const DestSize: NativeInt; out SeqSize: Integer); var P : PByte; HighSurrogate : Word16; LowSurrogate : Word16; begin P := DestBuf; case Ch of $00000000..$0000D7FF, $0000E000..$0000FFFF : begin SeqSize := 2; if not Assigned(P) or (DestSize <= 0) then exit; {$IFDEF FREEPASCAL} P^ := Byte(Ch and $FF); {$ELSE} P^ := Lo(Ch); {$ENDIF} if DestSize <= 1 then exit; Inc(P); {$IFDEF FREEPASCAL} P^ := Byte((Ch and $FF00) shr 8); {$ELSE} P^ := Hi(Ch); {$ENDIF} end; $0000D800..$0000DFFF : raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']); $00010000..$0010FFFF: begin SeqSize := 4; if not Assigned(P) or (DestSize <= 0) then exit; HighSurrogate := $D7C0 + (Ch shr 10); P^ := Lo(HighSurrogate); if DestSize <= 1 then exit; Inc(P); P^ := Hi(HighSurrogate); if DestSize <= 2 then exit; LowSurrogate := $DC00 xor (Ch and $3FF); Inc(P); P^ := Lo(LowSurrogate); if DestSize <= 3 then exit; Inc(P); P^ := Hi(LowSurrogate); end; else // out of UTF-16 range raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16LE']); end; end; // Returns True if valid encoding // If invalid, Returns False with Ch the invalid character and SeqSize the // size of a valid encoding function UTF16LEToUCS4Char( const Buf: Pointer; const Size: NativeInt; out Ch: UCS4Char; out SeqSize: Integer): Boolean; var ChP : PWideChar; C : Word16; LowSurrogate : Word16; begin if Size < 2 then begin // Too few bytes in Source Ch := 0; SeqSize := 2; Result := False; exit; end; ChP := Buf; C := Ord(ChP^); // UCS4Chars are stored in Little Endian mode case C of $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF] begin if Size < 4 then begin // Too few bytes in Source Ch := C; SeqSize := 4; Result := False; exit; end; Inc(ChP); LowSurrogate := Ord(ChP^); case LowSurrogate shr 8 of $DC..$DF : begin Ch := ((C - $D7C0) shl 10) + (((LowSurrogate shr 8) xor $DC) shl 8) + (LowSurrogate and $FF); SeqSize := 4; Result := True; end; else begin // Invalid encoding Ch := C; SeqSize := 4; Result := False; end; end; end; $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF] begin // Invalid encoding Ch := C; SeqSize := 2; Result := False; end else begin // 2 byte character Ch := C; SeqSize := 2; Result := True; end; end; end; { } { UTF-8 string functions } { } function DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean; var Q : PByte; begin Result := False; if Assigned(Buf) and (Size >= 3) and (PByte(Buf)^ = $EF) then begin Q := Buf; Inc(Q); if Q^ = $BB then begin Inc(Q); if Q^ = $BF then Result := True; end; end; end; function UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer; var C : Byte; I : Integer; Q : PByte; begin if not Assigned(Buf) or (Size <= 0) then begin Result := 0; exit; end; C := PByte(Buf)^; if C < $80 then // 1-byte (US-ASCII value) Result := 1 else if C and $C0 = $80 then // invalid encoding Result := 1 else begin // multi-byte character if C and $20 = 0 then Result := 2 else if C and $10 = 0 then Result := 3 else if C and $08 = 0 then Result := 4 else begin Result := 1; // invalid encoding exit; end; if Size < Result then // incomplete encoding exit; Q := Buf; Inc(Q); for I := 1 to Result - 1 do if Ord(Q^) and $C0 <> $80 then begin Result := 1; // invalid encoding exit; end else Inc(Q); end; end; function UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt; var Q : PByte; L : NativeInt; C : Integer; begin Q := Buf; L := Size; Result := 0; while L > 0 do begin C := UTF8CharSize(Q, L); Dec(L, C); Inc(Q, C); Inc(Result); end; end; function UTF8StringLength(const S: RawByteString): NativeInt; begin Result := UTF8BufLength(Pointer(S), Length(S)); end; function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer; begin if Ch < $80 then Result := 1 else if Ch < $800 then Result := 2 else if Ch < $10000 then Result := 3 else if Ch < $200000 then Result := 4 else raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']); end; function WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt; var P : PWideChar; I : NativeInt; C : UCS4Char; begin P := Buf; Result := 0; for I := 1 to Len do begin C := UCS4Char(P^); Inc(Result); if C >= $80 then if C >= $800 then Inc(Result, 2) else Inc(Result); Inc(P); end; end; function RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt; var P : PByte; I : NativeInt; begin P := Buf; Result := 0; for I := 1 to Len do begin Inc(Result); if P^ >= $80 then Inc(Result); Inc(P); end; end; function UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt; begin Result := WideBufToUTF8Size(Pointer(S), Length(S)); end; function RawByteStringToUTF8Size(const S: RawByteString): NativeInt; begin Result := RawByteBufToUTF8Size(Pointer(S), Length(S)); end; procedure RawByteBufToWideBuf( const Buf: Pointer; const BufSize: NativeInt; const DestBuf: Pointer); var I : NativeInt; P : PWord32; Q : PWord32; V : Word32; N : Integer; E : PByte; F : PWord16; begin if BufSize <= 0 then exit; P := Buf; Q := DestBuf; for I := 1 to BufSize div 4 do begin // convert 4 characters per iteration V := P^; Inc(P); Q^ := (V and $FF) or ((V and $FF00) shl 8); Inc(Q); V := V shr 16; Q^ := (V and $FF) or ((V and $FF00) shl 8); Inc(Q); end; // convert remaining (<4) N := BufSize mod 4; if N > 0 then begin E := Pointer(P); F := Pointer(Q); for I := 1 to N do begin F^ := E^; Inc(E); Inc(F); end; end; end; function RawByteStrPtrToUnicodeString(const S: Pointer; const Len: NativeInt): UnicodeString; {$IFDEF UseInline}inline;{$ENDIF} begin if Len <= 0 then Result := '' else begin SetLength(Result, Len); RawByteBufToWideBuf(S, Len, PWideChar(Result)); end; end; function IsAsciiBufB(const Buf: Pointer; const Len: NativeInt): Boolean; {$IFDEF UseInline}inline;{$ENDIF} var P : PByte; I : NativeInt; begin P := Buf; for I := 1 to Len do if P^ >= $80 then begin Result := False; exit; end else Inc(P); Result := True; end; function IsAsciiStringB(const S: RawByteString): Boolean; {$IFDEF UseInline}inline;{$ENDIF} begin Result := IsAsciiBufB(Pointer(S), Length(S)); end; function UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString; var P : PByte; Q : PWideChar; L : NativeInt; M : NativeInt; I : Integer; C : WideChar; begin if Size = 0 then begin Result := ''; exit; end; if IsAsciiBufB(S, Size) then // optimize for US-ASCII strings begin Result := RawByteStrPtrToUnicodeString(S, Size); exit; end; // Decode UTF-8 L := Size; P := S; SetLength(Result, L); // maximum size Q := Pointer(Result); M := 0; repeat UTF8ToWideChar(P, Size, I, C); Assert(I > 0); Q^ := C; Inc(Q); Inc(M); Inc(P, I); Dec(L, I); until L <= 0; if M < Size then SetLength(Result, M); // actual size end; function UTF8StringToUnicodeString(const S: RawByteString): UnicodeString; begin Result := UTF8StringToUnicodeStringP(Pointer(S), Length(S)); end; function UTF8StringToLongString(const S: RawByteString): RawByteString; var N : NativeInt; L : NativeInt; P : PByte; Q : PByte; M : NativeInt; I : Integer; C : WideChar; begin N := Length(S); if N = 0 then begin Result := ''; exit; end; if IsAsciiStringB(S) then // optimize for US-ASCII strings begin Result := S; exit; end; // Decode UTF-8 L := N; P := Pointer(S); SetLength(Result, L); // maximum size Q := Pointer(Result); M := 0; repeat UTF8ToWideChar(P, L, I, C); Assert(I > 0, 'I > 0'); if Ord(C) > $FF then raise EConvertError.Create(SUTFStringConvertError); Q^ := Byte(Ord(C)); Inc(Q); Inc(M); Inc(P, I); Dec(L, I); until L <= 0; if M < N then SetLength(Result, M); // actual size end; function UTF8StringToString(const S: RawByteString): String; begin {$IFDEF StringIsUnicode} Result := UTF8StringToUnicodeString(S); {$ELSE} Result := S; {$ENDIF} end; function WideBufToRawByteString(const P: PWideChar; const Len: NativeInt): RawByteString; {$IFDEF UseInline}inline;{$ENDIF} var S : PWideChar; Q : PByte; I : NativeInt; V : WideChar; begin if Len <= 0 then begin Result := ''; exit; end; SetLength(Result, Len); S := P; Q := Pointer(Result); for I := 1 to Len do begin V := S^; if Ord(V) > $FF then raise EConvertError.Create(SUTFStringConvertError); Q^ := Byte(V); Inc(S); Inc(Q); end; end; function WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString; var N : NativeInt; P : PWideChar; Q : PByte; M : NativeInt; I : NativeInt; J : Integer; begin if Len = 0 then begin Result := ''; exit; end; N := WideBufToUTF8Size(Buf, Len); if N = Len then // optimize for US-ASCII strings begin Result := WideBufToRawByteString(Buf, Len); exit; end; SetLength(Result, N); P := Buf; Q := Pointer(Result); M := 0; for I := 1 to Len do begin UCS4CharToUTF8(UCS4Char(P^), Q, N, J); Inc(P); Inc(Q, J); Dec(N, J); Inc(M, J); end; if M < N then SetLength(Result, M); // actual size end; function RawByteStringToUTF8String(const S: RawByteString): RawByteString; var P : PByte; L : NativeInt; N : NativeInt; Q : PByte; M : NativeInt; I : NativeInt; J : Integer; begin P := Pointer(S); L := Length(S); if L = 0 then begin Result := ''; exit; end; N := RawByteBufToUTF8Size(P, L); if N = L then // optimize for US-ASCII strings begin Result := S; exit; end; SetLength(Result, N); Q := Pointer(Result); M := 0; for I := 1 to L do begin UCS4CharToUTF8(UCS4Char(Ord(P^)), Q, N, J); Inc(P); Inc(Q, J); Dec(N, J); Inc(M, J); end; if M < N then SetLength(Result, M); // actual size end; function UnicodeStringToUTF8String(const S: UnicodeString): RawByteString; begin Result := WideBufToUTF8String(Pointer(S), Length(S)); end; const MaxUTF8SequenceSize = 4; function UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString; var Buf : array[0..MaxUTF8SequenceSize - 1] of Byte; Size : Integer; I : Integer; P, Q : PByte; begin Size := 0; UCS4CharToUTF8(Ch, @Buf, Sizeof(Buf), Size); SetLength(Result, Size); if Size > 0 then begin P := Pointer(Result); Q := @Buf; for I := 0 to Size - 1 do begin P^ := Q^; Inc(P); Inc(Q); end; end; end; function ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString; var P, Q : PByte; L, I : NativeInt; M : NativeInt; J : Integer; begin L := Length(S); if L = 0 then begin Result := ''; exit; end; // Calculate size M := L; P := Pointer(S); for I := 1 to L do begin if Ord(P^) >= $80 then Inc(M); // 2 bytes required for #$80-#$FF Inc(P); end; // Check if conversion is required if M = L then begin // All characters are US-ASCII, return reference to same string Result := S; exit; end; // Convert SetLength(Result, M); Q := Pointer(Result); P := Pointer(S); for I := 1 to L do begin WideCharToUTF8(WideChar(P^), Q, M, J); Inc(P); Inc(Q, J); Dec(M, J); end; end; function StringToUTF8String(const S: String): RawByteString; begin {$IFDEF StringIsUnicode} Result := UnicodeStringToUTF8String(S); {$ELSE} Result := S; {$ENDIF} end; { } { UTF-16 functions } { } function DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean; begin Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar($FFFE)); end; function DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean; begin Result := Assigned(P) and (Size >= Sizeof(WideChar)) and (PWideChar(P)^ = WideChar($FEFF)); end; { DetectUTF16Encoding returns True if the encoding was confirmed to be UTF-16. } { SwapEndian is True if it was detected that the UTF-16 data is in reverse } { endian from that used by the cpu. } function DetectUTF16BOM( const P: Pointer; const Size: NativeInt; out SwapEndian: Boolean): Boolean; begin if not Assigned(P) or (Size < Sizeof(WideChar)) then begin SwapEndian := False; Result := False; end else if PWideChar(P)^ = WideChar($FEFF) then begin SwapEndian := False; Result := True; end else if PWideChar(P)^ = WideChar($FFFE) then begin SwapEndian := True; Result := True; end else begin SwapEndian := False; Result := False; end; end; function SwapUTF16Endian(const P: WideChar): WideChar; begin Result := WideChar(((Ord(P) and $FF) shl 8) or (Ord(P) shr 8)); end; { } { Tests } { } {$IFDEF UTF_TEST} {$ASSERTIONS ON} procedure Test_UTF8; const W1 : array[0..3] of WideChar = (#$0041, #$2262, #$0391, #$002E); W2 : array[0..2] of WideChar = (#$D55C, #$AD6D, #$C5B4); W3 : array[0..2] of WideChar = (#$65E5, #$672C, #$8A9E); S1 = RawByteString(#$41#$E2#$89#$A2#$CE#$91#$2E); S2 = RawByteString(#$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4); S3 = RawByteString(#$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E); begin // UTF-8 test cases from RFC 2279 Assert(UnicodeStringToUTF8String(W1) = #$41#$E2#$89#$A2#$CE#$91#$2E, 'UnicodeStringToUTF8String'); Assert(UnicodeStringToUTF8String(W2) = #$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4, 'UnicodeStringToUTF8String'); Assert(UnicodeStringToUTF8String(W3) = #$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E, 'UnicodeStringToUTF8String'); Assert(UTF8StringToUnicodeString(S1) = W1, 'UTF8StringToUnicodeString'); Assert(UTF8StringToUnicodeString(S2) = W2, 'UTF8StringToUnicodeString'); Assert(UTF8StringToUnicodeString(S3) = W3, 'UTF8StringToUnicodeString'); Assert(UTF8StringLength(S1) = 4, 'UTF8StringLength'); Assert(UTF8StringLength(S2) = 3, 'UTF8StringLength'); Assert(UTF8StringLength(S3) = 3, 'UTF8StringLength'); end; procedure Test_UTF16; const W1 : array[0..1] of WideChar = (#$D83D, #$DE00); W2 : array[0..1] of WideChar = (#$D83D, #$0000); var Ch : UCS4Char; Size : Integer; D1 : array[0..1] of WideChar; begin UCS4CharToUTF16LE($00, @D1[0], 4, Size); Assert(Size = 2); Assert(D1[0] = #$0000); UCS4CharToUTF16LE($41, @D1[0], 4, Size); Assert(Size = 2); Assert(D1[0] = #$0041); UCS4CharToUTF16LE($1234, @D1[0], 4, Size); Assert(Size = 2); Assert(D1[0] = #$1234); UCS4CharToUTF16LE($1F600, @D1[0], 4, Size); Assert(Size = 4); Assert(D1[0] = #$D83D); Assert(D1[1] = #$DE00); Assert(UTF16LEToUCS4Char(@W1[0], 4, Ch, Size)); Assert(Size = 4); Assert(Ch = $1F600); Assert(not UTF16LEToUCS4Char(@W1[0], 2, Ch, Size)); Assert(Size = 4); Assert(Ch = $D83D); Assert(not UTF16LEToUCS4Char(@W1[1], 2, Ch, Size)); Assert(Size = 2); Assert(Ch = $DE00); Assert(not UTF16LEToUCS4Char(@W2[0], 4, Ch, Size)); Assert(Size = 4); Assert(Ch = $D83D); Assert(UTF16LEToUCS4Char(@W2[1], 2, Ch, Size)); Assert(Size = 2); Assert(Ch = $0000); end; procedure Test; begin Test_UTF8; Test_UTF16; end; {$ENDIF} end.