{******************************************************************************}
{                                                                              }
{   Library:          Fundamentals 5.00                                        }
{   File name:        flcUTF.pas                                               }
{   File version:     5.05                                                     }
{   Description:      UTF encoding and decoing functions.                      }
{                                                                              }
{   Copyright:        Copyright (c) 2015-2020, David J Butler                  }
{                     All rights reserved.                                     }
{                     Redistribution and use in source and binary forms, with  }
{                     or without modification, are permitted provided that     }
{                     the following conditions are met:                        }
{                     Redistributions of source code must retain the above     }
{                     copyright notice, this list of conditions and the        }
{                     following disclaimer.                                    }
{                     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND   }
{                     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED          }
{                     WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED   }
{                     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A          }
{                     PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL     }
{                     THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,    }
{                     INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR             }
{                     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,    }
{                     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF     }
{                     USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)         }
{                     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER   }
{                     IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING        }
{                     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE   }
{                     USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             }
{                     POSSIBILITY OF SUCH DAMAGE.                              }
{                                                                              }
{   Github:           https://github.com/fundamentalslib                       }
{   E-mail:           fundamentals.library at gmail.com                        }
{                                                                              }
{ Revision history:                                                            }
{                                                                              }
{   2015/05/06  4.01  Add UTF functions from unit cUnicodeCodecs.              }
{   2017/10/07  5.02  Move to flcUTF unit.                                     }
{   2018/08/12  5.03  String type changes.                                     }
{   2019/10/03  5.04  UTF16LEToUCS4Char.                                       }
{   2020/03/12  5.05  NativeInt changes.                                       }
{                     Remove dependencies on flcUtils and flcAscii.            }
{                                                                              }
{ Supported compilers:                                                         }
{                                                                              }
{   Delphi 2010-10.4 Win32/Win64        5.05  2020/06/02                       }
{   Delphi 10.2-10.4 Linux64            5.05  2020/06/02                       }
{   FreePascal 3.0.4 Win64              5.05  2020/06/02                       }
{                                                                              }
{******************************************************************************}

{$INCLUDE ..\flcInclude.inc}

{$IFDEF FREEPASCAL}
  {$WARNINGS OFF}
  {$HINTS OFF}
{$ENDIF}

{$IFDEF DEBUG}
{$IFDEF TEST}
  {$DEFINE UTF_TEST}
{$ENDIF}
{$ENDIF}

unit flcUTF;

interface

uses
  { Fundamentals }
  flcStdTypes;


{                                                                              }
{ UTF-8 character conversion functions                                         }
{                                                                              }
type
  TUTF8Error = (
      UTF8ErrorNone,
      UTF8ErrorInvalidEncoding,
      UTF8ErrorIncompleteEncoding,
      UTF8ErrorInvalidBuffer,
      UTF8ErrorOutOfRange
    );

function  UTF8ToUCS4Char(
          const Buf: Pointer;
          const Size: NativeInt;
          out SeqSize: Integer;
          out Ch: UCS4Char): TUTF8Error;

function  UTF8ToWideChar(
          const Buf: Pointer;
          const Size: NativeInt;
          out SeqSize: Integer;
          out Ch: WideChar): TUTF8Error;

procedure UCS4CharToUTF8(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);

procedure WideCharToUTF8(
          const Ch: WideChar;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);


{                                                                              }
{ UTF-16 character conversion functions                                        }
{                                                                              }
procedure UCS4CharToUTF16BE(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);

procedure UCS4CharToUTF16LE(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);

function  UTF16LEToUCS4Char(
          const Buf: Pointer;
          const Size: NativeInt;
          out Ch: UCS4Char;
          out SeqSize: Integer): Boolean;


{                                                                              }
{ UTF-8 string functions                                                       }
{                                                                              }
const
  UTF8BOMSize = 3;

function  DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean;

function  UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer;
function  UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt;
function  UTF8StringLength(const S: RawByteString): NativeInt;
function  UTF8StringToUnicodeString(const S: RawByteString): UnicodeString;
function  UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString;
function  UTF8StringToLongString(const S: RawByteString): RawByteString;
function  UTF8StringToString(const S: RawByteString): String;

function  UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer;
function  WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt;
function  UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt;
function  WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString;
function  UnicodeStringToUTF8String(const S: UnicodeString): RawByteString;
function  RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt;
function  RawByteStringToUTF8Size(const S: RawByteString): NativeInt;
function  RawByteStringToUTF8String(const S: RawByteString): RawByteString;
function  UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString;
function  ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString;
function  StringToUTF8String(const S: String): RawByteString;


{                                                                              }
{ UTF-16 functions                                                             }
{                                                                              }
const
  UTF16BOMSize = 2;

function  DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean;
function  DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean;
function  DetectUTF16BOM(
          const P: Pointer;
          const Size: NativeInt;
          out SwapEndian: Boolean): Boolean;
function  SwapUTF16Endian(const P: WideChar): WideChar;


{                                                                              }
{ Tests                                                                        }
{                                                                              }
{$IFDEF UTF_TEST}
procedure Test;
{$ENDIF}


implementation

uses
  { System }
  SysUtils;


{                                                                              }
{ UTF-8 character conversion functions                                         }
{                                                                              }

resourcestring
  SInvalidCodePoint = '$%x is not a valid %s code point';
  SUTFStringConvertError = 'UTF string conversion error';


{ UTF8ToUCS4Char returns UTF8ErrorNone if a valid UTF-8 sequence was decoded   }
{ (and Ch contains the decoded UCS4 character and SeqSize contains the size    }
{ of the UTF-8 sequence). If an incomplete UTF-8 sequence is encountered, the  }
{ function returns UTF8ErrorIncompleteEncoding and SeqSize > Size. If an       }
{ invalid UTF-8 sequence is encountered, the function returns                  }
{ UTF8ErrorInvalidEncoding and SeqSize (<= Size) is the size of the            }
{ invalid sequence, and Ch may be the intended character.                      }
function UTF8ToUCS4Char(
         const Buf: Pointer;
         const Size: NativeInt;
         out SeqSize: Integer;
         out Ch: UCS4Char): TUTF8Error;
var
  C : Byte;
  D : Byte;
  V : Word32;
  I : Integer;
  Q : PByte;
begin
  if not Assigned(Buf) or (Size <= 0) then
    begin
      SeqSize := 0;
      Ch := 0;
      Result := UTF8ErrorInvalidBuffer;
      exit;
    end;
  C := PByte(Buf)^;
  if C < $80 then
    begin
      SeqSize := 1;
      Ch := C;
      Result := UTF8ErrorNone;
      exit;
    end;
  // multi-byte characters always start with 11xxxxxx ($C0)
  // following bytes always start with 10xxxxxx ($80)
  if C and $C0 = $80 then
    begin
      SeqSize := 1;
      Ch := C;
      Result := UTF8ErrorInvalidEncoding;
      exit;
    end;
  if C and $20 = 0 then // 2-byte sequence
    begin
      SeqSize := 2;
      V := C and $1F;
    end
  else
  if C and $10 = 0 then // 3-byte sequence
    begin
      SeqSize := 3;
      V := C and $0F;
    end
  else
  if C and $08 = 0 then // 4-byte sequence (max needed for Unicode $0-$1FFFFF)
    begin
      SeqSize := 4;
      V := C and $07;
    end
  else
    begin
      SeqSize := 1;
      Ch := C;
      Result := UTF8ErrorInvalidEncoding;
      exit;
    end;
  if Size < SeqSize then // incomplete
    begin
      Ch := C;
      Result := UTF8ErrorIncompleteEncoding;
      exit;
    end;
  Q := Buf;
  for I := 1 to SeqSize - 1 do
    begin
      Inc(Q);
      D := Ord(Q^);
      if D and $C0 <> $80 then // following byte must start with 10xxxxxx
        begin
          SeqSize := 1;
          Ch := C;
          Result := UTF8ErrorInvalidEncoding;
          exit;
        end;
      V := (V shl 6) or (D and $3F); // decode 6 bits
    end;
  Ch := V;
  Result := UTF8ErrorNone;
end;

function UTF8ToWideChar(
         const Buf: Pointer;
         const Size: NativeInt;
         out SeqSize: Integer;
         out Ch: WideChar): TUTF8Error;
var Ch4 : UCS4Char;
begin
  Result := UTF8ToUCS4Char(Buf, Size, SeqSize, Ch4);
  if Ch4 > $FFFF then
    begin
      Result := UTF8ErrorOutOfRange;
      Ch := #$0000;
    end
  else
    Ch := WideChar(Ch4);
end;

{ UCS4CharToUTF8 transforms the UCS4 char Ch to UTF-8 encoding. SeqSize        }
{ returns the number of bytes needed to transform Ch. Up to DestSize           }
{ bytes of the UTF-8 encoding will be placed in Dest.                          }
procedure UCS4CharToUTF8(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);
var P : PByte;
begin
  P := DestBuf;
  if Ch < $80 then // US-ASCII (1-byte sequence)
    begin
      SeqSize := 1;
      if not Assigned(P) or (DestSize <= 0) then
        exit;
      P^ := Byte(Ch);
    end else
  if Ch < $800 then // 2-byte sequence
    begin
      SeqSize := 2;
      if not Assigned(P) or (DestSize <= 0) then
        exit;
      P^ := $C0 or Byte(Ch shr 6);
      if DestSize = 1 then
        exit;
      Inc(P);
      P^ := $80 or (Ch and $3F);
    end else
  if Ch < $10000 then // 3-byte sequence
    begin
      SeqSize := 3;
      if not Assigned(P) or (DestSize <= 0) then
        exit;
      P^ := $E0 or Byte(Ch shr 12);
      if DestSize = 1 then
        exit;
      Inc(P);
      P^ := $80 or ((Ch shr 6) and $3F);
      if DestSize = 2 then
        exit;
      Inc(P);
      P^ := $80 or (Ch and $3F);
    end else
  if Ch < $200000 then // 4-byte sequence
    begin
      SeqSize := 4;
      if not Assigned(P) or (DestSize <= 0) then
        exit;
      P^ := $F0 or Byte(Ch shr 18);
      if DestSize = 1 then
        exit;
      Inc(P);
      P^ := $80 or ((Ch shr 12) and $3F);
      if DestSize = 2 then
        exit;
      Inc(P);
      P^ := $80 or ((Ch shr 6) and $3F);
      if DestSize = 3 then
        exit;
      Inc(P);
      P^ := $80 or (Ch and $3F);
    end
  else
    raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']);
end;

procedure WideCharToUTF8(
          const Ch: WideChar;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);
begin
  UCS4CharToUTF8(Ord(Ch), DestBuf, DestSize, SeqSize);
end;


{                                                                              }
{ UTF-16 character conversion functions                                        }
{                                                                              }

resourcestring
  SCannotConvertUCS4 = 'Cannot convert $%8.8X to %s';

{ UCS4CharToUTF16BE transforms the UCS4 char Ch to UTF-16BE encoding. SeqSize  }
{ returns the number of bytes needed to transform Ch. Up to DestSize           }
{ bytes of the UTF-16BE encoding will be placed in Dest.                       }
procedure UCS4CharToUTF16BE(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);
var
  P : PByte;
  HighSurrogate : Word16;
  LowSurrogate : Word16;
begin
  P := DestBuf;
  case Ch of
    $00000000..$0000D7FF, $0000E000..$0000FFFF :
      begin
        SeqSize := 2;
        if not Assigned(P) or (DestSize <= 0) then
          exit;
        {$IFDEF FREEPASCAL}
        P^ := Byte((Ch and $FF00) shr 8);
        {$ELSE}
        P^ := Hi(Ch);
        {$ENDIF}
        if DestSize <= 1 then
          exit;
        Inc(P);
        {$IFDEF FREEPASCAL}
        P^ := Byte(Ch and $FF);
        {$ELSE}
        P^ := Lo(Ch);
        {$ENDIF}
      end;
    $0000D800..$0000DFFF :
      raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']);
    $00010000..$0010FFFF :
      begin
        SeqSize := 4;
        if not Assigned(P) or (DestSize <= 0) then
          exit;
        HighSurrogate := $D7C0 + (Ch shr 10);
        P^ := Hi(HighSurrogate);
        if DestSize <= 1 then
          exit;
        Inc(P);
        P^ := Lo(HighSurrogate);
        if DestSize <= 2 then
          exit;
        LowSurrogate := $DC00 xor (Ch and $3FF);
        Inc(P);
        P^ := Hi(LowSurrogate);
        if DestSize <= 3 then
          exit;
        Inc(P);
        P^ := Lo(LowSurrogate);
      end;
  else // out of UTF-16 range
    raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16BE']);
  end;
end;

{ UCS4CharToUTF16LE transforms the UCS4 char Ch to UTF-16LE encoding. SeqSize  }
{ returns the number of bytes needed to transform Ch. Up to DestSize           }
{ bytes of the UTF-16LE encoding will be placed in Dest.                       }
procedure UCS4CharToUTF16LE(
          const Ch: UCS4Char;
          const DestBuf: Pointer;
          const DestSize: NativeInt;
          out SeqSize: Integer);
var
  P : PByte;
  HighSurrogate : Word16;
  LowSurrogate : Word16;
begin
  P := DestBuf;
  case Ch of
    $00000000..$0000D7FF, $0000E000..$0000FFFF :
      begin
        SeqSize := 2;
        if not Assigned(P) or (DestSize <= 0) then
          exit;
        {$IFDEF FREEPASCAL}
        P^ := Byte(Ch and $FF);
        {$ELSE}
        P^ := Lo(Ch);
        {$ENDIF}
        if DestSize <= 1 then
          exit;
        Inc(P);
        {$IFDEF FREEPASCAL}
        P^ := Byte((Ch and $FF00) shr 8);
        {$ELSE}
        P^ := Hi(Ch);
        {$ENDIF}
      end;
    $0000D800..$0000DFFF :
      raise EConvertError.CreateFmt(SInvalidCodePoint, [Ch, 'UCS-4']);
    $00010000..$0010FFFF:
      begin
        SeqSize := 4;
        if not Assigned(P) or (DestSize <= 0) then
          exit;
        HighSurrogate := $D7C0 + (Ch shr 10);
        P^ := Lo(HighSurrogate);
        if DestSize <= 1 then
          exit;
        Inc(P);
        P^ := Hi(HighSurrogate);
        if DestSize <= 2 then
          exit;
        LowSurrogate := $DC00 xor (Ch and $3FF);
        Inc(P);
        P^ := Lo(LowSurrogate);
        if DestSize <= 3 then
          exit;
        Inc(P);
        P^ := Hi(LowSurrogate);
      end;
  else // out of UTF-16 range
    raise EConvertError.CreateFmt(SCannotConvertUCS4, [Ch, 'UTF-16LE']);
  end;
end;

// Returns True if valid encoding
// If invalid, Returns False with Ch the invalid character and SeqSize the
// size of a valid encoding
function UTF16LEToUCS4Char(
         const Buf: Pointer;
         const Size: NativeInt;
         out Ch: UCS4Char;
         out SeqSize: Integer): Boolean;
var
  ChP : PWideChar;
  C : Word16;
  LowSurrogate : Word16;
begin
  if Size < 2 then
    begin
      // Too few bytes in Source
      Ch := 0;
      SeqSize := 2;
      Result := False;
      exit;
    end;
  ChP := Buf;
  C := Ord(ChP^); // UCS4Chars are stored in Little Endian mode
  case C of
    $D800..$DBFF: // High surrogate of Unicode character [$10000..$10FFFF]
      begin
        if Size < 4 then
          begin
            // Too few bytes in Source
            Ch := C;
            SeqSize := 4;
            Result := False;
            exit;
          end;
        Inc(ChP);
        LowSurrogate := Ord(ChP^);
        case LowSurrogate shr 8 of
          $DC..$DF :
            begin
              Ch := ((C - $D7C0) shl 10) + (((LowSurrogate shr 8) xor $DC) shl 8) + (LowSurrogate and $FF);
              SeqSize := 4;
              Result := True;
            end;
          else
            begin
              // Invalid encoding
              Ch := C;
              SeqSize := 4;
              Result := False;
            end;
        end;
      end;
    $DC00..$DFFF: // Low surrogate of Unicode character [$10000..$10FFFF]
      begin
        // Invalid encoding
        Ch := C;
        SeqSize := 2;
        Result := False;
      end
    else
      begin
        // 2 byte character
        Ch := C;
        SeqSize := 2;
        Result := True;
      end;
  end;
end;


{                                                                              }
{ UTF-8 string functions                                                       }
{                                                                              }
function DetectUTF8BOM(const Buf: Pointer; const Size: NativeInt): Boolean;
var
  Q : PByte;
begin
  Result := False;
  if Assigned(Buf) and (Size >= 3) and (PByte(Buf)^ = $EF) then
    begin
      Q := Buf;
      Inc(Q);
      if Q^ = $BB then
        begin
          Inc(Q);
          if Q^ = $BF then
            Result := True;
        end;
    end;
end;

function UTF8CharSize(const Buf: Pointer; const Size: NativeInt): Integer;
var
  C : Byte;
  I : Integer;
  Q : PByte;
begin
  if not Assigned(Buf) or (Size <= 0) then
    begin
      Result := 0;
      exit;
    end;
  C := PByte(Buf)^;
  if C < $80 then // 1-byte (US-ASCII value)
    Result := 1
  else
  if C and $C0 = $80 then // invalid encoding
    Result := 1
  else
    begin
      // multi-byte character
      if C and $20 = 0 then
        Result := 2
      else
      if C and $10 = 0 then
        Result := 3
      else
      if C and $08 = 0 then
        Result := 4
      else
        begin
          Result := 1; // invalid encoding
          exit;
        end;
      if Size < Result then // incomplete encoding
        exit;
      Q := Buf;
      Inc(Q);
      for I := 1 to Result - 1 do
        if Ord(Q^) and $C0 <> $80 then
          begin
            Result := 1; // invalid encoding
            exit;
          end
        else
          Inc(Q);
    end;
end;

function UTF8BufLength(const Buf: Pointer; const Size: NativeInt): NativeInt;
var
  Q : PByte;
  L : NativeInt;
  C : Integer;
begin
  Q := Buf;
  L := Size;
  Result := 0;
  while L > 0 do
    begin
      C := UTF8CharSize(Q, L);
      Dec(L, C);
      Inc(Q, C);
      Inc(Result);
    end;
end;

function UTF8StringLength(const S: RawByteString): NativeInt;
begin
  Result := UTF8BufLength(Pointer(S), Length(S));
end;

function UCS4CharToUTF8CharSize(const Ch: UCS4Char): Integer;
begin
  if Ch < $80 then
    Result := 1 else
  if Ch < $800 then
    Result := 2 else
  if Ch < $10000 then
    Result := 3 else
  if Ch < $200000 then
    Result := 4
  else
    raise EConvertError.CreateFmt(SInvalidCodePoint, [Ord(Ch), 'Unicode']);
end;

function WideBufToUTF8Size(const Buf: PWideChar; const Len: NativeInt): NativeInt;
var
  P : PWideChar;
  I : NativeInt;
  C : UCS4Char;
begin
  P := Buf;
  Result := 0;
  for I := 1 to Len do
    begin
      C := UCS4Char(P^);
      Inc(Result);
      if C >= $80 then
        if C >= $800 then
          Inc(Result, 2) else
          Inc(Result);
      Inc(P);
    end;
end;

function RawByteBufToUTF8Size(const Buf: Pointer; const Len: NativeInt): NativeInt;
var
  P : PByte;
  I : NativeInt;
begin
  P := Buf;
  Result := 0;
  for I := 1 to Len do
    begin
      Inc(Result);
      if P^ >= $80 then
        Inc(Result);
      Inc(P);
    end;
end;

function UnicodeStringToUTF8Size(const S: UnicodeString): NativeInt;
begin
  Result := WideBufToUTF8Size(Pointer(S), Length(S));
end;

function RawByteStringToUTF8Size(const S: RawByteString): NativeInt;
begin
  Result := RawByteBufToUTF8Size(Pointer(S), Length(S));
end;

procedure RawByteBufToWideBuf(
          const Buf: Pointer;
          const BufSize: NativeInt;
          const DestBuf: Pointer);
var
  I : NativeInt;
  P : PWord32;
  Q : PWord32;
  V : Word32;
  N : Integer;
  E : PByte;
  F : PWord16;
begin
  if BufSize <= 0 then
    exit;
  P := Buf;
  Q := DestBuf;
  for I := 1 to BufSize div 4 do
    begin
      // convert 4 characters per iteration
      V := P^;
      Inc(P);
      Q^ := (V and $FF) or ((V and $FF00) shl 8);
      Inc(Q);
      V := V shr 16;
      Q^ := (V and $FF) or ((V and $FF00) shl 8);
      Inc(Q);
    end;
  // convert remaining (<4)
  N := BufSize mod 4;
  if N > 0 then
    begin
      E := Pointer(P);
      F := Pointer(Q);
      for I := 1 to N do
        begin
          F^ := E^;
          Inc(E);
          Inc(F);
        end;
    end;
end;

function RawByteStrPtrToUnicodeString(const S: Pointer; const Len: NativeInt): UnicodeString; {$IFDEF UseInline}inline;{$ENDIF}
begin
  if Len <= 0 then
    Result := ''
  else
    begin
      SetLength(Result, Len);
      RawByteBufToWideBuf(S, Len, PWideChar(Result));
    end;
end;

function IsAsciiBufB(const Buf: Pointer; const Len: NativeInt): Boolean; {$IFDEF UseInline}inline;{$ENDIF}
var
  P : PByte;
  I : NativeInt;
begin
  P := Buf;
  for I := 1 to Len do
    if P^ >= $80 then
      begin
        Result := False;
        exit;
      end
    else
      Inc(P);
  Result := True;
end;

function IsAsciiStringB(const S: RawByteString): Boolean; {$IFDEF UseInline}inline;{$ENDIF}
begin
  Result := IsAsciiBufB(Pointer(S), Length(S));
end;

function UTF8StringToUnicodeStringP(const S: Pointer; const Size: NativeInt): UnicodeString;
var
  P : PByte;
  Q : PWideChar;
  L : NativeInt;
  M : NativeInt;
  I : Integer;
  C : WideChar;
begin
  if Size = 0 then
    begin
      Result := '';
      exit;
    end;
  if IsAsciiBufB(S, Size) then // optimize for US-ASCII strings
    begin
      Result := RawByteStrPtrToUnicodeString(S, Size);
      exit;
    end;
  // Decode UTF-8
  L := Size;
  P := S;
  SetLength(Result, L); // maximum size
  Q := Pointer(Result);
  M := 0;
  repeat
    UTF8ToWideChar(P, Size, I, C);
    Assert(I > 0);
    Q^ := C;
    Inc(Q);
    Inc(M);
    Inc(P, I);
    Dec(L, I);
  until L <= 0;
  if M < Size then
    SetLength(Result, M); // actual size
end;

function UTF8StringToUnicodeString(const S: RawByteString): UnicodeString;
begin
  Result := UTF8StringToUnicodeStringP(Pointer(S), Length(S));
end;

function UTF8StringToLongString(const S: RawByteString): RawByteString;
var
  N : NativeInt;
  L : NativeInt;
  P : PByte;
  Q : PByte;
  M : NativeInt;
  I : Integer;
  C : WideChar;
begin
  N := Length(S);
  if N = 0 then
    begin
      Result := '';
      exit;
    end;
  if IsAsciiStringB(S) then // optimize for US-ASCII strings
    begin
      Result := S;
      exit;
    end;
  // Decode UTF-8
  L := N;
  P := Pointer(S);
  SetLength(Result, L); // maximum size
  Q := Pointer(Result);
  M := 0;
  repeat
    UTF8ToWideChar(P, L, I, C);
    Assert(I > 0, 'I > 0');
    if Ord(C) > $FF then
      raise EConvertError.Create(SUTFStringConvertError);
    Q^ := Byte(Ord(C));
    Inc(Q);
    Inc(M);
    Inc(P, I);
    Dec(L, I);
  until L <= 0;
  if M < N then
    SetLength(Result, M); // actual size
end;

function UTF8StringToString(const S: RawByteString): String;
begin
  {$IFDEF StringIsUnicode}
  Result := UTF8StringToUnicodeString(S);
  {$ELSE}
  Result := S;
  {$ENDIF}
end;

function WideBufToRawByteString(const P: PWideChar; const Len: NativeInt): RawByteString; {$IFDEF UseInline}inline;{$ENDIF}
var
  S : PWideChar;
  Q : PByte;
  I : NativeInt;
  V : WideChar;
begin
  if Len <= 0 then
    begin
      Result := '';
      exit;
    end;
  SetLength(Result, Len);
  S := P;
  Q := Pointer(Result);
  for I := 1 to Len do
    begin
      V := S^;
      if Ord(V) > $FF then
        raise EConvertError.Create(SUTFStringConvertError);
      Q^ := Byte(V);
      Inc(S);
      Inc(Q);
    end;
end;

function WideBufToUTF8String(const Buf: PWideChar; const Len: NativeInt): RawByteString;
var
  N : NativeInt;
  P : PWideChar;
  Q : PByte;
  M : NativeInt;
  I : NativeInt;
  J : Integer;
begin
  if Len = 0 then
    begin
      Result := '';
      exit;
    end;
  N := WideBufToUTF8Size(Buf, Len);
  if N = Len then // optimize for US-ASCII strings
    begin
      Result := WideBufToRawByteString(Buf, Len);
      exit;
    end;
  SetLength(Result, N);
  P := Buf;
  Q := Pointer(Result);
  M := 0;
  for I := 1 to Len do
    begin
      UCS4CharToUTF8(UCS4Char(P^), Q, N, J);
      Inc(P);
      Inc(Q, J);
      Dec(N, J);
      Inc(M, J);
    end;
  if M < N then
    SetLength(Result, M); // actual size
end;

function RawByteStringToUTF8String(const S: RawByteString): RawByteString;
var
  P       : PByte;
  L       : NativeInt;
  N       : NativeInt;
  Q       : PByte;
  M       : NativeInt;
  I       : NativeInt;
  J       : Integer;
begin
  P := Pointer(S);
  L := Length(S);
  if L = 0 then
    begin
      Result := '';
      exit;
    end;
  N := RawByteBufToUTF8Size(P, L);
  if N = L then // optimize for US-ASCII strings
    begin
      Result := S;
      exit;
    end;
  SetLength(Result, N);
  Q := Pointer(Result);
  M := 0;
  for I := 1 to L do
    begin
      UCS4CharToUTF8(UCS4Char(Ord(P^)), Q, N, J);
      Inc(P);
      Inc(Q, J);
      Dec(N, J);
      Inc(M, J);
    end;
  if M < N then
    SetLength(Result, M); // actual size
end;

function UnicodeStringToUTF8String(const S: UnicodeString): RawByteString;
begin
  Result := WideBufToUTF8String(Pointer(S), Length(S));
end;

const
  MaxUTF8SequenceSize = 4;

function UCS4CharToUTF8String(const Ch: UCS4Char): RawByteString;
var
  Buf  : array[0..MaxUTF8SequenceSize - 1] of Byte;
  Size : Integer;
  I    : Integer;
  P, Q : PByte;
begin
  Size := 0;
  UCS4CharToUTF8(Ch, @Buf, Sizeof(Buf), Size);
  SetLength(Result, Size);
  if Size > 0 then
    begin
      P := Pointer(Result);
      Q := @Buf;
      for I := 0 to Size - 1 do
        begin
          P^ := Q^;
          Inc(P);
          Inc(Q);
        end;
    end;
end;

function ISO8859_1StringToUTF8String(const S: RawByteString): RawByteString;
var
  P, Q : PByte;
  L, I : NativeInt;
  M    : NativeInt;
  J    : Integer;
begin
  L := Length(S);
  if L = 0 then
    begin
      Result := '';
      exit;
    end;
  // Calculate size
  M := L;
  P := Pointer(S);
  for I := 1 to L do
    begin
      if Ord(P^) >= $80 then
        Inc(M); // 2 bytes required for #$80-#$FF
      Inc(P);
    end;
  // Check if conversion is required
  if M = L then
    begin
      // All characters are US-ASCII, return reference to same string
      Result := S;
      exit;
    end;
  // Convert
  SetLength(Result, M);
  Q := Pointer(Result);
  P := Pointer(S);
  for I := 1 to L do
    begin
      WideCharToUTF8(WideChar(P^), Q, M, J);
      Inc(P);
      Inc(Q, J);
      Dec(M, J);
    end;
end;

function StringToUTF8String(const S: String): RawByteString;
begin
  {$IFDEF StringIsUnicode}
  Result := UnicodeStringToUTF8String(S);
  {$ELSE}
  Result := S;
  {$ENDIF}
end;


{                                                                              }
{ UTF-16 functions                                                             }
{                                                                              }
function DetectUTF16BEBOM(const P: Pointer; const Size: NativeInt): Boolean;
begin
  Result := Assigned(P) and (Size >= Sizeof(WideChar)) and
            (PWideChar(P)^ = WideChar($FFFE));
end;

function DetectUTF16LEBOM(const P: Pointer; const Size: NativeInt): Boolean;
begin
  Result := Assigned(P) and (Size >= Sizeof(WideChar)) and
            (PWideChar(P)^ = WideChar($FEFF));
end;

{ DetectUTF16Encoding returns True if the encoding was confirmed to be UTF-16. }
{ SwapEndian is True if it was detected that the UTF-16 data is in reverse     }
{ endian from that used by the cpu.                                            }
function DetectUTF16BOM(
         const P: Pointer;
         const Size: NativeInt;
         out SwapEndian: Boolean): Boolean;
begin
  if not Assigned(P) or (Size < Sizeof(WideChar)) then
    begin
      SwapEndian := False;
      Result := False;
    end else
  if PWideChar(P)^ = WideChar($FEFF) then
    begin
      SwapEndian := False;
      Result := True;
    end else
  if PWideChar(P)^ = WideChar($FFFE) then
    begin
      SwapEndian := True;
      Result := True;
    end
  else
    begin
      SwapEndian := False;
      Result := False;
    end;
end;

function SwapUTF16Endian(const P: WideChar): WideChar;
begin
  Result := WideChar(((Ord(P) and $FF) shl 8) or (Ord(P) shr 8));
end;


{                                                                              }
{ Tests                                                                        }
{                                                                              }
{$IFDEF UTF_TEST}
{$ASSERTIONS ON}
procedure Test_UTF8;
const
  W1 : array[0..3] of WideChar = (#$0041, #$2262, #$0391, #$002E);
  W2 : array[0..2] of WideChar = (#$D55C, #$AD6D, #$C5B4);
  W3 : array[0..2] of WideChar = (#$65E5, #$672C, #$8A9E);
  S1 = RawByteString(#$41#$E2#$89#$A2#$CE#$91#$2E);
  S2 = RawByteString(#$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4);
  S3 = RawByteString(#$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E);
begin
  // UTF-8 test cases from RFC 2279
  Assert(UnicodeStringToUTF8String(W1) = #$41#$E2#$89#$A2#$CE#$91#$2E, 'UnicodeStringToUTF8String');
  Assert(UnicodeStringToUTF8String(W2) = #$ED#$95#$9C#$EA#$B5#$AD#$EC#$96#$B4, 'UnicodeStringToUTF8String');
  Assert(UnicodeStringToUTF8String(W3) = #$E6#$97#$A5#$E6#$9C#$AC#$E8#$AA#$9E, 'UnicodeStringToUTF8String');
  Assert(UTF8StringToUnicodeString(S1) = W1, 'UTF8StringToUnicodeString');
  Assert(UTF8StringToUnicodeString(S2) = W2, 'UTF8StringToUnicodeString');
  Assert(UTF8StringToUnicodeString(S3) = W3, 'UTF8StringToUnicodeString');
  Assert(UTF8StringLength(S1) = 4, 'UTF8StringLength');
  Assert(UTF8StringLength(S2) = 3, 'UTF8StringLength');
  Assert(UTF8StringLength(S3) = 3, 'UTF8StringLength');
end;

procedure Test_UTF16;
const
  W1 : array[0..1] of WideChar = (#$D83D, #$DE00);
  W2 : array[0..1] of WideChar = (#$D83D, #$0000);
var
  Ch : UCS4Char;
  Size : Integer;
  D1 : array[0..1] of WideChar;
begin
  UCS4CharToUTF16LE($00, @D1[0], 4, Size);
  Assert(Size = 2);
  Assert(D1[0] = #$0000);

  UCS4CharToUTF16LE($41, @D1[0], 4, Size);
  Assert(Size = 2);
  Assert(D1[0] = #$0041);

  UCS4CharToUTF16LE($1234, @D1[0], 4, Size);
  Assert(Size = 2);
  Assert(D1[0] = #$1234);

  UCS4CharToUTF16LE($1F600, @D1[0], 4, Size);
  Assert(Size = 4);
  Assert(D1[0] = #$D83D);
  Assert(D1[1] = #$DE00);

  Assert(UTF16LEToUCS4Char(@W1[0], 4, Ch, Size));
  Assert(Size = 4);
  Assert(Ch = $1F600);

  Assert(not UTF16LEToUCS4Char(@W1[0], 2, Ch, Size));
  Assert(Size = 4);
  Assert(Ch = $D83D);

  Assert(not UTF16LEToUCS4Char(@W1[1], 2, Ch, Size));
  Assert(Size = 2);
  Assert(Ch = $DE00);

  Assert(not UTF16LEToUCS4Char(@W2[0], 4, Ch, Size));
  Assert(Size = 4);
  Assert(Ch = $D83D);

  Assert(UTF16LEToUCS4Char(@W2[1], 2, Ch, Size));
  Assert(Size = 2);
  Assert(Ch = $0000);
end;

procedure Test;
begin
  Test_UTF8;
  Test_UTF16;
end;
{$ENDIF}


end.