xtool/contrib/fundamentals/XMLParser/flcXMLFunctions.pas

{******************************************************************************}
{                                                                              }
{   Library:          Fundamentals 5.00                                        }
{   File name:        flcXMLFunctions.pas                                      }
{   File version:     5.10                                                     }
{   Description:      XML functions                                            }
{                                                                              }
{   Copyright:        Copyright (c) 2000-2019, David J Butler                  }
{                     All rights reserved.                                     }
{                     Redistribution and use in source and binary forms, with  }
{                     or without modification, are permitted provided that     }
{                     the following conditions are met:                        }
{                     Redistributions of source code must retain the above     }
{                     copyright notice, this list of conditions and the        }
{                     following disclaimer.                                    }
{                     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND   }
{                     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED          }
{                     WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED   }
{                     WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A          }
{                     PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL     }
{                     THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,    }
{                     INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR             }
{                     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,    }
{                     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF     }
{                     USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)         }
{                     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER   }
{                     IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING        }
{                     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE   }
{                     USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             }
{                     POSSIBILITY OF SUCH DAMAGE.                              }
{                                                                              }
{   Github:           https://github.com/fundamentalslib                       }
{   E-mail:           fundamentals.library at gmail.com                        }
{                                                                              }
{ Revision history:                                                            }
{                                                                              }
{   2000/05/11  1.01  Created cXML from cInternetStandards.                    }
{   2002/02/14  2.02  Fixed bug in xmlValidEncName. Thanks to Thomas Jensen    }
{                     <tjensen@xs4all.nl> for finding it.                      }
{   2002/04/17  2.03  Created cXMLFunctions from cXML.                         }
{   2002/04/26  2.04  Unicode support.                                         }
{   2003/09/07  3.05  Revised for Fundamentals 3.                              }
{   2004/02/21  3.06  Added xmlResolveEntityReference function.                }
{   2004/04/01  3.07  Compilable with FreePascal 1.9.2 Win32 i386.             }
{   2005/12/07  4.08  Compilable with FreePascal 2.0.1 Linux i386.             }
{   2019/04/28  5.09  String type changes.                                     }
{   2019/04/28  5.10  Revised for Fundamentals 5.                              }
{                                                                              }
{******************************************************************************}

{$INCLUDE flcXML.inc}

unit flcXMLFunctions;

interface

uses
  { System }
  SysUtils,

  { Fundamentals }
  flcStdTypes,
  flcUnicodeCodecs,
  flcZeroTermStrings;


{                                                                              }
{ Constants                                                                    }
{                                                                              }
const
  xmlVersion = '1.0';


{                                                                              }
{ Exception                                                                    }
{                                                                              }
type
  Exml = class(Exception);


{                                                                              }
{ Decoding                                                                     }
{                                                                              }
function  xmlValidChar(const Ch: AnsiChar): Boolean; overload;
function  xmlValidChar(const Ch: UCS4Char): Boolean; overload;
function  xmlValidChar(const Ch: WideChar): Boolean; overload;

function  xmlIsSpaceChar(const Ch: WideChar): Boolean;
function  xmlIsLetter(const Ch: WideChar): Boolean;
function  xmlIsDigit(const Ch: WideChar): Boolean;

function  xmlIsNameStartChar(const Ch: WideChar): Boolean;
function  xmlIsNameChar(const Ch: WideChar): Boolean;
function  xmlIsPubidChar(const Ch: WideChar): Boolean;
function  xmlValidName(const Text: UnicodeString): Boolean;

const
  xmlSpace = [#$20, #$9, #$D, #$A];

function  xmlSkipSpace(var P: PWideChar): Boolean;
function  xmlSkipEq(var P: PWideChar): Boolean;
function  xmlExtractQuotedText(var P: PWideChar; var S: UnicodeString): Boolean;

{ xmlGetEntityEncoding                                                         }
{   Buf is a pointer to the first part of an xml entity. It must include       }
{   the xml declaration. HeaderSize returns the number of bytes offset in      }
{   Buf to the actual xml.                                                     }
function  xmlGetEntityEncoding(const Buf: Pointer; const BufSize: Integer;
          out HeaderSize: Integer): TUnicodeCodecClass;

function  xmlResolveEntityReference(const RefName: UnicodeString): WideChar;


{                                                                              }
{ Encoding                                                                     }
{                                                                              }
function  xmlTag(const Tag: UnicodeString): UnicodeString;
function  xmlEndTag(const Tag: UnicodeString): UnicodeString;
function  xmlAttrTag(const Tag: UnicodeString;
          const Attr: UnicodeString = ''): UnicodeString;
function  xmlEmptyTag(const Tag, Attr: UnicodeString): UnicodeString;
procedure xmlSafeTextInPlace(var Txt: UnicodeString);
function  xmlSafeText(const Txt: UnicodeString): UnicodeString;
function  xmlSpaceIndent(const IndentLength: Integer;
          const IndentLevel: Integer): UnicodeString;
function  xmlTabIndent(const IndentLevel: Integer): UnicodeString;
function  xmlComment(const Comment: UnicodeString): UnicodeString;


{                                                                              }
{ Test cases                                                                   }
{                                                                              }
{$IFDEF XML_TEST}
procedure Test;
{$ENDIF}


implementation

uses
  { Fundamentals }
  flcUtils,
  flcStrings,
  flcUnicodeChar;


{                                                                              }
{ Decoding                                                                     }
{                                                                              }

{   [2]   Char ::=  #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |        }
{                   [#x10000-#x10FFFF]                                         }
function xmlValidChar(const Ch: AnsiChar): Boolean;
begin
  Result := Ch in [#$9, #$A, #$D, #$20..#$FF];
end;

function xmlValidChar(const Ch: UCS4Char): Boolean;
begin
  if Ch <= $D7FF then
    if Ch >= $20 then
      Result := True
    else
      Result := Ch in [$9, $A, $D]
  else
    if Ch > $10FFFF then
      Result := False
    else
      case Ch of
        $D800..$DFFF,
        $FFFE..$FFFF : Result := False;
      else
        Result := True;
      end;
end;

function xmlValidChar(const Ch: WideChar): Boolean;
begin
  if Ord(Ch) <= $D7FF then
    if Ord(Ch) >= $20 then
      Result := True
    else
      Result := Byte(Ord(Ch)) in [$9, $A, $D]
  else
    case Ch of
      #$D800..#$DFFF,
      #$FFFE..#$FFFF : Result := False;
    else
      Result := True;
    end;
end;

{   [3]   S ::=  (#x20 | #x9 | #xD | #xA)+                                     }
function xmlIsSpaceChar(const Ch: WideChar): Boolean;
begin
  if Ord(Ch) > $20 then
    Result := False
  else
    Result := Byte(Ch) in [$20, $09, $0D, $0A];
end;

{   [84]  Letter ::=  BaseChar | Ideographic                                   }
function xmlIsLetter(const Ch: WideChar): Boolean;
begin
  Result := UnicodeIsLetter(Ch);
end;

{   [88]  Digit ::=  [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] |     }
{                    [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] |     }
{                    [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] |     }
{                    [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |     }
{                    [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]       }
function xmlIsDigit(const Ch: WideChar): Boolean;
begin
  Result := UnicodeIsDecimalDigit(Ch);
end;


{   [4]   NameChar ::=  Letter | Digit | '.' | '-' | '_' | ':' |               }
{                 CombiningChar | Extender                                     }
function xmlIsNameChar(const Ch: WideChar): Boolean;
begin
  Result := xmlIsLetter(Ch) or xmlIsDigit(Ch);
  if Result then
    exit;
  case Ch of
    '.', '-', '_', ':' : Result := True;
  end;
end;

{   [5]   Name ::=  (Letter | '_' | ':') (NameChar)*                           }
function xmlIsNameStartChar(const Ch: WideChar): Boolean;
begin
  Result := xmlIsLetter(Ch) or (Ch = '_') or (Ch = ':');
end;

{   [13]  PubidChar ::=  #x20 | #xD | #xA | [a-zA-Z0-9] |                      }
{                        [-'()+,./:=?;!*#@$_%]                                 }
function xmlIsPubidChar(const Ch: WideChar): Boolean;
begin
  case Ch of
    ' ', #$D, #$A, 'A'..'Z', 'a'..'z', '0'..'9',
    '-', '''', '(', ')', '+', ',', '.', '/', ':',
    '=', '?', ';', '!', '*', '#', '@', '$', '_', '%' :
      Result := True;
  else
    Result := False;
  end;
end;

function xmlValidName(const Text: UnicodeString): Boolean;
var P : PWideChar;
    L : Integer;
begin
  Result := False;
  P := Pointer(Text);
  L := Length(Text);
  if not Assigned(P) or (L = 0) then
    exit;
  if not xmlIsNameStartChar(P^) then
    exit;
  Inc(P);
  Dec(L);
  Result := StrPMatchCharW(P, L, xmlIsNameChar);
end;

{   [3]   S ::=  (#x20 | #x9 | #xD | #xA)+                                     }
function xmlSkipSpace(var P: PWideChar): Boolean;
begin
  Assert(Assigned(P));
  Result := StrZSkipAllW(P, xmlIsSpaceChar) > 0;
end;

{   [25]  Eq ::=  S? '=' S?                                                    }
function xmlSkipEq(var P: PWideChar): Boolean;
var Q : PWideChar;
begin
  Q := P;
  xmlSkipSpace(Q);
  Result := Ord(Q^) = Ord('=');
  if not Result then
    exit;
  Inc(Q);
  xmlSkipSpace(Q);
  P := Q;
end;

{   [15]  Comment ::=  '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'       }
function xmlSkipComment(var P: PWideChar): Boolean;
var I : Integer;
begin
  Result := StrZMatchStrAsciiBW(P, '<!--', True);
  if not Result then
    exit;
  I := StrZPosBW('-->', P);
  Result := I >= 0;
  if not Result then
    exit;
  Inc(P, I + 3);
end;

{   [..]  QuotedText  ::=  ' Text ' | " Text "                                 }
function xmlExtractQuotedText(var P: PWideChar; var S: UnicodeString): Boolean;
begin
  Assert(Assigned(P));
  Result := StrZExtractQuotedU(P, S, ['''']);
  if Result then
    exit;
  Result := StrZExtractQuotedU(P, S, ['"']);
end;

{ Entity encoding                                                              }
{   [23]  XMLDecl ::=  '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'       }
{   [24]  VersionInfo ::=  S 'version' Eq (' VersionNum ' | " VersionNum ")    }
{   [26]  VersionNum ::=  ([a-zA-Z0-9_.:] | '-')+                              }
{   [80]  EncodingDecl ::=  S 'encoding' Eq ('"' EncName '"' |                 }
{         "'" EncName "'" )                                                    }
function xmlGetEntityEncoding(const Buf: Pointer; const BufSize: Integer;
    out HeaderSize: Integer): TUnicodeCodecClass;
var S    : UnicodeString;
    R    : PByteChar;
    P, Q : PWideChar;
    L    : Integer;
begin
  R := Buf;
  L := BufSize;
  // Detect Unicode markings
  Result := DetectUTFEncoding(R, L, HeaderSize);
  if Assigned(Result) then
    exit;
  HeaderSize := 0;
  Result := TUTF8Codec;
  // Check if first character is a likely XML UTF-16 character
  if L >= 2 then
    begin
      P := Pointer(R);
      case Ord(P^) of
        $0009, $000A, $000D, $0020, $003C : Result := TUTF16LECodec;
        $0900, $0A00, $0D00, $2000, $3C00 : Result := TUTF16BECodec;
      end;
    end;
  if L < 16 then
    exit;
  // Decode
  S := EncodingToUTF16U(Result, R, MinInt(L, 4096));
  // Locate XML declaration
  P := Pointer(S);
  while xmlSkipSpace(P) or xmlSkipComment(P) do ;
  if not StrZSkipStrBW(P, '<?xml', False) then
    exit;
  if not xmlSkipSpace(P) then
    exit;
  // Locate encoding attribute
  while (Ord(P^) <> 0) and (P^ <> '>') do
    begin
      if xmlIsSpaceChar(P^) then
        begin
          Q := P;
          Inc(Q);
          if StrZSkipStrBW(Q, 'encoding', False) then
            if xmlSkipEq(Q) then
              begin
                // Extract encoding
                if not xmlExtractQuotedText(Q, S) then
                  S := StrZExtractToU(Q, ['>', #0] + xmlSpace);
                StrTrimInPlaceU(S, UnicodeIsWhiteSpace);
                if S <> '' then
                  // Get codec type from encoding name
                  begin
                    Result := GetCodecClassByAlias(S);
                  end;
                // Found encoding attribute
                exit;
              end;
        end;
      // Next character
      Inc(P);
    end;
end;

function xmlResolveEntityReference(const RefName: UnicodeString): WideChar;
begin
  if StrEqualU(RefName, 'amp', True) then
    Result := '&'
  else
  if StrEqualU(RefName, 'lt', True) then
    Result := '<'
  else
  if StrEqualU(RefName, 'gt', True) then
    Result := '>'
  else
  if StrEqualU(RefName, 'quot', True) then
    Result := '"'
  else
  if StrEqualU(RefName, 'apos', True) then
    Result := ''''
  else
    Result := WideChar(#0);
end;


{                                                                              }
{ Encoding                                                                     }
{                                                                              }
function xmlTag(const Tag: UnicodeString): UnicodeString;
begin
  Result := '<' + Tag + '>';
end;

function xmlAttrTag(const Tag: UnicodeString; const Attr: UnicodeString): UnicodeString;
begin
  if Attr = '' then
    Result := xmlTag(Tag)
  else
    Result := '<' + Tag + ' ' + Attr + '>';
end;

function xmlEndTag(const Tag: UnicodeString): UnicodeString;
begin
  Result := '</' + Tag + '>';
end;

function xmlEmptyTag(const Tag, Attr: UnicodeString): UnicodeString;
begin
  if Attr = '' then
    Result := '<' + Tag + '/>'
  else
    Result := '<' + Tag + ' ' + Attr + '/>';
end;

procedure xmlSafeTextInPlace(var Txt: UnicodeString);
begin
  if PosCharSetU([#0, '&', '''', '"', '>', '<'], Txt) <= 0 then
    exit;
  Txt := StrReplaceCharStrU(#0, '', Txt);
  Txt := StrReplaceCharStrU('&', '&amp;', Txt);
  Txt := StrReplaceCharStrU('''', '&apos;', Txt);
  Txt := StrReplaceCharStrU('"', '&quot;', Txt);
  Txt := StrReplaceCharStrU('>', '&gt;', Txt);
  Txt := StrReplaceCharStrU('<', '&lt;', Txt);
end;

function xmlSafeText(const Txt: UnicodeString): UnicodeString;
begin
  Result := Txt;
  xmlSafeTextInPlace(Result);
end;

function xmlSpaceIndent(const IndentLength: Integer; const IndentLevel: Integer): UnicodeString;
begin
  Result := DupCharU(#32, IndentLevel * IndentLength);
end;

function xmlTabIndent(const IndentLevel: Integer): UnicodeString;
begin
  Result := DupCharU(#9, IndentLevel);
end;

{   [15]  Comment ::=  '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'       }
function xmlComment(const Comment: UnicodeString): UnicodeString;
begin
  Assert(PosStrU('--', Comment) = 0, 'Invalid sequence in comment');
  Result := '<!--' + Comment + '-->';
end;


{                                                                              }
{ Test cases                                                                   }
{                                                                              }
{$IFDEF XML_TEST}
{$ASSERTIONS ON}
procedure Test;
var S : RawByteString;
    T : UnicodeString;
    I : Integer;
begin
  Assert(xmlValidChar(WideChar(#32)), 'xmlValidChar');
  Assert(xmlValidChar(WideChar(#13)), 'xmlValidChar');
  Assert(not xmlValidChar(WideChar(#0)), 'xmlValidChar');
  Assert(not xmlValidChar(WideChar(#11)), 'xmlValidChar');
  Assert(xmlValidName('_Test_0'), 'xmlValidName');
  Assert(not xmlValidName('X '), 'xmlValidName');
  Assert(not xmlValidName('X$'), 'xmlValidName');
  Assert(not xmlValidName('5X'), 'xmlValidName');
  Assert(xmlTag('Test') = '<Test>', 'xmlTag');
  Assert(xmlComment('Test') = '<!--Test-->', 'xmlComment');
  Assert(xmlSafeText('(abc) [123]') = '(abc) [123]', 'xmlSafeText');
  Assert(xmlSafeText('a&<b>') = 'a&amp;&lt;b&gt;', 'xmlSafeText');
  Assert(xmlIsSpaceChar(#32), 'xmlIsSpaceChar');
  Assert(xmlIsSpaceChar(#13), 'xmlIsSpaceChar');
  Assert(not xmlIsSpaceChar('_'), 'xmlIsSpaceChar');
  S := '<?xml version="1.0" encoding="ascii">';
  Assert(xmlGetEntityEncoding(Pointer(S), Length(S), I) = TUSASCIICodec, 'xmlGetEntityEncoding');
  S := '  <?xml  attr  encoding=ISO-8859-10 >  ';
  Assert(xmlGetEntityEncoding(Pointer(S), Length(S), I) = TISO8859_10Codec, 'xmlGetEntityEncoding');
  T := '<?xml version="1.0">';
  Assert(xmlGetEntityEncoding(Pointer(T), Length(T) * SizeOf(WideChar), I) = TUTF16LECodec, 'xmlGetEntityEncoding');
end;
{$ENDIF}


end.