{******************************************************************************} { } { Library: Fundamentals 5.00 } { File name: flcXMLParser.pas } { File version: 5.09 } { Description: XML parser } { } { Copyright: Copyright (c) 2000-2019, David J Butler } { All rights reserved. } { Redistribution and use in source and binary forms, with } { or without modification, are permitted provided that } { the following conditions are met: } { Redistributions of source code must retain the above } { copyright notice, this list of conditions and the } { following disclaimer. } { THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND } { CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED } { WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED } { WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A } { PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL } { THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, } { INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR } { CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, } { PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF } { USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) } { HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER } { IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING } { NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE } { USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE } { POSSIBILITY OF SUCH DAMAGE. } { } { Github: https://github.com/fundamentalslib } { E-mail: fundamentals.library at gmail.com } { } { Revision history: } { } { 2000/05/11 1.01 Created cXML from cInternetStandards. } { 2001/05/08 1.02 Complete revision. } { 2001/05/11 1.03 Added DTD parser. } { 2001/07/07 1.04 Small revisions. } { 2002/04/17 2.05 Created cXMLParser from cXML. } { 2002/04/29 2.06 Refactored for Unicode support. } { 2003/09/07 3.07 Revised for Fundamentals 3. } { 2019/04/28 5.08 String type changes. } { 2019/04/28 5.09 Revised for Fundamentals 5. } { } { Supported compilers: } { } { Delphi 10.2 Win32 5.09 2019/04/28 } { Delphi 10.2 Win64 5.09 2018/04/28 } { Delphi 10.2 Linux64 5.09 2018/04/28 } { } {******************************************************************************} {$INCLUDE flcXML.inc} unit flcXMLParser; interface uses { System } SysUtils, { Fundamentals } flcStdTypes, flcUtils, flcStreams, flcUnicodeCodecs, flcUnicodeReader, flcXMLFunctions, flcXMLDocument; { } { XML Parser } { } type TxmlParseOptions = set of ( xmlPreserveSpaceAroundContent, xmlCheckWellFormed); const xmlDefaultParseOptions = []; type TxmlKnownReference = record Reference : UnicodeString; Value : UnicodeString; end; TxmlParser = class; TxmlParserObjectEvent = procedure (Sender: TxmlParser; Obj: AxmlType) of object; TxmlParser = class protected FOptions : TxmlParseOptions; FEncoding : UnicodeString; FOnTag : TxmlParserObjectEvent; FOnElement : TxmlParserObjectEvent; FOnPI : TxmlParserObjectEvent; FOnComment : TxmlParserObjectEvent; FToken : AxmlType; FRawString : RawByteString; FRawReader : AReaderEx; FRawReaderOwner : Boolean; FReader : TUnicodeReader; FReaderOwner : Boolean; function GetCheckWellFormed: Boolean; procedure ParseError(const Msg: String); procedure GetNextToken; virtual; function ParseProlog: TxmlProlog; function ParseElement: AxmlElement; function CreateCharData(const Text: UnicodeString): AxmlType; virtual; function CreateCharRef(const Number: Word32; const Hex: Boolean): AxmlType; virtual; function CreatePEReference(const Name: UnicodeString): TxmlPEReference; virtual; function CreateGeneralEntityRef(const Name: UnicodeString): AxmlType; virtual; function CreateAttribute(const Name: UnicodeString; const Value: TxmlAttValue): AxmlType; virtual; function CreateTextAttribute(const Name: UnicodeString; const Value: TxmlQuotedText): AxmlType; virtual; function CreateAttributeList(const TagName: UnicodeString; const List: TxmlTypeList): AxmlAttributeList; function CreateProcessingInstruction(const PITarget, Text: UnicodeString): AxmlType; function CreateComment(const Text: UnicodeString): AxmlType; virtual; function CreateStartTag(const Name: UnicodeString; const Attributes: AxmlAttributeList): AxmlType; virtual; function CreateEndTag(const Name: UnicodeString): AxmlType; virtual; function CreateEmptyElementTag(const Name: UnicodeString; const Attributes: AxmlAttributeList): AxmlType; virtual; function CreateElement(const StartTag: TxmlStartTag; const EndTag: TxmlEndTag; const Content: TxmlElementContent): AxmlElement; virtual; function CreateEmptyElement(const Tag: TxmlEmptyElementTag): AxmlElement; virtual; function CreateElementContent(const StartTag: TxmlStartTag): TxmlElementContent; virtual; function CreateDocTypeDeclarationList: TxmlDocTypeDeclarationList; virtual; function CreateDocument(const Prolog: TxmlProlog; const RootElement: AxmlElement): TxmlDocument; virtual; procedure ExpectChar(const Ch: WideChar); procedure ExpectRawByteStr(const S: RawByteString); function ExtractName(const Required: Boolean = False): UnicodeString; function ExtractNmToken(const Required: Boolean = False): UnicodeString; function MatchSpace: Boolean; function SkipSpace: Boolean; function MatchSpaceDelimited(const Text: RawByteString; const SkipText: Boolean = True): Boolean; function ExtractEq(const Required: Boolean = True): Boolean; function ExtractTextString(const Delimiters: ByteCharSet): UnicodeString; function ExtractText(const Delimiters: ByteCharSet): AxmlType; function ExtractPEReference: TxmlPEReference; function ExtractCharRef: AxmlType; function ExtractEntityRef: AxmlType; function ExtractReference: AxmlType; procedure ExtractReferenceText(const List: TxmlTypeList; const Delimiters: ByteCharSet; const InclPEReference: Boolean); function ExtractQuote(out Quote: WideChar): Boolean; function ExtractQuotedTextString(out Quote: WideChar; const Delimiters: ByteCharSet): UnicodeString; function ExtractQuotedText(const Delimiters: ByteCharSet): TxmlQuotedText; function ExtractQuotedReferenceText(const TextClass: CxmlQuotedReferenceText; const Delimiters: ByteCharSet; const InclPEReference: Boolean): TxmlQuotedReferenceText; function ExtractTextAttribute: AxmlType; function ExtractAttributeValue: TxmlAttValue; function ExtractAttribute: AxmlType; function ExtractAttributeList(const TagName: UnicodeString): AxmlAttributeList; function ExtractXMLDeclaration: TxmlXMLDecl; function ExtractProcessingInstruction: AxmlType; function ExtractQTag: AxmlType; function ExtractComment: AxmlType; function ExtractCDATASection: AxmlType; procedure ExtractNamesRest(const L: TxmlTypeList; const NmToken: Boolean; const Delimiter: AnsiChar); procedure ExtractNames(const L: TxmlTypeList; const NmToken: Boolean; const Delimiter: AnsiChar); function ExtractElementDeclaration: AxmlType; function ExtractAttDef: TxmlAttDef; procedure ExtractAttDefList(const L: TxmlTypeList); function ExtractAttListDeclaration: AxmlType; function ExtractExternalID(const NData: Boolean; const PublicID: Boolean): TxmlExternalID; function ExtractEntityDeclaration: AxmlType; function ExtractNotationDeclaration: AxmlType; function ExtractMarkupDeclaration: AxmlType; function ExtractDeclarations: TxmlDocTypeDeclarationList; function ExtractDTD: AxmlType; function ExtractETag: AxmlType; function ExtractTag: AxmlType; procedure Init; public constructor Create; destructor Destroy; override; property Options: TxmlParseOptions read FOptions write FOptions default xmlDefaultParseOptions; property Encoding: UnicodeString read FEncoding write FEncoding; property OnTag: TxmlParserObjectEvent read FOnTag write FOnTag; property OnElement: TxmlParserObjectEvent read FOnElement write FOnElement; property OnPI: TxmlParserObjectEvent read FOnPI write FOnPI; property OnComment: TxmlParserObjectEvent read FOnComment write FOnComment; procedure Clear; procedure SetUnicodeReader(const Reader: TUnicodeReader; const ReaderOwner: Boolean = False); procedure SetReader(const Reader: AReaderEx; const ReaderOwner: Boolean = False); procedure SetBuffer(const Buf: Pointer; const Size: Integer); procedure SetStringB(const Buf: RawByteString); procedure SetFileName(const FileName: String); function ExtractDocument: TxmlDocument; end; ExmlParser = class(Exml); TxmlParserClass = class of TxmlParser; { } { Parse functions } { } function ParseXMLBuffer(const Buffer: Pointer; const Size: Integer): TxmlDocument; function ParseXMLStringB(const S: RawByteString): TxmlDocument; function ParseXMLStringU(const S: UnicodeString): TxmlDocument; function ParseXMLFile(const FileName: String): TxmlDocument; { } { Test cases } { } {$IFDEF XML_TEST} procedure Test; {$ENDIF} implementation uses { Fundamentals } flcStrings, flcCharSet; { } { TxmlParser } { } constructor TxmlParser.Create; begin inherited Create; Init; end; destructor TxmlParser.Destroy; begin if FRawReaderOwner then FreeAndNil(FRawReader); if FReaderOwner then FreeAndNil(FReader); inherited Destroy; end; procedure TxmlParser.Init; begin FOptions := xmlDefaultParseOptions; end; function TxmlParser.GetCheckWellFormed: Boolean; begin Result := xmlCheckWellFormed in FOptions; end; procedure TxmlParser.Clear; begin FRawString := ''; if FRawReaderOwner then FreeAndNil(FRawReader) else FRawReader := nil; if FReaderOwner then FreeAndNil(FReader) else FReader := nil; end; procedure TxmlParser.SetUnicodeReader(const Reader: TUnicodeReader; const ReaderOwner: Boolean); begin if FReaderOwner then FreeAndNil(FReader); FReader := Reader; FReaderOwner := ReaderOwner; end; procedure TxmlParser.SetReader(const Reader: AReaderEx; const ReaderOwner: Boolean); var T : TUnicodeCodecClass; B : array[0..1023] of Byte; L, N : Integer; begin if FRawReaderOwner then FreeAndNil(FRawReader); FRawReader := Reader; FRawReaderOwner := ReaderOwner; if Assigned(Reader) then begin if FEncoding <> '' then T := GetCodecClassByAliasU(FEncoding) else T := nil; if not Assigned(T) then begin L := Reader.Peek(B[0], Sizeof(B)); T := xmlGetEntityEncoding(@B[0], L, N); if Assigned(T) then Reader.Skip(N); end; SetUnicodeReader(TUnicodeReader.Create(Reader, False, T.Create, True), True); end else SetUnicodeReader(nil); end; procedure TxmlParser.SetBuffer(const Buf: Pointer; const Size: Integer); begin if Assigned(Buf) and (Size > 0) then SetReader(TMemoryReader.Create(Buf, Size), True) else SetReader(nil, False); end; procedure TxmlParser.SetStringB(const Buf: RawByteString); begin FRawString := Buf; SetBuffer(Pointer(FRawString), Length(FRawString)); end; procedure TxmlParser.SetFileName(const FileName: String); begin SetReader(TFileReader.Create(FileName), True); end; procedure TxmlParser.ParseError(const Msg: String); begin raise EXMLParser.Create(Msg); end; { } { XML STRUCTURES } { } function TxmlParser.CreateCharData(const Text: UnicodeString): AxmlType; begin Result := TxmlCharData.Create(Text); end; function TxmlParser.CreateCharRef(const Number: Word32; const Hex: Boolean): AxmlType; begin Result := TxmlCharRef.Create(Number, Hex); end; function TxmlParser.CreatePEReference(const Name: UnicodeString): TxmlPEReference; begin Result := TxmlPEReference.Create(Name); end; function TxmlParser.CreateGeneralEntityRef(const Name: UnicodeString): AxmlType; begin Result := TxmlGeneralEntityRef.Create(Name); end; function TxmlParser.CreateAttribute(const Name: UnicodeString; const Value: TxmlAttValue): AxmlType; begin Result := TxmlAttribute.Create(Name, Value); end; function TxmlParser.CreateAttributeList(const TagName: UnicodeString; const List: TxmlTypeList): AxmlAttributeList; begin Result := TxmlAttributeList.Create(List); end; function TxmlParser.CreateTextAttribute(const Name: UnicodeString; const Value: TxmlQuotedText): AxmlType; begin Result := TxmlTextAttribute.Create(Name, Value); end; function TxmlParser.CreateProcessingInstruction(const PITarget, Text: UnicodeString): AxmlType; begin Result := TxmlProcessingInstruction.Create(PITarget, Text); end; function TxmlParser.CreateComment(const Text: UnicodeString): AxmlType; begin Result := TxmlComment.Create(Text); end; function TxmlParser.CreateStartTag(const Name: UnicodeString; const Attributes: AxmlAttributeList): AxmlType; begin Result := TxmlStartTag.Create(Name, Attributes); end; function TxmlParser.CreateEndTag(const Name: UnicodeString): AxmlType; begin Result := TxmlEndTag.Create(Name); end; function TxmlParser.CreateEmptyElementTag(const Name: UnicodeString; const Attributes: AxmlAttributeList): AxmlType; begin Result := TxmlEmptyElementTag.Create(Name, Attributes); end; function TxmlParser.CreateElement(const StartTag: TxmlStartTag; const EndTag: TxmlEndTag; const Content: TxmlElementContent): AxmlElement; begin Result := TxmlElement.Create(StartTag, EndTag, Content); end; function TxmlParser.CreateEmptyElement(const Tag: TxmlEmptyElementTag): AxmlElement; begin Result := TxmlEmptyElement.Create(Tag); end; function TxmlParser.CreateElementContent(const StartTag: TxmlStartTag): TxmlElementContent; begin Result := TxmlElementContent.Create; end; function TxmlParser.CreateDocTypeDeclarationList: TxmlDocTypeDeclarationList; begin Result := TxmlDocTypeDeclarationList.Create; end; function TxmlParser.CreateDocument(const Prolog: TxmlProlog; const RootElement: AxmlElement): TxmlDocument; begin Result := TxmlDocument.Create(Prolog, RootElement); end; { } { TOKENS } { } procedure TxmlParser.ExpectChar(const Ch: WideChar); begin if not FReader.MatchWideChar(Ch, True) then ParseError(Ch + ' expected'); end; procedure TxmlParser.ExpectRawByteStr(const S: RawByteString); begin if not FReader.MatchRawByteStr(S, True, True) then ParseError(String(S) + ' expected'); end; { [3] S ::= (#x20 | #x9 | #xD | #xA)+ } function TxmlParser.MatchSpace: Boolean; begin Result := FReader.MatchChar(xmlIsSpaceChar, False); end; function TxmlParser.SkipSpace: Boolean; begin Result := FReader.SkipAll(xmlIsSpaceChar) > 0; end; function TxmlParser.MatchSpaceDelimited(const Text: RawByteString; const SkipText: Boolean = True): Boolean; begin Result := FReader.MatchRawByteStrDelimited(Text, True, xmlIsSpaceChar, SkipText); end; { [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | } { CombiningChar | Extender } { [5] Name ::= (Letter | '_' | ':') (NameChar)* } function TxmlParser.ExtractName(const Required: Boolean): UnicodeString; begin if FReader.MatchChar(xmlIsNameStartChar, False) then begin Result := FReader.ReadChar; Result := Result + FReader.ReadChars(xmlIsNameChar); // keep as two statements end else if Required then ParseError('Name expected') else Result := ''; end; { [7] Nmtoken ::= (NameChar)+ } function TxmlParser.ExtractNmToken(const Required: Boolean): UnicodeString; begin Result := FReader.ReadChars(xmlIsNameChar); if Result = '' then if Required then ParseError('Name expected') else exit; end; { [25] Eq ::= S? '=' S? } function TxmlParser.ExtractEq(const Required: Boolean): Boolean; begin SkipSpace; Result := FReader.MatchWideChar('=', True); if Required and not Result then ParseError('= expected'); SkipSpace; end; { [..] TextString ::= [^Delimiters]* } function TxmlParser.ExtractTextString(const Delimiters: ByteCharSet): UnicodeString; begin Result := FReader.ReadToRawByteChar(Delimiters); end; { [..] Text ::= TextString } function TxmlParser.ExtractText(const Delimiters: ByteCharSet): AxmlType; var Text : UnicodeString; begin Text := ExtractTextString(Delimiters); if not (xmlPreserveSpaceAroundContent in FOptions) then StrTrimInPlaceU(Text, xmlIsSpaceChar); if Text = '' then Result := nil else Result := CreateCharData(Text); end; { [69] PEReference ::= '%' Name ';' } function TxmlParser.ExtractPEReference: TxmlPEReference; var Name : UnicodeString; begin if not FReader.MatchWideChar('%', True) then begin Result := nil; exit; end; Name := ExtractName(True); ExpectChar(';'); Result := CreatePEReference(Name); end; { [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' } function TxmlParser.ExtractCharRef: AxmlType; var Str : RawByteString; Val : Word32; begin Result := nil; if not FReader.MatchRawByteStr('&#', True, True) then exit; if FReader.MatchWideChar('x', True) then begin Str := FReader.ReadRawByteChars(['0'..'9', 'A'..'F', 'a'..'f']); if Str = '' then if GetCheckWellFormed then ParseError('Hexadecimal number expected') else Result := CreateCharData('&#x') else if not FReader.MatchWideChar(';', True) then if GetCheckWellFormed then ParseError('; expected') else Result := CreateCharData('&#x' + ToStringB(Str)) else begin Val := HexToWord32B(Str); Result := CreateCharRef(Val, True); end; end else begin Str := FReader.ReadRawByteChars(['0'..'9']); if Str = '' then if GetCheckWellFormed then ParseError('Number expected') else Result := CreateCharData('&#') else if not FReader.MatchWideChar(';', True) then if GetCheckWellFormed then ParseError('; expected') else Result := CreateCharData('&#' + ToStringB(Str)) else begin Val := StringToWord32B(Str); Result := CreateCharRef(Val, False); end; end; end; { [68] EntityRef ::= '&' Name ';' } function TxmlParser.ExtractEntityRef: AxmlType; var Name : UnicodeString; begin Result := nil; if not FReader.MatchWideChar('&', True) then exit; Name := ExtractName; if Name = '' then begin if GetCheckWellFormed then ParseError('Entity name expected') else Result := CreateCharData('&'); end else begin if FReader.MatchWideChar(';', False) then begin Result := CreateGeneralEntityRef(Name); FReader.Skip(1); end else if GetCheckWellFormed then ParseError('; expected') else Result := CreateCharData('&' + Name); end; end; { [67] Reference ::= EntityRef | CharRef } function TxmlParser.ExtractReference: AxmlType; begin Result := ExtractCharRef; if Assigned(Result) then exit; Result := ExtractEntityRef; end; { [..] ReferenceText ::= (Text | Reference | PEReference)* } procedure TxmlParser.ExtractReferenceText(const List: TxmlTypeList; const Delimiters: ByteCharSet; const InclPEReference: Boolean); var C : AxmlType; R : Boolean; begin Assert(Assigned(List)); repeat C := ExtractText(Delimiters); if not Assigned(C) then C := ExtractReference; if not Assigned(C) and InclPEReference then C := ExtractPEReference; R := Assigned(C); if R then List.AddChild(C); until not R; end; { [..] Quote ::= "'" | '"' } function TxmlParser.ExtractQuote(out Quote: WideChar): Boolean; var C : WideChar; begin C := FReader.PeekChar; if (C = '''') or (C = '"') then begin FReader.Skip(1); Quote := C; Result := True; end else begin Quote := #0; Result := False; end; end; { [..] QuotedText ::= "'" Text "'" | '"' Text '"' } function TxmlParser.ExtractQuotedTextString(out Quote: WideChar; const Delimiters: ByteCharSet): UnicodeString; var D : ByteCharSet; begin if not ExtractQuote(Quote) then begin Result := ''; exit; end; D := Delimiters; Include(D, AnsiChar(Ord(Quote))); Result := ExtractTextString(D); ExpectChar(Quote); end; function TxmlParser.ExtractQuotedText(const Delimiters: ByteCharSet): TxmlQuotedText; var Q : WideChar; T : UnicodeString; begin T := ExtractQuotedTextString(Q, Delimiters); if Q = #0 then Result := nil else Result := TxmlQuotedText.Create(T); end; { [..] QuotedReferenceText ::= "'" ReferenceText "'" | } { '"' ReferenceText '"' } function TxmlParser.ExtractQuotedReferenceText(const TextClass: CxmlQuotedReferenceText; const Delimiters: ByteCharSet; const InclPEReference: Boolean): TxmlQuotedReferenceText; var D : ByteCharSet; C : WideChar; begin if not ExtractQuote(C) then begin Result := nil; exit; end; Result := TextClass.Create; AssignCharSet(D, Delimiters); Include(D, AnsiChar(Ord(C))); ExtractReferenceText(Result, D, InclPEReference); ExpectChar(C); end; { [..] TextAttribute ::= Name Eq QuotedText } function TxmlParser.ExtractTextAttribute: AxmlType; var Name : UnicodeString; Val : TxmlQuotedText; begin Name := ExtractName; if Name = '' then begin Result := nil; exit; end; ExtractEq(True); Val := ExtractQuotedText(['<']); Result := CreateTextAttribute(Name, Val); end; { [10] AttValue ::= '"' ([^<&"] | Reference)* '"' } { | "'" ([^<&'] | Reference)* "'" } function TxmlParser.ExtractAttributeValue: TxmlAttValue; begin Result := TxmlAttValue(ExtractQuotedReferenceText(TxmlAttValue, ['<'], False)); end; { [41] Attribute ::= Name Eq AttValue } function TxmlParser.ExtractAttribute: AxmlType; var Name : UnicodeString; Val : TxmlAttValue; begin Name := ExtractName; if Name = '' then begin Result := nil; exit; end; ExtractEq(True); Val := ExtractAttributeValue; Result := CreateAttribute(Name, Val); end; { [..] (S Attribute)* S? } function TxmlParser.ExtractAttributeList(const TagName: UnicodeString): AxmlAttributeList; var D : AxmlType; L : TxmlTypeList; R : Boolean; begin SkipSpace; D := ExtractAttribute; if not Assigned(D) then begin Result := nil; exit; end; L := TxmlTypeList.Create(D); repeat SkipSpace; D := ExtractAttribute; R := Assigned(D); if R then L.AddChild(D); until not R; Result := CreateAttributeList(TagName, L); end; { [23] XMLDecl ::= '' } { [24] VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ") } { [80] EncodingDecl ::= S 'encoding' Eq } { ('"' EncName '"' | "'" EncName "'" ) } { [32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | } { ('"' ('yes' | 'no') '"')) } function TxmlParser.ExtractXMLDeclaration: TxmlXMLDecl; var C : AxmlType; R : Boolean; begin Result := TxmlXMLDecl.Create; R := False; repeat if not SkipSpace then R := True else begin C := ExtractTextAttribute; if Assigned(C) then Result.AddChild(C) else R := True; end; until R; ExpectRawByteStr('?>'); end; { [16] PI ::= '' Char*)))? '?>' } function TxmlParser.ExtractProcessingInstruction: AxmlType; var Target, Text : UnicodeString; begin Target := ExtractName; if Target = '' then Result := nil else begin Text := FReader.ReadToRawByteStr('?>', True); ExpectRawByteStr('?>'); Result := CreateProcessingInstruction(Target, Text); if Assigned(FOnPI) then FOnPI(self, Result); end; end; { [23] XMLDecl ::= '' } { [16] PI ::= '' Char*)))? '?>' } function TxmlParser.ExtractQTag: AxmlType; begin FReader.Skip(1); if FReader.MatchRawByteStr('xml', False, True) then Result := ExtractXMLDeclaration else Result := ExtractProcessingInstruction; end; { [15] Comment ::= '' } function TxmlParser.ExtractComment: AxmlType; var S : UnicodeString; begin FReader.Skip(2); S := FReader.ReadToRawByteStr('-->', True); ExpectRawByteStr('-->'); Result := CreateComment(S); if Assigned(FOnComment) then FOnComment(self, Result); end; { [18] CDSect ::= CDStart CData CDEnd } { [19] CDStart ::= '' Char*)) } { [21] CDEnd ::= ']]>' } function TxmlParser.ExtractCDATASection: AxmlType; var S : UnicodeString; begin FReader.Skip(7); S := FReader.ReadToRawByteStr(']]>', True); ExpectRawByteStr(']]>'); Result := TxmlCDSect.Create(S); end; { [..] NamesRest ::= (S? 'Delimiter' S? Name)* ')' } procedure TxmlParser.ExtractNamesRest(const L: TxmlTypeList; const NmToken: Boolean; const Delimiter: AnsiChar); var R : Boolean; begin R := False; repeat SkipSpace; if FReader.MatchWideChar(WideChar(Delimiter), True) then begin SkipSpace; if NmToken then L.AddChild(TxmlLiteralFormatting.Create(ExtractNmToken(True))) else L.AddChild(TxmlLiteralFormatting.Create(ExtractName(True))); end else if FReader.MatchWideChar(')', True) then R := True else ParseError(') expected'); until R; end; { [..] Names ::= S? Name NamesRest } procedure TxmlParser.ExtractNames(const L: TxmlTypeList; const NmToken: Boolean; const Delimiter: AnsiChar); var N : UnicodeString; begin SkipSpace; if FReader.MatchWideChar(WideChar(')'), True) then exit; if NmToken then N := ExtractNmToken else N := ExtractName; if N = '' then exit; L.AddChild(TxmlLiteralFormatting.Create(N)); ExtractNamesRest(L, NmToken, Delimiter); end; { [45] elementdecl ::= '' } { [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children } { [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' } { | '(' S? '#PCDATA' S? ')' } function TxmlParser.ExtractElementDeclaration: AxmlType; { [47] children ::= (choice | seq) ('?' | '*' | '+')? } { [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? } { [49] choice ::= '(' S? cp ( S? '|' S? cp )* S? ')' } { [50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' } function ExtractChildren: AxmlListChildSpec; var IsChoice, IsSeq : Boolean; C : AxmlChildSpec; R : Boolean; Ch : WideChar; begin Result := nil; SkipSpace; if FReader.MatchWideChar(')', True) then exit; Ch := #0; IsChoice := False; IsSeq := False; try R := False; repeat if FReader.MatchWideChar('(', True) then C := ExtractChildren else C := TxmlNameChildSpec.Create(ExtractName(True)); SkipSpace; if FReader.MatchWideChar(',', True) then begin if IsChoice then ParseError('List must be either choice or seq') else if not IsSeq then begin IsSeq := True; Result := TxmlSeqChildSpec.Create; Result.List := TxmlTypeList.Create; end; Ch := ','; end else if FReader.MatchWideChar(WideChar('|'), True) then begin if IsSeq then ParseError('List must be either choice or seq') else if not IsChoice then begin IsChoice := True; Result := TxmlChoiceChildSpec.Create; Result.List := TxmlTypeList.Create; end; Ch := '|'; end else if FReader.MatchWideChar(WideChar(')'), True) then begin if not IsChoice and not IsSeq then begin Result := TxmlSeqChildSpec.Create; Result.List := TxmlTypeList.Create; end; R := True; end else ParseError(') expected'); // Parsing problem for +,?, * ??? Result.List.AddAssigned(C); if not R then begin Result.List.AddAssigned(TxmlLiteralFormatting.Create(Ch)); SkipSpace; end; until R; except FreeAndNil(Result); raise; end; end; var N : UnicodeString; E : TxmlElementDeclaration; begin if not MatchSpaceDelimited(''); except FreeAndNil(E); raise; end; Result := E; end; end; { [..] AttDef ::= Name S AttType S DefaultDecl } { [54] AttType ::= StringType | TokenizedType | EnumeratedType } { [55] StringType ::= 'CDATA' } { [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' } { | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' } { [57] EnumeratedType ::= NotationType | Enumeration } { [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* } { S? ')' } { [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' } { [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' } { | (('#FIXED' S)? AttValue) } function TxmlParser.ExtractAttDef: TxmlAttDef; var N : UnicodeString; P : TxmlTypeList; T : TxmlAttType; D : TxmlDefaultType; A : TxmlAttValue; begin N := ExtractName; if N = '' then begin Result := nil; exit; end; if not SkipSpace then ParseError('Unexpected token'); P := nil; A := nil; T := atNone; try if FReader.MatchWideChar('(', True) then begin T := atEnumeratedEnumerationType; P := TxmlTypeList.Create; ExtractNames(P, True, '|'); end else if MatchSpaceDelimited('NOTATION') then begin T := atEnumeratedNotationType; SkipSpace; P := TxmlTypeList.Create; ExtractNames(P, False, '|'); end else if MatchSpaceDelimited('CDATA') then T := atStringType else if MatchSpaceDelimited('ID') then T := atTokenizedTypeID else if MatchSpaceDelimited('IDREF') then T := atTokenizedTypeIDREF else if MatchSpaceDelimited('IDREFS') then T := atTokenizedTypeIDREFS else if MatchSpaceDelimited('ENTITY') then T := atTokenizedTypeENTITY else if MatchSpaceDelimited('ENTITIES') then T := atTokenizedTypeENTITIES else if MatchSpaceDelimited('NMTOKEN') then T := atTokenizedTypeNMTOKEN else if MatchSpaceDelimited('NMTOKENS') then T := atTokenizedTypeNMTOKENS else ParseError('Invalid AttType'); SkipSpace; if MatchSpaceDelimited('#REQUIRED', True) then D := dtRequired else if MatchSpaceDelimited('#IMPLIED', True) then D := dtImplied else begin if MatchSpaceDelimited('#FIXED', True) then begin D := dtFixed; SkipSpace; end else D := dtValue; A := ExtractAttributeValue; end; except FreeAndNil(A); FreeAndNil(P); raise; end; case T of atEnumeratedEnumerationType, atEnumeratedNotationType : Result := TxmlAttDef.Create(N, T, P, D, A); else Result := TxmlAttDef.Create(N, T, nil, D, A); end; end; { [..] AttDef* S? } procedure TxmlParser.ExtractAttDefList(const L: TxmlTypeList); begin repeat SkipSpace; until not L.AddAssigned(ExtractAttDef); end; { [52] AttlistDecl ::= '' } function TxmlParser.ExtractAttListDeclaration: AxmlType; var N : UnicodeString; begin if not MatchSpaceDelimited(''); except FreeAndNil(Result); raise; end; end; { [75] ExternalID ::= 'SYSTEM' S SystemLiteral } { | 'PUBLIC' S PubidLiteral S SystemLiteral } { [76] NDataDecl ::= S 'NDATA' S Name } function TxmlParser.ExtractExternalID(const NData: Boolean; const PublicID: Boolean): TxmlExternalID; var T, U : TxmlQuotedText; C : CxmlExternalID; begin if NData then C := TxmlExternalIDNData else C := TxmlExternalID; if not PublicID and FReader.MatchRawByteStr('SYSTEM', True, True) then begin if not SkipSpace then Result := nil else begin T := ExtractQuotedText([]); if not Assigned(T) then ParseError('SystemID expected'); Result := C.CreateSystemID(T); end; end else Result := nil; if not Assigned(Result) and FReader.MatchRawByteStr('PUBLIC', True, True) then begin if not SkipSpace then Result := nil else begin U := nil; T := nil; try T := ExtractQuotedText([]); if not Assigned(T) then ParseError('PublicID expected'); if not PublicID then begin if not SkipSpace then ParseError('SystemID expected'); U := ExtractQuotedText([]); if not Assigned(U) then ParseError('SystemID expected'); end; except FreeAndNil(T); FreeAndNil(U); raise; end; Result := C.CreatePublicID(T, U); end; end; if Assigned(Result) and NData then begin SkipSpace; if MatchSpaceDelimited('NDATA') then begin SkipSpace; TxmlExternalIDNData(Result).NData := ExtractName(True); end; end; end; { [70] EntityDecl ::= GEDecl | PEDecl } { [71] GEDecl ::= '' } { [72] PEDecl ::= '' } { [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) } { [74] PEDef ::= EntityValue | ExternalID } { [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' } { | "'" ([^%&'] | PEReference | Reference)* "'" } function TxmlParser.ExtractEntityDeclaration: AxmlType; var N : UnicodeString; PE : Boolean; D : AxmlType; begin if not MatchSpaceDelimited(''), True) then ParseError('> expected'); Result := TxmlEntityDeclaration.Create(PE, N, D); end; { [82] NotationDecl ::= '' } { [83] PublicID ::= 'PUBLIC' S PubidLiteral } function TxmlParser.ExtractNotationDeclaration: AxmlType; var X : TxmlExternalID; N : UnicodeString; begin if not MatchSpaceDelimited(''), True) then ParseError('> expected'); except FreeAndNil(X); raise; end; Result := TxmlNotationDeclaration.Create(N, X); end; { [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | } { NotationDecl | PI | Comment } function TxmlParser.ExtractMarkupDeclaration: AxmlType; begin Result := ExtractElementDeclaration; if Assigned(Result) then exit; Result := ExtractAttListDeclaration; if Assigned(Result) then exit; Result := ExtractEntityDeclaration; if Assigned(Result) then exit; Result := ExtractNotationDeclaration; if Assigned(Result) then exit; Result := ExtractProcessingInstruction; if Assigned(Result) then exit; if FReader.MatchRawByteStr('' } { [19] CDStart ::= '' } function TxmlParser.ExtractETag: AxmlType; var S : UnicodeString; begin FReader.Skip(1); if FReader.MatchRawByteStr('--', True, False) then Result := ExtractComment else if FReader.MatchRawByteStr('[CDATA[', True, False) then Result := ExtractCDATASection else begin S := FReader.ReadToRawByteChar(xmlSpace + ['[', '<', '>', '&']); if S = 'DOCTYPE' then Result := ExtractDTD else begin Result := nil; ParseError('Unrecognised ' } { [42] ETag ::= '' } { [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' } function TxmlParser.ExtractTag: AxmlType; var N : UnicodeString; EmptyTag, EndTag : Boolean; C : AxmlAttributeList; begin EndTag := FReader.MatchWideChar(WideChar('/'), True); N := ExtractName(True); C := ExtractAttributeList(N); try EmptyTag := FReader.MatchWideChar(WideChar('/'), True); if not FReader.MatchWideChar(WideChar('>'), True) then ParseError('> expected'); if EmptyTag and EndTag then ParseError('Invalid tag'); if EndTag then begin if Assigned(C) then ParseError('Attributes not allowed in end tag'); Result := CreateEndTag(N); end else if EmptyTag then Result := CreateEmptyElementTag(N, C) else Result := CreateStartTag(N, C); if Assigned(FOnTag) then FOnTag(self, Result); except FreeAndNil(C); raise; end; end; { Returns S, CharData, Reference, XMLDecl, PI, Comment, CDSect, STag, ETag, } { EmptyElemTag } procedure TxmlParser.GetNextToken; var Ch : WideChar; begin SkipSpace; if FReader.EOF then begin FToken := nil; exit; end; FToken := ExtractText(['<', '&']); if Assigned(FToken) then exit; Ch := FReader.PeekChar; if Ch = '&' then FToken := ExtractReference else begin Assert(Ch = '<', 'Unexpected character'); FReader.Skip(1); Ch := FReader.PeekChar; if Ch = '?' then FToken := ExtractQTag else if Ch = '!' then FToken := ExtractETag else FToken := ExtractTag; end; end; { [27] Misc ::= Comment | PI | S } function IsMiscToken(const Token: AxmlType): Boolean; begin Result := (Token is TxmlSpace) or (Token is TxmlComment) or (Token is TxmlProcessingInstruction); end; { [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? } function TxmlParser.ParseProlog: TxmlProlog; var DocTypeDecl : Boolean; FirstToken : Boolean; begin Result := nil; DocTypeDecl := False; FirstToken := True; try while IsMiscToken(FToken) or (FToken is TxmlXMLDecl) or (FToken is TxmlDocTypeDecl) do begin if FToken is TxmlXMLDecl then if not FirstToken then ParseError('XML Declaration must be first item in document prolog'); if FToken is TxmlDocTypeDecl then if DocTypeDecl then ParseError('Duplicate Document Type Declaration (DTD)') else DocTypeDecl := True; if FirstToken then begin Result := TxmlProlog.Create; FirstToken := False; end; Result.AddChild(FToken); GetNextToken; end; except FreeAndNil(Result); raise; end; end; { [39] element ::= EmptyElemTag | STag content ETag } { [43] content ::= (element | CharData | Reference | CDSect | } { PI | Comment)* } function TxmlParser.ParseElement: AxmlElement; var StartTag : TxmlStartTag; EndTag : TxmlEndTag; Content : TxmlElementContent; C : AxmlType; Closed : Boolean; begin if FToken is TxmlStartTag then begin StartTag := TxmlStartTag(FToken); GetNextToken; Closed := False; Content := nil; EndTag := nil; try repeat if FToken is TxmlEndTag then if TxmlEndTag(FToken).Name = StartTag.Name then Closed := True else ParseError('Close tag without matching open tag: Expected ') else begin if (FToken is TxmlStartTag) or (FToken is TxmlEmptyElementTag) then C := ParseElement else begin C := FToken; GetNextToken; end; if IsMiscToken(C) or (C is AxmlElement) or (C is TxmlCharData) or (C is AxmlReference) or (C is TxmlCDSect) then begin if not Assigned(Content) then Content := CreateElementContent(StartTag); Content.AddChild(C); end else ParseError('Closing tag expected'); end; until Closed; EndTag := TxmlEndTag(FToken); GetNextToken; except FreeAndNil(StartTag); FreeAndNil(EndTag); FreeAndNil(Content); raise; end; Result := CreateElement(StartTag, EndTag, Content); end else if FToken is TxmlEmptyElementTag then begin Result := CreateEmptyElement(TxmlEmptyElementTag(FToken)); GetNextToken; end else Result := nil; if Assigned(Result) then if Assigned(FOnElement) then FOnElement(self, Result); end; { [1] document ::= prolog element Misc* } function TxmlParser.ExtractDocument: TxmlDocument; var Prolog : TxmlProlog; RootElement : AxmlElement; begin if not Assigned(FReader) then ParseError('No xml text'); GetNextToken; Prolog := ParseProlog; RootElement := ParseElement; if not Assigned(RootElement) then begin FreeAndNil(Prolog); ParseError('Document has no root element'); end; Result := CreateDocument(Prolog, RootElement); try while IsMiscToken(FToken) do begin Result.AddChild(FToken); GetNextToken; end; if Assigned(FToken) then ParseError('Unexpected token'); except FreeAndNil(Result); raise; end; end; { } { Parse functions } { } function ParseXMLBuffer(const Buffer: Pointer; const Size: Integer): TxmlDocument; var P : TxmlParser; begin P := TxmlParser.Create; try P.SetBuffer(Buffer, Size); Result := P.ExtractDocument; finally FreeAndNil(P); end; end; function ParseXMLStringB(const S: RawByteString): TxmlDocument; begin Result := ParseXMLBuffer(Pointer(S), Length(S)); end; function ParseXMLStringU(const S: UnicodeString): TxmlDocument; var P : TxmlParser; begin P := TxmlParser.Create; try P.SetUnicodeReader(TUnicodeMemoryReader.Create(Pointer(S), Length(S) * SizeOf(WideChar), nil, True)); Result := P.ExtractDocument; finally FreeAndNil(P); end; end; function ParseXMLFile(const FileName: String): TxmlDocument; var P : TxmlParser; begin P := TxmlParser.Create; try P.SetFileName(FileName); Result := P.ExtractDocument; finally FreeAndNil(P); end; end; { } { Test cases } { } {$IFDEF XML_TEST} {$ASSERTIONS ON} procedure TestParser(const S: RawByteString); var D : TxmlDocument; begin D := ParseXMLStringB(S); try Assert(D.AsUTF8String([xmloNoFormatting], 0) = S, 'ParseXML'); finally D.Free; end; end; procedure Test; begin TestParser('Test'); TestParser('Test'); end; {$ENDIF} end.