{******************************************************************************} { } { Library: Fundamentals 5.00 - HTML Parser } { File name: flcHTMLParser.pas } { File version: 5.07 } { Description: HTML Parser } { } { Copyright: Copyright (c) 2000-2020, David J Butler } { All rights reserved. } { Redistribution and use in source and binary forms, with } { or without modification, are permitted provided that } { the following conditions are met: } { Redistributions of source code must retain the above } { copyright notice, this list of conditions and the } { following disclaimer. } { THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND } { CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED } { WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED } { WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A } { PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL } { THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, } { INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR } { CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, } { PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF } { USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) } { HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER } { IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING } { NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE } { USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE } { POSSIBILITY OF SUCH DAMAGE. } { } { Github: https://github.com/fundamentalslib } { E-mail: fundamentals.library at gmail.com } { } { Revision history: } { } { 2002/10/21 1.00 Initial version. } { 2002/10/29 1.01 Unicode support. } { 2002/10/30 1.02 Support for overlapping tags. } { 2002/11/02 1.03 Refactored. } { 2002/11/04 1.04 Optimizations. } { 2002/12/16 1.05 Improved white-space and line-break handling. } { 2015/04/11 1.06 UnicodeString changes. } { 2019/02/22 5.07 Revise for Fundamentals 5. } { } {******************************************************************************} {$INCLUDE flcHTML.inc} unit flcHTMLParser; interface uses { System } SysUtils, Classes, { Fundamentals } flcStdTypes, flcDataStructs, { HTML } flcHTMLElements, flcHTMLLexer, flcHTMLDocBase, flcHTMLDoc; { } { ThtmlParser } { } type ThtmlParser = class; // forward { ThtmlOpenElementInfo } { Structure to store a nested open element } ThtmlOpenElementInfo = class(TDoublyLinkedItem) protected FTagID : ThtmlTagID; FName : String; FElement : AhtmlElement; public property TagID: ThtmlTagID read FTagID write FTagID; property Name: String read FName write FName; property Element: AhtmlElement read FElement write FElement; end; { ThtmlParserState } { Class that represents the state of the HTML parser while parsing. } ThtmlParserState = class protected FParser : ThtmlParser; FDocument : ThtmlDocument; FElementList : TDoublyLinkedList; FFreeList : TDoublyLinkedList; FReopenList : TDoublyLinkedList; function GetOpenCount: Integer; function GetInnerItem: ThtmlOpenElementInfo; function GetInnerTagID: ThtmlTagID; function GetInnerContainer: AhtmlContainerObject; function GetOuterItem: ThtmlOpenElementInfo; function AddToContainer(const Container: AhtmlContainerObject; const Obj: AhtmlObject): Boolean; function AddToInnerContainer(const Obj: AhtmlObject): Boolean; procedure OpenElement(const Item: ThtmlOpenElementInfo); overload; procedure OpenElement(const TagID: ThtmlTagID; const Name: String; const Element: AhtmlElement); overload; procedure OpenAutoStartElements(const TagID: ThtmlTagID); function ReleaseElement: ThtmlOpenElementInfo; procedure CloseElementToFree; procedure CloseElementToReopen; procedure ReOpenElements; procedure CloseOverlapping(const OpenElement: ThtmlOpenElementInfo); procedure CloseElement(const OpenElement: ThtmlOpenElementInfo); function GetOpenElementForEndTag(const TagID: ThtmlTagID; const Name: String): ThtmlOpenElementInfo; function GetAutoCloseElementForStartTag( const TagID: ThtmlTagID): ThtmlOpenElementInfo; public constructor Create(const Parser: ThtmlParser); destructor Destroy; override; procedure Reset; property Parser: ThtmlParser read FParser; property Document: ThtmlDocument read FDocument; property OpenCount: Integer read GetOpenCount; property InnerItem: ThtmlOpenElementInfo read GetInnerItem; property InnerTagID: ThtmlTagID read GetInnerTagID; property InnerContainer: AhtmlContainerObject read GetInnerContainer; property OuterItem: ThtmlOpenElementInfo read GetOuterItem; function LocateTagID(const TagID: ThtmlTagID): ThtmlOpenElementInfo; function LocateName(const Name: String): ThtmlOpenElementInfo; function IsTagIDOpen(const TagID: ThtmlTagID): Boolean; function IsNameOpen(const Name: String): Boolean; function IsBodyOpen: Boolean; end; { ThtmlParser event types } ThtmlParserEvent = procedure (Sender: ThtmlParser) of object; ThtmlParserMessageType = ( hmWarning, hmError); ThtmlParserMessageEvent = procedure (Sender: ThtmlParser; MsgType: ThtmlParserMessageType; Msg: String) of object; ThtmlParserTokenEvent = procedure (Sender: ThtmlParser; TokenType: ThtmlTokenType) of object; ThtmlParserTokenStrEvent = procedure (Sender: ThtmlParser; TokenType: ThtmlTokenType; TokenStr: String) of object; ThtmlParserTextContext = (tcContent, tcAttributeValue); ThtmlParserTextEvent = procedure (Sender: ThtmlParser; TextContext: ThtmlParserTextContext; var Text: String) of object; ThtmlParserResolveEntityRefEvent = procedure (Sender: ThtmlParser; EntityRef: String; var Text: String; var Resolved: Boolean) of object; ThtmlParserStringEvent = procedure (Sender: ThtmlParser; Text: String) of object; ThtmlParserCommentEvent = procedure (Sender: ThtmlParser; Comments: array of String) of object; ThtmlParserDeclarationEvent = procedure (Sender: ThtmlParser; Name, Text: String) of object; ThtmlParserProcessingInstrEvent = procedure (Sender: ThtmlParser; Target, PI: String) of object; ThtmlParserStartTagEvent = procedure (Sender: ThtmlParser; TagID: ThtmlTagID; Name: String) of object; ThtmlParserStartTagAttributesEvent = procedure (Sender: ThtmlParser; TagID: ThtmlTagID; Name: String; AttributeNames, AttributeValues: array of String; Element: AhtmlElement) of object; ThtmlParserEndTagEvent = procedure (Sender: ThtmlParser; TagID: ThtmlTagID; Name: String) of object; ThtmlParserElementEvent = procedure (Sender: ThtmlParser; TagID: ThtmlTagID; Name: String; Element: AhtmlElement) of object; ThtmlParserDocumentObjectEvent = procedure (Sender: ThtmlParser; const Obj: AhtmlObject) of object; { ThtmlParser types } ThtmlParserOptions = set of ( poDisableNotifications, poStopOnError, poDontProduceDocument, poDontResolveReferences, poDontAllowOverlappedTags ); { ThtmlParser } ThtmlParser = class(TComponent) protected FOptions : ThtmlParserOptions; FRawText : RawByteString; FFileName : String; FEncoding : RawByteString; FOnMessage : ThtmlParserMessageEvent; FOnToken : ThtmlParserTokenEvent; FOnTokenStr : ThtmlParserTokenStrEvent; FOnResolveEntityRef : ThtmlParserResolveEntityRefEvent; FOnText : ThtmlParserTextEvent; FOnUnresolvedText : ThtmlParserTextEvent; FOnLineBreak : ThtmlParserEvent; FOnComment : ThtmlParserCommentEvent; FOnDeclaration : ThtmlParserDeclarationEvent; FOnMarkedSection : ThtmlParserStringEvent; FOnCDATA : ThtmlParserStringEvent; FOnProcessingInstr : ThtmlParserProcessingInstrEvent; FOnStartTag : ThtmlParserStartTagEvent; FOnStartTagAttributes : ThtmlParserStartTagAttributesEvent; FOnEndTag : ThtmlParserEndTagEvent; FOnElementOpen : ThtmlParserElementEvent; FOnElementClose : ThtmlParserElementEvent; FOnDocumentObject : ThtmlParserDocumentObjectEvent; FLexer : ThtmlLexer; FState : ThtmlParserState; FTokenType : ThtmlTokenType; procedure LogMessage(const MsgType: ThtmlParserMessageType; const Msg: String); procedure LogUnexpectedToken; procedure LogUnmatchedEndTag(const TagID: ThtmlTagID; const Name: String); procedure TriggerElementOpen(const Item: ThtmlOpenElementInfo); virtual; procedure TriggerElementClose(const Item: ThtmlOpenElementInfo); virtual; procedure TriggerDocumentObject(const Obj: AhtmlObject); virtual; function CreateText(const Context: ThtmlParserTextContext; var Text: String): AhtmlObject; virtual; function CreateCharRef(const CharVal: LongWord; const HasTrailer: Boolean): AhtmlObject; virtual; function CreateEntityRef(const EntityName: String; const HasTrailer: Boolean): AhtmlObject; virtual; function CreateLineBreak: AhtmlObject; virtual; function CreateEmptyComment: AhtmlObject; virtual; function CreateComment(const Comments: StringArray): AhtmlObject; virtual; function CreateDeclaration(const Name, Text: String): AhtmlObject; virtual; function CreateMarkedSection(const Text: String): AhtmlObject; virtual; function CreateCDATA(const CDATA: String): AhtmlObject; virtual; function CreatePI(const Target, PI: String): AhtmlObject; virtual; function CreateElement(const TagID: ThtmlTagID; const Name: String): AhtmlElement; virtual; function GetTokenStr: String; function GetNextToken: ThtmlTokenType; function ResolveRef(var Text: String): Boolean; function ParsePlainText: String; function ParseText(const TextContext: ThtmlParserTextContext; var Text: String): AhtmlObject; function ParseContentText: AhtmlObject; function ParseLineBreak: AhtmlObject; function ParseComment: AhtmlObject; function ParseDeclaration: AhtmlObject; function ParseMarkedSection: AhtmlObject; function ParseCDATA: AhtmlObject; function ParsePI: AhtmlObject; function ParseTagAttributeValue( const Attribute: ThtmlElementAttribute): String; function ParseTagAttribute(const TagID: ThtmlTagID; const TagName: String; const Element: AhtmlElement; var AttrID: ThtmlAttrID; var Name, Value: String): ThtmlElementAttribute; procedure ParseTagAttributes(const TagID: ThtmlTagID; const Name: String; const Element: AhtmlElement); function ParseStartTag: AhtmlObject; function ParseEndTag: AhtmlObject; function ParseToken: AhtmlObject; function GetDocument: ThtmlDocument; public constructor Create(AOwner: TComponent); override; destructor Destroy; override; property Options: ThtmlParserOptions read FOptions write FOptions; property RawText: RawByteString read FRawText write FRawText; property FileName: String read FFileName write FFileName; property Encoding: RawByteString read FEncoding write FEncoding; property OnMessage: ThtmlParserMessageEvent read FOnMessage write FOnMessage; property OnToken: ThtmlParserTokenEvent read FOnToken write FOnToken; property OnTokenStr: ThtmlParserTokenStrEvent read FOnTokenStr write FOnTokenStr; property OnResolveEntityRef: ThtmlParserResolveEntityRefEvent read FOnResolveEntityRef write FOnResolveEntityRef; property OnText: ThtmlParserTextEvent read FOnText write FOnText; property OnUnresolvedText: ThtmlParserTextEvent read FOnUnresolvedText write FOnUnresolvedText; property OnLineBreak: ThtmlParserEvent read FOnLineBreak write FOnLineBreak; property OnComment: ThtmlParserCommentEvent read FOnComment write FOnComment; property OnDeclaration: ThtmlParserDeclarationEvent read FOnDeclaration write FOnDeclaration; property OnMarkedSection: ThtmlParserStringEvent read FOnMarkedSection write FOnMarkedSection; property OnCDATA: ThtmlParserStringEvent read FOnCDATA write FOnCDATA; property OnProcessingInstr: ThtmlParserProcessingInstrEvent read FOnProcessingInstr write FOnProcessingInstr; property OnStartTag: ThtmlParserStartTagEvent read FOnStartTag write FOnStartTag; property OnStartTagAttributes: ThtmlParserStartTagAttributesEvent read FOnStartTagAttributes write FOnStartTagAttributes; property OnEndTag: ThtmlParserEndTagEvent read FOnEndTag write FOnEndTag; property OnElementOpen: ThtmlParserElementEvent read FOnElementOpen write FOnElementOpen; property OnElementClose: ThtmlParserElementEvent read FOnElementClose write FOnElementClose; property OnDocumentObject: ThtmlParserDocumentObjectEvent read FOnDocumentObject write FOnDocumentObject; function ParseDocument: ThtmlDocument; property State: ThtmlParserState read FState; property Document: ThtmlDocument read GetDocument; end; EhtmlParser = class(Exception); { } { TfclHTMLParser } { } type TfclHTMLParser = class (ThtmlParser) published property Options; property RawText; property FileName; property Encoding; property OnMessage; property OnToken; property OnTokenStr; property OnResolveEntityRef; property OnText; property OnUnresolvedText; property OnLineBreak; property OnComment; property OnDeclaration; property OnMarkedSection; property OnCDATA; property OnProcessingInstr; property OnStartTag; property OnStartTagAttributes; property OnEndTag; property OnElementOpen; property OnElementClose; property OnDocumentObject; end; { } { Test cases } { } {$IFDEF HTML_TEST} procedure Test; {$ENDIF} implementation uses { Fundamentals } flcUtils, flcDynArrays, flcStrings, flcUnicodeReader, { HTML } flcHTMLCharEntity, flcHTMLDocElements, flcHTMLReader; { } { ThtmlParserState } { } constructor ThtmlParserState.Create(const Parser: ThtmlParser); begin inherited Create; Assert(Assigned(Parser)); FParser := Parser; FElementList := TDoublyLinkedList.Create; FFreeList := TDoublyLinkedList.Create; FReopenList := TDoublyLinkedList.Create; end; destructor ThtmlParserState.Destroy; begin FreeAndNil(FDocument); FreeAndNil(FReopenList); FreeAndNil(FFreeList); FreeAndNil(FElementList); inherited Destroy; end; procedure ThtmlParserState.Reset; begin FreeAndNil(FDocument); FReopenList.DeleteList; FElementList.DeleteList; if not (poDontProduceDocument in FParser.FOptions) then FDocument := ThtmlDocument.Create; OpenElement(HTML_TAG_Document, '', nil); end; function ThtmlParserState.GetOpenCount: Integer; begin Result := FElementList.Count; end; function ThtmlParserState.GetInnerItem: ThtmlOpenElementInfo; begin Result := ThtmlOpenElementInfo(FElementList.Last); end; function ThtmlParserState.GetInnerTagID: ThtmlTagID; begin if not Assigned(FElementList.Last) then Result := HTML_TAG_None else Result := ThtmlOpenElementInfo(FElementList.Last).TagID; end; function ThtmlParserState.GetInnerContainer: AhtmlContainerObject; var I: ThtmlOpenElementInfo; begin I := GetInnerItem; if Assigned(I) then if I.TagID = HTML_TAG_Document then Result := FDocument else Result := I.Element else Result := FDocument; end; function ThtmlParserState.GetOuterItem: ThtmlOpenElementInfo; begin Result := ThtmlOpenElementInfo(FElementList.First); end; function ThtmlParserState.AddToContainer(const Container: AhtmlContainerObject; const Obj: AhtmlObject): Boolean; begin Result := False; if not Assigned(Obj) then exit; if Assigned(Container) then begin Container.AddItem(Obj); Result := True; end; FParser.TriggerDocumentObject(Obj); if not Assigned(Container) then Obj.Free; end; function ThtmlParserState.AddToInnerContainer(const Obj: AhtmlObject): Boolean; begin Result := AddToContainer(GetInnerContainer, Obj); end; function ThtmlParserState.LocateTagID(const TagID: ThtmlTagID): ThtmlOpenElementInfo; begin Result := ThtmlOpenElementInfo(FElementList.Last); while Assigned(Result) do if TagID = Result.TagID then exit else Result := ThtmlOpenElementInfo(Result.Prev); end; function ThtmlParserState.LocateName(const Name: String): ThtmlOpenElementInfo; begin Result := ThtmlOpenElementInfo(FElementList.Last); while Assigned(Result) do if StrEqualNoAsciiCaseU(Name, Result.Name) then exit else Result := ThtmlOpenElementInfo(Result.Prev); end; function ThtmlParserState.IsTagIDOpen(const TagID: ThtmlTagID): Boolean; begin Result := Assigned(LocateTagID(TagID));; end; function ThtmlParserState.IsNameOpen(const Name: String): Boolean; begin Result := Assigned(LocateName(Name)); end; function ThtmlParserState.IsBodyOpen: Boolean; var I: ThtmlOpenElementInfo; begin I := ThtmlOpenElementInfo(FElementList.First); Result := False; while Assigned(I) do if I.TagID = HTML_TAG_HTML then begin Result := True; break; end else I := ThtmlOpenElementInfo(I.Next); if not Result then exit; Result := False; I := ThtmlOpenElementInfo(I.Next); while Assigned(I) do if I.TagID = HTML_TAG_BODY then begin Result := True; break; end else I := ThtmlOpenElementInfo(I.Next); end; procedure ThtmlParserState.OpenElement(const Item: ThtmlOpenElementInfo); begin Assert(Assigned(Item), 'Assigned(Item)'); FElementList.Append(Item); FParser.TriggerElementOpen(Item); end; procedure ThtmlParserState.OpenElement(const TagID: ThtmlTagID; const Name: String; const Element: AhtmlElement); var Item: ThtmlOpenElementInfo; begin Item := ThtmlOpenElementInfo(FFreeList.RemoveFirst); if not Assigned(Item) then Item := ThtmlOpenElementInfo.Create; Item.TagID := TagID; Item.Name := Name; Item.Element := Element; OpenElement(Item); end; procedure ThtmlParserState.OpenAutoStartElements(const TagID: ThtmlTagID); var Item : ThtmlOpenElementInfo; NewTag, Tag : ThtmlTagID; Element : AhtmlElement; Container : AhtmlContainerObject; begin Item := InnerItem; while Assigned(Item) do begin if TagID = Item.TagID then break; NewTag := htmlAutoOpenTag(TagID, Item.TagID); if NewTag <> HTML_TAG_None then begin if NewTag = Item.TagID then break; // no auto-start element if Item.TagID = HTML_TAG_Document then Container := FDocument else Container := Item.Element; repeat // create auto-start element Element := FParser.CreateElement(NewTag, ''); if Assigned(Element) then if not AddToContainer(Container, Element) then Element := nil; OpenElement(NewTag, '', Element); // next Tag := htmlAutoOpenTag(TagID, NewTag); if (Tag = HTML_TAG_None) or (Tag = NewTag) then break; Container := Element; NewTag := Tag; until False; break; end else Item := ThtmlOpenElementInfo(Item.Prev); end; end; function ThtmlParserState.ReleaseElement: ThtmlOpenElementInfo; begin Result := ThtmlOpenElementInfo(FElementList.Last); if not Assigned(Result) then exit; FParser.TriggerElementClose(Result); FElementList.RemoveLast; end; procedure ThtmlParserState.CloseElementToFree; var Item: ThtmlOpenElementInfo; begin Item := ReleaseElement; if not Assigned(Item) then exit; FFreeList.Append(Item); end; procedure ThtmlParserState.CloseElementToReOpen; var Item: ThtmlOpenElementInfo; begin Item := ReleaseElement; if not Assigned(Item) then exit; FReopenList.Append(Item); end; procedure ThtmlParserState.ReOpenElements; var Container : AhtmlContainerObject; Item : ThtmlOpenElementInfo; begin if FReopenList.IsEmpty then exit; Container := InnerContainer; repeat Item := ThtmlOpenElementInfo(FReopenList.RemoveLast); if not Assigned(Item) then break; if Assigned(Item.Element) then begin Item.Element := Item.Element.DuplicateElement; if not AddToContainer(Container, Item.Element) then Item.Element := nil; Container := Item.Element; end else Container := nil; OpenElement(Item); until False; end; procedure ThtmlParserState.CloseOverlapping (const OpenElement: ThtmlOpenElementInfo); var Item: ThtmlOpenElementInfo; begin // Close open overlapping tags up to, but excluding, OpenElement if poDontAllowOverlappedTags in FParser.FOptions then // Strict: Close all open overlapped elements repeat Item := InnerItem; if OpenElement = Item then break; CloseElementToFree; until False else // Overlaps allowed: Close certian overlapping elements repeat Item := InnerItem; if OpenElement = Item then break; if not htmlDoesCloseTagCloseOpenTag(OpenElement.TagID, Item.TagID) then CloseElementToReopen else CloseElementToFree; until False; end; procedure ThtmlParserState.CloseElement(const OpenElement: ThtmlOpenElementInfo); begin CloseOverlapping(OpenElement); CloseElementToFree; ReOpenElements; end; function ThtmlParserState.GetOpenElementForEndTag(const TagID: ThtmlTagID; const Name: String): ThtmlOpenElementInfo; begin Result := InnerItem; while Assigned(Result) do begin if htmlIsSameTag(TagID, Name, Result.TagID, Result.Name) then // found matching open tag exit else if poDontAllowOverlappedTags in FParser.FOptions then begin if not htmlIsElementEndTagOptional(Result.TagID) then begin Result := nil; // strict: overlapping closes only allowed exit; // through elements with optional end tags end; end else if not htmlDoesCloseTagCloseOutside(TagID, Result.TagID) then begin Result := nil; // tag does not close through this level exit; end; Result := ThtmlOpenElementInfo(Result.Prev); end; Result := nil; // no matching open tag found end; function ThtmlParserState.GetAutoCloseElementForStartTag( const TagID: ThtmlTagID): ThtmlOpenElementInfo; begin Result := InnerItem; while Assigned(Result) do begin // 1. check if auto-close element if TagID = Result.TagID then begin if htmlIsElementEndTagOptional(TagID) then // close matching open tag with optional end tag exit; // no further searching Result := nil; exit; end else if htmlDoesOpenTagAutoCloseOpenTag(TagID, Result.TagID) then // found exit else // 2. check if iteration should continue outside if poDontAllowOverlappedTags in FParser.FOptions then begin if not htmlIsElementEndTagOptional(Result.TagID) then begin Result := nil; // strict: overlapping closes only allowed exit; // through elements with optional end tags end; end else if not htmlDoesOpenTagAutoCloseOutside(TagID, Result.TagID) then begin Result := nil; // tag does not close through this level exit; end; // 3. next open level Result := ThtmlOpenElementInfo(Result.Prev); end; Result := nil; // no matching open tag found end; { } { ThtmlParser } { } constructor ThtmlParser.Create(AOwner: TComponent); begin inherited Create(AOwner); FOptions := []; FState := ThtmlParserState.Create(self); end; destructor ThtmlParser.Destroy; begin FreeAndNil(FLexer); FreeAndNil(FState); inherited Destroy; end; { Messages } procedure ThtmlParser.LogMessage(const MsgType: ThtmlParserMessageType; const Msg: String); begin if not (poDisableNotifications in FOptions) and Assigned(FOnMessage) then FOnMessage(self, MsgType, Msg); if (MsgType = hmError) and (poStopOnError in FOptions) then raise EhtmlParser.Create('Parse error: ' + Msg); end; procedure ThtmlParser.LogUnexpectedToken; begin LogMessage(hmError, 'Unexpected token (' + FLexer.TokenTypeDescription + ')'); end; procedure ThtmlParser.LogUnmatchedEndTag(const TagID: ThtmlTagID; const Name: String); var S: String; begin S := Name; if (S = '') and (TagID <> HTML_TAG_None) then S := htmlGetTagName(TagID); LogMessage(hmError, 'Unmatched end tag (' + ToStringU(S) + ')'); end; { Triggers } procedure ThtmlParser.TriggerElementOpen(const Item: ThtmlOpenElementInfo); begin if not (poDisableNotifications in FOptions) and Assigned(FOnElementOpen) then with Item do FOnElementOpen(self, TagID, Name, Element); end; procedure ThtmlParser.TriggerElementClose(const Item: ThtmlOpenElementInfo); begin if not (poDisableNotifications in FOptions) and Assigned(FOnElementClose) then with Item do FOnElementClose(self, TagID, Name, Element); end; procedure ThtmlParser.TriggerDocumentObject(const Obj: AhtmlObject); begin if not (poDisableNotifications in FOptions) then begin if Assigned(FOnDocumentObject) then FOnDocumentObject(self, Obj); end; end; { Create } function ThtmlParser.CreateText(const Context: ThtmlParserTextContext; var Text: String): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnText) then FOnText(self, Context, Text); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlText.Create(Text); end; function ThtmlParser.CreateCharRef(const CharVal: LongWord; const HasTrailer: Boolean): AhtmlObject; begin if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlCharRef.Create(CharVal, HasTrailer); end; function ThtmlParser.CreateEntityRef(const EntityName: String; const HasTrailer: Boolean): AhtmlObject; begin if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlEntityRef.Create(EntityName, HasTrailer); end; function ThtmlParser.CreateLineBreak: AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnLineBreak) then FOnLineBreak(self); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlLineBreak.Create; end; function ThtmlParser.CreateEmptyComment: AhtmlObject; begin if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlEmptyComment.Create; end; function ThtmlParser.CreateComment(const Comments: StringArray): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnComment) then FOnComment(self, Comments); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlComment.Create(Comments); end; function ThtmlParser.CreateDeclaration(const Name, Text: String): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnDeclaration) then FOnDeclaration(self, Name, Text); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlRawTag.Create('!' + Name + iifU(Text <> '', ' ', '') + Text); end; function ThtmlParser.CreateMarkedSection(const Text: String): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnMarkedSection) then FOnMarkedSection(self, Text); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlRawTag.Create('![' + Text + ']'); end; function ThtmlParser.CreateCDATA(const CDATA: String): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnCDATA) then FOnCDATA(self, CDATA); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlCDATA.Create(CDATA); end; function ThtmlParser.CreatePI(const Target, PI: String): AhtmlObject; begin if not (poDisableNotifications in FOptions) and Assigned(FOnProcessingInstr) then FOnProcessingInstr(self, Target, PI); if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlPI.Create(Target, PI); end; function ThtmlParser.CreateElement(const TagID: ThtmlTagID; const Name: String): AhtmlElement; var C: AhtmlContainerObject; begin if poDontProduceDocument in FOptions then Result := nil else begin C := FState.GetInnerContainer; if Assigned(C) then Result := C.CreateItem(Ord(TagID), Name) as AhtmlElement else Result := htmlCreateElement(TagID, Name); end; end; { Token } function ThtmlParser.GetTokenStr: String; begin Result := FLexer.TokenStr; end; function ThtmlParser.GetNextToken: ThtmlTokenType; begin if not (poDisableNotifications in FOptions) and (FTokenType <> htNone) then begin // note: notify token After it has been examined if Assigned(FOnToken) then FOnToken(self, FTokenType); if Assigned(FOnTokenStr) then FOnTokenStr(self, FTokenType, FLexer.TokenStr); end; // get next token from lexer FTokenType := FLexer.GetNextToken; // return token type Result := FTokenType; end; function ThtmlParser.ResolveRef(var Text: String): Boolean; begin Assert(FTokenType in htmlTokensRef, 'FTokenType in htmlTokensRef'); Text := ''; if poDontResolveReferences in FOptions then Result := False else case FTokenType of htEntityRef: begin Result := FLexer.ResolveEntityRef(Text); if not (poDisableNotifications in FOptions) and Assigned(FOnResolveEntityRef) then FOnResolveEntityRef(self, GetTokenStr, Text, Result); end; htCharRef, htCharRefHex: Result := FLexer.ResolveCharRef(Text); else Result := False; end; end; { Parse } function ThtmlParser.ParsePlainText: String; var T: String; begin Assert(FTokenType in htmlTokensText, 'FTokenType in htmlTokensText'); Result := ''; repeat case FTokenType of htText: Result := Result + GetTokenStr; htCharRef, htCharRefHex, htEntityRef: if ResolveRef(T) then Result := Result + T else break; else break; end; GetNextToken; until False; end; function ThtmlParser.ParseText(const TextContext: ThtmlParserTextContext; var Text: String): AhtmlObject; var CharVal : LongWord; S : String; R : Boolean; begin // extract resolved text Assert(FTokenType in htmlTokensText, 'FTokenType in htmlTokensText'); Text := ParsePlainText; if Text <> '' then Result := CreateText(TextContext, Text) else // extract unresolved entity reference case FTokenType of htCharRefHex, htCharRef: begin CharVal := FLexer.CharRefValue; Text := htmlCharRef(CharVal, FTokenType = htCharRefHex); GetNextToken; R := (FTokenType = htRefTrailer); if R then GetNextToken; Result := CreateCharRef(CharVal, R); if not (poDisableNotifications in FOptions) and Assigned(FOnUnresolvedText) then FOnUnresolvedText(self, TextContext, Text); end; htEntityRef: begin S := GetTokenStr; Text := '&' + S + ';'; GetNextToken; R := (FTokenType = htRefTrailer); if R then GetNextToken; Result := CreateEntityRef(S, R); if not (poDisableNotifications in FOptions) and Assigned(FOnUnresolvedText) then FOnUnresolvedText(self, TextContext, Text); end; else Result := nil; end; end; function ThtmlParser.ParseContentText: AhtmlObject; var T: String; begin Result := ParseText(tcContent, T); end; function ThtmlParser.ParseLineBreak: AhtmlObject; begin Assert(FTokenType = htLineBreak, 'FTokenType = htLineBreak'); Result := CreateLineBreak; GetNextToken; end; function ThtmlParser.ParseComment: AhtmlObject; var S: StringArray; begin Assert(FTokenType in htmlTokensComment, 'FTokenType in htmlTokensComment'); case FTokenType of htEmptyComment: begin GetNextToken; Result := CreateEmptyComment; end; htComment: begin S := nil; repeat DynArrayAppend(S, GetTokenStr); until GetNextToken <> htComment; if FTokenType = htCommentEnd then GetNextToken else LogUnexpectedToken; Result := CreateComment(S); end; else Result := nil; end; end; function ThtmlParser.ParseDeclaration: AhtmlObject; var Name, Text: String; begin Assert(FTokenType = htDeclaration, 'FTokenType = htDeclaration'); Name := GetTokenStr; if GetNextToken = htDeclarationText then begin Text := GetTokenStr; GetNextToken; end else Text := ''; Result := CreateDeclaration(Name, Text); end; function ThtmlParser.ParseMarkedSection: AhtmlObject; var S: String; begin Assert(FTokenType = htMarkedSection, 'FTokenType = htMarkedSection'); S := GetTokenStr; GetNextToken; Result := CreateMarkedSection(S); end; function ThtmlParser.ParseCDATA: AhtmlObject; var S: String; begin Assert(FTokenType = htCDATA, 'FTokenType = htCDATA'); S := GetTokenStr; GetNextToken; Result := CreateCDATA(S); end; function ThtmlParser.ParsePI: AhtmlObject; var Target, PI: String; begin Assert(FTokenType = htPITarget, 'FTokenType = htPITarget'); Target := GetTokenStr; if GetNextToken = htPI then begin PI := GetTokenStr; GetNextToken; Result := CreatePI(Target, PI); end else Result := CreatePI(Target, ''); end; function ThtmlParser.ParseTagAttributeValue( const Attribute: ThtmlElementAttribute): String; var Obj : AhtmlObject; Text : String; begin Assert(FTokenType = htTagAttrValueStart, 'FTokenType = htTagAttrValueStart'); GetNextToken; Result := ''; while FTokenType in htmlTokensTextInclLineBreak do begin Obj := ParseText(tcAttributeValue, Text); Result := Result + Text; if Assigned(Attribute) and Assigned(Obj) then Attribute.AddItem(Obj); end; if FTokenType = htTagAttrValueEnd then GetNextToken else LogUnexpectedToken; end; function ThtmlParser.ParseTagAttribute(const TagID: ThtmlTagID; const TagName: String; const Element: AhtmlElement; var AttrID: ThtmlAttrID; var Name, Value: String): ThtmlElementAttribute; begin Assert(FTokenType = htTagAttrName, 'FTokenType = htTagAttrName'); AttrID := FLexer.AttrID; if AttrID = HTML_ATTR_None then Name := GetTokenStr else Name := ''; GetNextToken; if poDontProduceDocument in FOptions then Result := nil else Result := ThtmlElementAttribute.Create(AttrID, Name); // Parse value if FTokenType = htTagAttrValueStart then Value := ParseTagAttributeValue(Result) else Value := ''; end; procedure ThtmlParser.ParseTagAttributes(const TagID: ThtmlTagID; const Name: String; const Element: AhtmlElement); var Attr : ThtmlElementAttribute; AttrID : ThtmlAttrID; AttrName : String; Value : String; Names : StringArray; Values : StringArray; Notify : Boolean; begin Notify := not (poDisableNotifications in FOptions) and Assigned(FOnStartTagAttributes); repeat case FTokenType of htTagAttrName: begin Attr := ParseTagAttribute(TagID, Name, Element, AttrID, AttrName, Value); if Assigned(Element) and Assigned(Attr) then Element.Attributes.AddItem(Attr); if Notify then begin if (AttrName = '') and (AttrID <> HTML_ATTR_None) then AttrName := htmlGetAttrName(AttrID); DynArrayAppend(Names, AttrName); DynArrayAppend(Values, Value); end; end; htTagAttrValueStart: repeat LogUnexpectedToken; GetNextToken; until not (FTokenType in htmlTokensText); htTagAttrValueEnd: begin LogUnexpectedToken; GetNextToken; end; else break; end; until False; if Notify then FOnStartTagAttributes(self, TagID, Name, Names, Values, Element); end; function ThtmlParser.ParseStartTag: AhtmlObject; var TagID : ThtmlTagID; Name : String; Element : AhtmlElement; Item : ThtmlOpenElementInfo; ContentTag : Boolean; begin Assert(FTokenType = htStartTag, 'FTokenType = htStartTag'); // get tag name TagID := FLexer.TagID; Name := GetTokenStr; GetNextToken; if not (poDisableNotifications in FOptions) and Assigned(FOnStartTag) then FOnStartTag(self, TagID, Name); // auto-close nesting levels Item := FState.GetAutoCloseElementForStartTag(TagID); if Assigned(Item) then FState.CloseElement(Item); // auto-close // auto-start nesting levels FState.OpenAutoStartElements(TagID); // create element Element := CreateElement(TagID, Name); // parse rest of tag ParseTagAttributes(TagID, Name, Element); // set tag type if FTokenType = htEmptyTag then // empty tag begin if Assigned(Element) then Element.TagType := ttEmptyTag; ContentTag := False; GetNextToken; end else if htmlIsEmptyElement(TagID) then // required to be empty begin if Assigned(Element) then Element.TagType := ttStartTag; ContentTag := False; end else begin // content tag if Assigned(Element) then Element.TagType := ttContentTags; ContentTag := True; end; // update nesting levels if not FState.AddToInnerContainer(Element) then Element := nil; if ContentTag then // new nest level FState.OpenElement(TagID, Name, Element); // object added to container Result := nil; end; function ThtmlParser.ParseEndTag: AhtmlObject; var TagID : ThtmlTagID; Item : ThtmlOpenElementInfo; Name : String; begin // parse tag TagID := FLexer.TagID; if TagID = HTML_TAG_None then Name := GetTokenStr else Name := ''; GetNextToken; // notify if not (poDisableNotifications in FOptions) and Assigned(FOnEndTag) then FOnEndTag(self, TagID, Name); // close element Item := FState.GetOpenElementForEndTag(TagID, Name); if Assigned(Item) then FState.CloseElement(Item) else LogUnmatchedEndTag(TagID, Name); // object in container Result := nil; end; function ThtmlParser.ParseToken: AhtmlObject; begin case FTokenType of htText, htEntityRef, htCharRef, htCharRefHex : Result := ParseContentText; htStartTag : Result := ParseStartTag; htEndTag : Result := ParseEndTag; htLineBreak : Result := ParseLineBreak; htEmptyComment, htComment : Result := ParseComment; htMarkedSection : Result := ParseMarkedSection; htCDATA : Result := ParseCDATA; htPITarget : Result := ParsePI; htDeclaration : Result := ParseDeclaration; htEOF : Result := nil; else begin LogUnexpectedToken; GetNextToken; Result := nil; end; end; end; function ThtmlParser.ParseDocument: ThtmlDocument; var Reader: TUnicodeReader; begin // Initialize FreeAndNil(FLexer); if FFileName <> '' then Reader := htmlGetDocumentReaderForFile(FFileName, FEncoding) else Reader := htmlGetDocumentReaderForRawString(FRawText, FEncoding); FLexer := ThtmlLexer.Create(Reader, True); FLexer.NoLineBreakToken := False; FState.Reset; // Parse FTokenType := htNone; GetNextToken; while FTokenType <> htEOF do FState.AddToInnerContainer(ParseToken); // Success FreeAndNil(FLexer); Result := FState.Document; if Assigned(Result) then Result.PrepareStructure; end; function ThtmlParser.GetDocument: ThtmlDocument; begin Assert(Assigned(FState)); Result := FState.Document; end; { } { Test cases } { } {$IFDEF HTML_TEST} {$ASSERTIONS ON} procedure Test; var P : ThtmlParser; procedure TestHTML(const RawHTML: RawByteString); var D : ThtmlDocument; begin P.RawText := RawHTML; D := P.ParseDocument; Assert(D.HTMLText = UTF8ToString(RawHTML)); end; begin P := ThtmlParser.Create(nil); try TestHTML('Test'); TestHTML('<_Test1>Test<_Test2>'); TestHTML('<_Test1/>Test<_Test2/>'); finally P.Free; end; end; {$ENDIF} end.