1391 lines
44 KiB
ObjectPascal
1391 lines
44 KiB
ObjectPascal
{******************************************************************************}
|
|
{ }
|
|
{ Library: Fundamentals 5.00 - HTML Parser }
|
|
{ File name: flcHTMLParser.pas }
|
|
{ File version: 5.07 }
|
|
{ Description: HTML Parser }
|
|
{ }
|
|
{ Copyright: Copyright (c) 2000-2020, David J Butler }
|
|
{ All rights reserved. }
|
|
{ Redistribution and use in source and binary forms, with }
|
|
{ or without modification, are permitted provided that }
|
|
{ the following conditions are met: }
|
|
{ Redistributions of source code must retain the above }
|
|
{ copyright notice, this list of conditions and the }
|
|
{ following disclaimer. }
|
|
{ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND }
|
|
{ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED }
|
|
{ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED }
|
|
{ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A }
|
|
{ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL }
|
|
{ THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, }
|
|
{ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR }
|
|
{ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, }
|
|
{ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF }
|
|
{ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) }
|
|
{ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER }
|
|
{ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING }
|
|
{ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE }
|
|
{ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE }
|
|
{ POSSIBILITY OF SUCH DAMAGE. }
|
|
{ }
|
|
{ Github: https://github.com/fundamentalslib }
|
|
{ E-mail: fundamentals.library at gmail.com }
|
|
{ }
|
|
{ Revision history: }
|
|
{ }
|
|
{ 2002/10/21 1.00 Initial version. }
|
|
{ 2002/10/29 1.01 Unicode support. }
|
|
{ 2002/10/30 1.02 Support for overlapping tags. }
|
|
{ 2002/11/02 1.03 Refactored. }
|
|
{ 2002/11/04 1.04 Optimizations. }
|
|
{ 2002/12/16 1.05 Improved white-space and line-break handling. }
|
|
{ 2015/04/11 1.06 UnicodeString changes. }
|
|
{ 2019/02/22 5.07 Revise for Fundamentals 5. }
|
|
{ }
|
|
{******************************************************************************}
|
|
|
|
{$INCLUDE flcHTML.inc}
|
|
|
|
unit flcHTMLParser;
|
|
|
|
interface
|
|
|
|
uses
|
|
{ System }
|
|
SysUtils,
|
|
Classes,
|
|
|
|
{ Fundamentals }
|
|
flcStdTypes,
|
|
flcDataStructs,
|
|
|
|
{ HTML }
|
|
flcHTMLElements,
|
|
flcHTMLLexer,
|
|
flcHTMLDocBase,
|
|
flcHTMLDoc;
|
|
|
|
|
|
|
|
{ }
|
|
{ ThtmlParser }
|
|
{ }
|
|
type
|
|
ThtmlParser = class; // forward
|
|
|
|
{ ThtmlOpenElementInfo }
|
|
{ Structure to store a nested open element }
|
|
ThtmlOpenElementInfo = class(TDoublyLinkedItem)
|
|
protected
|
|
FTagID : ThtmlTagID;
|
|
FName : String;
|
|
FElement : AhtmlElement;
|
|
|
|
public
|
|
property TagID: ThtmlTagID read FTagID write FTagID;
|
|
property Name: String read FName write FName;
|
|
property Element: AhtmlElement read FElement write FElement;
|
|
end;
|
|
|
|
{ ThtmlParserState }
|
|
{ Class that represents the state of the HTML parser while parsing. }
|
|
ThtmlParserState = class
|
|
protected
|
|
FParser : ThtmlParser;
|
|
FDocument : ThtmlDocument;
|
|
FElementList : TDoublyLinkedList;
|
|
FFreeList : TDoublyLinkedList;
|
|
FReopenList : TDoublyLinkedList;
|
|
|
|
function GetOpenCount: Integer;
|
|
function GetInnerItem: ThtmlOpenElementInfo;
|
|
function GetInnerTagID: ThtmlTagID;
|
|
function GetInnerContainer: AhtmlContainerObject;
|
|
function GetOuterItem: ThtmlOpenElementInfo;
|
|
|
|
function AddToContainer(const Container: AhtmlContainerObject;
|
|
const Obj: AhtmlObject): Boolean;
|
|
function AddToInnerContainer(const Obj: AhtmlObject): Boolean;
|
|
|
|
procedure OpenElement(const Item: ThtmlOpenElementInfo); overload;
|
|
procedure OpenElement(const TagID: ThtmlTagID; const Name: String;
|
|
const Element: AhtmlElement); overload;
|
|
procedure OpenAutoStartElements(const TagID: ThtmlTagID);
|
|
|
|
function ReleaseElement: ThtmlOpenElementInfo;
|
|
procedure CloseElementToFree;
|
|
procedure CloseElementToReopen;
|
|
procedure ReOpenElements;
|
|
|
|
procedure CloseOverlapping(const OpenElement: ThtmlOpenElementInfo);
|
|
procedure CloseElement(const OpenElement: ThtmlOpenElementInfo);
|
|
function GetOpenElementForEndTag(const TagID: ThtmlTagID;
|
|
const Name: String): ThtmlOpenElementInfo;
|
|
function GetAutoCloseElementForStartTag(
|
|
const TagID: ThtmlTagID): ThtmlOpenElementInfo;
|
|
|
|
public
|
|
constructor Create(const Parser: ThtmlParser);
|
|
destructor Destroy; override;
|
|
|
|
procedure Reset;
|
|
|
|
property Parser: ThtmlParser read FParser;
|
|
property Document: ThtmlDocument read FDocument;
|
|
|
|
property OpenCount: Integer read GetOpenCount;
|
|
property InnerItem: ThtmlOpenElementInfo read GetInnerItem;
|
|
property InnerTagID: ThtmlTagID read GetInnerTagID;
|
|
property InnerContainer: AhtmlContainerObject read GetInnerContainer;
|
|
property OuterItem: ThtmlOpenElementInfo read GetOuterItem;
|
|
|
|
function LocateTagID(const TagID: ThtmlTagID): ThtmlOpenElementInfo;
|
|
function LocateName(const Name: String): ThtmlOpenElementInfo;
|
|
function IsTagIDOpen(const TagID: ThtmlTagID): Boolean;
|
|
function IsNameOpen(const Name: String): Boolean;
|
|
function IsBodyOpen: Boolean;
|
|
end;
|
|
|
|
{ ThtmlParser event types }
|
|
ThtmlParserEvent = procedure (Sender: ThtmlParser) of object;
|
|
|
|
ThtmlParserMessageType = (
|
|
hmWarning,
|
|
hmError);
|
|
ThtmlParserMessageEvent = procedure (Sender: ThtmlParser;
|
|
MsgType: ThtmlParserMessageType; Msg: String) of object;
|
|
|
|
ThtmlParserTokenEvent = procedure (Sender: ThtmlParser;
|
|
TokenType: ThtmlTokenType) of object;
|
|
ThtmlParserTokenStrEvent = procedure (Sender: ThtmlParser;
|
|
TokenType: ThtmlTokenType; TokenStr: String) of object;
|
|
|
|
ThtmlParserTextContext = (tcContent, tcAttributeValue);
|
|
ThtmlParserTextEvent = procedure (Sender: ThtmlParser;
|
|
TextContext: ThtmlParserTextContext; var Text: String) of object;
|
|
ThtmlParserResolveEntityRefEvent = procedure (Sender: ThtmlParser;
|
|
EntityRef: String;
|
|
var Text: String; var Resolved: Boolean) of object;
|
|
|
|
ThtmlParserStringEvent = procedure (Sender: ThtmlParser;
|
|
Text: String) of object;
|
|
ThtmlParserCommentEvent = procedure (Sender: ThtmlParser;
|
|
Comments: array of String) of object;
|
|
ThtmlParserDeclarationEvent = procedure (Sender: ThtmlParser;
|
|
Name, Text: String) of object;
|
|
ThtmlParserProcessingInstrEvent = procedure (Sender: ThtmlParser;
|
|
Target, PI: String) of object;
|
|
|
|
ThtmlParserStartTagEvent = procedure (Sender: ThtmlParser;
|
|
TagID: ThtmlTagID; Name: String) of object;
|
|
ThtmlParserStartTagAttributesEvent = procedure (Sender: ThtmlParser;
|
|
TagID: ThtmlTagID; Name: String;
|
|
AttributeNames, AttributeValues: array of String;
|
|
Element: AhtmlElement) of object;
|
|
ThtmlParserEndTagEvent = procedure (Sender: ThtmlParser;
|
|
TagID: ThtmlTagID; Name: String) of object;
|
|
|
|
ThtmlParserElementEvent = procedure (Sender: ThtmlParser;
|
|
TagID: ThtmlTagID; Name: String; Element: AhtmlElement) of object;
|
|
|
|
ThtmlParserDocumentObjectEvent = procedure (Sender: ThtmlParser;
|
|
const Obj: AhtmlObject) of object;
|
|
|
|
{ ThtmlParser types }
|
|
ThtmlParserOptions = set of (
|
|
poDisableNotifications,
|
|
poStopOnError,
|
|
poDontProduceDocument,
|
|
poDontResolveReferences,
|
|
poDontAllowOverlappedTags
|
|
);
|
|
|
|
{ ThtmlParser }
|
|
ThtmlParser = class(TComponent)
|
|
protected
|
|
FOptions : ThtmlParserOptions;
|
|
FRawText : RawByteString;
|
|
FFileName : String;
|
|
FEncoding : RawByteString;
|
|
FOnMessage : ThtmlParserMessageEvent;
|
|
FOnToken : ThtmlParserTokenEvent;
|
|
FOnTokenStr : ThtmlParserTokenStrEvent;
|
|
FOnResolveEntityRef : ThtmlParserResolveEntityRefEvent;
|
|
FOnText : ThtmlParserTextEvent;
|
|
FOnUnresolvedText : ThtmlParserTextEvent;
|
|
FOnLineBreak : ThtmlParserEvent;
|
|
FOnComment : ThtmlParserCommentEvent;
|
|
FOnDeclaration : ThtmlParserDeclarationEvent;
|
|
FOnMarkedSection : ThtmlParserStringEvent;
|
|
FOnCDATA : ThtmlParserStringEvent;
|
|
FOnProcessingInstr : ThtmlParserProcessingInstrEvent;
|
|
FOnStartTag : ThtmlParserStartTagEvent;
|
|
FOnStartTagAttributes : ThtmlParserStartTagAttributesEvent;
|
|
FOnEndTag : ThtmlParserEndTagEvent;
|
|
FOnElementOpen : ThtmlParserElementEvent;
|
|
FOnElementClose : ThtmlParserElementEvent;
|
|
FOnDocumentObject : ThtmlParserDocumentObjectEvent;
|
|
|
|
FLexer : ThtmlLexer;
|
|
FState : ThtmlParserState;
|
|
FTokenType : ThtmlTokenType;
|
|
|
|
procedure LogMessage(const MsgType: ThtmlParserMessageType;
|
|
const Msg: String);
|
|
procedure LogUnexpectedToken;
|
|
procedure LogUnmatchedEndTag(const TagID: ThtmlTagID; const Name: String);
|
|
|
|
procedure TriggerElementOpen(const Item: ThtmlOpenElementInfo); virtual;
|
|
procedure TriggerElementClose(const Item: ThtmlOpenElementInfo); virtual;
|
|
procedure TriggerDocumentObject(const Obj: AhtmlObject); virtual;
|
|
|
|
function CreateText(const Context: ThtmlParserTextContext;
|
|
var Text: String): AhtmlObject; virtual;
|
|
function CreateCharRef(const CharVal: LongWord;
|
|
const HasTrailer: Boolean): AhtmlObject; virtual;
|
|
function CreateEntityRef(const EntityName: String;
|
|
const HasTrailer: Boolean): AhtmlObject; virtual;
|
|
function CreateLineBreak: AhtmlObject; virtual;
|
|
function CreateEmptyComment: AhtmlObject; virtual;
|
|
function CreateComment(const Comments: StringArray): AhtmlObject; virtual;
|
|
function CreateDeclaration(const Name, Text: String): AhtmlObject; virtual;
|
|
function CreateMarkedSection(const Text: String): AhtmlObject; virtual;
|
|
function CreateCDATA(const CDATA: String): AhtmlObject; virtual;
|
|
function CreatePI(const Target, PI: String): AhtmlObject; virtual;
|
|
function CreateElement(const TagID: ThtmlTagID;
|
|
const Name: String): AhtmlElement; virtual;
|
|
|
|
function GetTokenStr: String;
|
|
function GetNextToken: ThtmlTokenType;
|
|
function ResolveRef(var Text: String): Boolean;
|
|
|
|
function ParsePlainText: String;
|
|
function ParseText(const TextContext: ThtmlParserTextContext;
|
|
var Text: String): AhtmlObject;
|
|
function ParseContentText: AhtmlObject;
|
|
function ParseLineBreak: AhtmlObject;
|
|
function ParseComment: AhtmlObject;
|
|
function ParseDeclaration: AhtmlObject;
|
|
function ParseMarkedSection: AhtmlObject;
|
|
function ParseCDATA: AhtmlObject;
|
|
function ParsePI: AhtmlObject;
|
|
function ParseTagAttributeValue(
|
|
const Attribute: ThtmlElementAttribute): String;
|
|
function ParseTagAttribute(const TagID: ThtmlTagID; const TagName: String;
|
|
const Element: AhtmlElement;
|
|
var AttrID: ThtmlAttrID;
|
|
var Name, Value: String): ThtmlElementAttribute;
|
|
procedure ParseTagAttributes(const TagID: ThtmlTagID; const Name: String;
|
|
const Element: AhtmlElement);
|
|
function ParseStartTag: AhtmlObject;
|
|
function ParseEndTag: AhtmlObject;
|
|
|
|
function ParseToken: AhtmlObject;
|
|
|
|
function GetDocument: ThtmlDocument;
|
|
|
|
public
|
|
constructor Create(AOwner: TComponent); override;
|
|
destructor Destroy; override;
|
|
|
|
property Options: ThtmlParserOptions read FOptions write FOptions;
|
|
property RawText: RawByteString read FRawText write FRawText;
|
|
property FileName: String read FFileName write FFileName;
|
|
property Encoding: RawByteString read FEncoding write FEncoding;
|
|
|
|
property OnMessage: ThtmlParserMessageEvent read FOnMessage write FOnMessage;
|
|
|
|
property OnToken: ThtmlParserTokenEvent read FOnToken write FOnToken;
|
|
property OnTokenStr: ThtmlParserTokenStrEvent read FOnTokenStr write FOnTokenStr;
|
|
|
|
property OnResolveEntityRef: ThtmlParserResolveEntityRefEvent read FOnResolveEntityRef write FOnResolveEntityRef;
|
|
property OnText: ThtmlParserTextEvent read FOnText write FOnText;
|
|
property OnUnresolvedText: ThtmlParserTextEvent read FOnUnresolvedText write FOnUnresolvedText;
|
|
property OnLineBreak: ThtmlParserEvent read FOnLineBreak write FOnLineBreak;
|
|
|
|
property OnComment: ThtmlParserCommentEvent read FOnComment write FOnComment;
|
|
property OnDeclaration: ThtmlParserDeclarationEvent read FOnDeclaration write FOnDeclaration;
|
|
property OnMarkedSection: ThtmlParserStringEvent read FOnMarkedSection write FOnMarkedSection;
|
|
property OnCDATA: ThtmlParserStringEvent read FOnCDATA write FOnCDATA;
|
|
property OnProcessingInstr: ThtmlParserProcessingInstrEvent read FOnProcessingInstr write FOnProcessingInstr;
|
|
|
|
property OnStartTag: ThtmlParserStartTagEvent read FOnStartTag write FOnStartTag;
|
|
property OnStartTagAttributes: ThtmlParserStartTagAttributesEvent read FOnStartTagAttributes write FOnStartTagAttributes;
|
|
property OnEndTag: ThtmlParserEndTagEvent read FOnEndTag write FOnEndTag;
|
|
|
|
property OnElementOpen: ThtmlParserElementEvent read FOnElementOpen write FOnElementOpen;
|
|
property OnElementClose: ThtmlParserElementEvent read FOnElementClose write FOnElementClose;
|
|
|
|
property OnDocumentObject: ThtmlParserDocumentObjectEvent read FOnDocumentObject write FOnDocumentObject;
|
|
|
|
function ParseDocument: ThtmlDocument;
|
|
property State: ThtmlParserState read FState;
|
|
|
|
property Document: ThtmlDocument read GetDocument;
|
|
end;
|
|
EhtmlParser = class(Exception);
|
|
|
|
|
|
|
|
{ }
|
|
{ TfclHTMLParser }
|
|
{ }
|
|
type
|
|
TfclHTMLParser = class (ThtmlParser)
|
|
published
|
|
property Options;
|
|
property RawText;
|
|
property FileName;
|
|
property Encoding;
|
|
|
|
property OnMessage;
|
|
property OnToken;
|
|
property OnTokenStr;
|
|
property OnResolveEntityRef;
|
|
property OnText;
|
|
property OnUnresolvedText;
|
|
property OnLineBreak;
|
|
property OnComment;
|
|
property OnDeclaration;
|
|
property OnMarkedSection;
|
|
property OnCDATA;
|
|
property OnProcessingInstr;
|
|
property OnStartTag;
|
|
property OnStartTagAttributes;
|
|
property OnEndTag;
|
|
property OnElementOpen;
|
|
property OnElementClose;
|
|
property OnDocumentObject;
|
|
end;
|
|
|
|
|
|
|
|
{ }
|
|
{ Test cases }
|
|
{ }
|
|
{$IFDEF HTML_TEST}
|
|
procedure Test;
|
|
{$ENDIF}
|
|
|
|
|
|
|
|
implementation
|
|
|
|
uses
|
|
{ Fundamentals }
|
|
flcUtils,
|
|
flcDynArrays,
|
|
flcStrings,
|
|
flcUnicodeReader,
|
|
|
|
{ HTML }
|
|
flcHTMLCharEntity,
|
|
flcHTMLDocElements,
|
|
flcHTMLReader;
|
|
|
|
|
|
|
|
{ }
|
|
{ ThtmlParserState }
|
|
{ }
|
|
constructor ThtmlParserState.Create(const Parser: ThtmlParser);
|
|
begin
|
|
inherited Create;
|
|
Assert(Assigned(Parser));
|
|
FParser := Parser;
|
|
FElementList := TDoublyLinkedList.Create;
|
|
FFreeList := TDoublyLinkedList.Create;
|
|
FReopenList := TDoublyLinkedList.Create;
|
|
end;
|
|
|
|
destructor ThtmlParserState.Destroy;
|
|
begin
|
|
FreeAndNil(FDocument);
|
|
FreeAndNil(FReopenList);
|
|
FreeAndNil(FFreeList);
|
|
FreeAndNil(FElementList);
|
|
inherited Destroy;
|
|
end;
|
|
|
|
procedure ThtmlParserState.Reset;
|
|
begin
|
|
FreeAndNil(FDocument);
|
|
FReopenList.DeleteList;
|
|
FElementList.DeleteList;
|
|
if not (poDontProduceDocument in FParser.FOptions) then
|
|
FDocument := ThtmlDocument.Create;
|
|
OpenElement(HTML_TAG_Document, '<!DOC!>', nil);
|
|
end;
|
|
|
|
function ThtmlParserState.GetOpenCount: Integer;
|
|
begin
|
|
Result := FElementList.Count;
|
|
end;
|
|
|
|
function ThtmlParserState.GetInnerItem: ThtmlOpenElementInfo;
|
|
begin
|
|
Result := ThtmlOpenElementInfo(FElementList.Last);
|
|
end;
|
|
|
|
function ThtmlParserState.GetInnerTagID: ThtmlTagID;
|
|
begin
|
|
if not Assigned(FElementList.Last) then
|
|
Result := HTML_TAG_None
|
|
else
|
|
Result := ThtmlOpenElementInfo(FElementList.Last).TagID;
|
|
end;
|
|
|
|
function ThtmlParserState.GetInnerContainer: AhtmlContainerObject;
|
|
var I: ThtmlOpenElementInfo;
|
|
begin
|
|
I := GetInnerItem;
|
|
if Assigned(I) then
|
|
if I.TagID = HTML_TAG_Document then
|
|
Result := FDocument else
|
|
Result := I.Element
|
|
else
|
|
Result := FDocument;
|
|
end;
|
|
|
|
function ThtmlParserState.GetOuterItem: ThtmlOpenElementInfo;
|
|
begin
|
|
Result := ThtmlOpenElementInfo(FElementList.First);
|
|
end;
|
|
|
|
function ThtmlParserState.AddToContainer(const Container: AhtmlContainerObject;
|
|
const Obj: AhtmlObject): Boolean;
|
|
begin
|
|
Result := False;
|
|
if not Assigned(Obj) then
|
|
exit;
|
|
if Assigned(Container) then
|
|
begin
|
|
Container.AddItem(Obj);
|
|
Result := True;
|
|
end;
|
|
FParser.TriggerDocumentObject(Obj);
|
|
if not Assigned(Container) then
|
|
Obj.Free;
|
|
end;
|
|
|
|
function ThtmlParserState.AddToInnerContainer(const Obj: AhtmlObject): Boolean;
|
|
begin
|
|
Result := AddToContainer(GetInnerContainer, Obj);
|
|
end;
|
|
|
|
function ThtmlParserState.LocateTagID(const TagID: ThtmlTagID): ThtmlOpenElementInfo;
|
|
begin
|
|
Result := ThtmlOpenElementInfo(FElementList.Last);
|
|
while Assigned(Result) do
|
|
if TagID = Result.TagID then
|
|
exit
|
|
else
|
|
Result := ThtmlOpenElementInfo(Result.Prev);
|
|
end;
|
|
|
|
function ThtmlParserState.LocateName(const Name: String): ThtmlOpenElementInfo;
|
|
begin
|
|
Result := ThtmlOpenElementInfo(FElementList.Last);
|
|
while Assigned(Result) do
|
|
if StrEqualNoAsciiCaseU(Name, Result.Name) then
|
|
exit
|
|
else
|
|
Result := ThtmlOpenElementInfo(Result.Prev);
|
|
end;
|
|
|
|
function ThtmlParserState.IsTagIDOpen(const TagID: ThtmlTagID): Boolean;
|
|
begin
|
|
Result := Assigned(LocateTagID(TagID));;
|
|
end;
|
|
|
|
function ThtmlParserState.IsNameOpen(const Name: String): Boolean;
|
|
begin
|
|
Result := Assigned(LocateName(Name));
|
|
end;
|
|
|
|
function ThtmlParserState.IsBodyOpen: Boolean;
|
|
var I: ThtmlOpenElementInfo;
|
|
|
|
begin
|
|
I := ThtmlOpenElementInfo(FElementList.First);
|
|
Result := False;
|
|
while Assigned(I) do
|
|
if I.TagID = HTML_TAG_HTML then
|
|
begin
|
|
Result := True;
|
|
break;
|
|
end else
|
|
I := ThtmlOpenElementInfo(I.Next);
|
|
if not Result then
|
|
exit;
|
|
Result := False;
|
|
I := ThtmlOpenElementInfo(I.Next);
|
|
while Assigned(I) do
|
|
if I.TagID = HTML_TAG_BODY then
|
|
begin
|
|
Result := True;
|
|
break;
|
|
end
|
|
else
|
|
I := ThtmlOpenElementInfo(I.Next);
|
|
end;
|
|
|
|
procedure ThtmlParserState.OpenElement(const Item: ThtmlOpenElementInfo);
|
|
begin
|
|
Assert(Assigned(Item), 'Assigned(Item)');
|
|
FElementList.Append(Item);
|
|
FParser.TriggerElementOpen(Item);
|
|
end;
|
|
|
|
procedure ThtmlParserState.OpenElement(const TagID: ThtmlTagID;
|
|
const Name: String; const Element: AhtmlElement);
|
|
var Item: ThtmlOpenElementInfo;
|
|
begin
|
|
Item := ThtmlOpenElementInfo(FFreeList.RemoveFirst);
|
|
if not Assigned(Item) then
|
|
Item := ThtmlOpenElementInfo.Create;
|
|
Item.TagID := TagID;
|
|
Item.Name := Name;
|
|
Item.Element := Element;
|
|
OpenElement(Item);
|
|
end;
|
|
|
|
procedure ThtmlParserState.OpenAutoStartElements(const TagID: ThtmlTagID);
|
|
var Item : ThtmlOpenElementInfo;
|
|
NewTag, Tag : ThtmlTagID;
|
|
Element : AhtmlElement;
|
|
Container : AhtmlContainerObject;
|
|
begin
|
|
Item := InnerItem;
|
|
while Assigned(Item) do
|
|
begin
|
|
if TagID = Item.TagID then
|
|
break;
|
|
NewTag := htmlAutoOpenTag(TagID, Item.TagID);
|
|
if NewTag <> HTML_TAG_None then
|
|
begin
|
|
if NewTag = Item.TagID then
|
|
break; // no auto-start element
|
|
if Item.TagID = HTML_TAG_Document then
|
|
Container := FDocument else
|
|
Container := Item.Element;
|
|
repeat
|
|
// create auto-start element
|
|
Element := FParser.CreateElement(NewTag, '');
|
|
if Assigned(Element) then
|
|
if not AddToContainer(Container, Element) then
|
|
Element := nil;
|
|
OpenElement(NewTag, '', Element);
|
|
// next
|
|
Tag := htmlAutoOpenTag(TagID, NewTag);
|
|
if (Tag = HTML_TAG_None) or (Tag = NewTag) then
|
|
break;
|
|
Container := Element;
|
|
NewTag := Tag;
|
|
until False;
|
|
break;
|
|
end
|
|
else
|
|
Item := ThtmlOpenElementInfo(Item.Prev);
|
|
end;
|
|
end;
|
|
|
|
function ThtmlParserState.ReleaseElement: ThtmlOpenElementInfo;
|
|
begin
|
|
Result := ThtmlOpenElementInfo(FElementList.Last);
|
|
if not Assigned(Result) then
|
|
exit;
|
|
FParser.TriggerElementClose(Result);
|
|
FElementList.RemoveLast;
|
|
end;
|
|
|
|
procedure ThtmlParserState.CloseElementToFree;
|
|
var Item: ThtmlOpenElementInfo;
|
|
begin
|
|
Item := ReleaseElement;
|
|
if not Assigned(Item) then
|
|
exit;
|
|
FFreeList.Append(Item);
|
|
end;
|
|
|
|
procedure ThtmlParserState.CloseElementToReOpen;
|
|
var Item: ThtmlOpenElementInfo;
|
|
begin
|
|
Item := ReleaseElement;
|
|
if not Assigned(Item) then
|
|
exit;
|
|
FReopenList.Append(Item);
|
|
end;
|
|
|
|
procedure ThtmlParserState.ReOpenElements;
|
|
var Container : AhtmlContainerObject;
|
|
Item : ThtmlOpenElementInfo;
|
|
begin
|
|
if FReopenList.IsEmpty then
|
|
exit;
|
|
Container := InnerContainer;
|
|
repeat
|
|
Item := ThtmlOpenElementInfo(FReopenList.RemoveLast);
|
|
if not Assigned(Item) then
|
|
break;
|
|
if Assigned(Item.Element) then
|
|
begin
|
|
Item.Element := Item.Element.DuplicateElement;
|
|
if not AddToContainer(Container, Item.Element) then
|
|
Item.Element := nil;
|
|
Container := Item.Element;
|
|
end
|
|
else
|
|
Container := nil;
|
|
OpenElement(Item);
|
|
until False;
|
|
end;
|
|
|
|
procedure ThtmlParserState.CloseOverlapping
|
|
(const OpenElement: ThtmlOpenElementInfo);
|
|
var Item: ThtmlOpenElementInfo;
|
|
begin
|
|
// Close open overlapping tags up to, but excluding, OpenElement
|
|
if poDontAllowOverlappedTags in FParser.FOptions then
|
|
// Strict: Close all open overlapped elements
|
|
repeat
|
|
Item := InnerItem;
|
|
if OpenElement = Item then
|
|
break;
|
|
CloseElementToFree;
|
|
until False
|
|
else
|
|
// Overlaps allowed: Close certian overlapping elements
|
|
repeat
|
|
Item := InnerItem;
|
|
if OpenElement = Item then
|
|
break;
|
|
if not htmlDoesCloseTagCloseOpenTag(OpenElement.TagID, Item.TagID) then
|
|
CloseElementToReopen
|
|
else
|
|
CloseElementToFree;
|
|
until False;
|
|
end;
|
|
|
|
procedure ThtmlParserState.CloseElement(const OpenElement: ThtmlOpenElementInfo);
|
|
begin
|
|
CloseOverlapping(OpenElement);
|
|
CloseElementToFree;
|
|
ReOpenElements;
|
|
end;
|
|
|
|
function ThtmlParserState.GetOpenElementForEndTag(const TagID: ThtmlTagID;
|
|
const Name: String): ThtmlOpenElementInfo;
|
|
begin
|
|
Result := InnerItem;
|
|
while Assigned(Result) do
|
|
begin
|
|
if htmlIsSameTag(TagID, Name, Result.TagID, Result.Name) then
|
|
// found matching open tag
|
|
exit else
|
|
if poDontAllowOverlappedTags in FParser.FOptions then
|
|
begin
|
|
if not htmlIsElementEndTagOptional(Result.TagID) then
|
|
begin
|
|
Result := nil; // strict: overlapping closes only allowed
|
|
exit; // through elements with optional end tags
|
|
end;
|
|
end else
|
|
if not htmlDoesCloseTagCloseOutside(TagID, Result.TagID) then
|
|
begin
|
|
Result := nil; // tag does not close through this level
|
|
exit;
|
|
end;
|
|
Result := ThtmlOpenElementInfo(Result.Prev);
|
|
end;
|
|
Result := nil; // no matching open tag found
|
|
end;
|
|
|
|
function ThtmlParserState.GetAutoCloseElementForStartTag(
|
|
const TagID: ThtmlTagID): ThtmlOpenElementInfo;
|
|
begin
|
|
Result := InnerItem;
|
|
while Assigned(Result) do
|
|
begin
|
|
// 1. check if auto-close element
|
|
if TagID = Result.TagID then
|
|
begin
|
|
if htmlIsElementEndTagOptional(TagID) then
|
|
// close matching open tag with optional end tag
|
|
exit;
|
|
// no further searching
|
|
Result := nil;
|
|
exit;
|
|
end else
|
|
if htmlDoesOpenTagAutoCloseOpenTag(TagID, Result.TagID) then
|
|
// found
|
|
exit else
|
|
// 2. check if iteration should continue outside
|
|
if poDontAllowOverlappedTags in FParser.FOptions then
|
|
begin
|
|
if not htmlIsElementEndTagOptional(Result.TagID) then
|
|
begin
|
|
Result := nil; // strict: overlapping closes only allowed
|
|
exit; // through elements with optional end tags
|
|
end;
|
|
end else
|
|
if not htmlDoesOpenTagAutoCloseOutside(TagID, Result.TagID) then
|
|
begin
|
|
Result := nil; // tag does not close through this level
|
|
exit;
|
|
end;
|
|
// 3. next open level
|
|
Result := ThtmlOpenElementInfo(Result.Prev);
|
|
end;
|
|
Result := nil; // no matching open tag found
|
|
end;
|
|
|
|
|
|
|
|
{ }
|
|
{ ThtmlParser }
|
|
{ }
|
|
constructor ThtmlParser.Create(AOwner: TComponent);
|
|
begin
|
|
inherited Create(AOwner);
|
|
FOptions := [];
|
|
FState := ThtmlParserState.Create(self);
|
|
end;
|
|
|
|
destructor ThtmlParser.Destroy;
|
|
begin
|
|
FreeAndNil(FLexer);
|
|
FreeAndNil(FState);
|
|
inherited Destroy;
|
|
end;
|
|
|
|
{ Messages }
|
|
procedure ThtmlParser.LogMessage(const MsgType: ThtmlParserMessageType;
|
|
const Msg: String);
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and Assigned(FOnMessage) then
|
|
FOnMessage(self, MsgType, Msg);
|
|
if (MsgType = hmError) and (poStopOnError in FOptions) then
|
|
raise EhtmlParser.Create('Parse error: ' + Msg);
|
|
end;
|
|
|
|
procedure ThtmlParser.LogUnexpectedToken;
|
|
begin
|
|
LogMessage(hmError, 'Unexpected token (' + FLexer.TokenTypeDescription + ')');
|
|
end;
|
|
|
|
procedure ThtmlParser.LogUnmatchedEndTag(const TagID: ThtmlTagID; const Name: String);
|
|
var S: String;
|
|
begin
|
|
S := Name;
|
|
if (S = '') and (TagID <> HTML_TAG_None) then
|
|
S := htmlGetTagName(TagID);
|
|
LogMessage(hmError, 'Unmatched end tag (' + ToStringU(S) + ')');
|
|
end;
|
|
|
|
{ Triggers }
|
|
procedure ThtmlParser.TriggerElementOpen(const Item: ThtmlOpenElementInfo);
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnElementOpen) then
|
|
with Item do
|
|
FOnElementOpen(self, TagID, Name, Element);
|
|
end;
|
|
|
|
procedure ThtmlParser.TriggerElementClose(const Item: ThtmlOpenElementInfo);
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnElementClose) then
|
|
with Item do
|
|
FOnElementClose(self, TagID, Name, Element);
|
|
end;
|
|
|
|
procedure ThtmlParser.TriggerDocumentObject(const Obj: AhtmlObject);
|
|
begin
|
|
if not (poDisableNotifications in FOptions) then
|
|
begin
|
|
if Assigned(FOnDocumentObject) then
|
|
FOnDocumentObject(self, Obj);
|
|
end;
|
|
end;
|
|
|
|
{ Create }
|
|
function ThtmlParser.CreateText(const Context: ThtmlParserTextContext;
|
|
var Text: String): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnText) then
|
|
FOnText(self, Context, Text);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlText.Create(Text);
|
|
end;
|
|
|
|
function ThtmlParser.CreateCharRef(const CharVal: LongWord; const HasTrailer: Boolean): AhtmlObject;
|
|
begin
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlCharRef.Create(CharVal, HasTrailer);
|
|
end;
|
|
|
|
function ThtmlParser.CreateEntityRef(const EntityName: String; const HasTrailer: Boolean): AhtmlObject;
|
|
begin
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlEntityRef.Create(EntityName, HasTrailer);
|
|
end;
|
|
|
|
function ThtmlParser.CreateLineBreak: AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnLineBreak) then
|
|
FOnLineBreak(self);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlLineBreak.Create;
|
|
end;
|
|
|
|
function ThtmlParser.CreateEmptyComment: AhtmlObject;
|
|
begin
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlEmptyComment.Create;
|
|
end;
|
|
|
|
function ThtmlParser.CreateComment(const Comments: StringArray): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnComment) then
|
|
FOnComment(self, Comments);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlComment.Create(Comments);
|
|
end;
|
|
|
|
function ThtmlParser.CreateDeclaration(const Name, Text: String): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnDeclaration) then
|
|
FOnDeclaration(self, Name, Text);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlRawTag.Create('!' + Name + iifU(Text <> '', ' ', '') + Text);
|
|
end;
|
|
|
|
function ThtmlParser.CreateMarkedSection(const Text: String): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnMarkedSection) then
|
|
FOnMarkedSection(self, Text);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlRawTag.Create('![' + Text + ']');
|
|
end;
|
|
|
|
function ThtmlParser.CreateCDATA(const CDATA: String): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnCDATA) then
|
|
FOnCDATA(self, CDATA);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlCDATA.Create(CDATA);
|
|
end;
|
|
|
|
function ThtmlParser.CreatePI(const Target, PI: String): AhtmlObject;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnProcessingInstr) then
|
|
FOnProcessingInstr(self, Target, PI);
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlPI.Create(Target, PI);
|
|
end;
|
|
|
|
function ThtmlParser.CreateElement(const TagID: ThtmlTagID;
|
|
const Name: String): AhtmlElement;
|
|
var C: AhtmlContainerObject;
|
|
begin
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
begin
|
|
C := FState.GetInnerContainer;
|
|
if Assigned(C) then
|
|
Result := C.CreateItem(Ord(TagID), Name) as AhtmlElement
|
|
else
|
|
Result := htmlCreateElement(TagID, Name);
|
|
end;
|
|
end;
|
|
|
|
{ Token }
|
|
function ThtmlParser.GetTokenStr: String;
|
|
begin
|
|
Result := FLexer.TokenStr;
|
|
end;
|
|
|
|
function ThtmlParser.GetNextToken: ThtmlTokenType;
|
|
begin
|
|
if not (poDisableNotifications in FOptions) and (FTokenType <> htNone) then
|
|
begin
|
|
// note: notify token After it has been examined
|
|
if Assigned(FOnToken) then
|
|
FOnToken(self, FTokenType);
|
|
if Assigned(FOnTokenStr) then
|
|
FOnTokenStr(self, FTokenType, FLexer.TokenStr);
|
|
end;
|
|
// get next token from lexer
|
|
FTokenType := FLexer.GetNextToken;
|
|
// return token type
|
|
Result := FTokenType;
|
|
end;
|
|
|
|
function ThtmlParser.ResolveRef(var Text: String): Boolean;
|
|
begin
|
|
Assert(FTokenType in htmlTokensRef, 'FTokenType in htmlTokensRef');
|
|
Text := '';
|
|
if poDontResolveReferences in FOptions then
|
|
Result := False
|
|
else
|
|
case FTokenType of
|
|
htEntityRef:
|
|
begin
|
|
Result := FLexer.ResolveEntityRef(Text);
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnResolveEntityRef) then
|
|
FOnResolveEntityRef(self, GetTokenStr, Text, Result);
|
|
end;
|
|
htCharRef,
|
|
htCharRefHex:
|
|
Result := FLexer.ResolveCharRef(Text);
|
|
else
|
|
Result := False;
|
|
end;
|
|
end;
|
|
|
|
{ Parse }
|
|
function ThtmlParser.ParsePlainText: String;
|
|
var T: String;
|
|
begin
|
|
Assert(FTokenType in htmlTokensText, 'FTokenType in htmlTokensText');
|
|
Result := '';
|
|
repeat
|
|
case FTokenType of
|
|
htText:
|
|
Result := Result + GetTokenStr;
|
|
htCharRef,
|
|
htCharRefHex,
|
|
htEntityRef:
|
|
if ResolveRef(T) then
|
|
Result := Result + T
|
|
else
|
|
break;
|
|
else
|
|
break;
|
|
end;
|
|
GetNextToken;
|
|
until False;
|
|
end;
|
|
|
|
function ThtmlParser.ParseText(const TextContext: ThtmlParserTextContext;
|
|
var Text: String): AhtmlObject;
|
|
var CharVal : LongWord;
|
|
S : String;
|
|
R : Boolean;
|
|
begin
|
|
// extract resolved text
|
|
Assert(FTokenType in htmlTokensText, 'FTokenType in htmlTokensText');
|
|
Text := ParsePlainText;
|
|
if Text <> '' then
|
|
Result := CreateText(TextContext, Text)
|
|
else
|
|
// extract unresolved entity reference
|
|
case FTokenType of
|
|
htCharRefHex,
|
|
htCharRef:
|
|
begin
|
|
CharVal := FLexer.CharRefValue;
|
|
Text := htmlCharRef(CharVal, FTokenType = htCharRefHex);
|
|
GetNextToken;
|
|
R := (FTokenType = htRefTrailer);
|
|
if R then
|
|
GetNextToken;
|
|
Result := CreateCharRef(CharVal, R);
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnUnresolvedText) then
|
|
FOnUnresolvedText(self, TextContext, Text);
|
|
end;
|
|
htEntityRef:
|
|
begin
|
|
S := GetTokenStr;
|
|
Text := '&' + S + ';';
|
|
GetNextToken;
|
|
R := (FTokenType = htRefTrailer);
|
|
if R then
|
|
GetNextToken;
|
|
Result := CreateEntityRef(S, R);
|
|
if not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnUnresolvedText) then
|
|
FOnUnresolvedText(self, TextContext, Text);
|
|
end;
|
|
else
|
|
Result := nil;
|
|
end;
|
|
end;
|
|
|
|
function ThtmlParser.ParseContentText: AhtmlObject;
|
|
var T: String;
|
|
begin
|
|
Result := ParseText(tcContent, T);
|
|
end;
|
|
|
|
function ThtmlParser.ParseLineBreak: AhtmlObject;
|
|
begin
|
|
Assert(FTokenType = htLineBreak, 'FTokenType = htLineBreak');
|
|
Result := CreateLineBreak;
|
|
GetNextToken;
|
|
end;
|
|
|
|
function ThtmlParser.ParseComment: AhtmlObject;
|
|
var S: StringArray;
|
|
begin
|
|
Assert(FTokenType in htmlTokensComment, 'FTokenType in htmlTokensComment');
|
|
case FTokenType of
|
|
htEmptyComment:
|
|
begin
|
|
GetNextToken;
|
|
Result := CreateEmptyComment;
|
|
end;
|
|
htComment:
|
|
begin
|
|
S := nil;
|
|
repeat
|
|
DynArrayAppend(S, GetTokenStr);
|
|
until GetNextToken <> htComment;
|
|
if FTokenType = htCommentEnd then
|
|
GetNextToken
|
|
else
|
|
LogUnexpectedToken;
|
|
Result := CreateComment(S);
|
|
end;
|
|
else
|
|
Result := nil;
|
|
end;
|
|
end;
|
|
|
|
function ThtmlParser.ParseDeclaration: AhtmlObject;
|
|
var Name, Text: String;
|
|
begin
|
|
Assert(FTokenType = htDeclaration, 'FTokenType = htDeclaration');
|
|
Name := GetTokenStr;
|
|
if GetNextToken = htDeclarationText then
|
|
begin
|
|
Text := GetTokenStr;
|
|
GetNextToken;
|
|
end else
|
|
Text := '';
|
|
Result := CreateDeclaration(Name, Text);
|
|
end;
|
|
|
|
function ThtmlParser.ParseMarkedSection: AhtmlObject;
|
|
var S: String;
|
|
begin
|
|
Assert(FTokenType = htMarkedSection, 'FTokenType = htMarkedSection');
|
|
S := GetTokenStr;
|
|
GetNextToken;
|
|
Result := CreateMarkedSection(S);
|
|
end;
|
|
|
|
function ThtmlParser.ParseCDATA: AhtmlObject;
|
|
var S: String;
|
|
begin
|
|
Assert(FTokenType = htCDATA, 'FTokenType = htCDATA');
|
|
S := GetTokenStr;
|
|
GetNextToken;
|
|
Result := CreateCDATA(S);
|
|
end;
|
|
|
|
function ThtmlParser.ParsePI: AhtmlObject;
|
|
var Target, PI: String;
|
|
begin
|
|
Assert(FTokenType = htPITarget, 'FTokenType = htPITarget');
|
|
Target := GetTokenStr;
|
|
if GetNextToken = htPI then
|
|
begin
|
|
PI := GetTokenStr;
|
|
GetNextToken;
|
|
Result := CreatePI(Target, PI);
|
|
end else
|
|
Result := CreatePI(Target, '');
|
|
end;
|
|
|
|
function ThtmlParser.ParseTagAttributeValue(
|
|
const Attribute: ThtmlElementAttribute): String;
|
|
var Obj : AhtmlObject;
|
|
Text : String;
|
|
begin
|
|
Assert(FTokenType = htTagAttrValueStart, 'FTokenType = htTagAttrValueStart');
|
|
GetNextToken;
|
|
Result := '';
|
|
while FTokenType in htmlTokensTextInclLineBreak do
|
|
begin
|
|
Obj := ParseText(tcAttributeValue, Text);
|
|
Result := Result + Text;
|
|
if Assigned(Attribute) and Assigned(Obj) then
|
|
Attribute.AddItem(Obj);
|
|
end;
|
|
if FTokenType = htTagAttrValueEnd then
|
|
GetNextToken
|
|
else
|
|
LogUnexpectedToken;
|
|
end;
|
|
|
|
function ThtmlParser.ParseTagAttribute(const TagID: ThtmlTagID;
|
|
const TagName: String; const Element: AhtmlElement;
|
|
var AttrID: ThtmlAttrID; var Name, Value: String): ThtmlElementAttribute;
|
|
begin
|
|
Assert(FTokenType = htTagAttrName, 'FTokenType = htTagAttrName');
|
|
AttrID := FLexer.AttrID;
|
|
if AttrID = HTML_ATTR_None then
|
|
Name := GetTokenStr
|
|
else
|
|
Name := '';
|
|
GetNextToken;
|
|
if poDontProduceDocument in FOptions then
|
|
Result := nil
|
|
else
|
|
Result := ThtmlElementAttribute.Create(AttrID, Name);
|
|
// Parse value
|
|
if FTokenType = htTagAttrValueStart then
|
|
Value := ParseTagAttributeValue(Result)
|
|
else
|
|
Value := '';
|
|
end;
|
|
|
|
procedure ThtmlParser.ParseTagAttributes(const TagID: ThtmlTagID;
|
|
const Name: String; const Element: AhtmlElement);
|
|
var Attr : ThtmlElementAttribute;
|
|
AttrID : ThtmlAttrID;
|
|
AttrName : String;
|
|
Value : String;
|
|
Names : StringArray;
|
|
Values : StringArray;
|
|
Notify : Boolean;
|
|
begin
|
|
Notify := not (poDisableNotifications in FOptions) and
|
|
Assigned(FOnStartTagAttributes);
|
|
repeat
|
|
case FTokenType of
|
|
htTagAttrName:
|
|
begin
|
|
Attr := ParseTagAttribute(TagID, Name, Element,
|
|
AttrID, AttrName, Value);
|
|
if Assigned(Element) and Assigned(Attr) then
|
|
Element.Attributes.AddItem(Attr);
|
|
if Notify then
|
|
begin
|
|
if (AttrName = '') and (AttrID <> HTML_ATTR_None) then
|
|
AttrName := htmlGetAttrName(AttrID);
|
|
DynArrayAppend(Names, AttrName);
|
|
DynArrayAppend(Values, Value);
|
|
end;
|
|
end;
|
|
htTagAttrValueStart:
|
|
repeat
|
|
LogUnexpectedToken;
|
|
GetNextToken;
|
|
until not (FTokenType in htmlTokensText);
|
|
htTagAttrValueEnd:
|
|
begin
|
|
LogUnexpectedToken;
|
|
GetNextToken;
|
|
end;
|
|
else
|
|
break;
|
|
end;
|
|
until False;
|
|
if Notify then
|
|
FOnStartTagAttributes(self, TagID, Name, Names, Values, Element);
|
|
end;
|
|
|
|
function ThtmlParser.ParseStartTag: AhtmlObject;
|
|
var TagID : ThtmlTagID;
|
|
Name : String;
|
|
Element : AhtmlElement;
|
|
Item : ThtmlOpenElementInfo;
|
|
ContentTag : Boolean;
|
|
begin
|
|
Assert(FTokenType = htStartTag, 'FTokenType = htStartTag');
|
|
// get tag name
|
|
TagID := FLexer.TagID;
|
|
Name := GetTokenStr;
|
|
GetNextToken;
|
|
if not (poDisableNotifications in FOptions) and Assigned(FOnStartTag) then
|
|
FOnStartTag(self, TagID, Name);
|
|
// auto-close nesting levels
|
|
Item := FState.GetAutoCloseElementForStartTag(TagID);
|
|
if Assigned(Item) then
|
|
FState.CloseElement(Item); // auto-close
|
|
// auto-start nesting levels
|
|
FState.OpenAutoStartElements(TagID);
|
|
// create element
|
|
Element := CreateElement(TagID, Name);
|
|
// parse rest of tag
|
|
ParseTagAttributes(TagID, Name, Element);
|
|
// set tag type
|
|
if FTokenType = htEmptyTag then // empty tag
|
|
begin
|
|
if Assigned(Element) then
|
|
Element.TagType := ttEmptyTag;
|
|
ContentTag := False;
|
|
GetNextToken;
|
|
end else
|
|
if htmlIsEmptyElement(TagID) then // required to be empty
|
|
begin
|
|
if Assigned(Element) then
|
|
Element.TagType := ttStartTag;
|
|
ContentTag := False;
|
|
end else
|
|
begin // content tag
|
|
if Assigned(Element) then
|
|
Element.TagType := ttContentTags;
|
|
ContentTag := True;
|
|
end;
|
|
// update nesting levels
|
|
if not FState.AddToInnerContainer(Element) then
|
|
Element := nil;
|
|
if ContentTag then
|
|
// new nest level
|
|
FState.OpenElement(TagID, Name, Element);
|
|
// object added to container
|
|
Result := nil;
|
|
end;
|
|
|
|
function ThtmlParser.ParseEndTag: AhtmlObject;
|
|
var TagID : ThtmlTagID;
|
|
Item : ThtmlOpenElementInfo;
|
|
Name : String;
|
|
begin
|
|
// parse tag
|
|
TagID := FLexer.TagID;
|
|
if TagID = HTML_TAG_None then
|
|
Name := GetTokenStr
|
|
else
|
|
Name := '';
|
|
GetNextToken;
|
|
// notify
|
|
if not (poDisableNotifications in FOptions) and Assigned(FOnEndTag) then
|
|
FOnEndTag(self, TagID, Name);
|
|
// close element
|
|
Item := FState.GetOpenElementForEndTag(TagID, Name);
|
|
if Assigned(Item) then
|
|
FState.CloseElement(Item)
|
|
else
|
|
LogUnmatchedEndTag(TagID, Name);
|
|
// object in container
|
|
Result := nil;
|
|
end;
|
|
|
|
function ThtmlParser.ParseToken: AhtmlObject;
|
|
begin
|
|
case FTokenType of
|
|
htText,
|
|
htEntityRef,
|
|
htCharRef,
|
|
htCharRefHex : Result := ParseContentText;
|
|
htStartTag : Result := ParseStartTag;
|
|
htEndTag : Result := ParseEndTag;
|
|
htLineBreak : Result := ParseLineBreak;
|
|
htEmptyComment,
|
|
htComment : Result := ParseComment;
|
|
htMarkedSection : Result := ParseMarkedSection;
|
|
htCDATA : Result := ParseCDATA;
|
|
htPITarget : Result := ParsePI;
|
|
htDeclaration : Result := ParseDeclaration;
|
|
htEOF : Result := nil;
|
|
else
|
|
begin
|
|
LogUnexpectedToken;
|
|
GetNextToken;
|
|
Result := nil;
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
function ThtmlParser.ParseDocument: ThtmlDocument;
|
|
var Reader: TUnicodeReader;
|
|
begin
|
|
// Initialize
|
|
FreeAndNil(FLexer);
|
|
if FFileName <> '' then
|
|
Reader := htmlGetDocumentReaderForFile(FFileName, FEncoding)
|
|
else
|
|
Reader := htmlGetDocumentReaderForRawString(FRawText, FEncoding);
|
|
FLexer := ThtmlLexer.Create(Reader, True);
|
|
FLexer.NoLineBreakToken := False;
|
|
FState.Reset;
|
|
// Parse
|
|
FTokenType := htNone;
|
|
GetNextToken;
|
|
while FTokenType <> htEOF do
|
|
FState.AddToInnerContainer(ParseToken);
|
|
// Success
|
|
FreeAndNil(FLexer);
|
|
Result := FState.Document;
|
|
if Assigned(Result) then
|
|
Result.PrepareStructure;
|
|
end;
|
|
|
|
function ThtmlParser.GetDocument: ThtmlDocument;
|
|
begin
|
|
Assert(Assigned(FState));
|
|
Result := FState.Document;
|
|
end;
|
|
|
|
|
|
|
|
|
|
{ }
|
|
{ Test cases }
|
|
{ }
|
|
{$IFDEF HTML_TEST}
|
|
{$ASSERTIONS ON}
|
|
procedure Test;
|
|
var P : ThtmlParser;
|
|
|
|
procedure TestHTML(const RawHTML: RawByteString);
|
|
var D : ThtmlDocument;
|
|
begin
|
|
P.RawText := RawHTML;
|
|
D := P.ParseDocument;
|
|
Assert(D.HTMLText = UTF8ToString(RawHTML));
|
|
end;
|
|
|
|
begin
|
|
P := ThtmlParser.Create(nil);
|
|
try
|
|
TestHTML('<!DOCTYPE HTML><HTML><HEAD></HEAD><BODY>Test</BODY></HTML>');
|
|
TestHTML('<!DOCTYPE HTML><HTML><HEAD><_Test1></_Test1></HEAD><BODY>Test<_Test2></_Test2></BODY></HTML>');
|
|
TestHTML('<!DOCTYPE HTML><HTML><HEAD><_Test1/></HEAD><BODY>Test<_Test2/></BODY></HTML>');
|
|
finally
|
|
P.Free;
|
|
end;
|
|
end;
|
|
{$ENDIF}
|
|
|
|
|
|
|
|
end.
|
|
|