318 lines
16 KiB
ObjectPascal
318 lines
16 KiB
ObjectPascal
{******************************************************************************}
|
|
{ }
|
|
{ Library: Fundamentals 5.00 - HTML Parser }
|
|
{ File name: flcHTMLCharEntity.pas }
|
|
{ File version: 5.04 }
|
|
{ Description: HTML named character entities }
|
|
{ }
|
|
{ Copyright: Copyright (c) 2000-2020, David J Butler }
|
|
{ All rights reserved. }
|
|
{ Redistribution and use in source and binary forms, with }
|
|
{ or without modification, are permitted provided that }
|
|
{ the following conditions are met: }
|
|
{ Redistributions of source code must retain the above }
|
|
{ copyright notice, this list of conditions and the }
|
|
{ following disclaimer. }
|
|
{ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND }
|
|
{ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED }
|
|
{ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED }
|
|
{ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A }
|
|
{ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL }
|
|
{ THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, }
|
|
{ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR }
|
|
{ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, }
|
|
{ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF }
|
|
{ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) }
|
|
{ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER }
|
|
{ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING }
|
|
{ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE }
|
|
{ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE }
|
|
{ POSSIBILITY OF SUCH DAMAGE. }
|
|
{ }
|
|
{ Github: https://github.com/fundamentalslib }
|
|
{ E-mail: fundamentals.library at gmail.com }
|
|
{ }
|
|
{ Revision history: }
|
|
{ }
|
|
{ 2002/11/03 1.00 Part of cHTMLUtils. }
|
|
{ 2002/12/08 1.01 Part of cHTMLConsts. }
|
|
{ 2015/04/04 1.02 RawByteString changes. }
|
|
{ 2015/04/11 1.03 UnicodeString changes. }
|
|
{ 2019/02/22 5.04 Revised for Fundamentals 5. }
|
|
{ }
|
|
{******************************************************************************}
|
|
|
|
{$INCLUDE flcHTML.inc}
|
|
|
|
unit flcHTMLCharEntity;
|
|
|
|
interface
|
|
|
|
|
|
|
|
{ }
|
|
{ HTML Named Character Entities }
|
|
{ }
|
|
function htmlDecodeCharEntity(const Entity: String): Word;
|
|
|
|
|
|
|
|
{ }
|
|
{ htmlCharRef }
|
|
{ }
|
|
function htmlCharRef(const CharVal: LongWord; const UseHex: Boolean): String;
|
|
|
|
|
|
|
|
{ }
|
|
{ Tests }
|
|
{ }
|
|
{$IFDEF HTML_TEST}
|
|
procedure Test;
|
|
{$ENDIF}
|
|
|
|
|
|
|
|
implementation
|
|
|
|
uses
|
|
flcStdTypes,
|
|
flcUtils,
|
|
flcDynArrays;
|
|
|
|
|
|
|
|
{ }
|
|
{ HTML Named Character Entities }
|
|
{ }
|
|
type
|
|
THtmlEntity = record
|
|
Name : String;
|
|
Char : Word;
|
|
end;
|
|
|
|
const
|
|
// HTML 4 character entity references
|
|
HtmlEntities = 253;
|
|
HtmlEntity: array[0..HtmlEntities - 1] of ThtmlEntity = (
|
|
(* Additional *)
|
|
(Name:'apos'; Char:39),
|
|
(* HTMLspecial.ent *)
|
|
(Name:'quot'; Char:34), (Name:'amp'; Char:38),
|
|
(Name:'lt'; Char:60), (Name:'gt'; Char:62),
|
|
{ Latin Extended-A }
|
|
(Name:'OElig'; Char:338), (Name:'oelig'; Char:339),
|
|
(Name:'Scaron'; Char:352), (Name:'scaron'; Char:353),
|
|
(Name:'Yuml'; Char:376),
|
|
{ Spacing Modifier Letters }
|
|
(Name:'circ'; Char:710), (Name:'tilde'; Char:732),
|
|
{ General Punctuation }
|
|
(Name:'ensp'; Char:8194), (Name:'emsp'; Char:8195),
|
|
(Name:'thinsp'; Char:8201), (Name:'zwnj'; Char:8204),
|
|
(Name:'zwj'; Char:8205), (Name:'lrm'; Char:8206),
|
|
(Name:'rlm'; Char:8207), (Name:'ndash'; Char:8211),
|
|
(Name:'mdash'; Char:8212), (Name:'lsquo'; Char:8216),
|
|
(Name:'rsquo'; Char:8217), (Name:'sbquo'; Char:8218),
|
|
(Name:'ldquo'; Char:8220), (Name:'rdquo'; Char:8221),
|
|
(Name:'bdquo'; Char:8222), (Name:'dagger'; Char:8224),
|
|
(Name:'Dagger'; Char:8225), (Name:'permil'; Char:8240),
|
|
(Name:'lsaquo'; Char:8249), (Name:'rsaquo'; Char:8250),
|
|
(Name:'euro'; Char:8364),
|
|
(* HTMLsymbol.ent *)
|
|
{ Latin Extended-B }
|
|
(Name:'fnof'; Char:402),
|
|
{ Greek }
|
|
(Name:'Alpha'; Char:913), (Name:'Beta'; Char:914),
|
|
(Name:'Gamma'; Char:915), (Name:'Delta'; Char:916),
|
|
(Name:'Epsilon'; Char:917), (Name:'Zeta'; Char:918),
|
|
(Name:'Eta'; Char:919), (Name:'Theta'; Char:920),
|
|
(Name:'Iota'; Char:921), (Name:'Kappa'; Char:922),
|
|
(Name:'Lambda'; Char:923), (Name:'Mu'; Char:924),
|
|
(Name:'Nu'; Char:925), (Name:'Xi'; Char:926),
|
|
(Name:'Omicron'; Char:927), (Name:'Pi'; Char:928),
|
|
(Name:'Rho'; Char:929), (Name:'Sigma'; Char:931),
|
|
(Name:'Tau'; Char:932), (Name:'Upsilon'; Char:933),
|
|
(Name:'Phi'; Char:934), (Name:'Chi'; Char:935),
|
|
(Name:'Psi'; Char:936), (Name:'Omega'; Char:937),
|
|
(Name:'alpha'; Char:945), (Name:'beta'; Char:946),
|
|
(Name:'gamma'; Char:947), (Name:'delta'; Char:948),
|
|
(Name:'epsilon'; Char:949), (Name:'zeta'; Char:950),
|
|
(Name:'eta'; Char:951), (Name:'theta'; Char:952),
|
|
(Name:'iota'; Char:953), (Name:'kappa'; Char:954),
|
|
(Name:'lambda'; Char:955), (Name:'mu'; Char:956),
|
|
(Name:'nu'; Char:957), (Name:'xi'; Char:958),
|
|
(Name:'omicron'; Char:959), (Name:'pi'; Char:960),
|
|
(Name:'rho'; Char:961), (Name:'sigmaf'; Char:962),
|
|
(Name:'sigma'; Char:963), (Name:'tau'; Char:964),
|
|
(Name:'upsilon'; Char:965), (Name:'phi'; Char:966),
|
|
(Name:'chi'; Char:967), (Name:'psi'; Char:968),
|
|
(Name:'omega'; Char:969), (Name:'thetasym'; Char:977),
|
|
(Name:'upsih'; Char:978), (Name:'piv'; Char:982),
|
|
{ General Punctuation }
|
|
(Name:'bull'; Char:8226), (Name:'hellip'; Char:8230),
|
|
(Name:'prime'; Char:8242), (Name:'Prime'; Char:8243),
|
|
(Name:'oline'; Char:8254), (Name:'frasl'; Char:8260),
|
|
{ Letterlike Symbols }
|
|
(Name:'weierp'; Char:8472), (Name:'image'; Char:8465),
|
|
(Name:'real'; Char:8476), (Name:'trade'; Char:8482),
|
|
(Name:'alefsym'; Char:8501),
|
|
{ Arrows }
|
|
(Name:'larr'; Char:8592), (Name:'uarr'; Char:8593),
|
|
(Name:'rarr'; Char:8594), (Name:'darr'; Char:8595),
|
|
(Name:'harr'; Char:8596), (Name:'crarr'; Char:8629),
|
|
(Name:'lArr'; Char:8656), (Name:'uArr'; Char:8657),
|
|
(Name:'rArr'; Char:8658), (Name:'dArr'; Char:8659),
|
|
(Name:'hArr'; Char:8660),
|
|
{ Mathematical Operators }
|
|
(Name:'forall'; Char:8704), (Name:'part'; Char:8706),
|
|
(Name:'exist'; Char:8707), (Name:'empty'; Char:8709),
|
|
(Name:'nabla'; Char:8711), (Name:'isin'; Char:8712),
|
|
(Name:'notin'; Char:8713), (Name:'ni'; Char:8715),
|
|
(Name:'prod'; Char:8719), (Name:'sum'; Char:8721),
|
|
(Name:'minus'; Char:8722), (Name:'lowast'; Char:8727),
|
|
(Name:'radic'; Char:8730), (Name:'prop'; Char:8733),
|
|
(Name:'infin'; Char:8734), (Name:'ang'; Char:8736),
|
|
(Name:'and'; Char:8743), (Name:'or'; Char:8744),
|
|
(Name:'cap'; Char:8745), (Name:'cup'; Char:8746),
|
|
(Name:'int'; Char:8747), (Name:'there4'; Char:8756),
|
|
(Name:'sim'; Char:8764), (Name:'cong'; Char:8773),
|
|
(Name:'asymp'; Char:8776), (Name:'ne'; Char:8800),
|
|
(Name:'equiv'; Char:8801), (Name:'le'; Char:8804),
|
|
(Name:'ge'; Char:8805), (Name:'sub'; Char:8834),
|
|
(Name:'sup'; Char:8835), (Name:'nsub'; Char:8836),
|
|
(Name:'sube'; Char:8838), (Name:'supe'; Char:8839),
|
|
(Name:'oplus'; Char:8853), (Name:'otimes'; Char:8855),
|
|
(Name:'perp'; Char:8869), (Name:'sdot'; Char:8901),
|
|
{ Miscellaneous Technical }
|
|
(Name:'lceil'; Char:8968), (Name:'rceil'; Char:8969),
|
|
(Name:'lfloor'; Char:8970), (Name:'rfloor'; Char:8971),
|
|
(Name:'lang'; Char:9001), (Name:'rang'; Char:9002),
|
|
(Name:'loz'; Char:9674),
|
|
{ Miscellaneous Symbols }
|
|
(Name:'spades'; Char:9824), (Name:'clubs'; Char:9827),
|
|
(Name:'hearts'; Char:9829), (Name:'diams'; Char:9830),
|
|
|
|
(* HTMLlat1.ent *)
|
|
(Name:'nbsp'; Char:160), (Name:'iexcl'; Char:161),
|
|
(Name:'cent'; Char:162), (Name:'pound'; Char:163),
|
|
(Name:'curren'; Char:164), (Name:'yen'; Char:165),
|
|
(Name:'brvbar'; Char:166), (Name:'sect'; Char:167),
|
|
(Name:'uml'; Char:168), (Name:'copy'; Char:169),
|
|
(Name:'ordf'; Char:170), (Name:'laquo'; Char:171),
|
|
(Name:'not'; Char:172), (Name:'shy'; Char:173),
|
|
(Name:'reg'; Char:174), (Name:'macr'; Char:175),
|
|
(Name:'deg'; Char:176), (Name:'plusmn'; Char:177),
|
|
(Name:'sup2'; Char:178), (Name:'sup3'; Char:179),
|
|
(Name:'acute'; Char:180), (Name:'micro'; Char:181),
|
|
(Name:'para'; Char:182), (Name:'middot'; Char:183),
|
|
(Name:'cedil'; Char:184), (Name:'sup1'; Char:185),
|
|
(Name:'ordm'; Char:186), (Name:'raquo'; Char:187),
|
|
(Name:'frac14'; Char:188), (Name:'frac12'; Char:189),
|
|
(Name:'frac34'; Char:190), (Name:'iquest'; Char:191),
|
|
(Name:'Agrave'; Char:192), (Name:'Aacute'; Char:193),
|
|
(Name:'Acirc'; Char:194), (Name:'Atilde'; Char:195),
|
|
(Name:'Auml'; Char:196), (Name:'Aring'; Char:197),
|
|
(Name:'AElig'; Char:198), (Name:'Ccedil'; Char:199),
|
|
(Name:'Egrave'; Char:200), (Name:'Eacute'; Char:201),
|
|
(Name:'Ecirc'; Char:202), (Name:'Euml'; Char:203),
|
|
(Name:'Igrave'; Char:204), (Name:'Iacute'; Char:205),
|
|
(Name:'Icirc'; Char:206), (Name:'Iuml'; Char:207),
|
|
(Name:'ETH'; Char:208), (Name:'Ntilde'; Char:209),
|
|
(Name:'Ograve'; Char:210), (Name:'Oacute'; Char:211),
|
|
(Name:'Ocirc'; Char:212), (Name:'Otilde'; Char:213),
|
|
(Name:'Ouml'; Char:214), (Name:'times'; Char:215),
|
|
(Name:'Oslash'; Char:216), (Name:'Ugrave'; Char:217),
|
|
(Name:'Uacute'; Char:218), (Name:'Ucirc'; Char:219),
|
|
(Name:'Uuml'; Char:220), (Name:'Yacute'; Char:221),
|
|
(Name:'THORN'; Char:222), (Name:'szlig'; Char:223),
|
|
(Name:'agrave'; Char:224), (Name:'aacute'; Char:225),
|
|
(Name:'acirc'; Char:226), (Name:'atilde'; Char:227),
|
|
(Name:'auml'; Char:228), (Name:'aring'; Char:229),
|
|
(Name:'aelig'; Char:230), (Name:'ccedil'; Char:231),
|
|
(Name:'egrave'; Char:232), (Name:'eacute'; Char:233),
|
|
(Name:'ecirc'; Char:234), (Name:'euml'; Char:235),
|
|
(Name:'igrave'; Char:236), (Name:'iacute'; Char:237),
|
|
(Name:'icirc'; Char:238), (Name:'iuml'; Char:239),
|
|
(Name:'eth'; Char:240), (Name:'ntilde'; Char:241),
|
|
(Name:'ograve'; Char:242), (Name:'oacute'; Char:243),
|
|
(Name:'ocirc'; Char:244), (Name:'otilde'; Char:245),
|
|
(Name:'ouml'; Char:246), (Name:'divide'; Char:247),
|
|
(Name:'oslash'; Char:248), (Name:'ugrave'; Char:249),
|
|
(Name:'uacute'; Char:250), (Name:'ucirc'; Char:251),
|
|
(Name:'uuml'; Char:252), (Name:'yacute'; Char:253),
|
|
(Name:'thorn'; Char:254), (Name:'yuml'; Char:255)
|
|
);
|
|
|
|
const
|
|
HtmlEntityHashSize = HtmlEntities;
|
|
|
|
var
|
|
HtmlEntityHashIndex : array of LongIntArray;
|
|
HtmlEntityHashInit : Boolean = False;
|
|
|
|
procedure InitHTMLEntityHash;
|
|
var I: Integer;
|
|
begin
|
|
HtmlEntityHashIndex := nil;
|
|
SetLength(HtmlEntityHashIndex, HtmlEntityHashSize);
|
|
for I := 0 to HtmlEntities - 1 do
|
|
DynArrayAppend(HtmlEntityHashIndex[HashStr(HtmlEntity[I].Name, 1, -1, True, HtmlEntityHashSize)], I);
|
|
HtmlEntityHashInit := True;
|
|
end;
|
|
|
|
function htmlDecodeCharEntity(const Entity: String): Word;
|
|
var I, J, H: Integer;
|
|
begin
|
|
if not HtmlEntityHashInit then
|
|
InitHTMLEntityHash;
|
|
H := HashStr(Entity, 1, -1, True, HtmlEntityHashSize);
|
|
for I := 0 to Length(HtmlEntityHashIndex[H]) - 1 do
|
|
begin
|
|
J := HtmlEntityHashIndex[H][I];
|
|
if Entity = HtmlEntity[J].Name then // case-sensitive
|
|
begin
|
|
Result := HtmlEntity[J].Char;
|
|
exit;
|
|
end;
|
|
end;
|
|
Result := 0;
|
|
end;
|
|
|
|
|
|
|
|
{ }
|
|
{ htmlCharRef }
|
|
{ }
|
|
function htmlCharRef(const CharVal: LongWord; const UseHex: Boolean): String;
|
|
begin
|
|
if UseHex then
|
|
if CharVal <= $FF then
|
|
Result := '#x' + Word32toHex(CharVal, 2) + ';'
|
|
else
|
|
if CharVal <= $FFFF then
|
|
Result := '#x' + Word32toHex(CharVal, 4) + ';'
|
|
else
|
|
Result := '#x' + Word32toHex(CharVal, 6) + ';'
|
|
else
|
|
Result := '#' + Word32ToStr(CharVal) + ';';
|
|
end;
|
|
|
|
|
|
|
|
{ }
|
|
{ Tests }
|
|
{ }
|
|
{$IFDEF HTML_TEST}
|
|
{$ASSERTIONS ON}
|
|
procedure Test;
|
|
begin
|
|
Assert(htmlDecodeCharEntity('quot') = 34, 'htmlDecodeCharEntity');
|
|
Assert(htmlDecodeCharEntity('QUOT') = 0, 'htmlDecodeCharEntity');
|
|
Assert(htmlDecodeCharEntity('pi') = 960, 'htmlDecodeCharEntity');
|
|
Assert(htmlDecodeCharEntity('xyz') = 0, 'htmlDecodeCharEntity');
|
|
end;
|
|
{$ENDIF}
|
|
|
|
|
|
end.
|