xtool/contrib/fundamentals/HTMLParser/flcHTMLReader.pas

193 lines
7.2 KiB
ObjectPascal

{******************************************************************************}
{ }
{ Library: Fundamentals 5.00 - HTML Parser }
{ File name: flcHTMLReader.pas }
{ File version: 5.02 }
{ Description: HTML reader utilities }
{ }
{ Copyright: Copyright (c) 2000-2020, David J Butler }
{ All rights reserved. }
{ Redistribution and use in source and binary forms, with }
{ or without modification, are permitted provided that }
{ the following conditions are met: }
{ Redistributions of source code must retain the above }
{ copyright notice, this list of conditions and the }
{ following disclaimer. }
{ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND }
{ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED }
{ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED }
{ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A }
{ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL }
{ THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, }
{ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR }
{ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, }
{ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF }
{ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) }
{ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER }
{ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING }
{ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE }
{ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE }
{ POSSIBILITY OF SUCH DAMAGE. }
{ }
{ Github: https://github.com/fundamentalslib }
{ E-mail: fundamentals.library at gmail.com }
{ }
{ Revision history: }
{ }
{ 2001/04/13 1.01 Part of cHTML unit. }
{ 2019/02/21 5.02 Part flcHTMLReader unit. }
{ }
{******************************************************************************}
{$INCLUDE flcHTML.inc}
unit flcHTMLReader;
interface
uses
flcStreams,
flcUnicodeCodecs,
flcUnicodeReader;
{ Encoding detection functions }
function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass;
function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString;
function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass;
function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass;
{ Unicode Document Reader constructors }
function htmlGetDocumentReader(
const Reader: AReaderEx; const ReaderOwner: Boolean = True;
const Encoding: RawByteString = ''): TUnicodeReader;
function htmlGetDocumentReaderForRawString(
const Document: RawByteString;
const Encoding: RawByteString = ''): TUnicodeReader;
function htmlGetDocumentReaderForFile(
const FileName: String;
const Encoding: RawByteString = ''): TUnicodeReader;
implementation
uses
flcUTF,
flcStrings;
{ Encoding detection functions }
function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass;
begin
if Encoding <> '' then
begin
Result := GetCodecClassByAliasA(Encoding);
end
else
Result := nil;
end;
function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString;
var P: PAnsiChar;
L: Integer;
R: Boolean;
begin
L := Length(DocumentTop);
if L = 0 then
begin
Result := '';
exit;
end;
P := Pointer(DocumentTop);
// check if document is UTF-16 Unicode encoding
if DetectUTF16BOM(P, L, R) then
begin
if not R then
Result := 'utf16'
else
Result := 'utf16le';
exit;
end;
// check document html meta tag
Result := StrBetweenB(DocumentTop, 'text/html; charset=', [#0..#32, '"', '''', '>', ';'],
False, True, False);
if Result <> '' then
exit;
// find any charset indicator
Result := StrTrimB(StrBetweenB(DocumentTop, 'charset=', ['<', '>', ';', ']'],
False, True, False), [#0..#32, '"', '''']);
if Result <> '' then
exit;
end;
function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass;
begin
Result := htmlGetUnicodeCodec(htmlDetectEncoding(DocumentTop));
end;
function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass;
begin
// Check specified encoding
Result := htmlGetUnicodeCodec(Encoding);
if Assigned(Result) then
exit;
// Detect encoding
Result := htmlDetectDocumentCodec(DocumentTop);
if Assigned(Result) then
exit;
// Use default for HTML: ISO-8859-1 (Latin1)
Result := TISO8859_1Codec;
end;
{ Unicode Document Reader constructors }
function htmlGetDocumentReader(
const Reader: AReaderEx; const ReaderOwner: Boolean;
const Encoding: RawByteString): TUnicodeReader;
const
DocumentSampleSize = 4096;
var
C : TUnicodeCodecClass;
P : Integer;
T : RawByteString;
begin
C := htmlGetUnicodeCodec(Encoding);
if not Assigned(C) then
begin
// detect from document top
P := Reader.Position;
T := Reader.ReadStrB(DocumentSampleSize);
Reader.Position := P;
C := htmlDetectDocumentCodec(T);
end;
if not Assigned(C) then
C := TISO8859_1Codec; // default codec
Result := TUnicodeReader.Create(Reader, ReaderOwner, C.Create, True);
end;
function htmlGetDocumentReaderForRawString(const Document: RawByteString;
const Encoding: RawByteString): TUnicodeReader;
begin
Result := TUnicodeMemoryReader.Create(
Pointer(Document), Length(Document),
htmlGetDocumentCodec(Encoding, Document).Create, True);
end;
function htmlGetDocumentReaderForFile(const FileName: String;
const Encoding: RawByteString): TUnicodeReader;
begin
Result := htmlGetDocumentReader(
TFileReader.Create(FileName), True, Encoding);
end;
end.