193 lines
7.2 KiB
ObjectPascal
193 lines
7.2 KiB
ObjectPascal
{******************************************************************************}
|
|
{ }
|
|
{ Library: Fundamentals 5.00 - HTML Parser }
|
|
{ File name: flcHTMLReader.pas }
|
|
{ File version: 5.02 }
|
|
{ Description: HTML reader utilities }
|
|
{ }
|
|
{ Copyright: Copyright (c) 2000-2020, David J Butler }
|
|
{ All rights reserved. }
|
|
{ Redistribution and use in source and binary forms, with }
|
|
{ or without modification, are permitted provided that }
|
|
{ the following conditions are met: }
|
|
{ Redistributions of source code must retain the above }
|
|
{ copyright notice, this list of conditions and the }
|
|
{ following disclaimer. }
|
|
{ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND }
|
|
{ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED }
|
|
{ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED }
|
|
{ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A }
|
|
{ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL }
|
|
{ THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, }
|
|
{ INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR }
|
|
{ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, }
|
|
{ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF }
|
|
{ USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) }
|
|
{ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER }
|
|
{ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING }
|
|
{ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE }
|
|
{ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE }
|
|
{ POSSIBILITY OF SUCH DAMAGE. }
|
|
{ }
|
|
{ Github: https://github.com/fundamentalslib }
|
|
{ E-mail: fundamentals.library at gmail.com }
|
|
{ }
|
|
{ Revision history: }
|
|
{ }
|
|
{ 2001/04/13 1.01 Part of cHTML unit. }
|
|
{ 2019/02/21 5.02 Part flcHTMLReader unit. }
|
|
{ }
|
|
{******************************************************************************}
|
|
|
|
{$INCLUDE flcHTML.inc}
|
|
|
|
unit flcHTMLReader;
|
|
|
|
interface
|
|
|
|
uses
|
|
flcStreams,
|
|
flcUnicodeCodecs,
|
|
flcUnicodeReader;
|
|
|
|
|
|
|
|
{ Encoding detection functions }
|
|
function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass;
|
|
function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString;
|
|
function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass;
|
|
function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass;
|
|
|
|
|
|
|
|
{ Unicode Document Reader constructors }
|
|
function htmlGetDocumentReader(
|
|
const Reader: AReaderEx; const ReaderOwner: Boolean = True;
|
|
const Encoding: RawByteString = ''): TUnicodeReader;
|
|
function htmlGetDocumentReaderForRawString(
|
|
const Document: RawByteString;
|
|
const Encoding: RawByteString = ''): TUnicodeReader;
|
|
function htmlGetDocumentReaderForFile(
|
|
const FileName: String;
|
|
const Encoding: RawByteString = ''): TUnicodeReader;
|
|
|
|
|
|
|
|
implementation
|
|
|
|
uses
|
|
flcUTF,
|
|
flcStrings;
|
|
|
|
|
|
|
|
{ Encoding detection functions }
|
|
function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass;
|
|
begin
|
|
if Encoding <> '' then
|
|
begin
|
|
Result := GetCodecClassByAliasA(Encoding);
|
|
end
|
|
else
|
|
Result := nil;
|
|
end;
|
|
|
|
function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString;
|
|
var P: PAnsiChar;
|
|
L: Integer;
|
|
R: Boolean;
|
|
begin
|
|
L := Length(DocumentTop);
|
|
if L = 0 then
|
|
begin
|
|
Result := '';
|
|
exit;
|
|
end;
|
|
P := Pointer(DocumentTop);
|
|
// check if document is UTF-16 Unicode encoding
|
|
if DetectUTF16BOM(P, L, R) then
|
|
begin
|
|
if not R then
|
|
Result := 'utf16'
|
|
else
|
|
Result := 'utf16le';
|
|
exit;
|
|
end;
|
|
// check document html meta tag
|
|
Result := StrBetweenB(DocumentTop, 'text/html; charset=', [#0..#32, '"', '''', '>', ';'],
|
|
False, True, False);
|
|
if Result <> '' then
|
|
exit;
|
|
// find any charset indicator
|
|
Result := StrTrimB(StrBetweenB(DocumentTop, 'charset=', ['<', '>', ';', ']'],
|
|
False, True, False), [#0..#32, '"', '''']);
|
|
if Result <> '' then
|
|
exit;
|
|
end;
|
|
|
|
function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass;
|
|
begin
|
|
Result := htmlGetUnicodeCodec(htmlDetectEncoding(DocumentTop));
|
|
end;
|
|
|
|
function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass;
|
|
begin
|
|
// Check specified encoding
|
|
Result := htmlGetUnicodeCodec(Encoding);
|
|
if Assigned(Result) then
|
|
exit;
|
|
// Detect encoding
|
|
Result := htmlDetectDocumentCodec(DocumentTop);
|
|
if Assigned(Result) then
|
|
exit;
|
|
// Use default for HTML: ISO-8859-1 (Latin1)
|
|
Result := TISO8859_1Codec;
|
|
end;
|
|
|
|
|
|
|
|
{ Unicode Document Reader constructors }
|
|
function htmlGetDocumentReader(
|
|
const Reader: AReaderEx; const ReaderOwner: Boolean;
|
|
const Encoding: RawByteString): TUnicodeReader;
|
|
const
|
|
DocumentSampleSize = 4096;
|
|
var
|
|
C : TUnicodeCodecClass;
|
|
P : Integer;
|
|
T : RawByteString;
|
|
begin
|
|
C := htmlGetUnicodeCodec(Encoding);
|
|
if not Assigned(C) then
|
|
begin
|
|
// detect from document top
|
|
P := Reader.Position;
|
|
T := Reader.ReadStrB(DocumentSampleSize);
|
|
Reader.Position := P;
|
|
C := htmlDetectDocumentCodec(T);
|
|
end;
|
|
if not Assigned(C) then
|
|
C := TISO8859_1Codec; // default codec
|
|
Result := TUnicodeReader.Create(Reader, ReaderOwner, C.Create, True);
|
|
end;
|
|
|
|
function htmlGetDocumentReaderForRawString(const Document: RawByteString;
|
|
const Encoding: RawByteString): TUnicodeReader;
|
|
begin
|
|
Result := TUnicodeMemoryReader.Create(
|
|
Pointer(Document), Length(Document),
|
|
htmlGetDocumentCodec(Encoding, Document).Create, True);
|
|
end;
|
|
|
|
function htmlGetDocumentReaderForFile(const FileName: String;
|
|
const Encoding: RawByteString): TUnicodeReader;
|
|
begin
|
|
Result := htmlGetDocumentReader(
|
|
TFileReader.Create(FileName), True, Encoding);
|
|
end;
|
|
|
|
|
|
|
|
end.
|
|
|