{******************************************************************************} { } { Library: Fundamentals 5.00 - HTML Parser } { File name: flcHTMLReader.pas } { File version: 5.02 } { Description: HTML reader utilities } { } { Copyright: Copyright (c) 2000-2020, David J Butler } { All rights reserved. } { Redistribution and use in source and binary forms, with } { or without modification, are permitted provided that } { the following conditions are met: } { Redistributions of source code must retain the above } { copyright notice, this list of conditions and the } { following disclaimer. } { THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND } { CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED } { WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED } { WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A } { PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL } { THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, } { INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR } { CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, } { PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF } { USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) } { HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER } { IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING } { NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE } { USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE } { POSSIBILITY OF SUCH DAMAGE. } { } { Github: https://github.com/fundamentalslib } { E-mail: fundamentals.library at gmail.com } { } { Revision history: } { } { 2001/04/13 1.01 Part of cHTML unit. } { 2019/02/21 5.02 Part flcHTMLReader unit. } { } {******************************************************************************} {$INCLUDE flcHTML.inc} unit flcHTMLReader; interface uses flcStreams, flcUnicodeCodecs, flcUnicodeReader; { Encoding detection functions } function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass; function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString; function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass; function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass; { Unicode Document Reader constructors } function htmlGetDocumentReader( const Reader: AReaderEx; const ReaderOwner: Boolean = True; const Encoding: RawByteString = ''): TUnicodeReader; function htmlGetDocumentReaderForRawString( const Document: RawByteString; const Encoding: RawByteString = ''): TUnicodeReader; function htmlGetDocumentReaderForFile( const FileName: String; const Encoding: RawByteString = ''): TUnicodeReader; implementation uses flcUTF, flcStrings; { Encoding detection functions } function htmlGetUnicodeCodec(const Encoding: RawByteString): TUnicodeCodecClass; begin if Encoding <> '' then begin Result := GetCodecClassByAliasA(Encoding); end else Result := nil; end; function htmlDetectEncoding(const DocumentTop: RawByteString): RawByteString; var P: PAnsiChar; L: Integer; R: Boolean; begin L := Length(DocumentTop); if L = 0 then begin Result := ''; exit; end; P := Pointer(DocumentTop); // check if document is UTF-16 Unicode encoding if DetectUTF16BOM(P, L, R) then begin if not R then Result := 'utf16' else Result := 'utf16le'; exit; end; // check document html meta tag Result := StrBetweenB(DocumentTop, 'text/html; charset=', [#0..#32, '"', '''', '>', ';'], False, True, False); if Result <> '' then exit; // find any charset indicator Result := StrTrimB(StrBetweenB(DocumentTop, 'charset=', ['<', '>', ';', ']'], False, True, False), [#0..#32, '"', '''']); if Result <> '' then exit; end; function htmlDetectDocumentCodec(const DocumentTop: RawByteString): TUnicodeCodecClass; begin Result := htmlGetUnicodeCodec(htmlDetectEncoding(DocumentTop)); end; function htmlGetDocumentCodec(const Encoding, DocumentTop: RawByteString): TUnicodeCodecClass; begin // Check specified encoding Result := htmlGetUnicodeCodec(Encoding); if Assigned(Result) then exit; // Detect encoding Result := htmlDetectDocumentCodec(DocumentTop); if Assigned(Result) then exit; // Use default for HTML: ISO-8859-1 (Latin1) Result := TISO8859_1Codec; end; { Unicode Document Reader constructors } function htmlGetDocumentReader( const Reader: AReaderEx; const ReaderOwner: Boolean; const Encoding: RawByteString): TUnicodeReader; const DocumentSampleSize = 4096; var C : TUnicodeCodecClass; P : Integer; T : RawByteString; begin C := htmlGetUnicodeCodec(Encoding); if not Assigned(C) then begin // detect from document top P := Reader.Position; T := Reader.ReadStrB(DocumentSampleSize); Reader.Position := P; C := htmlDetectDocumentCodec(T); end; if not Assigned(C) then C := TISO8859_1Codec; // default codec Result := TUnicodeReader.Create(Reader, ReaderOwner, C.Create, True); end; function htmlGetDocumentReaderForRawString(const Document: RawByteString; const Encoding: RawByteString): TUnicodeReader; begin Result := TUnicodeMemoryReader.Create( Pointer(Document), Length(Document), htmlGetDocumentCodec(Encoding, Document).Create, True); end; function htmlGetDocumentReaderForFile(const FileName: String; const Encoding: RawByteString): TUnicodeReader; begin Result := htmlGetDocumentReader( TFileReader.Create(FileName), True, Encoding); end; end.