1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2000-2020, University of Amsterdam 7 VU University Amsterdam 8 CWI, Amsterdam 9 All rights reserved. 10 11 Redistribution and use in source and binary forms, with or without 12 modification, are permitted provided that the following conditions 13 are met: 14 15 1. Redistributions of source code must retain the above copyright 16 notice, this list of conditions and the following disclaimer. 17 18 2. Redistributions in binary form must reproduce the above copyright 19 notice, this list of conditions and the following disclaimer in 20 the documentation and/or other materials provided with the 21 distribution. 22 23 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 POSSIBILITY OF SUCH DAMAGE. 35*/ 36 37:- module(sgml, 38 [ load_html/3, % +Input, -DOM, +Options 39 load_xml/3, % +Input, -DOM, +Options 40 load_sgml/3, % +Input, -DOM, +Options 41 42 load_sgml_file/2, % +File, -ListOfContent 43 load_xml_file/2, % +File, -ListOfContent 44 load_html_file/2, % +File, -Document 45 46 load_structure/3, % +File, -Term, +Options 47 48 load_dtd/2, % +DTD, +File 49 load_dtd/3, % +DTD, +File, +Options 50 dtd/2, % +Type, -DTD 51 dtd_property/2, % +DTD, ?Property 52 53 new_dtd/2, % +Doctype, -DTD 54 free_dtd/1, % +DTD 55 open_dtd/3, % +DTD, +Options, -Stream 56 57 new_sgml_parser/2, % -Parser, +Options 58 free_sgml_parser/1, % +Parser 59 set_sgml_parser/2, % +Parser, +Options 60 get_sgml_parser/2, % +Parser, +Options 61 sgml_parse/2, % +Parser, +Options 62 63 sgml_register_catalog_file/2, % +File, +StartOrEnd 64 65 xml_quote_attribute/3, % +In, -Quoted, +Encoding 66 xml_quote_cdata/3, % +In, -Quoted, +Encoding 67 xml_quote_attribute/2, % +In, -Quoted 68 xml_quote_cdata/2, % +In, -Quoted 69 xml_name/1, % +In 70 xml_name/2, % +In, +Encoding 71 72 xsd_number_string/2, % ?Number, ?String 73 xsd_time_string/3, % ?Term, ?Type, ?String 74 75 xml_basechar/1, % +Code 76 xml_ideographic/1, % +Code 77 xml_combining_char/1, % +Code 78 xml_digit/1, % +Code 79 xml_extender/1, % +Code 80 81 iri_xml_namespace/2, % +IRI, -Namespace 82 iri_xml_namespace/3, % +IRI, -Namespace, -LocalName 83 xml_is_dom/1 % +Term 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89 90:- meta_predicate 91 load_structure( , , ), 92 load_html( , , ), 93 load_xml( , , ), 94 load_sgml( , , ). 95 96:- predicate_options(load_structure/3, 3, 97 [ charpos(integer), 98 cdata(oneof([atom,string])), 99 defaults(boolean), 100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])), 101 doctype(atom), 102 dtd(any), 103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])), 104 entity(atom,atom), 105 keep_prefix(boolean), 106 file(atom), 107 line(integer), 108 offset(integer), 109 number(oneof([token,integer])), 110 qualify_attributes(boolean), 111 shorttag(boolean), 112 case_sensitive_attributes(boolean), 113 case_preserving_attributes(boolean), 114 system_entities(boolean), 115 max_memory(integer), 116 space(oneof([sgml,preserve,default,remove,strict])), 117 xmlns(atom), 118 xmlns(atom,atom), 119 pass_to(sgml_parse/2, 2) 120 ]). 121:- predicate_options(load_html/3, 3, 122 [ pass_to(load_structure/3, 3) 123 ]). 124:- predicate_options(load_xml/3, 3, 125 [ pass_to(load_structure/3, 3) 126 ]). 127:- predicate_options(load_sgml/3, 3, 128 [ pass_to(load_structure/3, 3) 129 ]). 130:- predicate_options(load_dtd/3, 3, 131 [ dialect(oneof([sgml,xml,xmlns])), 132 pass_to(open/4, 4) 133 ]). 134:- predicate_options(sgml_parse/2, 2, 135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]), 136 callable), 137 cdata(oneof([atom,string])), 138 content_length(integer), 139 document(-any), 140 max_errors(integer), 141 parse(oneof([file,element,content,declaration,input])), 142 source(any), 143 syntax_errors(oneof([quiet,print,style])), 144 xml_no_ns(oneof([error,quiet])) 145 ]). 146:- predicate_options(new_sgml_parser/2, 2, 147 [ dtd(any) 148 ]).
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180 181user:file_search_path(dtd, '.'). 182user:file_search_path(dtd, swi('library/DTD')). 183 184sgml_register_catalog_file(File, Location) :- 185 prolog_to_os_filename(File, OsFile), 186 '_sgml_register_catalog_file'(OsFile, Location). 187 188:- use_foreign_library(foreign(sgml2pl)). 189 190register_catalog(Base) :- 191 absolute_file_name(dtd(Base), 192 [ extensions([soc]), 193 access(read), 194 file_errors(fail) 195 ], 196 SocFile), 197 sgml_register_catalog_file(SocFile, end). 198 199:- initialization 200 ignore(register_catalog('HTML4')). 201 202 203 /******************************* 204 * DTD HANDLING * 205 *******************************/ 206 207/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 208Note that concurrent access to DTD objects is not allowed, and hence we 209will allocate and destroy them in each thread. Possibibly it would be 210nicer to find out why concurrent access to DTD's is flawed. It is 211diagnosed to mess with the entity resolution by Fabien Todescato. 212- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ 213 214:- thread_local 215 current_dtd/2. 216:- volatile 217 current_dtd/2. 218:- thread_local 219 registered_cleanup/0. 220:- volatile 221 registered_cleanup/0. 222 223:- multifile 224 dtd_alias/2. 225 226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227 228dtd_alias(html4, 'HTML4'). 229dtd_alias(html5, 'HTML5'). 230dtd_alias(html, DTD) :- 231 current_prolog_flag(html_dialect, Dialect), 232 dtd_alias(Dialect, DTD).
dtd(Type)
. Loaded DTD
objects are cached. Note that DTD objects may not be shared
between threads. Therefore, dtd/2 maintains the pool of DTD
objects using a thread_local predicate. DTD objects are
destroyed if a thread terminates.
244dtd(Type, DTD) :- 245 current_dtd(Type, DTD), 246 !. 247dtd(Type, DTD) :- 248 new_dtd(Type, DTD), 249 ( dtd_alias(Type, Base) 250 -> true 251 ; Base = Type 252 ), 253 absolute_file_name(dtd(Base), 254 [ extensions([dtd]), 255 access(read) 256 ], DtdFile), 257 load_dtd(DTD, DtdFile), 258 register_cleanup, 259 asserta(current_dtd(Type, DTD)).
274load_dtd(DTD, DtdFile) :- 275 load_dtd(DTD, DtdFile, []). 276load_dtd(DTD, DtdFile, Options) :- 277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions), 278 setup_call_cleanup( 279 open_dtd(DTD, DTDOptions, DtdOut), 280 setup_call_cleanup( 281 open(DtdFile, read, DtdIn, OpenOptions), 282 copy_stream_data(DtdIn, DtdOut), 283 close(DtdIn)), 284 close(DtdOut)). 285 286split_dtd_options([], [], []). 287split_dtd_options([H|T], [H|TD], S) :- 288 dtd_option(H), 289 !, 290 split_dtd_options(T, TD, S). 291split_dtd_options([H|T], TD, [H|S]) :- 292 split_dtd_options(T, TD, S). 293 294dtd_option(dialect(_)).
302destroy_dtds :-
303 ( current_dtd(_Type, DTD),
304 free_dtd(DTD),
305 fail
306 ; true
307 ).
313register_cleanup :- 314 registered_cleanup, 315 !. 316register_cleanup :- 317 ( current_prolog_flag(threads, true) 318 -> prolog_listen(this_thread_exit, destroy_dtds) 319 ; true 320 ), 321 assert(registered_cleanup). 322 323 324 /******************************* 325 * EXAMINE DTD * 326 *******************************/ 327 328prop(doctype(_), _). 329prop(elements(_), _). 330prop(entities(_), _). 331prop(notations(_), _). 332prop(entity(E, _), DTD) :- 333 ( nonvar(E) 334 -> true 335 ; '$dtd_property'(DTD, entities(EL)), 336 member(E, EL) 337 ). 338prop(element(E, _, _), DTD) :- 339 ( nonvar(E) 340 -> true 341 ; '$dtd_property'(DTD, elements(EL)), 342 member(E, EL) 343 ). 344prop(attributes(E, _), DTD) :- 345 ( nonvar(E) 346 -> true 347 ; '$dtd_property'(DTD, elements(EL)), 348 member(E, EL) 349 ). 350prop(attribute(E, A, _, _), DTD) :- 351 ( nonvar(E) 352 -> true 353 ; '$dtd_property'(DTD, elements(EL)), 354 member(E, EL) 355 ), 356 ( nonvar(A) 357 -> true 358 ; '$dtd_property'(DTD, attributes(E, AL)), 359 member(A, AL) 360 ). 361prop(notation(N, _), DTD) :- 362 ( nonvar(N) 363 -> true 364 ; '$dtd_property'(DTD, notations(NL)), 365 member(N, NL) 366 ). 367 368dtd_property(DTD, Prop) :- 369 prop(Prop, DTD), 370 '$dtd_property'(DTD, Prop). 371 372 373 /******************************* 374 * SGML * 375 *******************************/
A proper XML document contains only a single toplevel element whose name matches the document type. Nevertheless, a list is returned for consistency with the representation of element content.
The encoding(+Encoding)
option is treated special for
compatibility reasons:
iso-8859-1
, us-ascii
or utf-8
,
the stream is opened in binary mode and the option is passed
to the SGML parser.399load_structure(Spec, DOM, Options) :- 400 sgml_open_options(Options, OpenOptions, SGMLOptions), 401 setup_call_cleanup( 402 open_any(Spec, read, In, Close, OpenOptions), 403 load_structure_from_stream(In, DOM, SGMLOptions), 404 close_any(Close)). 405 406sgml_open_options(Options, OpenOptions, SGMLOptions) :- 407 Options = M:Plain, 408 ( select_option(encoding(Encoding), Plain, NoEnc) 409 -> ( sgml_encoding(Encoding) 410 -> merge_options(NoEnc, [type(binary)], OpenOptions), 411 SGMLOptions = Options 412 ; OpenOptions = Plain, 413 SGMLOptions = M:NoEnc 414 ) 415 ; merge_options(Plain, [type(binary)], OpenOptions), 416 SGMLOptions = Options 417 ). 418 419sgml_encoding(Enc) :- 420 downcase_atom(Enc, Enc1), 421 sgml_encoding_l(Enc1). 422 423sgml_encoding_l('iso-8859-1'). 424sgml_encoding_l('us-ascii'). 425sgml_encoding_l('utf-8'). 426sgml_encoding_l('utf8'). 427sgml_encoding_l('iso_latin_1'). 428sgml_encoding_l('ascii'). 429 430load_structure_from_stream(In, Term, M:Options) :- 431 ( select_option(dtd(DTD), Options, Options1) 432 -> ExplicitDTD = true 433 ; ExplicitDTD = false, 434 Options1 = Options 435 ), 436 move_front(Options1, dialect(_), Options2), % dialect sets defaults 437 setup_call_cleanup( 438 new_sgml_parser(Parser, 439 [ dtd(DTD) 440 ]), 441 parse(Parser, M:Options2, TermRead, In), 442 free_sgml_parser(Parser)), 443 ( ExplicitDTD == true 444 -> ( DTD = dtd(_, DocType), 445 dtd_property(DTD, doctype(DocType)) 446 -> true 447 ; true 448 ) 449 ; free_dtd(DTD) 450 ), 451 Term = TermRead. 452 453move_front(Options0, Opt, Options) :- 454 selectchk(Opt, Options0, Options1), 455 !, 456 Options = [Opt|Options1]. 457move_front(Options, _, Options). 458 459 460parse(Parser, M:Options, Document, In) :- 461 set_parser_options(Options, Parser, In, Options1), 462 parser_meta_options(Options1, M, Options2), 463 set_input_location(Parser, In), 464 sgml_parse(Parser, 465 [ document(Document), 466 source(In) 467 | Options2 468 ]). 469 470set_parser_options([], _, _, []). 471set_parser_options([H|T], Parser, In, Rest) :- 472 ( set_parser_option(H, Parser, In) 473 -> set_parser_options(T, Parser, In, Rest) 474 ; Rest = [H|R2], 475 set_parser_options(T, Parser, In, R2) 476 ). 477 478set_parser_option(Var, _Parser, _In) :- 479 var(Var), 480 !, 481 instantiation_error(Var). 482set_parser_option(Option, Parser, _) :- 483 def_entity(Option, Parser), 484 !. 485set_parser_option(offset(Offset), _Parser, In) :- 486 !, 487 seek(In, Offset, bof, _). 488set_parser_option(Option, Parser, _In) :- 489 parser_option(Option), 490 !, 491 set_sgml_parser(Parser, Option). 492set_parser_option(Name=Value, Parser, In) :- 493 Option =.. [Name,Value], 494 set_parser_option(Option, Parser, In). 495 496 497parser_option(dialect(_)). 498parser_option(shorttag(_)). 499parser_option(case_sensitive_attributes(_)). 500parser_option(case_preserving_attributes(_)). 501parser_option(system_entities(_)). 502parser_option(max_memory(_)). 503parser_option(file(_)). 504parser_option(line(_)). 505parser_option(space(_)). 506parser_option(number(_)). 507parser_option(defaults(_)). 508parser_option(doctype(_)). 509parser_option(qualify_attributes(_)). 510parser_option(encoding(_)). 511parser_option(keep_prefix(_)). 512 513 514def_entity(entity(Name, Value), Parser) :- 515 get_sgml_parser(Parser, dtd(DTD)), 516 xml_quote_attribute(Value, QValue), 517 setup_call_cleanup(open_dtd(DTD, [], Stream), 518 format(Stream, '<!ENTITY ~w "~w">~n', 519 [Name, QValue]), 520 close(Stream)). 521def_entity(xmlns(URI), Parser) :- 522 set_sgml_parser(Parser, xmlns(URI)). 523def_entity(xmlns(NS, URI), Parser) :- 524 set_sgml_parser(Parser, xmlns(NS, URI)).
530parser_meta_options([], _, []). 531parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :- 532 !, 533 parser_meta_options(T0, M, T). 534parser_meta_options([H|T0], M, [H|T]) :- 535 parser_meta_options(T0, M, T).
542set_input_location(Parser, _In) :- 543 get_sgml_parser(Parser, file(_)), 544 !. 545set_input_location(Parser, In) :- 546 stream_property(In, file_name(File)), 547 !, 548 set_sgml_parser(Parser, file(File)), 549 stream_property(In, position(Pos)), 550 set_sgml_parser(Parser, position(Pos)). 551set_input_location(_, _). 552 553 /******************************* 554 * UTILITIES * 555 *******************************/
564load_sgml_file(File, Term) :-
565 load_sgml(File, Term, []).
574load_xml_file(File, Term) :-
575 load_xml(File, Term, []).
584load_html_file(File, DOM) :-
585 load_html(File, DOM, []).
dtd(html, DTD)
.html_dialect
You may also want to use the library(http/http_open) to support loading from HTTP and HTTPS URLs. For example:
:- use_module(library(http/http_open)). :- use_module(library(sgml)). load_html_url(URL, DOM) :- load_html(URL, DOM, []).
614load_html(File, Term, M:Options) :-
615 current_prolog_flag(html_dialect, Dialect),
616 dtd(Dialect, DTD),
617 merge_options(Options,
618 [ dtd(DTD),
619 dialect(Dialect),
620 max_errors(-1),
621 syntax_errors(quiet)
622 ], Options1),
623 load_structure(File, Term, M:Options1).
dialect(xml)
633load_xml(Input, DOM, M:Options) :-
634 merge_options(Options,
635 [ dialect(xml)
636 ], Options1),
637 load_structure(Input, DOM, M:Options1).
dialect(sgml)
647load_sgml(Input, DOM, M:Options) :- 648 merge_options(Options, 649 [ dialect(sgml) 650 ], Options1), 651 load_structure(Input, DOM, M:Options1). 652 653 654 655 /******************************* 656 * ENCODING * 657 *******************************/
667xml_quote_attribute(In, Quoted) :- 668 xml_quote_attribute(In, Quoted, ascii). 669 670xml_quote_cdata(In, Quoted) :- 671 xml_quote_cdata(In, Quoted, ascii).
677xml_name(In) :- 678 xml_name(In, ascii). 679 680 681 /******************************* 682 * XML CHARACTER CLASSES * 683 *******************************/
698 /******************************* 699 * TYPE CHECKING * 700 *******************************/
707xml_is_dom(0) :- !, fail. % catch variables 708xml_is_dom(List) :- 709 is_list(List), 710 !, 711 xml_is_content_list(List). 712xml_is_dom(Term) :- 713 xml_is_element(Term). 714 715xml_is_content_list([]). 716xml_is_content_list([H|T]) :- 717 xml_is_content(H), 718 xml_is_content_list(T). 719 720xml_is_content(0) :- !, fail. 721xml_is_content(pi(Pi)) :- 722 !, 723 atom(Pi). 724xml_is_content(CDATA) :- 725 atom(CDATA), 726 !. 727xml_is_content(CDATA) :- 728 string(CDATA), 729 !. 730xml_is_content(Term) :- 731 xml_is_element(Term). 732 733xml_is_element(element(Name, Attributes, Content)) :- 734 dom_name(Name), 735 dom_attributes(Attributes), 736 xml_is_content_list(Content). 737 738dom_name(NS:Local) :- 739 atom(NS), 740 atom(Local), 741 !. 742dom_name(Local) :- 743 atom(Local). 744 745dom_attributes(0) :- !, fail. 746dom_attributes([]). 747dom_attributes([H|T]) :- 748 dom_attribute(H), 749 dom_attributes(T). 750 751dom_attribute(Name=Value) :- 752 dom_name(Name), 753 atomic(Value). 754 755 756 /******************************* 757 * MESSAGES * 758 *******************************/ 759:- multifile 760 prolog:message/3. 761 762% Catch messages. sgml/4 is generated by the SGML2PL binding. 763 764prologmessage(sgml(Parser, File, Line, Message)) --> 765 { get_sgml_parser(Parser, dialect(Dialect)) 766 }, 767 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ]. 768 769 770 /******************************* 771 * XREF SUPPORT * 772 *******************************/ 773 774:- multifile 775 prolog:called_by/2. 776 777prologcalled_by(sgml_parse(_, Options), Called) :- 778 findall(Meta, meta_call_term(_, Meta, Options), Called). 779 780meta_call_term(T, G+N, Options) :- 781 T = call(Event, G), 782 pmember(T, Options), 783 call_params(Event, Term), 784 functor(Term, _, N). 785 786pmember(X, List) :- % member for partial lists 787 nonvar(List), 788 List = [H|T], 789 ( X = H 790 ; pmember(X, T) 791 ). 792 793call_params(begin, begin(tag,attributes,parser)). 794call_params(end, end(tag,parser)). 795call_params(cdata, cdata(cdata,parser)). 796call_params(pi, pi(cdata,parser)). 797call_params(decl, decl(cdata,parser)). 798call_params(error, error(severity,message,parser)). 799call_params(xmlns, xmlns(namespace,url,parser)). 800call_params(urlns, urlns(url,url,parser)). 801 802 /******************************* 803 * SANDBOX * 804 *******************************/ 805 806:- multifile 807 sandbox:safe_primitive/1, 808 sandbox:safe_meta_predicate/1. 809 810sandbox:safe_meta_predicate(sgml:load_structure/3). 811sandbox:safe_primitive(sgml:dtd(Dialect, _)) :- 812 dtd_alias(Dialect, _). 813sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)). 814sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)). 815sandbox:safe_primitive(sgml:xml_name(_,_)). 816sandbox:safe_primitive(sgml:xml_basechar(_)). 817sandbox:safe_primitive(sgml:xml_ideographic(_)). 818sandbox:safe_primitive(sgml:xml_combining_char(_)). 819sandbox:safe_primitive(sgml:xml_digit(_)). 820sandbox:safe_primitive(sgml:xml_extender(_)). 821sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)). 822sandbox:safe_primitive(sgml:xsd_number_string(_,_)). 823sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))
SGML, XML and HTML parser
This library allows you to parse SGML, XML and HTML data into a Prolog data structure. The library defines several families of predicates:
The DOM structure can be used by library(xpath) to extract information from the document.