36
37:- module(sgml,
38 [ load_html/3, 39 load_xml/3, 40 load_sgml/3, 41
42 load_sgml_file/2, 43 load_xml_file/2, 44 load_html_file/2, 45
46 load_structure/3, 47
48 load_dtd/2, 49 load_dtd/3, 50 dtd/2, 51 dtd_property/2, 52
53 new_dtd/2, 54 free_dtd/1, 55 open_dtd/3, 56
57 new_sgml_parser/2, 58 free_sgml_parser/1, 59 set_sgml_parser/2, 60 get_sgml_parser/2, 61 sgml_parse/2, 62
63 sgml_register_catalog_file/2, 64
65 xml_quote_attribute/3, 66 xml_quote_cdata/3, 67 xml_quote_attribute/2, 68 xml_quote_cdata/2, 69 xml_name/1, 70 xml_name/2, 71
72 xsd_number_string/2, 73 xsd_time_string/3, 74
75 xml_basechar/1, 76 xml_ideographic/1, 77 xml_combining_char/1, 78 xml_digit/1, 79 xml_extender/1, 80
81 iri_xml_namespace/2, 82 iri_xml_namespace/3, 83 xml_is_dom/1 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89
90:- meta_predicate
91 load_structure(+, -, :),
92 load_html(+, -, :),
93 load_xml(+, -, :),
94 load_sgml(+, -, :). 95
96:- predicate_options(load_structure/3, 3,
97 [ charpos(integer),
98 cdata(oneof([atom,string])),
99 defaults(boolean),
100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
101 doctype(atom),
102 dtd(any),
103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
104 entity(atom,atom),
105 keep_prefix(boolean),
106 file(atom),
107 line(integer),
108 offset(integer),
109 number(oneof([token,integer])),
110 qualify_attributes(boolean),
111 shorttag(boolean),
112 case_sensitive_attributes(boolean),
113 case_preserving_attributes(boolean),
114 system_entities(boolean),
115 max_memory(integer),
116 space(oneof([sgml,preserve,default,remove,strict])),
117 xmlns(atom),
118 xmlns(atom,atom),
119 pass_to(sgml_parse/2, 2)
120 ]). 121:- predicate_options(load_html/3, 3,
122 [ pass_to(load_structure/3, 3)
123 ]). 124:- predicate_options(load_xml/3, 3,
125 [ pass_to(load_structure/3, 3)
126 ]). 127:- predicate_options(load_sgml/3, 3,
128 [ pass_to(load_structure/3, 3)
129 ]). 130:- predicate_options(load_dtd/3, 3,
131 [ dialect(oneof([sgml,xml,xmlns])),
132 pass_to(open/4, 4)
133 ]). 134:- predicate_options(sgml_parse/2, 2,
135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
136 callable),
137 cdata(oneof([atom,string])),
138 content_length(integer),
139 document(-any),
140 max_errors(integer),
141 parse(oneof([file,element,content,declaration,input])),
142 source(any),
143 syntax_errors(oneof([quiet,print,style])),
144 xml_no_ns(oneof([error,quiet]))
145 ]). 146:- predicate_options(new_sgml_parser/2, 2,
147 [ dtd(any)
148 ]). 149
150
177
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180
181user:file_search_path(dtd, '.').
182user:file_search_path(dtd, swi('library/DTD')).
183
184sgml_register_catalog_file(File, Location) :-
185 prolog_to_os_filename(File, OsFile),
186 '_sgml_register_catalog_file'(OsFile, Location).
187
188:- use_foreign_library(foreign(sgml2pl)). 189
190register_catalog(Base) :-
191 absolute_file_name(dtd(Base),
192 [ extensions([soc]),
193 access(read),
194 file_errors(fail)
195 ],
196 SocFile),
197 sgml_register_catalog_file(SocFile, end).
198
199:- initialization
200 ignore(register_catalog('HTML4')). 201
202
203 206
213
214:- thread_local
215 current_dtd/2. 216:- volatile
217 current_dtd/2. 218:- thread_local
219 registered_cleanup/0. 220:- volatile
221 registered_cleanup/0. 222
223:- multifile
224 dtd_alias/2. 225
226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227
228dtd_alias(html4, 'HTML4').
229dtd_alias(html5, 'HTML5').
230dtd_alias(html, DTD) :-
231 current_prolog_flag(html_dialect, Dialect),
232 dtd_alias(Dialect, DTD).
233
243
244dtd(Type, DTD) :-
245 current_dtd(Type, DTD),
246 !.
247dtd(Type, DTD) :-
248 new_dtd(Type, DTD),
249 ( dtd_alias(Type, Base)
250 -> true
251 ; Base = Type
252 ),
253 absolute_file_name(dtd(Base),
254 [ extensions([dtd]),
255 access(read)
256 ], DtdFile),
257 load_dtd(DTD, DtdFile),
258 register_cleanup,
259 asserta(current_dtd(Type, DTD)).
260
273
274load_dtd(DTD, DtdFile) :-
275 load_dtd(DTD, DtdFile, []).
276load_dtd(DTD, DtdFile, Options) :-
277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
278 setup_call_cleanup(
279 open_dtd(DTD, DTDOptions, DtdOut),
280 setup_call_cleanup(
281 open(DtdFile, read, DtdIn, OpenOptions),
282 copy_stream_data(DtdIn, DtdOut),
283 close(DtdIn)),
284 close(DtdOut)).
285
286split_dtd_options([], [], []).
287split_dtd_options([H|T], [H|TD], S) :-
288 dtd_option(H),
289 !,
290 split_dtd_options(T, TD, S).
291split_dtd_options([H|T], TD, [H|S]) :-
292 split_dtd_options(T, TD, S).
293
294dtd_option(dialect(_)).
295
296
301
302destroy_dtds :-
303 ( current_dtd(_Type, DTD),
304 free_dtd(DTD),
305 fail
306 ; true
307 ).
308
312
313register_cleanup :-
314 registered_cleanup,
315 !.
316register_cleanup :-
317 ( current_prolog_flag(threads, true)
318 -> prolog_listen(this_thread_exit, destroy_dtds)
319 ; true
320 ),
321 assert(registered_cleanup).
322
323
324 327
328prop(doctype(_), _).
329prop(elements(_), _).
330prop(entities(_), _).
331prop(notations(_), _).
332prop(entity(E, _), DTD) :-
333 ( nonvar(E)
334 -> true
335 ; '$dtd_property'(DTD, entities(EL)),
336 member(E, EL)
337 ).
338prop(element(E, _, _), DTD) :-
339 ( nonvar(E)
340 -> true
341 ; '$dtd_property'(DTD, elements(EL)),
342 member(E, EL)
343 ).
344prop(attributes(E, _), DTD) :-
345 ( nonvar(E)
346 -> true
347 ; '$dtd_property'(DTD, elements(EL)),
348 member(E, EL)
349 ).
350prop(attribute(E, A, _, _), DTD) :-
351 ( nonvar(E)
352 -> true
353 ; '$dtd_property'(DTD, elements(EL)),
354 member(E, EL)
355 ),
356 ( nonvar(A)
357 -> true
358 ; '$dtd_property'(DTD, attributes(E, AL)),
359 member(A, AL)
360 ).
361prop(notation(N, _), DTD) :-
362 ( nonvar(N)
363 -> true
364 ; '$dtd_property'(DTD, notations(NL)),
365 member(N, NL)
366 ).
367
368dtd_property(DTD, Prop) :-
369 prop(Prop, DTD),
370 '$dtd_property'(DTD, Prop).
371
372
373 376
398
399load_structure(Spec, DOM, Options) :-
400 sgml_open_options(Options, OpenOptions, SGMLOptions),
401 setup_call_cleanup(
402 open_any(Spec, read, In, Close, OpenOptions),
403 load_structure_from_stream(In, DOM, SGMLOptions),
404 close_any(Close)).
405
406sgml_open_options(Options, OpenOptions, SGMLOptions) :-
407 Options = M:Plain,
408 ( select_option(encoding(Encoding), Plain, NoEnc)
409 -> ( sgml_encoding(Encoding)
410 -> merge_options(NoEnc, [type(binary)], OpenOptions),
411 SGMLOptions = Options
412 ; OpenOptions = Plain,
413 SGMLOptions = M:NoEnc
414 )
415 ; merge_options(Plain, [type(binary)], OpenOptions),
416 SGMLOptions = Options
417 ).
418
419sgml_encoding(Enc) :-
420 downcase_atom(Enc, Enc1),
421 sgml_encoding_l(Enc1).
422
423sgml_encoding_l('iso-8859-1').
424sgml_encoding_l('us-ascii').
425sgml_encoding_l('utf-8').
426sgml_encoding_l('utf8').
427sgml_encoding_l('iso_latin_1').
428sgml_encoding_l('ascii').
429
430load_structure_from_stream(In, Term, M:Options) :-
431 ( select_option(dtd(DTD), Options, Options1)
432 -> ExplicitDTD = true
433 ; ExplicitDTD = false,
434 Options1 = Options
435 ),
436 move_front(Options1, dialect(_), Options2), 437 setup_call_cleanup(
438 new_sgml_parser(Parser,
439 [ dtd(DTD)
440 ]),
441 parse(Parser, M:Options2, TermRead, In),
442 free_sgml_parser(Parser)),
443 ( ExplicitDTD == true
444 -> ( DTD = dtd(_, DocType),
445 dtd_property(DTD, doctype(DocType))
446 -> true
447 ; true
448 )
449 ; free_dtd(DTD)
450 ),
451 Term = TermRead.
452
453move_front(Options0, Opt, Options) :-
454 selectchk(Opt, Options0, Options1),
455 !,
456 Options = [Opt|Options1].
457move_front(Options, _, Options).
458
459
460parse(Parser, M:Options, Document, In) :-
461 set_parser_options(Options, Parser, In, Options1),
462 parser_meta_options(Options1, M, Options2),
463 set_input_location(Parser, In),
464 sgml_parse(Parser,
465 [ document(Document),
466 source(In)
467 | Options2
468 ]).
469
470set_parser_options([], _, _, []).
471set_parser_options([H|T], Parser, In, Rest) :-
472 ( set_parser_option(H, Parser, In)
473 -> set_parser_options(T, Parser, In, Rest)
474 ; Rest = [H|R2],
475 set_parser_options(T, Parser, In, R2)
476 ).
477
478set_parser_option(Var, _Parser, _In) :-
479 var(Var),
480 !,
481 instantiation_error(Var).
482set_parser_option(Option, Parser, _) :-
483 def_entity(Option, Parser),
484 !.
485set_parser_option(offset(Offset), _Parser, In) :-
486 !,
487 seek(In, Offset, bof, _).
488set_parser_option(Option, Parser, _In) :-
489 parser_option(Option),
490 !,
491 set_sgml_parser(Parser, Option).
492set_parser_option(Name=Value, Parser, In) :-
493 Option =.. [Name,Value],
494 set_parser_option(Option, Parser, In).
495
496
497parser_option(dialect(_)).
498parser_option(shorttag(_)).
499parser_option(case_sensitive_attributes(_)).
500parser_option(case_preserving_attributes(_)).
501parser_option(system_entities(_)).
502parser_option(max_memory(_)).
503parser_option(file(_)).
504parser_option(line(_)).
505parser_option(space(_)).
506parser_option(number(_)).
507parser_option(defaults(_)).
508parser_option(doctype(_)).
509parser_option(qualify_attributes(_)).
510parser_option(encoding(_)).
511parser_option(keep_prefix(_)).
512
513
514def_entity(entity(Name, Value), Parser) :-
515 get_sgml_parser(Parser, dtd(DTD)),
516 xml_quote_attribute(Value, QValue),
517 setup_call_cleanup(open_dtd(DTD, [], Stream),
518 format(Stream, '<!ENTITY ~w "~w">~n',
519 [Name, QValue]),
520 close(Stream)).
521def_entity(xmlns(URI), Parser) :-
522 set_sgml_parser(Parser, xmlns(URI)).
523def_entity(xmlns(NS, URI), Parser) :-
524 set_sgml_parser(Parser, xmlns(NS, URI)).
525
529
530parser_meta_options([], _, []).
531parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
532 !,
533 parser_meta_options(T0, M, T).
534parser_meta_options([H|T0], M, [H|T]) :-
535 parser_meta_options(T0, M, T).
536
537
541
542set_input_location(Parser, _In) :-
543 get_sgml_parser(Parser, file(_)),
544 !.
545set_input_location(Parser, In) :-
546 stream_property(In, file_name(File)),
547 !,
548 set_sgml_parser(Parser, file(File)),
549 stream_property(In, position(Pos)),
550 set_sgml_parser(Parser, position(Pos)).
551set_input_location(_, _).
552
553 556
563
564load_sgml_file(File, Term) :-
565 load_sgml(File, Term, []).
566
573
574load_xml_file(File, Term) :-
575 load_xml(File, Term, []).
576
583
584load_html_file(File, DOM) :-
585 load_html(File, DOM, []).
586
613
614load_html(File, Term, M:Options) :-
615 current_prolog_flag(html_dialect, Dialect),
616 dtd(Dialect, DTD),
617 merge_options(Options,
618 [ dtd(DTD),
619 dialect(Dialect),
620 max_errors(-1),
621 syntax_errors(quiet)
622 ], Options1),
623 load_structure(File, Term, M:Options1).
624
632
633load_xml(Input, DOM, M:Options) :-
634 merge_options(Options,
635 [ dialect(xml)
636 ], Options1),
637 load_structure(Input, DOM, M:Options1).
638
646
647load_sgml(Input, DOM, M:Options) :-
648 merge_options(Options,
649 [ dialect(sgml)
650 ], Options1),
651 load_structure(Input, DOM, M:Options1).
652
653
654
655 658
666
667xml_quote_attribute(In, Quoted) :-
668 xml_quote_attribute(In, Quoted, ascii).
669
670xml_quote_cdata(In, Quoted) :-
671 xml_quote_cdata(In, Quoted, ascii).
672
676
677xml_name(In) :-
678 xml_name(In, ascii).
679
680
681 684
696
697
698 701
706
707xml_is_dom(0) :- !, fail. 708xml_is_dom(List) :-
709 is_list(List),
710 !,
711 xml_is_content_list(List).
712xml_is_dom(Term) :-
713 xml_is_element(Term).
714
715xml_is_content_list([]).
716xml_is_content_list([H|T]) :-
717 xml_is_content(H),
718 xml_is_content_list(T).
719
720xml_is_content(0) :- !, fail.
721xml_is_content(pi(Pi)) :-
722 !,
723 atom(Pi).
724xml_is_content(CDATA) :-
725 atom(CDATA),
726 !.
727xml_is_content(CDATA) :-
728 string(CDATA),
729 !.
730xml_is_content(Term) :-
731 xml_is_element(Term).
732
733xml_is_element(element(Name, Attributes, Content)) :-
734 dom_name(Name),
735 dom_attributes(Attributes),
736 xml_is_content_list(Content).
737
738dom_name(NS:Local) :-
739 atom(NS),
740 atom(Local),
741 !.
742dom_name(Local) :-
743 atom(Local).
744
745dom_attributes(0) :- !, fail.
746dom_attributes([]).
747dom_attributes([H|T]) :-
748 dom_attribute(H),
749 dom_attributes(T).
750
751dom_attribute(Name=Value) :-
752 dom_name(Name),
753 atomic(Value).
754
755
756 759:- multifile
760 prolog:message/3. 761
763
764prolog:message(sgml(Parser, File, Line, Message)) -->
765 { get_sgml_parser(Parser, dialect(Dialect))
766 },
767 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
768
769
770 773
774:- multifile
775 prolog:called_by/2. 776
777prolog:called_by(sgml_parse(_, Options), Called) :-
778 findall(Meta, meta_call_term(_, Meta, Options), Called).
779
780meta_call_term(T, G+N, Options) :-
781 T = call(Event, G),
782 pmember(T, Options),
783 call_params(Event, Term),
784 functor(Term, _, N).
785
786pmember(X, List) :- 787 nonvar(List),
788 List = [H|T],
789 ( X = H
790 ; pmember(X, T)
791 ).
792
793call_params(begin, begin(tag,attributes,parser)).
794call_params(end, end(tag,parser)).
795call_params(cdata, cdata(cdata,parser)).
796call_params(pi, pi(cdata,parser)).
797call_params(decl, decl(cdata,parser)).
798call_params(error, error(severity,message,parser)).
799call_params(xmlns, xmlns(namespace,url,parser)).
800call_params(urlns, urlns(url,url,parser)).
801
802 805
806:- multifile
807 sandbox:safe_primitive/1,
808 sandbox:safe_meta_predicate/1. 809
810sandbox:safe_meta_predicate(sgml:load_structure/3).
811sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
812 dtd_alias(Dialect, _).
813sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
814sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
815sandbox:safe_primitive(sgml:xml_name(_,_)).
816sandbox:safe_primitive(sgml:xml_basechar(_)).
817sandbox:safe_primitive(sgml:xml_ideographic(_)).
818sandbox:safe_primitive(sgml:xml_combining_char(_)).
819sandbox:safe_primitive(sgml:xml_digit(_)).
820sandbox:safe_primitive(sgml:xml_extender(_)).
821sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
822sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
823sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))