36
37:- module(sgml,
38 [ load_html/3, 39 load_xml/3, 40 load_sgml/3, 41
42 load_sgml_file/2, 43 load_xml_file/2, 44 load_html_file/2, 45
46 load_structure/3, 47
48 load_dtd/2, 49 load_dtd/3, 50 dtd/2, 51 dtd_property/2, 52
53 new_dtd/2, 54 free_dtd/1, 55 open_dtd/3, 56
57 new_sgml_parser/2, 58 free_sgml_parser/1, 59 set_sgml_parser/2, 60 get_sgml_parser/2, 61 sgml_parse/2, 62
63 sgml_register_catalog_file/2, 64
65 xml_quote_attribute/3, 66 xml_quote_cdata/3, 67 xml_quote_attribute/2, 68 xml_quote_cdata/2, 69 xml_name/1, 70 xml_name/2, 71
72 xsd_number_string/2, 73 xsd_time_string/3, 74
75 xml_basechar/1, 76 xml_ideographic/1, 77 xml_combining_char/1, 78 xml_digit/1, 79 xml_extender/1, 80
81 iri_xml_namespace/2, 82 iri_xml_namespace/3, 83 xml_is_dom/1 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89
90:- meta_predicate
91 load_structure(+, -, :),
92 load_html(+, -, :),
93 load_xml(+, -, :),
94 load_sgml(+, -, :). 95
96:- predicate_options(load_structure/3, 3,
97 [ charpos(integer),
98 cdata(oneof([atom,string])),
99 defaults(boolean),
100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
101 doctype(atom),
102 dtd(any),
103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
104 entity(atom,atom),
105 keep_prefix(boolean),
106 file(atom),
107 line(integer),
108 offset(integer),
109 number(oneof([token,integer])),
110 qualify_attributes(boolean),
111 shorttag(boolean),
112 case_sensitive_attributes(boolean),
113 case_preserving_attributes(boolean),
114 system_entities(boolean),
115 max_memory(integer),
116 ignore_doctype(boolean),
117 space(oneof([sgml,preserve,default,remove,strict])),
118 xmlns(atom),
119 xmlns(atom,atom),
120 pass_to(sgml_parse/2, 2)
121 ]). 122:- predicate_options(load_html/3, 3,
123 [ pass_to(load_structure/3, 3)
124 ]). 125:- predicate_options(load_xml/3, 3,
126 [ pass_to(load_structure/3, 3)
127 ]). 128:- predicate_options(load_sgml/3, 3,
129 [ pass_to(load_structure/3, 3)
130 ]). 131:- predicate_options(load_dtd/3, 3,
132 [ dialect(oneof([sgml,xml,xmlns])),
133 pass_to(open/4, 4)
134 ]). 135:- predicate_options(sgml_parse/2, 2,
136 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
137 callable),
138 cdata(oneof([atom,string])),
139 content_length(integer),
140 document(-any),
141 max_errors(integer),
142 parse(oneof([file,element,content,declaration,input])),
143 source(any),
144 syntax_errors(oneof([quiet,print,style])),
145 xml_no_ns(oneof([error,quiet]))
146 ]). 147:- predicate_options(new_sgml_parser/2, 2,
148 [ dtd(any)
149 ]). 150
151
178
179:- multifile user:file_search_path/2. 180:- dynamic user:file_search_path/2. 181
182user:file_search_path(dtd, '.').
183user:file_search_path(dtd, swi('library/DTD')).
184
185sgml_register_catalog_file(File, Location) :-
186 prolog_to_os_filename(File, OsFile),
187 '_sgml_register_catalog_file'(OsFile, Location).
188
189:- use_foreign_library(foreign(sgml2pl)). 190
191register_catalog(Base) :-
192 absolute_file_name(dtd(Base),
193 [ extensions([soc]),
194 access(read),
195 file_errors(fail)
196 ],
197 SocFile),
198 sgml_register_catalog_file(SocFile, end).
199
200:- initialization
201 ignore(register_catalog('HTML4')). 202
203
204 207
214
215:- thread_local
216 current_dtd/2. 217:- volatile
218 current_dtd/2. 219:- thread_local
220 registered_cleanup/0. 221:- volatile
222 registered_cleanup/0. 223
224:- multifile
225 dtd_alias/2. 226
227:- create_prolog_flag(html_dialect, html5, [type(atom)]). 228
229dtd_alias(html4, 'HTML4').
230dtd_alias(html5, 'HTML5').
231dtd_alias(html, DTD) :-
232 current_prolog_flag(html_dialect, Dialect),
233 dtd_alias(Dialect, DTD).
234
244
245dtd(Type, DTD) :-
246 current_dtd(Type, DTD),
247 !.
248dtd(Type, DTD) :-
249 new_dtd(Type, DTD),
250 ( dtd_alias(Type, Base)
251 -> true
252 ; Base = Type
253 ),
254 absolute_file_name(dtd(Base),
255 [ extensions([dtd]),
256 access(read)
257 ], DtdFile),
258 load_dtd(DTD, DtdFile),
259 register_cleanup,
260 asserta(current_dtd(Type, DTD)).
261
274
275load_dtd(DTD, DtdFile) :-
276 load_dtd(DTD, DtdFile, []).
277load_dtd(DTD, DtdFile, Options) :-
278 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
279 setup_call_cleanup(
280 open_dtd(DTD, DTDOptions, DtdOut),
281 setup_call_cleanup(
282 open(DtdFile, read, DtdIn, OpenOptions),
283 copy_stream_data(DtdIn, DtdOut),
284 close(DtdIn)),
285 close(DtdOut)).
286
291
292destroy_dtds :-
293 ( current_dtd(_Type, DTD),
294 free_dtd(DTD),
295 fail
296 ; true
297 ).
298
302
303register_cleanup :-
304 registered_cleanup,
305 !.
306register_cleanup :-
307 ( thread_self(main)
308 -> at_halt(destroy_dtds)
309 ; current_prolog_flag(threads, true)
310 -> prolog_listen(this_thread_exit, destroy_dtds)
311 ; true
312 ),
313 assert(registered_cleanup).
314
315
316 319
320prop(doctype(_), _).
321prop(elements(_), _).
322prop(entities(_), _).
323prop(notations(_), _).
324prop(entity(E, _), DTD) :-
325 ( nonvar(E)
326 -> true
327 ; '$dtd_property'(DTD, entities(EL)),
328 member(E, EL)
329 ).
330prop(element(E, _, _), DTD) :-
331 ( nonvar(E)
332 -> true
333 ; '$dtd_property'(DTD, elements(EL)),
334 member(E, EL)
335 ).
336prop(attributes(E, _), DTD) :-
337 ( nonvar(E)
338 -> true
339 ; '$dtd_property'(DTD, elements(EL)),
340 member(E, EL)
341 ).
342prop(attribute(E, A, _, _), DTD) :-
343 ( nonvar(E)
344 -> true
345 ; '$dtd_property'(DTD, elements(EL)),
346 member(E, EL)
347 ),
348 ( nonvar(A)
349 -> true
350 ; '$dtd_property'(DTD, attributes(E, AL)),
351 member(A, AL)
352 ).
353prop(notation(N, _), DTD) :-
354 ( nonvar(N)
355 -> true
356 ; '$dtd_property'(DTD, notations(NL)),
357 member(N, NL)
358 ).
359
360dtd_property(DTD, Prop) :-
361 prop(Prop, DTD),
362 '$dtd_property'(DTD, Prop).
363
364
365 368
390
391load_structure(Spec, DOM, Options) :-
392 sgml_open_options(Options, OpenOptions, SGMLOptions),
393 setup_call_cleanup(
394 open_any(Spec, read, In, Close, OpenOptions),
395 load_structure_from_stream(In, DOM, SGMLOptions),
396 close_any(Close)).
397
398sgml_open_options(Options, OpenOptions, SGMLOptions) :-
399 Options = M:Plain,
400 ( select_option(encoding(Encoding), Plain, NoEnc)
401 -> ( sgml_encoding(Encoding)
402 -> merge_options(NoEnc, [type(binary)], OpenOptions),
403 SGMLOptions = Options
404 ; OpenOptions = Plain,
405 SGMLOptions = M:NoEnc
406 )
407 ; merge_options(Plain, [type(binary)], OpenOptions),
408 SGMLOptions = Options
409 ).
410
411sgml_encoding(Enc) :-
412 downcase_atom(Enc, Enc1),
413 sgml_encoding_l(Enc1).
414
415sgml_encoding_l('iso-8859-1').
416sgml_encoding_l('us-ascii').
417sgml_encoding_l('utf-8').
418sgml_encoding_l('utf8').
419sgml_encoding_l('iso_latin_1').
420sgml_encoding_l('ascii').
421
422load_structure_from_stream(In, Term, M:Options) :-
423 ( select_option(dtd(DTD), Options, Options1)
424 -> ExplicitDTD = true
425 ; ExplicitDTD = false,
426 Options1 = Options
427 ),
428 move_front(Options1, dialect(_), Options2), 429 setup_call_cleanup(
430 new_sgml_parser(Parser,
431 [ dtd(DTD)
432 ]),
433 parse(Parser, M:Options2, TermRead, In),
434 free_sgml_parser(Parser)),
435 ( ExplicitDTD == true
436 -> ( DTD = dtd(_, DocType),
437 dtd_property(DTD, doctype(DocType))
438 -> true
439 ; true
440 )
441 ; free_dtd(DTD)
442 ),
443 Term = TermRead.
444
445move_front(Options0, Opt, Options) :-
446 selectchk(Opt, Options0, Options1),
447 !,
448 Options = [Opt|Options1].
449move_front(Options, _, Options).
450
451
452parse(Parser, M:Options, Document, In) :-
453 set_parser_options(Options, Parser, In, Options1),
454 parser_meta_options(Options1, M, Options2),
455 set_input_location(Parser, In),
456 sgml_parse(Parser,
457 [ document(Document),
458 source(In)
459 | Options2
460 ]).
461
462set_parser_options([], _, _, []).
463set_parser_options([H|T], Parser, In, Rest) :-
464 ( set_parser_option(H, Parser, In)
465 -> set_parser_options(T, Parser, In, Rest)
466 ; Rest = [H|R2],
467 set_parser_options(T, Parser, In, R2)
468 ).
469
470set_parser_option(Var, _Parser, _In) :-
471 var(Var),
472 !,
473 instantiation_error(Var).
474set_parser_option(Option, Parser, _) :-
475 def_entity(Option, Parser),
476 !.
477set_parser_option(offset(Offset), _Parser, In) :-
478 !,
479 seek(In, Offset, bof, _).
480set_parser_option(Option, Parser, _In) :-
481 parser_option(Option),
482 !,
483 set_sgml_parser(Parser, Option).
484set_parser_option(Name=Value, Parser, In) :-
485 Option =.. [Name,Value],
486 set_parser_option(Option, Parser, In).
487
488
489parser_option(dialect(_)).
490parser_option(shorttag(_)).
491parser_option(case_sensitive_attributes(_)).
492parser_option(case_preserving_attributes(_)).
493parser_option(system_entities(_)).
494parser_option(max_memory(_)).
495parser_option(ignore_doctype(_)).
496parser_option(file(_)).
497parser_option(line(_)).
498parser_option(space(_)).
499parser_option(number(_)).
500parser_option(defaults(_)).
501parser_option(doctype(_)).
502parser_option(qualify_attributes(_)).
503parser_option(encoding(_)).
504parser_option(keep_prefix(_)).
505
506
507def_entity(entity(Name, Value), Parser) :-
508 get_sgml_parser(Parser, dtd(DTD)),
509 xml_quote_attribute(Value, QValue),
510 setup_call_cleanup(open_dtd(DTD, [], Stream),
511 format(Stream, '<!ENTITY ~w "~w">~n',
512 [Name, QValue]),
513 close(Stream)).
514def_entity(xmlns(URI), Parser) :-
515 set_sgml_parser(Parser, xmlns(URI)).
516def_entity(xmlns(NS, URI), Parser) :-
517 set_sgml_parser(Parser, xmlns(NS, URI)).
518
522
523parser_meta_options([], _, []).
524parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
525 !,
526 parser_meta_options(T0, M, T).
527parser_meta_options([H|T0], M, [H|T]) :-
528 parser_meta_options(T0, M, T).
529
530
534
535set_input_location(Parser, _In) :-
536 get_sgml_parser(Parser, file(_)),
537 !.
538set_input_location(Parser, In) :-
539 stream_property(In, file_name(File)),
540 !,
541 set_sgml_parser(Parser, file(File)),
542 stream_property(In, position(Pos)),
543 set_sgml_parser(Parser, position(Pos)).
544set_input_location(_, _).
545
546 549
556
557load_sgml_file(File, Term) :-
558 load_sgml(File, Term, []).
559
566
567load_xml_file(File, Term) :-
568 load_xml(File, Term, []).
569
576
577load_html_file(File, DOM) :-
578 load_html(File, DOM, []).
579
606
607load_html(File, Term, M:Options) :-
608 current_prolog_flag(html_dialect, Dialect),
609 dtd(Dialect, DTD),
610 merge_options(Options,
611 [ dtd(DTD),
612 dialect(Dialect),
613 max_errors(-1),
614 syntax_errors(quiet)
615 ], Options1),
616 load_structure(File, Term, M:Options1).
617
625
626load_xml(Input, DOM, M:Options) :-
627 merge_options(Options,
628 [ dialect(xml)
629 ], Options1),
630 load_structure(Input, DOM, M:Options1).
631
639
640load_sgml(Input, DOM, M:Options) :-
641 merge_options(Options,
642 [ dialect(sgml)
643 ], Options1),
644 load_structure(Input, DOM, M:Options1).
645
646
647
648 651
659
660xml_quote_attribute(In, Quoted) :-
661 xml_quote_attribute(In, Quoted, ascii).
662
663xml_quote_cdata(In, Quoted) :-
664 xml_quote_cdata(In, Quoted, ascii).
665
669
670xml_name(In) :-
671 xml_name(In, ascii).
672
673
674 677
689
690
691 694
699
700xml_is_dom(0) :- !, fail. 701xml_is_dom(List) :-
702 is_list(List),
703 !,
704 xml_is_content_list(List).
705xml_is_dom(Term) :-
706 xml_is_element(Term).
707
708xml_is_content_list([]).
709xml_is_content_list([H|T]) :-
710 xml_is_content(H),
711 xml_is_content_list(T).
712
713xml_is_content(0) :- !, fail.
714xml_is_content(pi(Pi)) :-
715 !,
716 atom(Pi).
717xml_is_content(CDATA) :-
718 atom(CDATA),
719 !.
720xml_is_content(CDATA) :-
721 string(CDATA),
722 !.
723xml_is_content(Term) :-
724 xml_is_element(Term).
725
726xml_is_element(element(Name, Attributes, Content)) :-
727 dom_name(Name),
728 dom_attributes(Attributes),
729 xml_is_content_list(Content).
730
731dom_name(NS:Local) :-
732 atom(NS),
733 atom(Local),
734 !.
735dom_name(Local) :-
736 atom(Local).
737
738dom_attributes(0) :- !, fail.
739dom_attributes([]).
740dom_attributes([H|T]) :-
741 dom_attribute(H),
742 dom_attributes(T).
743
744dom_attribute(Name=Value) :-
745 dom_name(Name),
746 atomic(Value).
747
748
749 752:- multifile
753 prolog:message/3. 754
756
757prolog:message(sgml(Parser, File, Line, Message)) -->
758 { get_sgml_parser(Parser, dialect(Dialect))
759 },
760 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
761
762
763 766
767:- multifile
768 prolog:called_by/2. 769
770prolog:called_by(sgml_parse(_, Options), Called) :-
771 findall(Meta, meta_call_term(_, Meta, Options), Called).
772
773meta_call_term(T, G+N, Options) :-
774 T = call(Event, G),
775 pmember(T, Options),
776 call_params(Event, Term),
777 functor(Term, _, N).
778
779pmember(X, List) :- 780 nonvar(List),
781 List = [H|T],
782 ( X = H
783 ; pmember(X, T)
784 ).
785
786call_params(begin, begin(tag,attributes,parser)).
787call_params(end, end(tag,parser)).
788call_params(cdata, cdata(cdata,parser)).
789call_params(pi, pi(cdata,parser)).
790call_params(decl, decl(cdata,parser)).
791call_params(error, error(severity,message,parser)).
792call_params(xmlns, xmlns(namespace,url,parser)).
793call_params(urlns, urlns(url,url,parser)).
794
795 798
799:- multifile
800 sandbox:safe_primitive/1,
801 sandbox:safe_meta_predicate/1. 802
803sandbox:safe_meta_predicate(sgml:load_structure/3).
804sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
805 dtd_alias(Dialect, _).
806sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
807sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
808sandbox:safe_primitive(sgml:xml_name(_,_)).
809sandbox:safe_primitive(sgml:xml_basechar(_)).
810sandbox:safe_primitive(sgml:xml_ideographic(_)).
811sandbox:safe_primitive(sgml:xml_combining_char(_)).
812sandbox:safe_primitive(sgml:xml_digit(_)).
813sandbox:safe_primitive(sgml:xml_extender(_)).
814sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
815sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
816sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))