View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2019, University of Amsterdam
    7                              VU University Amsterdam
    8                              CWI, Amsterdam
    9    All rights reserved.
   10
   11    Redistribution and use in source and binary forms, with or without
   12    modification, are permitted provided that the following conditions
   13    are met:
   14
   15    1. Redistributions of source code must retain the above copyright
   16       notice, this list of conditions and the following disclaimer.
   17
   18    2. Redistributions in binary form must reproduce the above copyright
   19       notice, this list of conditions and the following disclaimer in
   20       the documentation and/or other materials provided with the
   21       distribution.
   22
   23    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   24    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   25    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   26    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   27    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   28    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   29    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   30    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   31    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   33    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34    POSSIBILITY OF SUCH DAMAGE.
   35*/
   36
   37:- module(xpath,
   38          [ xpath/3,                    % +DOM, +Spec, -Value
   39            xpath_chk/3,                % +DOM, +Spec, -Value
   40
   41            op(400, fx, //),
   42            op(400, fx, /),
   43            op(200, fy, @)
   44          ]).   45:- use_module(library(record),[record/1, op(_,_,record)]).   46
   47:- autoload(library(debug),[assertion/1]).   48:- autoload(library(error),[instantiation_error/1,must_be/2]).   49:- autoload(library(lists),[member/2]).   50:- autoload(library(sgml),[xsd_number_string/2]).   51
   52/** <module> Select nodes in an XML DOM
   53
   54The library xpath.pl provides predicates to select nodes from an XML DOM
   55tree as produced by library(sgml) based  on descriptions inspired by the
   56XPath language.
   57
   58The   predicate   xpath/3   selects   a   sub-structure   of   the   DOM
   59non-deterministically based on an  XPath-like   specification.  Not  all
   60selectors of XPath are implemented, but the ability to mix xpath/3 calls
   61with arbitrary Prolog code  provides  a   powerful  tool  for extracting
   62information from XML parse-trees.
   63
   64@see http://www.w3.org/TR/xpath
   65*/
   66
   67:- record
   68    element(name, attributes, content).   69
   70%!  xpath_chk(+DOM, +Spec, ?Content) is semidet.
   71%
   72%   Semi-deterministic version of xpath/3.
   73
   74xpath_chk(DOM, Spec, Content) :-
   75    xpath(DOM, Spec, Content),
   76    !.
   77
   78%!  xpath(+DOM, +Spec, ?Content) is nondet.
   79%
   80%   Match an element in a DOM structure.   The syntax is inspired by
   81%   XPath, using () rather than  []   to  select  inside an element.
   82%   First we can construct paths using / and //:
   83%
   84%       $ =|//|=Term :
   85%       Select any node in the DOM matching term.
   86%       $ =|/|=Term :
   87%       Match the root against Term.
   88%       $ Term :
   89%       Select the immediate children of the root matching Term.
   90%
   91%   The Terms above are of type   _callable_.  The functor specifies
   92%   the element name. The element name   '*'  refers to any element.
   93%   The name =self= refers to the   top-element  itself and is often
   94%   used for processing matches of an  earlier xpath/3 query. A term
   95%   NS:Term refers to an XML  name   in  the  namespace NS. Optional
   96%   arguments specify additional  constraints   and  functions.  The
   97%   arguments are processed from left  to right. Defined conditional
   98%   argument values are:
   99%
  100%       $ index(?Index) :
  101%       True if the element is the Index-th child of its parent,
  102%       where 1 denotes the first child. Index can be one of:
  103%         $ `Var` :
  104%         `Var` is unified with the index of the matched element.
  105%         $ =last= :
  106%         True for the last element.
  107%         $ =last= - `IntExpr` :
  108%         True for the last-minus-nth element. For example,
  109%         `last-1` is the element directly preceding the last one.
  110%         $ `IntExpr` :
  111%         True for the element whose index equals `IntExpr`.
  112%       $ Integer :
  113%       The N-th element with the given name, with 1 denoting the
  114%       first element. Same as index(Integer).
  115%       $ =last= :
  116%       The last element with the given name. Same as
  117%       index(last).
  118%       $ =last= - IntExpr :
  119%       The IntExpr-th element before the last.
  120%       Same as index(last-IntExpr).
  121%
  122%   Defined function argument values are:
  123%
  124%       $ =self= :
  125%       Evaluate to the entire element
  126%       $ =content= :
  127%       Evaluate to the content of the element (a list)
  128%       $ =text= :
  129%       Evaluates to all text from the sub-tree as an atom
  130%       $ `text(As)` :
  131%       Evaluates to all text from the sub-tree according to
  132%       `As`, which is either `atom` or `string`.
  133%       $ =normalize_space= :
  134%       As =text=, but uses normalize_space/2 to normalise
  135%       white-space in the output
  136%       $ =number= :
  137%       Extract an integer or float from the value.  Ignores
  138%       leading and trailing white-space
  139%       $ =|@|=Attribute :
  140%       Evaluates to the value of the given attribute.  Attribute
  141%       can be a compound term. In this case the functor name
  142%       denotes the element and arguments perform transformations
  143%       on the attribute value.  Defined transformations are:
  144%
  145%         - number
  146%         Translate the value into a number using
  147%         xsd_number_string/2 from library(sgml).
  148%         - integer
  149%         As `number`, but subsequently transform the value
  150%         into an integer using the round/1 function.
  151%         - float
  152%         As `number`, but subsequently transform the value
  153%         into a float using the float/1 function.
  154%         - string
  155%         Translate the value into a Prolog string.
  156%         - lower
  157%         Translate the value to lower case, preserving
  158%         the type.
  159%         - upper
  160%         Translate the value to upper case, preserving
  161%         the type.
  162%
  163%   In addition, the argument-list can be _conditions_:
  164%
  165%       $ Left = Right :
  166%       Succeeds if the left-hand unifies with the right-hand.
  167%       If the left-hand side is a function, this is evaluated.
  168%       The right-hand side is _never_ evaluated, and thus the
  169%       condition `content = content` defines that the content
  170%       of the element is the atom `content`.
  171%       The functions `lower_case` and `upper_case` can be applied
  172%       to Right (see example below).
  173%       $ contains(Haystack, Needle) :
  174%       Succeeds if Needle is a sub-string of Haystack.
  175%       $ XPath :
  176%       Succeeds if XPath matches in the currently selected
  177%       sub-DOM.  For example, the following expression finds
  178%       an =h3= element inside a =div= element, where the =div=
  179%       element itself contains an =h2= child with a =strong=
  180%       child.
  181%
  182%         ==
  183%         //div(h2/strong)/h3
  184%         ==
  185%
  186%       This is equivalent to the conjunction of XPath goals below.
  187%
  188%         ==
  189%            ...,
  190%            xpath(DOM, //(div), Div),
  191%            xpath(Div, h2/strong, _),
  192%            xpath(Div, h3, Result)
  193%         ==
  194%
  195%   **Examples**:
  196%
  197%   Match each table-row in DOM:
  198%
  199%       ==
  200%       xpath(DOM, //tr, TR)
  201%       ==
  202%
  203%   Match the last cell  of  each   tablerow  in  DOM.  This example
  204%   illustrates that a result can be the input of subsequent xpath/3
  205%   queries. Using multiple queries  on   the  intermediate  TR term
  206%   guarantee that all results come from the same table-row:
  207%
  208%       ==
  209%       xpath(DOM, //tr, TR),
  210%       xpath(TR,  /td(last), TD)
  211%       ==
  212%
  213%   Match each =href= attribute in an <a> element
  214%
  215%       ==
  216%       xpath(DOM, //a(@href), HREF)
  217%       ==
  218%
  219%   Suppose we have a table containing  rows where each first column
  220%   is the name of a product with a   link to details and the second
  221%   is the price (a number).  The   following  predicate matches the
  222%   name, URL and price:
  223%
  224%       ==
  225%       product(DOM, Name, URL, Price) :-
  226%           xpath(DOM, //tr, TR),
  227%           xpath(TR, td(1), C1),
  228%           xpath(C1, /self(normalize_space), Name),
  229%           xpath(C1, a(@href), URL),
  230%           xpath(TR, td(2, number), Price).
  231%       ==
  232%
  233%   Suppose we want to select  books   with  genre="thriller" from a
  234%   tree containing elements =|<book genre=...>|=
  235%
  236%       ==
  237%       thriller(DOM, Book) :-
  238%           xpath(DOM, //book(@genre=thiller), Book).
  239%       ==
  240%
  241%   Match the elements =|<table align="center">|= _and_ =|<table
  242%   align="CENTER">|=:
  243%
  244%       ```prolog
  245%           //table(@align(lower) = center)
  246%       ```
  247%
  248%   Get the `width` and `height` of a `div` element as a number,
  249%   and the `div` node itself:
  250%
  251%       ==
  252%           xpath(DOM, //div(@width(number)=W, @height(number)=H), Div)
  253%       ==
  254%
  255%   Note that `div` is an infix operator, so parentheses must be
  256%   used in cases like the following:
  257%
  258%       ==
  259%           xpath(DOM, //(div), Div)
  260%       ==
  261
  262xpath(DOM, Spec, Content) :-
  263    in_dom(Spec, DOM, Content).
  264
  265in_dom(//Spec, DOM, Value) :-
  266    !,
  267    element_spec(Spec, Name, Modifiers),
  268    sub_dom(I, Len, Name, E, DOM),
  269    modifiers(Modifiers, I, Len, E, Value).
  270in_dom(/Spec, E, Value) :-
  271    !,
  272    element_spec(Spec, Name, Modifiers),
  273    (   Name == self
  274    ->  true
  275    ;   element_name(E, Name)
  276    ),
  277    modifiers(Modifiers, 1, 1, E, Value).
  278in_dom(A/B, DOM, Value) :-
  279    !,
  280    in_dom(A, DOM, Value0),
  281    in_dom(B, Value0, Value).
  282in_dom(A//B, DOM, Value) :-
  283    !,
  284    in_dom(A, DOM, Value0),
  285    in_dom(//B, Value0, Value).
  286in_dom(Spec, element(_, _, Content), Value) :-
  287    element_spec(Spec, Name, Modifiers),
  288    count_named_elements(Content, Name, CLen),
  289    CLen > 0,
  290    nth_element(N, Name, E, Content),
  291    modifiers(Modifiers, N, CLen, E, Value).
  292
  293element_spec(Var, _, _) :-
  294    var(Var),
  295    !,
  296    instantiation_error(Var).
  297element_spec(NS:Term, NS:Name, Modifiers) :-
  298    !,
  299    callable_name_arguments(Term, Name0, Modifiers),
  300    star(Name0, Name).
  301element_spec(Term, Name, Modifiers) :-
  302    !,
  303    callable_name_arguments(Term, Name0, Modifiers),
  304    star(Name0, Name).
  305
  306callable_name_arguments(Atom, Name, Arguments) :-
  307    atom(Atom),
  308    !,
  309    Name = Atom, Arguments = [].
  310callable_name_arguments(Compound, Name, Arguments) :-
  311    compound_name_arguments(Compound, Name, Arguments).
  312
  313
  314star(*, _) :- !.
  315star(Name, Name).
  316
  317
  318%!  sub_dom(-Index, -Count, +Name, -Sub, +DOM) is nondet.
  319%
  320%   Sub is a node in DOM with Name.
  321%
  322%   @param Count    is the total number of nodes in the content
  323%                   list Sub appears that have the same name.
  324%   @param Index    is the 1-based index of Sub of nodes with
  325%                   Name.
  326
  327sub_dom(1, 1, Name, DOM, DOM) :-
  328    element_name(DOM, Name0),
  329    \+ Name \= Name0.
  330sub_dom(N, Len, Name, E, element(_,_,Content)) :-
  331    !,
  332    sub_dom_2(N, Len, Name, E, Content).
  333sub_dom(N, Len, Name, E, Content) :-
  334    is_list(Content),
  335    sub_dom_2(N, Len, Name, E, Content).
  336
  337sub_dom_2(N, Len, Name, Element, Content) :-
  338    (   count_named_elements(Content, Name, Len),
  339        nth_element(N, Name, Element, Content)
  340    ;   member(element(_,_,C2), Content),
  341        sub_dom_2(N, Len, Name, Element, C2)
  342    ).
  343
  344
  345%!  count_named_elements(+Content, +Name, -Count) is det.
  346%
  347%   Count is the number of nodes with Name in Content.
  348
  349count_named_elements(Content, Name, Count) :-
  350    count_named_elements(Content, Name, 0, Count).
  351
  352count_named_elements([], _, Count, Count).
  353count_named_elements([element(Name,_,_)|T], Name0, C0, C) :-
  354    \+ Name \= Name0,
  355    !,
  356    C1 is C0+1,
  357    count_named_elements(T, Name0, C1, C).
  358count_named_elements([_|T], Name, C0, C) :-
  359    count_named_elements(T, Name, C0, C).
  360
  361
  362%!  nth_element(?N, +Name, -Element, +Content:list) is nondet.
  363%
  364%   True if Element is the N-th element with name in Content.
  365
  366nth_element(N, Name, Element, Content) :-
  367    nth_element_(1, N, Name, Element, Content).
  368
  369nth_element_(I, N, Name, E, [H|T]) :-
  370    element_name(H, Name0),
  371    \+ Name \= Name0,
  372    !,
  373    (   N = I,
  374        E = H
  375    ;   I2 is I + 1,
  376        (   nonvar(N), I2 > N
  377        ->  !, fail
  378        ;   true
  379        ),
  380        nth_element_(I2, N, Name, E, T)
  381    ).
  382nth_element_(I, N, Name, E, [_|T]) :-
  383    nth_element_(I, N, Name, E, T).
  384
  385
  386%!  modifiers(+Modifiers, +I, +Clen, +DOM, -Value)
  387%
  388%
  389
  390modifiers([], _, _, Value, Value).
  391modifiers([H|T], I, L, Value0, Value) :-
  392    modifier(H, I, L, Value0, Value1),
  393    modifiers(T, I, L, Value1, Value).
  394
  395modifier(M, _, _, _, _) :-
  396    var(M),
  397    !,
  398    instantiation_error(M).
  399modifier(Index, I, L, Value0, Value) :-
  400    implicit_index_modifier(Index),
  401    !,
  402    Value = Value0,
  403    index_modifier(Index, I, L).
  404modifier(index(Index), I, L, Value, Value) :-
  405    !,
  406    index_modifier(Index, I, L).
  407modifier(Function, _, _, In, Out) :-
  408    xpath_function(Function),
  409    !,
  410    xpath_function(Function, In, Out).
  411modifier(Function, _, _, In, Out) :-
  412    xpath_condition(Function, In),
  413    Out = In.
  414
  415implicit_index_modifier(I) :-
  416    integer(I),
  417    !.
  418implicit_index_modifier(last).
  419implicit_index_modifier(last-_Expr).
  420
  421index_modifier(Var, I, _L) :-
  422    var(Var),
  423    !,
  424    Var = I.
  425index_modifier(last, I, L) :-
  426    !,
  427    I =:= L.
  428index_modifier(last-Expr, I, L) :-
  429    !,
  430    I =:= L-Expr.
  431index_modifier(N, I, _) :-
  432    N =:= I.
  433
  434xpath_function(self, DOM, Value) :-                            % self
  435    !,
  436    Value = DOM.
  437xpath_function(content, Element, Value) :-                     % content
  438    !,
  439    element_content(Element, Value).
  440xpath_function(text, DOM, Text) :-                             % text
  441    !,
  442    text_of_dom(DOM, atom, Text).
  443xpath_function(text(As), DOM, Text) :-                         % text(As)
  444    !,
  445    text_of_dom(DOM, As, Text).
  446xpath_function(normalize_space, DOM, Text) :-                  % normalize_space
  447    !,
  448    text_of_dom(DOM, string, Text0),
  449    normalize_space(atom(Text), Text0).
  450xpath_function(number, DOM, Number) :-                         % number
  451    !,
  452    text_of_dom(DOM, string, Text0),
  453    normalize_space(string(Text), Text0),
  454    catch(xsd_number_string(Number, Text), _, fail).
  455xpath_function(@Name, element(_, Attrs, _), Value) :-          % @Name
  456    !,
  457    (   atom(Name)
  458    ->  memberchk(Name=Value, Attrs)
  459    ;   compound(Name)
  460    ->  compound_name_arguments(Name, AName, AOps),
  461        memberchk(AName=Value0, Attrs),
  462        translate_attribute(AOps, Value0, Value)
  463    ;   member(Name=Value, Attrs)
  464    ).
  465xpath_function(quote(Value), _, Value).                         % quote(Value)
  466
  467xpath_function(self).
  468xpath_function(content).
  469xpath_function(text).
  470xpath_function(text(_)).
  471xpath_function(normalize_space).
  472xpath_function(number).
  473xpath_function(@_).
  474xpath_function(quote(_)).
  475
  476translate_attribute([], Value, Value).
  477translate_attribute([H|T], Value0, Value) :-
  478    translate_attr(H, Value0, Value1),
  479    translate_attribute(T, Value1, Value).
  480
  481translate_attr(number, Value0, Value) :-
  482    xsd_number_string(Value, Value0).
  483translate_attr(integer, Value0, Value) :-
  484    xsd_number_string(Value1, Value0),
  485    Value is round(Value1).
  486translate_attr(float, Value0, Value) :-
  487    xsd_number_string(Value1, Value0),
  488    Value is float(Value1).
  489translate_attr(string, Value0, Value) :-
  490    atom_string(Value0, Value).
  491translate_attr(lower, Value0, Value) :-
  492    (   atom(Value0)
  493    ->  downcase_atom(Value0, Value)
  494    ;   string_lower(Value0, Value)
  495    ).
  496translate_attr(upper, Value0, Value) :-
  497    (   atom(Value0)
  498    ->  upcase_atom(Value0, Value)
  499    ;   string_upper(Value0, Value)
  500    ).
  501
  502xpath_condition(Left = Right, Value) :-                        % =
  503    !,
  504    var_or_function(Left, Value, LeftValue),
  505    process_equality(LeftValue, Right).
  506xpath_condition(contains(Haystack, Needle), Value) :-          % contains(Haystack, Needle)
  507    !,
  508    val_or_function(Haystack, Value, HaystackValue),
  509    val_or_function(Needle, Value, NeedleValue),
  510    atom(HaystackValue), atom(NeedleValue),
  511    (   sub_atom(HaystackValue, _, _, _, NeedleValue)
  512    ->  true
  513    ).
  514xpath_condition(Spec, Dom) :-
  515    in_dom(Spec, Dom, _).
  516
  517
  518%!  process_equality(+Left, +Right) is semidet.
  519%
  520%   Provides (very) partial support for XSLT   functions that can be
  521%   applied according to the XPath 2 specification.
  522%
  523%   For example the XPath expression  in   [1],  and  the equivalent
  524%   Prolog expression in [2], would both   match the HTML element in
  525%   [3].
  526%
  527%     ==
  528%     [1] //table[align=lower-case(center)]
  529%     [2] //table(@align=lower_case(center))
  530%     [3] <table align="CENTER">
  531%     ==
  532
  533process_equality(Left, Right) :-
  534    var(Right),
  535    !,
  536    Left = Right.
  537process_equality(Left, lower_case(Right)) :-
  538    !,
  539    downcase_atom(Left, Right).
  540process_equality(Left, upper_case(Right)) :-
  541    !,
  542    upcase_atom(Left, Right).
  543process_equality(Left, Right) :-
  544    Left = Right,
  545    !.
  546process_equality(Left, Right) :-
  547    atom(Left),
  548    atomic(Right),
  549    \+ atom(Left),
  550    atom_string(Left, Right).
  551
  552var_or_function(Arg, _, Arg) :-
  553    var(Arg),
  554    !.
  555var_or_function(Func, Value0, Value) :-
  556    xpath_function(Func),
  557    !,
  558    xpath_function(Func, Value0, Value).
  559var_or_function(Value, _, Value).
  560
  561val_or_function(Arg, _, Arg) :-
  562    var(Arg),
  563    !,
  564    instantiation_error(Arg).
  565val_or_function(Func, Value0, Value) :-                         % TBD
  566    xpath_function(Func, Value0, Value),
  567    !.
  568val_or_function(Value, _, Value).
  569
  570
  571%!  text_of_dom(+DOM, +As, -Text:atom) is det.
  572%
  573%   Text is the joined textual content of DOM.
  574
  575text_of_dom(DOM, As, Text) :-
  576    phrase(text_of(DOM), Tokens),
  577    (   As == atom
  578    ->  atomic_list_concat(Tokens, Text)
  579    ;   As == string
  580    ->  atomics_to_string(Tokens, Text)
  581    ;   must_be(oneof([atom,string]), As)
  582    ).
  583
  584text_of(element(_,_,Content)) -->
  585    text_of_list(Content).
  586text_of([]) -->
  587    [].
  588text_of([H|T]) -->
  589    text_of(H),
  590    text_of(T).
  591
  592
  593text_of_list([]) -->
  594    [].
  595text_of_list([H|T]) -->
  596    text_of_1(H),
  597    text_of_list(T).
  598
  599
  600text_of_1(element(_,_,Content)) -->
  601    !,
  602    text_of_list(Content).
  603text_of_1(Data) -->
  604    { assertion(atom_or_string(Data)) },
  605    [Data].
  606
  607atom_or_string(Data) :-
  608    (   atom(Data)
  609    ->  true
  610    ;   string(Data)
  611    )