View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2018, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(uri,
   36          [ uri_components/2,           % ?URI, ?Components
   37            uri_data/3,                 % ?Field, +Components, ?Data
   38            uri_data/4,                 % +Field, +Components, -Data, -New
   39
   40            uri_normalized/2,           % +URI, -NormalizedURI
   41            iri_normalized/2,           % +IRI, -NormalizedIRI
   42            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   43            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   44            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   45            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   46            uri_resolve/3,              % +URI, +Base, -AbsURI
   47            uri_is_global/1,            % +URI
   48            uri_query_components/2,     % ?QueryString, ?NameValueList
   49            uri_authority_components/2, % ?Authority, ?Components
   50            uri_authority_data/3,       % ?Field, ?Components, ?Data
   51                                        % Encoding
   52            uri_encoded/3,              % +Component, ?Value, ?Encoded
   53            uri_file_name/2,            % ?URI, ?Path
   54            uri_iri/2                   % ?URI, ?IRI
   55          ]).   56:- use_foreign_library(foreign(uri)).   57
   58/** <module> Process URIs
   59
   60This  library  provides   high-performance    C-based   primitives   for
   61manipulating URIs. We decided for a  C-based implementation for the much
   62better performance on raw character  manipulation. Notably, URI handling
   63primitives are used in  time-critical  parts   of  RDF  processing. This
   64implementation is based on RFC-3986:
   65
   66        http://labs.apache.org/webarch/uri/rfc/rfc3986.html
   67
   68The URI processing in this library is  rather liberal. That is, we break
   69URIs according to the rules, but we  do not validate that the components
   70are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
   71UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
   72
   73Earlier experience has shown that strict   enforcement of the URI syntax
   74results in many errors that  are   accepted  by  many other web-document
   75processing tools.
   76*/
   77
   78%!  uri_components(+URI, -Components) is det.
   79%!  uri_components(-URI, +Components) is det.
   80%
   81%   Break a URI  into  its  5   basic  components  according  to the
   82%   RFC-3986 regular expression:
   83%
   84%       ==
   85%       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
   86%        12            3  4          5       6  7        8 9
   87%       ==
   88%
   89%   @param Components is a   term  uri_components(Scheme, Authority,
   90%   Path, Search, Fragment). If a URI  is *parsed*, i.e., using mode
   91%   (+,-), components that are not   found are left _uninstantiated_
   92%   (variable). See uri_data/3 for accessing this structure.
   93
   94%!  uri_data(?Field, +Components, ?Data) is semidet.
   95%
   96%   Provide access the uri_component structure.  Defined field-names
   97%   are: =scheme=, =authority=, =path=, =search= and =fragment=
   98
   99uri_data(scheme,    uri_components(S, _, _, _, _), S).
  100uri_data(authority, uri_components(_, A, _, _, _), A).
  101uri_data(path,      uri_components(_, _, P, _, _), P).
  102uri_data(search,    uri_components(_, _, _, S, _), S).
  103uri_data(fragment,  uri_components(_, _, _, _, F), F).
  104
  105%!  uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
  106%
  107%   NewComponents is the same as Components with Field set to Data.
  108
  109uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  110                    uri_components(S, A, P, Q, F)).
  111uri_data(authority, uri_components(S, _, P, Q, F), A,
  112                    uri_components(S, A, P, Q, F)).
  113uri_data(path,      uri_components(S, A, _, Q, F), P,
  114                    uri_components(S, A, P, Q, F)).
  115uri_data(search,    uri_components(S, A, P, _, F), Q,
  116                    uri_components(S, A, P, Q, F)).
  117uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  118                    uri_components(S, A, P, Q, F)).
  119
  120%!  uri_normalized(+URI, -NormalizedURI) is det.
  121%
  122%   NormalizedURI is the normalized form   of  URI. Normalization is
  123%   syntactic and involves the following steps:
  124%
  125%       * 6.2.2.1. Case Normalization
  126%       * 6.2.2.2. Percent-Encoding Normalization
  127%       * 6.2.2.3. Path Segment Normalization
  128
  129%!  iri_normalized(+IRI, -NormalizedIRI) is det.
  130%
  131%   NormalizedIRI is the normalized form   of  IRI. Normalization is
  132%   syntactic and involves the following steps:
  133%
  134%       * 6.2.2.1. Case Normalization
  135%       * 6.2.2.3. Path Segment Normalization
  136%
  137%   @see    This is similar to uri_normalized/2, but does not do
  138%           normalization of %-escapes.
  139
  140%!  uri_normalized_iri(+URI, -NormalizedIRI) is det.
  141%
  142%   As uri_normalized/2, but percent-encoding is translated into IRI
  143%   Unicode characters. The translation  is   liberal:  valid  UTF-8
  144%   sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
  145%   character. Other %XX-sequences are mapped   to the corresponding
  146%   ISO-Latin-1 character and sole % characters are left untouched.
  147%
  148%   @see uri_iri/2.
  149
  150
  151%!  uri_is_global(+URI) is semidet.
  152%
  153%   True if URI has a scheme. The  semantics   is  the  same as the code
  154%   below, but the implementation is more efficient  as it does not need
  155%   to parse the other components, nor  needs   to  bind the scheme. The
  156%   condition to demand a scheme of more  than one character is added to
  157%   avoid confusion with DOS path names.
  158%
  159%   ==
  160%   uri_is_global(URI) :-
  161%           uri_components(URI, Components),
  162%           uri_data(scheme, Components, Scheme),
  163%           nonvar(Scheme),
  164%           atom_length(Scheme, Len),
  165%           Len > 1.
  166%   ==
  167
  168%!  uri_resolve(+URI, +Base, -GlobalURI) is det.
  169%
  170%   Resolve a possibly local URI relative   to Base. This implements
  171%   http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
  172
  173%!  uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
  174%
  175%   NormalizedGlobalURI is the normalized global version of URI.
  176%   Behaves as if defined by:
  177%
  178%   ==
  179%   uri_normalized(URI, Base, NormalizedGlobalURI) :-
  180%           uri_resolve(URI, Base, GlobalURI),
  181%           uri_normalized(GlobalURI, NormalizedGlobalURI).
  182%   ==
  183
  184%!  iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det.
  185%
  186%   NormalizedGlobalIRI is the normalized  global   version  of IRI.
  187%   This is similar to uri_normalized/3, but   does  not do %-escape
  188%   normalization.
  189
  190%!  uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
  191%
  192%   NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
  193%   as if defined by:
  194%
  195%   ==
  196%   uri_normalized(URI, Base, NormalizedGlobalIRI) :-
  197%           uri_resolve(URI, Base, GlobalURI),
  198%           uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
  199%   ==
  200
  201%!  uri_query_components(+String, -Query) is det.
  202%!  uri_query_components(-String, +Query) is det.
  203%
  204%   Perform encoding and decoding of an URI query string. Query is a
  205%   list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
  206%   query elements of the forms Name(Value)  and Name-Value are also
  207%   accepted to enhance interoperability with   the option and pairs
  208%   libraries.  E.g.
  209%
  210%   ==
  211%   ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
  212%   QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
  213%
  214%   ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
  215%   Q = [a=b, c='d+w', n='VU Amsterdam'].
  216%   ==
  217
  218
  219%!  uri_authority_components(+Authority, -Components) is det.
  220%!  uri_authority_components(-Authority, +Components) is det.
  221%
  222%   Break-down the authority component of a   URI. The fields of the
  223%   structure Components can be accessed using uri_authority_data/3.
  224
  225%!  uri_authority_data(+Field, ?Components, ?Data) is semidet.
  226%
  227%   Provide access the uri_authority  structure. Defined field-names
  228%   are: =user=, =password=, =host= and =port=
  229
  230uri_authority_data(user,     uri_authority(U, _, _, _), U).
  231uri_authority_data(password, uri_authority(_, P, _, _), P).
  232uri_authority_data(host,     uri_authority(_, _, H, _), H).
  233uri_authority_data(port,     uri_authority(_, _, _, P), P).
  234
  235
  236%!  uri_encoded(+Component, +Value, -Encoded) is det.
  237%!  uri_encoded(+Component, -Value, +Encoded) is det.
  238%
  239%   Encoded   is   the   URI   encoding   for   Value.   When   encoding
  240%   (Value->Encoded), Component specifies the URI   component  where the
  241%   value is used. It is  one   of  =query_value=, =fragment=, =path= or
  242%   =segment=.  Besides  alphanumerical   characters,    the   following
  243%   characters are passed verbatim (the set   is split in logical groups
  244%   according to RFC3986).
  245%
  246%       $ query_value, fragment :
  247%       "-._~" | "!$'()*,;" | "@" | "/?"
  248%       $ path :
  249%       "-._~" | "!$&'()*,;=" | "@" | "/"
  250%       $ segment :
  251%       "-._~" | "!$&'()*,;=" | "@"
  252
  253%!  uri_iri(+URI, -IRI) is det.
  254%!  uri_iri(-URI, +IRI) is det.
  255%
  256%   Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
  257%   a fully expanded  Unicode  string.   Unicode  strings  are first
  258%   encoded into UTF-8, after which %-encoding takes place.
  259%
  260%   @error syntax_error(Culprit) in mode (+,-) if URI is not a
  261%   legally percent-encoded UTF-8 string.
  262
  263
  264%!  uri_file_name(+URI, -FileName) is semidet.
  265%!  uri_file_name(-URI, +FileName) is det.
  266%
  267%   Convert between a URI and a   local  file_name. This protocol is
  268%   covered by RFC 1738. Please note   that file-URIs use _absolute_
  269%   paths. The mode (-, +) translates  a possible relative path into
  270%   an absolute one.
  271
  272uri_file_name(URI, FileName) :-
  273    nonvar(URI),
  274    !,
  275    uri_components(URI, Components),
  276    uri_data(scheme, Components, File), File == file,
  277    (   uri_data(authority, Components, '')
  278    ->  true
  279    ;   uri_data(authority, Components, localhost)
  280    ),
  281    uri_data(path, Components, FileNameEnc),
  282    uri_encoded(path, FileName0, FileNameEnc),
  283    delete_leading_slash(FileName0, FileName).
  284uri_file_name(URI, FileName) :-
  285    nonvar(FileName),
  286    !,
  287    absolute_file_name(FileName, Path0),
  288    ensure_leading_slash(Path0, Path),
  289    uri_encoded(path, Path, PathEnc),
  290    uri_data(scheme, Components, file),
  291    uri_data(authority, Components, ''),
  292    uri_data(path, Components, PathEnc),
  293    uri_components(URI, Components).
  294
  295%!  ensure_leading_slash(+WinPath, -Path).
  296%!  delete_leading_slash(+Path, -WinPath).
  297%
  298%   Deal with the fact that absolute paths   in Windows start with a
  299%   drive letter rather than a  /.  For   URIs  we  need a path that
  300%   starts with a /.
  301
  302ensure_leading_slash(Path, SlashPath) :-
  303    (   sub_atom(Path, 0, _, _, /)
  304    ->  SlashPath = Path
  305    ;   atom_concat(/, Path, SlashPath)
  306    ).
  307
  308:- if(current_prolog_flag(windows, true)).  309delete_leading_slash(Path, WinPath) :-
  310    atom_concat(/, WinPath, Path),
  311    is_absolute_file_name(WinPath),
  312    !.
  313:- endif.  314delete_leading_slash(Path, Path).
  315
  316
  317                 /*******************************
  318                 *            SANDBOX           *
  319                 *******************************/
  320
  321:- multifile sandbox:safe_primitive/1.  322
  323sandbox:safe_primitive(uri:uri_components(_,_)).
  324sandbox:safe_primitive(uri:uri_normalized(_,_)).
  325sandbox:safe_primitive(uri:iri_normalized(_,_)).
  326sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  327sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  328sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  329sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  330sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  331sandbox:safe_primitive(uri:uri_is_global(_)).
  332sandbox:safe_primitive(uri:uri_query_components(_,_)).
  333sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  334sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  335sandbox:safe_primitive(uri:uri_iri(_,_))