View source with raw comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2018, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(uri,
   36          [ uri_components/2,           % ?URI, ?Components
   37            uri_data/3,                 % ?Field, +Components, ?Data
   38            uri_data/4,                 % +Field, +Components, -Data, -New
   39
   40            uri_normalized/2,           % +URI, -NormalizedURI
   41            iri_normalized/2,           % +IRI, -NormalizedIRI
   42            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   43            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   44            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   45            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   46            uri_resolve/3,              % +URI, +Base, -AbsURI
   47            uri_is_global/1,            % +URI
   48            uri_query_components/2,     % ?QueryString, ?NameValueList
   49            uri_authority_components/2, % ?Authority, ?Components
   50            uri_authority_data/3,       % ?Field, ?Components, ?Data
   51                                        % Encoding
   52            uri_encoded/3,              % +Component, ?Value, ?Encoded
   53            uri_file_name/2,            % ?URI, ?Path
   54            uri_iri/2                   % ?URI, ?IRI
   55          ]).   56:- use_foreign_library(foreign(uri)).

Process URIs

This library provides high-performance C-based primitives for manipulating URIs. We decided for a C-based implementation for the much better performance on raw character manipulation. Notably, URI handling primitives are used in time-critical parts of RDF processing. This implementation is based on RFC-3986:

http://labs.apache.org/webarch/uri/rfc/rfc3986.html

The URI processing in this library is rather liberal. That is, we break URIs according to the rules, but we do not validate that the components are valid. Also, percent-decoding for IRIs is liberal. It first tries UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.

Earlier experience has shown that strict enforcement of the URI syntax results in many errors that are accepted by many other web-document processing tools. */

 uri_components(+URI, -Components) is det
uri_components(-URI, +Components) is det
Break a URI into its 5 basic components according to the RFC-3986 regular expression:
^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 12            3  4          5       6  7        8 9
Arguments:
Components- is a term uri_components(Scheme, Authority, Path, Search, Fragment). If a URI is parsed, i.e., using mode (+,-), components that are not found are left uninstantiated (variable). See uri_data/3 for accessing this structure.
 uri_data(?Field, +Components, ?Data) is semidet
Provide access the uri_component structure. Defined field-names are: scheme, authority, path, search and fragment
   99uri_data(scheme,    uri_components(S, _, _, _, _), S).
  100uri_data(authority, uri_components(_, A, _, _, _), A).
  101uri_data(path,      uri_components(_, _, P, _, _), P).
  102uri_data(search,    uri_components(_, _, _, S, _), S).
  103uri_data(fragment,  uri_components(_, _, _, _, F), F).
 uri_data(+Field, +Components, +Data, -NewComponents) is semidet
NewComponents is the same as Components with Field set to Data.
  109uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  110                    uri_components(S, A, P, Q, F)).
  111uri_data(authority, uri_components(S, _, P, Q, F), A,
  112                    uri_components(S, A, P, Q, F)).
  113uri_data(path,      uri_components(S, A, _, Q, F), P,
  114                    uri_components(S, A, P, Q, F)).
  115uri_data(search,    uri_components(S, A, P, _, F), Q,
  116                    uri_components(S, A, P, Q, F)).
  117uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  118                    uri_components(S, A, P, Q, F)).
 uri_normalized(+URI, -NormalizedURI) is det
NormalizedURI is the normalized form of URI. Normalization is syntactic and involves the following steps:
 iri_normalized(+IRI, -NormalizedIRI) is det
NormalizedIRI is the normalized form of IRI. Normalization is syntactic and involves the following steps:
See also
- This is similar to uri_normalized/2, but does not do normalization of %-escapes.
 uri_normalized_iri(+URI, -NormalizedIRI) is det
As uri_normalized/2, but percent-encoding is translated into IRI Unicode characters. The translation is liberal: valid UTF-8 sequences of %-encoded bytes are mapped to the Unicode character. Other %XX-sequences are mapped to the corresponding ISO-Latin-1 character and sole % characters are left untouched.
See also
- uri_iri/2.
 uri_is_global(+URI) is semidet
True if URI has a scheme. The semantics is the same as the code below, but the implementation is more efficient as it does not need to parse the other components, nor needs to bind the scheme. The condition to demand a scheme of more than one character is added to avoid confusion with DOS path names.
uri_is_global(URI) :-
        uri_components(URI, Components),
        uri_data(scheme, Components, Scheme),
        nonvar(Scheme),
        atom_length(Scheme, Len),
        Len > 1.
 uri_resolve(+URI, +Base, -GlobalURI) is det
Resolve a possibly local URI relative to Base. This implements http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det
NormalizedGlobalURI is the normalized global version of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalURI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized(GlobalURI, NormalizedGlobalURI).
 iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global version of IRI. This is similar to uri_normalized/3, but does not do %-escape normalization.
 uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global IRI of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalIRI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 uri_query_components(+String, -Query) is det
uri_query_components(-String, +Query) is det
Perform encoding and decoding of an URI query string. Query is a list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), query elements of the forms Name(Value) and Name-Value are also accepted to enhance interoperability with the option and pairs libraries. E.g.
?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.

?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
Q = [a=b, c='d+w', n='VU Amsterdam'].
 uri_authority_components(+Authority, -Components) is det
uri_authority_components(-Authority, +Components) is det
Break-down the authority component of a URI. The fields of the structure Components can be accessed using uri_authority_data/3.
 uri_authority_data(+Field, ?Components, ?Data) is semidet
Provide access the uri_authority structure. Defined field-names are: user, password, host and port
  230uri_authority_data(user,     uri_authority(U, _, _, _), U).
  231uri_authority_data(password, uri_authority(_, P, _, _), P).
  232uri_authority_data(host,     uri_authority(_, _, H, _), H).
  233uri_authority_data(port,     uri_authority(_, _, _, P), P).
 uri_encoded(+Component, +Value, -Encoded) is det
uri_encoded(+Component, -Value, +Encoded) is det
Encoded is the URI encoding for Value. When encoding (Value->Encoded), Component specifies the URI component where the value is used. It is one of query_value, fragment, path or segment. Besides alphanumerical characters, the following characters are passed verbatim (the set is split in logical groups according to RFC3986).
query_value, fragment
"-._~" | "!$'()*,;" | "@" | "/?"
path
"-._~" | "!$&'()*,;=" | "@" | "/"
segment
"-._~" | "!$&'()*,;=" | "@"
 uri_iri(+URI, -IRI) is det
uri_iri(-URI, +IRI) is det
Convert between a URI, encoded in US-ASCII and an IRI. An IRI is a fully expanded Unicode string. Unicode strings are first encoded into UTF-8, after which %-encoding takes place.
Errors
- syntax_error(Culprit) in mode (+,-) if URI is not a legally percent-encoded UTF-8 string.
 uri_file_name(+URI, -FileName) is semidet
uri_file_name(-URI, +FileName) is det
Convert between a URI and a local file_name. This protocol is covered by RFC 1738. Please note that file-URIs use absolute paths. The mode (-, +) translates a possible relative path into an absolute one.
  272uri_file_name(URI, FileName) :-
  273    nonvar(URI),
  274    !,
  275    uri_components(URI, Components),
  276    uri_data(scheme, Components, File), File == file,
  277    (   uri_data(authority, Components, '')
  278    ->  true
  279    ;   uri_data(authority, Components, localhost)
  280    ),
  281    uri_data(path, Components, FileNameEnc),
  282    uri_encoded(path, FileName0, FileNameEnc),
  283    delete_leading_slash(FileName0, FileName).
  284uri_file_name(URI, FileName) :-
  285    nonvar(FileName),
  286    !,
  287    absolute_file_name(FileName, Path0),
  288    ensure_leading_slash(Path0, Path),
  289    uri_encoded(path, Path, PathEnc),
  290    uri_data(scheme, Components, file),
  291    uri_data(authority, Components, ''),
  292    uri_data(path, Components, PathEnc),
  293    uri_components(URI, Components).
 ensure_leading_slash(+WinPath, -Path)
 delete_leading_slash(+Path, -WinPath)
Deal with the fact that absolute paths in Windows start with a drive letter rather than a /. For URIs we need a path that starts with a /.
  302ensure_leading_slash(Path, SlashPath) :-
  303    (   sub_atom(Path, 0, _, _, /)
  304    ->  SlashPath = Path
  305    ;   atom_concat(/, Path, SlashPath)
  306    ).
  307
  308:- if(current_prolog_flag(windows, true)).  309delete_leading_slash(Path, WinPath) :-
  310    atom_concat(/, WinPath, Path),
  311    is_absolute_file_name(WinPath),
  312    !.
  313:- endif.  314delete_leading_slash(Path, Path).
  315
  316
  317                 /*******************************
  318                 *            SANDBOX           *
  319                 *******************************/
  320
  321:- multifile sandbox:safe_primitive/1.  322
  323sandbox:safe_primitive(uri:uri_components(_,_)).
  324sandbox:safe_primitive(uri:uri_normalized(_,_)).
  325sandbox:safe_primitive(uri:iri_normalized(_,_)).
  326sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  327sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  328sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  329sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  330sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  331sandbox:safe_primitive(uri:uri_is_global(_)).
  332sandbox:safe_primitive(uri:uri_query_components(_,_)).
  333sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  334sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  335sandbox:safe_primitive(uri:uri_iri(_,_))