1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2017, VU University Amsterdam 7 All rights reserved. 8 9 Redistribution and use in source and binary forms, with or without 10 modification, are permitted provided that the following conditions 11 are met: 12 13 1. Redistributions of source code must retain the above copyright 14 notice, this list of conditions and the following disclaimer. 15 16 2. Redistributions in binary form must reproduce the above copyright 17 notice, this list of conditions and the following disclaimer in 18 the documentation and/or other materials provided with the 19 distribution. 20 21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 POSSIBILITY OF SUCH DAMAGE. 33*/ 34 35:- module(pcre, 36 [ re_match/2, % +Regex, +String 37 re_match/3, % +Regex, +String, +Options 38 re_matchsub/4, % +Regex, +String, -Subs, +Options 39 re_foldl/6, % :Goal, +Regex, +String, ?V0, ?V, +Options 40 re_split/3, % +Pattern, +String, -Split:list 41 re_split/4, % +Pattern, +String, -Split:list, +Options 42 re_replace/4, % +Pattern, +With, +String, -NewString 43 44 re_compile/3, % +Pattern, -Regex, +Options 45 re_flush/0, 46 re_config/1 % ?Config 47 ]). 48:- autoload(library(apply),[maplist/3]). 49:- autoload(library(error),[must_be/2,existence_error/2]). 50:- autoload(library(dcg/basics),[string/3,eos/2,digit/3,digits/3]). 51 52:- use_foreign_library(foreign(pcre4pl)). 53 54:- meta_predicate 55 re_foldl( , , , , , ).
74:- predicate_options(re_match/3, 3, 75 [ anchored(boolean), 76 bol(boolean), 77 bsr(oneof([anycrlf,unicode])), 78 empty(boolean), 79 empty_atstart(boolean), 80 eol(boolean), 81 newline(oneof([any,anycrlf,cr,lf,crlf])), 82 start(integer) 83 ]). 84:- predicate_options(re_compile/3, 3, 85 [ anchored(boolean), 86 bsr(oneof([anycrlf,unicode])), 87 caseless(boolean), 88 dollar_endonly(boolean), 89 dotall(boolean), 90 dupnames(boolean), 91 extended(boolean), 92 extra(boolean), 93 firstline(boolean), 94 compat(oneof([javascript])), 95 multiline(boolean), 96 newline(oneof([any,anycrlf,cr,lf,crlf])), 97 ucp(boolean), 98 ungreedy(boolean) 99 ]).
?- re_match("^needle"/i, "Needle in a haystack"). true.
Options:
true
, match only at the first positionfalse
)anycrlf
, \R only matches CR, LF or CRLF. If unicode
,
\R matches all Unicode line endings.
Subject string is the end of a line (default false
)true
)true
)false
)any
, recognize any Unicode newline sequence,
if anycrlf
, recognize CR, LF, and CRLF as newline
sequences, if cr
, recognize CR, if lf
, recognize
LF and finally if crlf
recognize CRLF as newline.149re_match(Regex, String) :- 150 re_match(Regex, String, []). 151re_match(Regex, String, Options) :- 152 re_compiled(Regex, Compiled), 153 re_match_(Compiled, String, Options).
capture_type(Type)
option passed to re_compile/3, may be specified
using flags if Regex is of the form Pattern/Flags and may be
specified at the level of individual captures using a naming
convention for the caption name. See re_compile/3 for details.
The example below exploits the typed groups to parse a date specification:
?- re_matchsub("(?<date> (?<year_I>(?:\\d\\d)?\\d\\d) - (?<month_I>\\d\\d) - (?<day_I>\\d\\d) )"/e, "2017-04-20", Sub, []). Sub = re_match{0:"2017-04-20", date:"2017-04-20", day:20, month:4, year:2017}.
182re_matchsub(Regex, String, Subs, Options) :-
183 re_compiled(Regex, Compiled),
184 re_matchsub_(Compiled, String, Pairs, Options),
185 dict_pairs(Subs, re_match, Pairs).
call(Goal, Dict1, V0, V1), call(Goal, Dict2, V1, V2), ... call(Goal, Dictn, Vn, V).
This predicate is used to implement re_split/4 and re_replace/4. For example, we can count all matches of a Regex on String using this code:
re_match_count(Regex, String, Count) :- re_foldl(increment, Regex, String, 0, Count, []). increment(_Match, V0, V1) :- V1 is V0+1.
After which we can query
?- re_match_count("a", "aap", X). X = 2.
219re_foldl(Goal, Regex, String, V0, V, Options) :- 220 re_compiled(Regex, Compiled), 221 re_foldl_(Compiled, String, Goal, V0, V, Options). 222 223:- public re_call_folder/4. 224 225re_call_folder(Goal, Pairs, V0, V1) :- 226 dict_pairs(Dict, re_match, Pairs), 227 call(Goal, Dict, V0, V1).
?- re_split("a+", "abaac", Split, []). Split = ["","a","b","aa","c"]. ?- re_split(":\\s*"/n, "Age: 33", Split, []). Split = ['Age', ': ', 33].
253re_split(Pattern, String, Split) :- 254 re_split(Pattern, String, Split, []). 255re_split(Pattern, String, Split, Options) :- 256 range_regex(Pattern, Compiled, Type), 257 State = state(String, 0, Type), 258 re_foldl(split(State), Compiled, String, Split, [Last], Options), 259 arg(2, State, LastSkipStart), 260 typed_sub(Type, String, LastSkipStart, _, 0, Last). 261 262range_regex(Pattern/Flags, Compiled, Type) :- !, 263 atom_chars(Flags, Chars), 264 replace_flags(Chars, Chars1, Type), 265 atom_chars(RFlags, [r|Chars1]), 266 re_compiled(Pattern/RFlags, Compiled). 267range_regex(Pattern, Compiled, string) :- 268 re_compiled(Pattern/r, Compiled). 269 270replace_flags([], [], Type) :- 271 default(Type, string). 272replace_flags([H|T0], T, Type) :- 273 split_type(H, Type), 274 !, 275 replace_flags(T0, T, Type). 276replace_flags([H|T0], [H|T], Type) :- 277 replace_flags(T0, T, Type). 278 279split_type(a, atom). 280split_type(s, string). 281split_type(n, name). 282 283split(State, Dict, [Skipped,Sep|T], T) :- 284 matched(State, Dict.0, Sep), 285 skipped(State, Dict.0, Skipped). 286 287matched(state(String, _, Type), Start-Len, Matched) :- 288 typed_sub(Type, String, Start, Len, _, Matched). 289 290skipped(State, Start-Len, Skipped) :- 291 State = state(String, Here, Type), 292 SkipLen is Start-Here, 293 typed_sub(Type, String, Here, SkipLen, _, Skipped), 294 NextSkipStart is Start+Len, 295 nb_setarg(2, State, NextSkipStart). 296 297typed_sub(string, Haystack, B, L, A, String) :- 298 sub_string(Haystack, B, L, A, String). 299typed_sub(atom, Haystack, B, L, A, String) :- 300 sub_atom(Haystack, B, L, A, String). 301typed_sub(name, Haystack, B, L, A, Value) :- 302 sub_string(Haystack, B, L, A, String), 303 ( number_string(Number, String) 304 -> Value = Number 305 ; atom_string(Value, String) 306 ).
319re_replace(Pattern, With, String, NewString) :- 320 range_regex(Pattern, Compiled, All, Type), 321 compile_replacement(With, RCompiled), 322 State = state(String, 0, Type), 323 ( All == all 324 -> re_foldl(replace(State, RCompiled), Compiled, String, Parts, [Last], []) 325 ; ( re_matchsub(Compiled, String, Match, []) 326 -> replace(State, RCompiled, Match, Parts, [Last]) 327 ; Repl = false 328 ) 329 ), 330 ( Repl == false 331 -> parts_to_output(Type, [String], NewString) 332 ; arg(2, State, LastSkipStart), 333 sub_string(String, LastSkipStart, _, 0, Last), 334 parts_to_output(Type, Parts, NewString) 335 ). 336 337range_regex(Pattern/Flags, Compiled, All, Type) :- !, 338 atom_chars(Flags, Chars), 339 replace_flags(Chars, Chars1, All, Type), 340 atom_chars(RFlags, [r|Chars1]), 341 re_compiled(Pattern/RFlags, Compiled). 342range_regex(Pattern, Compiled, first, string) :- 343 re_compiled(Pattern/r, Compiled). 344 345replace_flags([], [], All, Type) :- 346 default(All, first), 347 default(Type, string). 348replace_flags([H|T0], T, All, Type) :- 349 ( all(H, All) 350 -> true 351 ; type(H, Type) 352 ), 353 !, 354 replace_flags(T0, T, All, Type). 355replace_flags([H|T0], [H|T], All, Type) :- 356 replace_flags(T0, T, All, Type). 357 358all(g, all). 359type(a, atom). 360type(s, string). 361 362default(Val, Val) :- !. 363default(_, _). 364 365replace(State, With, Dict, [Skipped|Parts], T) :- 366 State = state(String, _, _Type), 367 copy_term(With, r(PartsR, Skel)), 368 Skel :< Dict, 369 range_strings(PartsR, String, Parts, T), 370 skipped(State, Dict.0, Skipped). 371 372range_strings([], _, T, T). 373range_strings([Start-Len|T0], String, [S|T1], T) :- 374 !, 375 sub_string(String, Start, Len, _, S), 376 range_strings(T0, String, T1, T). 377range_strings([S|T0], String, [S|T1], T) :- 378 range_strings(T0, String, T1, T). 379 380parts_to_output(string, Parts, String) :- 381 atomics_to_string(Parts, String). 382parts_to_output(atom, Parts, String) :- 383 atomic_list_concat(Parts, String).
391:- dynamic replacement_cache/2. 392:- volatile replacement_cache/2. 393 394compile_replacement(With, Compiled) :- 395 replacement_cache(With, Compiled), 396 !. 397compile_replacement(With, Compiled) :- 398 compile_replacement_nocache(With, Compiled), 399 assertz(replacement_cache(With, Compiled)). 400 401compile_replacement_nocache(With, r(Parts, Extract)) :- 402 string_codes(With, Codes), 403 phrase(replacement_parts(Parts, Pairs), Codes), 404 dict_pairs(Extract, _, Pairs). 405 406replacement_parts(Parts, Extract) --> 407 string(HCodes), 408 ( ("\\" ; "$"), 409 capture_name(Name) 410 -> !, 411 { add_part(HCodes, Parts, T0), 412 T0 = [Repl|T1], 413 Extract = [Name-Repl|Extract1] 414 }, 415 replacement_parts(T1, Extract1) 416 ; eos 417 -> !, 418 { add_part(HCodes, Parts, []), 419 Extract = [] 420 } 421 ). 422 423add_part([], Parts, Parts) :- 424 !. 425add_part(Codes, [H|T], T) :- 426 string_codes(H, Codes). 427 428capture_name(Name) --> 429 "{", 430 ( digit(D0) 431 -> digits(DL), 432 "}", 433 { number_codes(Name, [D0|DL]) } 434 ; letter(A0), 435 alnums(AL), 436 "}", 437 { atom_codes(Name, [A0|AL]) } 438 ). 439capture_name(Name) --> 440 digit(D0), 441 !, 442 digits(DL), 443 { number_codes(Name, [D0|DL]) }. 444capture_name(Name) --> 445 letter(A0), 446 !, 447 alnums(AL), 448 { atom_codes(Name, [A0|AL]) }. 449 450letter(L) --> 451 [L], 452 { between(0'a,0'z,L) 453 ; between(0'A,0'Z,L) 454 ; L == 0'_ 455 }, !. 456 457alnums([H|T]) --> 458 alnum(H), 459 !, 460 alnums(T). 461alnums([]) --> 462 "". 463 464alnum(L) --> 465 [L], 466 { between(0'a,0'z,L) 467 ; between(0'A,0'Z,L) 468 ; between(0'0,0'9,L) 469 ; L == 0'_ 470 }, !.
regex
(see blob/2).
Defined Options are defined below. Please consult the PCRE
documentation for details.
anycrlf
, \R only matches CR, LF or CRLF. If unicode
,
\R matches all Unicode line endings.true
, do caseless matching.true
, $ not to match newline at endtrue
, . matches anything including NLtrue
, allow duplicate names for subpatternstrue
, ignore white space and # commentstrue
, PCRE extra features (not much use currently)true
, force matching to be before newlinejavascript
, JavaScript compatibilitytrue
, ^ and $ match newlines within dataany
, recognize any Unicode newline sequence,
if anycrlf
(default), recognize CR, LF, and CRLF as newline
sequences, if cr
, recognize CR, if lf
, recognize
LF and finally if crlf
recognize CRLF as newline.true
, use Unicode properties for \d, \w, etc.true
, invert greediness of quantifiersIn addition to the options above that directly map to pcre flags the following options are processed:
true
, study the regular expression.Start-Length
. Note
the we use
Start-Length` rather than the more conventional
Start-End
to allow for immediate use with sub_atom/5 and
sub_string/5.
The capture_type
specifies the default for this pattern. The
interface supports a different type for each named group using
the syntax (?<name_T>...)
, where T
is one of S
(string),
A
(atom), I
(integer), F
(float), N
(number), T
(term)
and R
(range). In the current implementation I
, F
and N
are
synonyms for T
. Future versions may act different if the parsed
value is not of the requested numeric type.
545:- dynamic re_pool/3. 546:- volatile re_pool/3. 547 548re_compiled(Regex, Regex) :- 549 blob(Regex, regex), 550 !. 551re_compiled(Text/Flags, Regex) :- 552 must_be(text, Text), 553 must_be(atom, Flags), 554 re_pool(Text, Flags, Regex), 555 !. 556re_compiled(Text/Flags, Regex) :- 557 !, 558 re_flags_options(Flags, Options), 559 re_compile(Text, Regex, Options), 560 assertz(re_pool(Text, Flags, Regex)). 561re_compiled(Text, Regex) :- 562 must_be(text, Text), 563 re_pool(Text, '', Regex), 564 !. 565re_compiled(Text, Regex) :- 566 re_compiled(Text/'', Regex). 567 568re_flags_options(Flags, Options) :- 569 atom_chars(Flags, Chars), 570 maplist(re_flag_option, Chars, Options). 571 572re_flag_option(Flag, Option) :- 573 re_flag_option_(Flag, Option), 574 !. 575re_flag_option(Flag, _) :- 576 existence_error(re_flag, Flag). 577 578re_flag_option_(i, caseless(true)). 579re_flag_option_(m, multiline(true)). 580re_flag_option_(x, extended(true)). 581re_flag_option_(s, dotall(true)). 582re_flag_option_(a, capture_type(atom)). 583re_flag_option_(r, capture_type(range)). 584re_flag_option_(t, capture_type(term)).
592re_flush :-
593 retractall(replacement_cache(_,_)),
594 retractall(re_pool(_,_,_)).
PCRE_CONFIG_*
constant after removing =PCRE_CONFIG_= and mapping the name to lower
case, e.g. utf8
, unicode_properties
, etc. Value is either a
Prolog boolean, integer or atom.
Finally, the functionality of pcre_version()
is available using the
configuration name version
.
Perl compatible regular expression matching for SWI-Prolog
This module provides an interface to the PCRE (Perl Compatible Regular Expression) library. This Prolog interface provides an almost comprehensive wrapper around PCRE.
Regular expressions are created from a pattern and options and represented as a SWI-Prolog blob. This implies they are subject to (atom) garbage collection. Compiled regular expressions can safely be used in multiple threads. Most predicates accept both an explicitly compiled regular expression, a pattern or a term Pattern/Flags. In the latter two cases a regular expression blob is created and stored in a cache. The cache can be cleared using re_flush/0.