csv.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2018, VU University Amsterdam
    7                              CWI, Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(csv,
   37          [ csv//1,                     % +Rows
   38            csv//2,                     % +Rows, +Options
   39
   40            csv_read_file/2,            % +File, -Data
   41            csv_read_file/3,            % +File, -Data, +Options
   42            csv_read_stream/3,          % +Stream, -Data, +Options
   43
   44            csv_read_file_row/3,        % +File, -Row, +Options
   45            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   46            csv_options/2,		% -Compiled, +Options
   47
   48            csv_write_file/2,           % +File, +Data
   49            csv_write_file/3,           % +File, +Data, +Options
   50            csv_write_stream/3          % +Stream, +Data, +Options
   51          ]).   52:- use_module(library(record),[(record)/1, op(_,_,record)]).   53
   54:- autoload(library(apply),[maplist/2]).   55:- autoload(library(debug),[debug/3]).   56:- autoload(library(error),[must_be/2,domain_error/2]).   57:- autoload(library(lists),[append/3]).   58:- autoload(library(option),[option/2,select_option/4]).   59:- autoload(library(pure_input),
   60	    [phrase_from_file/3,phrase_from_stream/2]).   61:- autoload(library(readutil),[read_line_to_codes/2]).   62:- autoload(library(dcg/basics),[string//1,eos//0]).   63
   64
   65/** <module> Process CSV (Comma-Separated Values) data
   66
   67This library parses and generates CSV data.   CSV data is represented in
   68Prolog as a list of rows. Each row   is  a compound term, where all rows
   69have the same name and arity.
   70
   71@tbd    Implement immediate assert of the data to avoid possible stack
   72        overflows.
   73@tbd    Writing creates an intermediate code-list, possibly overflowing
   74        resources.  This waits for pure output!
   75@see RFC 4180
   76*/
   77
   78:- predicate_options(csv//2, 2,
   79                     [ separator(nonneg),       % mustv be code
   80                       strip(boolean),
   81                       ignore_quotes(boolean),
   82                       convert(boolean),
   83                       case(oneof([down,preserve,up])),
   84                       functor(atom),
   85                       arity(-nonneg),          % actually ?nonneg
   86                       match_arity(boolean)
   87                     ]).   88:- predicate_options(csv_read_file/3, 3,
   89                     [ pass_to(csv//2, 2),
   90                       pass_to(phrase_from_file/3, 3)
   91                     ]).   92:- predicate_options(csv_read_file_row/3, 3,
   93                     [ pass_to(csv//2, 2),
   94                       pass_to(open/4, 4)
   95                     ]).   96:- predicate_options(csv_write_file/3, 3,
   97                     [ pass_to(csv//2, 2),
   98                       pass_to(open/4, 4)
   99                     ]).  100:- predicate_options(csv_write_stream/3, 3,
  101                     [ pass_to(csv//2, 2)
  102                     ]).  103
  104
  105:- record
  106    csv_options(separator:integer=0',,
  107                strip:boolean=false,
  108                ignore_quotes:boolean=false,
  109                convert:boolean=true,
  110                case:oneof([down,preserve,up])=preserve,
  111                functor:atom=row,
  112                arity:integer,
  113                match_arity:boolean=true,
  114                skip_header:atom).  115
  116
  117%!  csv_read_file(+File, -Rows) is det.
  118%!  csv_read_file(+File, -Rows, +Options) is det.
  119%
  120%   Read a CSV file into a list of   rows. Each row is a Prolog term
  121%   with the same arity. Options  is   handed  to  csv//2. Remaining
  122%   options  are  processed  by    phrase_from_file/3.  The  default
  123%   separator depends on the file name   extension and is =|\t|= for
  124%   =|.tsv|= files and =|,|= otherwise.
  125%
  126%   Suppose we want to create a predicate   table/6  from a CSV file
  127%   that we know contains 6 fields  per   record.  This  can be done
  128%   using the code below. Without the   option  arity(6), this would
  129%   generate a predicate table/N, where N   is  the number of fields
  130%   per record in the data.
  131%
  132%       ==
  133%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
  134%          maplist(assert, Rows).
  135%       ==
  136
  137
  138csv_read_file(File, Rows) :-
  139    csv_read_file(File, Rows, []).
  140
  141csv_read_file(File, Rows, Options) :-
  142    default_separator(File, Options, Options1),
  143    make_csv_options(Options1, Record, RestOptions),
  144    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  145
  146
  147default_separator(File, Options0, Options) :-
  148    (   option(separator(_), Options0)
  149    ->  Options = Options0
  150    ;   file_name_extension(_, Ext0, File),
  151        downcase_atom(Ext0, Ext),
  152        ext_separator(Ext, Sep)
  153    ->  Options = [separator(Sep)|Options0]
  154    ;   Options = Options0
  155    ).
  156
  157ext_separator(csv, 0',).
  158ext_separator(tsv, 0'\t).
  159
  160
  161%!  csv_read_stream(+Stream, -Rows, +Options) is det.
  162%
  163%   Read CSV data from Stream.  See also csv_read_row/3.
  164
  165csv_read_stream(Stream, Rows, Options) :-
  166    make_csv_options(Options, Record, _),
  167    phrase_from_stream(csv_roptions(Rows, Record), Stream).
  168
  169
  170%!  csv(?Rows)// is det.
  171%!  csv(?Rows, +Options)// is det.
  172%
  173%   Prolog DCG to `read/write' CSV data.  Options:
  174%
  175%       * separator(+Code)
  176%       The comma-separator.  Must be a character code.  Default is
  177%       (of course) the comma. Character codes can be specified
  178%       using the 0' notion. E.g., using =|separator(0';)|= parses
  179%       a semicolon separated file.
  180%
  181%       * ignore_quotes(+Boolean)
  182%       If =true= (default false), threat double quotes as a normal
  183%       character.
  184%
  185%       * strip(+Boolean)
  186%       If =true= (default =false=), strip leading and trailing
  187%       blank space.  RFC4180 says that blank space is part of the
  188%       data.
  189%
  190%       * skip_header(+CommentLead)
  191%       Skip leading lines that start with CommentLead.  There is
  192%       no standard for comments in CSV files, but some CSV files
  193%       have a header where each line starts with `#`.  After
  194%       skipping comment lines this option causes csv//2 to skip empty
  195%       lines.  Note that an empty line may not contain white space
  196%       characters (space or tab) as these may provide valid data.
  197%
  198%       * convert(+Boolean)
  199%       If =true= (default), use name/2 on the field data.  This
  200%       translates the field into a number if possible.
  201%
  202%       * case(+Action)
  203%       If =down=, downcase atomic values.  If =up=, upcase them
  204%       and if =preserve= (default), do not change the case.
  205%
  206%       * functor(+Atom)
  207%       Functor to use for creating row terms.  Default is =row=.
  208%
  209%       * arity(?Arity)
  210%       Number of fields in each row.  This predicate raises
  211%       a domain_error(row_arity(Expected), Found) if a row is
  212%       found with different arity.
  213%
  214%       * match_arity(+Boolean)
  215%       If =false= (default =true=), do not reject CSV files where
  216%       lines provide a varying number of fields (columns).  This
  217%       can be a work-around to use some incorrect CSV files.
  218
  219csv(Rows) -->
  220    csv(Rows, []).
  221
  222csv(Rows, Options) -->
  223    { make_csv_options(Options, Record, _) },
  224    csv_roptions(Rows, Record).
  225
  226csv_roptions(Rows, Record) -->
  227    { ground(Rows) },
  228    !,
  229    emit_csv(Rows, Record).
  230csv_roptions(Rows, Record) -->
  231    skip_header(Record),
  232    csv_data(Rows, Record).
  233
  234skip_header(Options) -->
  235    { csv_options_skip_header(Options, CommentStart),
  236      nonvar(CommentStart),
  237      atom_codes(CommentStart, Codes)
  238    },
  239    !,
  240    skip_header_lines(Codes),
  241    skip_blank_lines.
  242skip_header(_) -->
  243    [].
  244
  245skip_header_lines(CommentStart) -->
  246    string(CommentStart),
  247    !,
  248    (   string(_Comment),
  249        end_of_record
  250    ->  skip_header_lines(CommentStart)
  251    ).
  252skip_header_lines(_) -->
  253    [].
  254
  255skip_blank_lines -->
  256    eos,
  257    !.
  258skip_blank_lines -->
  259    end_of_record,
  260    !,
  261    skip_blank_lines.
  262skip_blank_lines -->
  263    [].
  264
  265csv_data([], _) -->
  266    eos,
  267    !.
  268csv_data([Row|More], Options) -->
  269    row(Row, Options),
  270    !,
  271    { debug(csv, 'Row: ~p', [Row]) },
  272    csv_data(More, Options).
  273
  274
  275row(Row, Options) -->
  276    fields(Fields, Options),
  277    { csv_options_functor(Options, Functor),
  278      Row =.. [Functor|Fields],
  279      functor(Row, _, Arity),
  280      check_arity(Options, Arity)
  281    }.
  282
  283check_arity(Options, Arity) :-
  284    csv_options_arity(Options, Arity),
  285    !.
  286check_arity(Options, _) :-
  287    csv_options_match_arity(Options, false),
  288    !.
  289check_arity(Options, Arity) :-
  290    csv_options_arity(Options, Expected),
  291    domain_error(row_arity(Expected), Arity).
  292
  293fields([F|T], Options) -->
  294    field(F, Options),
  295    (   separator(Options)
  296    ->  fields(T, Options)
  297    ;   end_of_record
  298    ->  { T = [] }
  299    ).
  300
  301field(Value, Options) -->
  302    "\"",
  303    { csv_options_ignore_quotes(Options, false) },
  304    !,
  305    string_codes(Codes),
  306    { make_value(Codes, Value, Options) }.
  307field(Value, Options) -->
  308    { csv_options_strip(Options, true) },
  309    !,
  310    stripped_field(Value, Options).
  311field(Value, Options) -->
  312    { csv_options_separator(Options, Sep) },
  313    field_codes(Codes, Sep),
  314    { make_value(Codes, Value, Options) }.
  315
  316
  317stripped_field(Value, Options) -->
  318    ws,
  319    (   "\"",
  320        { csv_options_strip(Options, false) }
  321    ->  string_codes(Codes),
  322        ws
  323    ;   { csv_options_separator(Options, Sep) },
  324        field_codes(Codes0, Sep),
  325        { strip_trailing_ws(Codes0, Codes) }
  326    ),
  327    { make_value(Codes, Value, Options) }.
  328
  329ws --> " ", !, ws.
  330ws --> "\t", !, ws.
  331ws --> "".
  332
  333strip_trailing_ws(List, Stripped) :-
  334    append(Stripped, WS, List),
  335    all_ws(WS).
  336
  337all_ws([]).
  338all_ws([32|T]) :- all_ws(T).
  339all_ws([9|T]) :- all_ws(T).
  340
  341
  342%!  string_codes(-Codes)
  343%
  344%   Process a double-quotes string where  the   quote  is escaped by
  345%   doubling it. Eats the terminating double-quote.
  346
  347string_codes(List) -->
  348    [H],
  349    (   { H == 0'" }
  350    ->  (   "\""
  351        ->  { List = [H|T] },
  352            string_codes(T)
  353        ;   { List = [] }
  354        )
  355    ;   { List = [H|T] },
  356        string_codes(T)
  357    ).
  358
  359field_codes([], Sep), [Sep] --> [Sep], !.
  360field_codes([], _), "\n" --> "\r\n", !.
  361field_codes([], _), "\n" --> "\n", !.
  362field_codes([], _), "\n" --> "\r", !.
  363field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  364field_codes([], _) --> [].              % unterminated last record
  365
  366%!  make_value(+Codes, -Value, +Options) is det.
  367%
  368%   Convert a list of character codes to the actual value, depending
  369%   on Options.
  370
  371make_value(Codes, Value, Options) :-
  372    csv_options_convert(Options, Convert),
  373    csv_options_case(Options, Case),
  374    make_value(Convert, Case, Codes, Value).
  375
  376make_value(true, preserve, Codes, Value) :-
  377    !,
  378    name(Value, Codes).
  379make_value(true, Case, Codes, Value) :-
  380    !,
  381    (   number_string(Value, Codes)
  382    ->  true
  383    ;   make_value(false, Case, Codes, Value)
  384    ).
  385make_value(false, preserve, Codes, Value) :-
  386    !,
  387    atom_codes(Value, Codes).
  388make_value(false, down, Codes, Value) :-
  389    !,
  390    string_codes(String, Codes),
  391    downcase_atom(String, Value).
  392make_value(false, up, Codes, Value) :-
  393    string_codes(String, Codes),
  394    upcase_atom(String, Value).
  395
  396separator(Options) -->
  397    { csv_options_separator(Options, Sep) },
  398    [Sep].
  399
  400end_of_record --> "\n".			% Unix files
  401end_of_record --> "\r\n".               % DOS files
  402end_of_record --> "\r".                 % MacOS files
  403end_of_record --> eos.                  % unterminated last record
  404
  405
  406%!  csv_read_file_row(+File, -Row, +Options) is nondet.
  407%
  408%   True when Row is a row in File.  First unifies Row with the first
  409%   row in File. Backtracking  yields  the   second,  ...  row.  This
  410%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
  411%   loading all rows in memory.  Note   that  this interface does not
  412%   guarantee that all rows in File have the same arity.
  413%
  414%   In addition to the  options   of  csv_read_file/3, this predicate
  415%   processes the option:
  416%
  417%     * line(-Line)
  418%     Line is unified with the 1-based line-number from which Row is
  419%     read.  Note that Line is not the physical line, but rather the
  420%     _logical_ record number.
  421%
  422%   @tbd    Input is read line by line.  If a record separator is
  423%           embedded in a quoted field, parsing the record fails and
  424%           another line is added to the input.  This does not nicely
  425%           deal with other reasons why parsing the row may fail.
  426
  427csv_read_file_row(File, Row, Options) :-
  428    default_separator(File, Options, Options1),
  429    make_csv_options(Options1, RecordOptions, Options2),
  430    select_option(line(Line), Options2, RestOptions, _),
  431    setup_call_cleanup(
  432        open(File, read, Stream, RestOptions),
  433        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  434        close(Stream)).
  435
  436csv_read_stream_row(Stream, Row, Line, Options) :-
  437    between(1, infinite, Line),
  438    (   csv_read_row(Stream, Row0, Options),
  439        Row0 \== end_of_file
  440    ->  Row = Row0
  441    ;   !,
  442        fail
  443    ).
  444
  445
  446%!  csv_read_row(+Stream, -Row, +CompiledOptions) is det.
  447%
  448%   Read the next CSV record from Stream  and unify the result with Row.
  449%   CompiledOptions is created from  options   defined  for csv//2 using
  450%   csv_options/2. Row is unified with   `end_of_file` upon reaching the
  451%   end of the input.
  452
  453csv_read_row(Stream, Row, _Record) :-
  454    at_end_of_stream(Stream),
  455    !,
  456    Row = end_of_file.
  457csv_read_row(Stream, Row, Record) :-
  458    read_lines_to_codes(Stream, Codes, Record, even),
  459    phrase(row(Row0, Record), Codes),
  460    !,
  461    Row = Row0.
  462
  463read_lines_to_codes(Stream, Codes, Options, QuoteQuantity) :-
  464    read_line_to_codes(Stream, Codes0),
  465    Codes0 \== end_of_file,
  466    (   (   csv_options_ignore_quotes(Options, true)
  467        ;   check_quotes(Codes0, QuoteQuantity, even)
  468        )
  469    ->  Codes = Codes0
  470    ;   append(Codes0, [0'\n|Tail], Codes),
  471        read_lines_to_codes(Stream, Tail, Options, odd)
  472    ).
  473
  474check_quotes([], QuoteQuantity, QuoteQuantity) :-
  475    !.
  476check_quotes([0'"|T], odd, Result) :-
  477    !,
  478    check_quotes(T, even, Result).
  479check_quotes([0'"|T], even, Result) :-
  480    !,
  481    check_quotes(T, odd, Result).
  482check_quotes([_|T], QuoteQuantity, Result) :-
  483    check_quotes(T, QuoteQuantity, Result).
  484
  485
  486%!  csv_options(-Compiled, +Options) is det.
  487%
  488%   Compiled is the  compiled  representation   of  the  CSV  processing
  489%   options as they may be passed into   csv//2,  etc. This predicate is
  490%   used in combination with csv_read_row/3 to avoid repeated processing
  491%   of the options.
  492
  493csv_options(Compiled, Options) :-
  494    make_csv_options(Options, Compiled, _Ignored).
  495
  496
  497                /*******************************
  498                *             OUTPUT           *
  499                *******************************/
  500
  501%!  csv_write_file(+File, +Data) is det.
  502%!  csv_write_file(+File, +Data, +Options) is det.
  503%
  504%   Write a list of Prolog terms to a CSV file.  Options are given
  505%   to csv//2.  Remaining options are given to open/4.  The  default
  506%   separator depends on the file name   extension and is =|\t|= for
  507%   =|.tsv|= files and =|,|= otherwise.
  508
  509csv_write_file(File, Data) :-
  510    csv_write_file(File, Data, []).
  511
  512csv_write_file(File, Data, Options) :-
  513    must_be(list, Data),
  514    default_separator(File, Options, Options1),
  515    make_csv_options(Options1, OptionsRecord, RestOptions),
  516    setup_call_cleanup(
  517        open(File, write, Out, RestOptions),
  518        maplist(csv_write_row(Out, OptionsRecord), Data),
  519        close(Out)).
  520
  521csv_write_row(Out, OptionsRecord, Row) :-
  522    phrase(emit_row(Row, OptionsRecord), String),
  523    format(Out, '~s', [String]).
  524
  525emit_csv([], _) --> [].
  526emit_csv([H|T], Options) -->
  527    emit_row(H, Options),
  528    emit_csv(T, Options).
  529
  530emit_row(Row, Options) -->
  531    { Row =.. [_|Fields] },
  532    emit_fields(Fields, Options),
  533    "\r\n".                                     % RFC 4180 demands \r\n
  534
  535emit_fields([], _) -->
  536    "".
  537emit_fields([H|T], Options) -->
  538    emit_field(H, Options),
  539    (   { T == [] }
  540        ->  []
  541        ;   { csv_options_separator(Options, Sep) },
  542        [Sep],
  543        emit_fields(T, Options)
  544    ).
  545
  546emit_field(H, Options) -->
  547    { (   atom(H)
  548      ->  atom_codes(H, Codes)
  549      ;   string(H)
  550      ->  string_codes(H, Codes)
  551      )
  552    },
  553    !,
  554    (   { needs_quotes(H, Options) }
  555    ->  "\"", emit_string(Codes), "\""
  556    ;   emit_codes(Codes)
  557    ).
  558emit_field([], _) -->
  559    !,
  560    { atom_codes('[]', Codes) },
  561    emit_codes(Codes).
  562emit_field(H, _) -->
  563    { number_codes(H,Codes) },
  564    emit_codes(Codes).
  565
  566needs_quotes(Atom, _) :-
  567    sub_atom(Atom, _, _, _, '"'),
  568    !.
  569needs_quotes(Atom, _) :-
  570    sub_atom(Atom, _, _, _, '\n'),
  571    !.
  572needs_quotes(Atom, _) :-
  573    sub_atom(Atom, _, _, _, '\r'),
  574    !.
  575needs_quotes(Atom, Options) :-
  576    csv_options_separator(Options, Sep),
  577    char_code(Char, Sep),
  578    sub_atom(Atom, _, _, _, Char),
  579    !.
  580
  581emit_string([]) --> "".
  582emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  583emit_string([H|T]) --> [H], emit_string(T).
  584
  585emit_codes([]) --> "".
  586emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  587emit_codes([H|T]) --> [H], emit_codes(T).
  588
  589
  590%%     csv_write_stream(+Stream, +Data, +Options) is det.
  591%
  592%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
  593%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
  594%      incrementally. The example  below  saves   all  answers  from the
  595%      predicate data/3 to File.
  596%
  597%        ==
  598%        save_data(File) :-
  599%           setup_call_cleanup(
  600%               open(File, write, Out),
  601%               forall(data(C1,C2,C3),
  602%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
  603%               close(Out)),
  604%        ==
  605
  606csv_write_stream(Stream, Data, Options) :-
  607    must_be(list, Data),
  608    make_csv_options(Options, OptionsRecord, _),
  609    maplist(csv_write_row(Stream, OptionsRecord), Data)