#------------------------------------------------------------------ # csvmode.awk --- facilities for handling CSV data # # Author: Manuel Collado, , http://mcollado.z15.es # License: Public domain - see COPYING for details # Version: 0.2 - December 2020 # # Prefix for user visible items: csv or CSV # Prefix for internal only items: _csv_ # # Control variables: # CSVMODE = 0 Disable CSV input parsing (default) # = 1 Enable CSV input parsing # = -1 Convert CSV input records into regular awk records # CSVRECORD: For CSVMODE=-1, stores the original CSV input record # # Optional parameters: # CSVSEP Input field delimiter, default comma (,) # CSVQUOTE Input quoting character, default double quote (") # CSVTRIM = 0 Preserve all space in fields (default) # = 1 Discard unquoted leading or trailing space # = 2 And also collapse contiguous space into a single space # CSVONELINE Flag, disable multiline fields (default enabled) # CSVOSEP Output field delimiter, default CSVSEP # CSVOQUOTE Output quoting character, default CSVQUOTE # CSVOFS Field separator for CSVMODE=-1, default SUBSEP # # Public functions: # csvquote() Compose a CSV field from a clean value, quoting if necessary # csvcompose() Compose a CSV record from an array of fields # csvformat() Reformat a record with fixed field delimiters as a CSV one # csvunquote() Extract the clean value of a CSV field # csvsplit() Split a CSV record into an array of CSV fields # csvparse() Parse a CSV record and build an array of clean value fields # csvprint() Print the current record, formatted as CSV # csvfield() Access CSV fields by name #------------------------------------------------------------------ #------------------------------------------------------------------ # Auxiliary internal functions #------------------------------------------------------------------ # Select effective value, given or default # value = _csv_effective(given, default1, default2, default3) # given, default1, default2, default3: preference order function _csv_effective(given, default1, default2, default3) { if (given) return given if (default1) return default1 if (default2) return default2 return default3 } # Generate a regex that matches CSV fields # FPAT = _csv_fpat([,comma [,quote]]) # comma: Optional, the actual CSV field delimiter, default comma ',' # quote: Optional, the actual CSV quote character, default double quote '"' function _csv_fpat(comma, quote, trim) { comma = _csv_effective(comma, CSVSEP, ",") quote = _csv_effective(quote, CSVQUOTE, "\"") #" trim = _csv_effective(trim, CSVTRIM, 0) # CSV FPAT: "([^,]*)|(\"([^\"]|\"\")+\"[^,]*)" return "[ ]*(([^" comma "]*)|(" quote "([^" quote "]|" quote quote ")+" quote "[^" comma "]*))[ ]*" # return "[ ]*(([^" comma "]*)|(" quote "([^" quote "]|" quote quote ")*" quote "[^" comma "]*))[ ]*" } #------------------------------------------------------------------ # Generate CSV formatted data # Customized by CSVOSEP and CSVOQUOTE, if given #------------------------------------------------------------------ # Compose a CSV field from a clean value, quoting if necessary # csv_field = csvquote(string [,comma [,quote]]) # string: The value to be formatted as CSV field # comma: Optional, the desired CSV field delimiter, default comma ',' # quote: Optional, the desired quote character, default double quote '"' function csvquote(str, comma, quote) { comma = _csv_effective(comma, CSVOSEP, CSVSEP, ",") quote = _csv_effective(quote, CSVOQUOTE, CSVQUOTE, "\"") #" if (index(str, comma) || index(str, quote) || index(str, "\n")) { gsub(quote, quote quote, str) str = quote str quote } return str } # Compose a CSV record from an array of fields # csv_record = csvcompose(af [,comma [,quote]]) # af: Array[1..n] of values to be formatted as CSV fields # comma: Optional, the desired CSV field delimiter, default comma ',' # quote: Optional, the desired quote character, default double quote '"' function csvcompose(af, comma, quote, record, k, sep) { comma = _csv_effective(comma, CSVOSEP, CSVSEP, ",") record = "" sep = "" for (k=1; k in af; k++) { record = record sep csvquote(af[k], comma, quote) sep = comma } return record } # Reformat a record with fixed field delimiters as a CSV one # csv_record = csvformat(record [,fs [,comma [,quote]]]) # record: The record to be formatted as CSV # fs: Optional, the record field separator, default OFS # comma: Optional, the desired CSV field delimiter, default comma ',' # quote: Optional, the desired quote character, default double quote '"' function csvformat(record, fs, comma, quote, af) { fs = _csv_effective(fs, OFS, FS) split(record, af, fs) return csvcompose(af, comma, quote) } #------------------------------------------------------------------ # Extract clean values from CSV formatted data # Customized by CSVSEP, CSVQUOTE and CSVTRIM, if given #------------------------------------------------------------------ # Extract the clean value of a CSV field # value = csvunquote(csvfield [, quote [, trim]]) # csvfield: The CSV formatted field # quote: Optional, the actual CSV quote character, default double quote '"' # trim: Optional, the whitespace trim policy function csvunquote(csvfield, quote, trim) { quote = _csv_effective(quote, CSVQUOTE, "\"") #" trim = _csv_effective(trim, CSVTRIM, 0) if (trim > 0) { sub(/^[[:space:]]+/, "", csvfield) sub(/[[:space:]]+$/, "", csvfield) } if (csvfield ~ ("^" quote ".*" quote "$")) { csvfield = substr(csvfield, 2, length(csvfield)-2) gsub(quote quote, quote, csvfield) } if (trim > 1) { gsub( /[[:space:]]+/, " ", csvfield ) } return csvfield } # Split a CSV record into an array of CSV fields # num_fields = csvsplit(csvrecord, afield [, comma [, quote]]) # csvrecord: The CSV formatted record # afield: Array[1..n] of CSV formatted fields # comma: Optional, the actual CSV field delimiter, default comma ',' # quote: Optional, the actual CSV quote character, default double quote '"' function csvsplit(csvrecord, afield, comma, quote) { comma = _csv_effective(comma, CSVSEP, ",") quote = _csv_effective(quote, CSVQUOTE, "\"") #" return patsplit(csvrecord, afield, _csv_fpat(comma, quote)) } # Parse a CSV record and build an array of clean value fields # num_fields = csvparse(csvrecord, afield [, comma [, quote [, trim]]]) # csvrecord: The CSV formatted record # afield: Array[1..n] of clean value fields # comma: Optional, the actual CSV field delimiter, default comma ',' # quote: Optional, the actual CSV quote character, default double quote '"' # trim: Optional, the whitespace trim policy function csvparse(csvrecord, afield, comma, quote, trim, numfields, k) { comma = _csv_effective(comma, CSVSEP, ",") quote = _csv_effective(quote, CSVQUOTE, "\"") #" numfields = csvsplit(csvrecord, afield, comma, quote) for (k=1; k in afield; k++) { afield[k] = csvunquote(afield[k], quote, trim) } return numfields } #------------------------------------------------------------------ # CSVMODE aware record printing #------------------------------------------------------------------ # Print the current record, formatted as CSV function csvprint( k, sep, sep1) { if (_csv_mode==1) { # $0 is a CSV record print } else { # $0 contains clean field values sep = "" sep1 = _csv_effective(CSVOSEP, CSVSEP, ",") for (k=1; k<=NF; k++) { printf("%s", sep csvquote($k)) sep = sep1 } print "" } } #------------------------------------------------------------------ # Access CSV fields by name #------------------------------------------------------------------ # Retrieve the current value of the field with the given name # val = csvfield(name, missing) # name: The field name # missing: Optional, returned if there is no field with the given name function csvfield(name, missing) { if (name in _csv_column) { return $_csv_column[name] } else { return missing } } #------------------------------------------------------------------ # Parse whole CSV input files # Customized by CSVSEP, CSVQUOTE and CSVOFS, if given #------------------------------------------------------------------ # Test CSVMODE at beginfile and record parse options BEGINFILE { if (CSVMODE+0 != 0) { _csv_sep = _csv_effective(CSVSEP, ",") _csv_osep = _csv_effective(CSVOSEP, CSVSEP, ",") _csv_quote = _csv_effective(CSVQUOTE, "\"") #" _csv_trimlvl = _csv_effective(CSVTRIM, 0) _csv_ofs = _csv_effective(CSVOFS, SUBSEP) _csv_1line = _csv_effective(CSV1LINE, 0) _csv_save_input_mode = PROCINFO["FS"] _csv_save_fs = FS _csv_save_ofs = OFS _csv_save_fpat = FPAT _csv_save_rs = RS RS = "(\\r?\\n|\\r)" delete _csv_column if (CSVMODE > 0) { _csv_mode = 1 FS = _csv_osep OFS = _csv_osep FPAT = _csv_fpat(_csv_sep, _csv_quote) } else if (CSVMODE < 0) { _csv_mode = -1 FS = _csv_ofs OFS = _csv_ofs } } else { _csv_mode = 0 } } # Restore overriden predefined variables at endfile ENDFILE { if (_csv_mode) { RS = _csv_save_rs OFS = _csv_save_ofs FPAT = _csv_save_fpat FS = _csv_save_fs if (_csv_save_input_mode == "FPAT") FPAT = FPAT else if (_csv_save_input_mode == "FIELDWIDTHS") FIELDWIDTHS = FIELDWIDTHS } } # Process CSV input records _csv_mode+0 { # Collect multi-line data, if it is the case CSVRECORD = $0 while (!_csv_1line && gsub(_csv_quote, _csv_quote, CSVRECORD) % 2 == 1 && (_csv_multi = getline _csv_) > 0) { CSVRECORD = CSVRECORD "\n" _csv_ NR-- FNR-- } if (_csv_multi) { $0 = CSVRECORD } # Convert the CSV record at user request if (_csv_mode < 0) { _csv_nf = csvsplit($0, _csv_ff) _csv_record = "" _csv_sep = "" for (k=1; k in _csv_ff; k++) { _csv_record = _csv_record _csv_sep csvunquote(_csv_ff[k]) _csv_sep = OFS } $0 = _csv_record } else if (_csv_trimlvl > 0) { for (k=1; k<=NF; k++) { sub(/^[[:space:]]+/, "", $k) sub(/[[:space:]]+$/, "", $k) if (_csv_trimlvl > 1) { gsub( /[[:space:]]+/, " ", $k ) } } } # Store a possible header record if (FNR==1) { for (k=1; k<=NF; k++) { if (_csv_mode < 0) label = $k else label = csvunquote($k, _csv_quote, _csv_trimlvl) _csv_column[label] = k } } }