#------------------------------------------------------------------
# csvmode.awk --- facilities for handling CSV data
#
# Author: Manuel Collado, <m-collado@users.sourceforge.net>, http://mcollado.z15.es
# License: Public domain - see COPYING for details
# Version: 0.2 - December 2020
#
# Prefix for user visible items:  csv or CSV
# Prefix for internal only items: _csv_
#
# Control variables:
#   CSVMODE = 0  Disable CSV input parsing (default)
#           = 1  Enable CSV input parsing
#           = -1 Convert CSV input records into regular awk records
#   CSVRECORD: For CSVMODE=-1, stores the original CSV input record
#
# Optional parameters:
#   CSVSEP    Input field delimiter, default comma (,)
#   CSVQUOTE  Input quoting character, default double quote (")
#   CSVTRIM = 0  Preserve all space in fields (default)
#           = 1  Discard unquoted leading or trailing space
#           = 2  And also collapse contiguous space into a single space
#   CSVONELINE  Flag, disable multiline fields (default enabled)
#   CSVOSEP   Output field delimiter, default CSVSEP
#   CSVOQUOTE Output quoting character, default CSVQUOTE
#   CSVOFS    Field separator for CSVMODE=-1, default SUBSEP
#
# Public functions:
#   csvquote()    Compose a CSV field from a clean value, quoting if necessary
#   csvcompose()  Compose a CSV record from an array of fields
#   csvformat()   Reformat a record with fixed field delimiters as a CSV one
#   csvunquote()  Extract the clean value of a CSV field
#   csvsplit()    Split a CSV record into an array of CSV fields
#   csvparse()    Parse a CSV record and build an array of clean value fields
#   csvprint()    Print the current record, formatted as CSV
#   csvfield()    Access CSV fields by name
#------------------------------------------------------------------

#------------------------------------------------------------------
# Auxiliary internal functions
#------------------------------------------------------------------

# Select effective value, given or default
#   value = _csv_effective(given, default1, default2, default3)
#   given, default1, default2, default3: preference order
function _csv_effective(given, default1, default2, default3) {
    if (given) return given
    if (default1) return default1
    if (default2) return default2
    return default3
}

# Generate a regex that matches CSV fields
#   FPAT = _csv_fpat([,comma [,quote]])
#   comma: Optional, the actual CSV field delimiter, default comma ','
#   quote: Optional, the actual CSV quote character, default double quote '"'
function _csv_fpat(comma, quote, trim) {
    comma = _csv_effective(comma, CSVSEP, ",")
    quote = _csv_effective(quote, CSVQUOTE, "\"")                    #"
    trim = _csv_effective(trim, CSVTRIM, 0)
# CSV FPAT: "([^,]*)|(\"([^\"]|\"\")+\"[^,]*)"
    return "[ ]*(([^" comma "]*)|(" quote "([^" quote "]|" quote quote ")+" quote "[^" comma "]*))[ ]*"
#	return "[ ]*(([^" comma "]*)|(" quote "([^" quote "]|" quote quote ")*" quote "[^" comma "]*))[ ]*"
}


#------------------------------------------------------------------
# Generate CSV formatted data
# Customized by CSVOSEP and CSVOQUOTE, if given
#------------------------------------------------------------------

# Compose a CSV field from a clean value, quoting if necessary
#   csv_field = csvquote(string [,comma [,quote]])
#   string: The value to be formatted as CSV field
#   comma: Optional, the desired CSV field delimiter, default comma ','
#   quote: Optional, the desired quote character, default double quote '"'
function csvquote(str, comma, quote) {
    comma = _csv_effective(comma, CSVOSEP, CSVSEP, ",")
    quote = _csv_effective(quote, CSVOQUOTE, CSVQUOTE, "\"")         #"
    if (index(str, comma) || index(str, quote) || index(str, "\n")) {
        gsub(quote, quote quote, str)
        str = quote str quote
    }
    return str
}

# Compose a CSV record from an array of fields
#   csv_record = csvcompose(af [,comma [,quote]])
#   af: Array[1..n] of values to be formatted as CSV fields
#   comma: Optional, the desired CSV field delimiter, default comma ','
#   quote: Optional, the desired quote character, default double quote '"'
function csvcompose(af, comma, quote,        record, k, sep) {
    comma = _csv_effective(comma, CSVOSEP, CSVSEP, ",")
    record = ""
    sep = ""
    for (k=1; k in af; k++) {
        record = record sep csvquote(af[k], comma, quote)
        sep = comma
    }
    return record
}

# Reformat a record with fixed field delimiters as a CSV one
#   csv_record = csvformat(record [,fs [,comma [,quote]]])
#   record: The record to be formatted as CSV
#   fs: Optional, the record field separator, default OFS
#   comma: Optional, the desired CSV field delimiter, default comma ','
#   quote: Optional, the desired quote character, default double quote '"'
function csvformat(record, fs, comma, quote,     af) {
    fs = _csv_effective(fs, OFS, FS)
    split(record, af, fs)
    return csvcompose(af, comma, quote)
}


#------------------------------------------------------------------
# Extract clean values from CSV formatted data
# Customized by CSVSEP, CSVQUOTE and CSVTRIM, if given
#------------------------------------------------------------------

# Extract the clean value of a CSV field
#   value = csvunquote(csvfield [, quote [, trim]])
#   csvfield: The CSV formatted field
#   quote: Optional, the actual CSV quote character, default double quote '"'
#   trim: Optional, the whitespace trim policy
function csvunquote(csvfield, quote, trim) {
    quote = _csv_effective(quote, CSVQUOTE, "\"")                    #"
    trim = _csv_effective(trim, CSVTRIM, 0)
    if (trim > 0) {
        sub(/^[[:space:]]+/, "", csvfield)
        sub(/[[:space:]]+$/, "", csvfield)
    }
    if (csvfield ~ ("^" quote ".*" quote "$")) {
        csvfield = substr(csvfield, 2, length(csvfield)-2)
        gsub(quote quote, quote, csvfield)
    }
    if (trim > 1) {
        gsub( /[[:space:]]+/, " ", csvfield )
    }
    return csvfield
}

# Split a CSV record into an array of CSV fields
#   num_fields = csvsplit(csvrecord, afield [, comma [, quote]])
#   csvrecord: The CSV formatted record
#   afield: Array[1..n] of CSV formatted fields
#   comma: Optional, the actual CSV field delimiter, default comma ','
#   quote: Optional, the actual CSV quote character, default double quote '"'
function csvsplit(csvrecord, afield, comma, quote) {
    comma = _csv_effective(comma, CSVSEP, ",")
    quote = _csv_effective(quote, CSVQUOTE, "\"")                    #"
    return patsplit(csvrecord, afield, _csv_fpat(comma, quote))
}

# Parse a CSV record and build an array of clean value fields
#   num_fields = csvparse(csvrecord, afield [, comma [, quote [, trim]]])
#   csvrecord: The CSV formatted record
#   afield: Array[1..n] of clean value fields
#   comma: Optional, the actual CSV field delimiter, default comma ','
#   quote: Optional, the actual CSV quote character, default double quote '"'
#   trim: Optional, the whitespace trim policy
function csvparse(csvrecord, afield, comma, quote, trim,      numfields, k) {
    comma = _csv_effective(comma, CSVSEP, ",")
    quote = _csv_effective(quote, CSVQUOTE, "\"")                    #"
    numfields = csvsplit(csvrecord, afield, comma, quote)
    for (k=1; k in afield; k++) {
        afield[k] = csvunquote(afield[k], quote, trim)
    }
    return numfields
}

#------------------------------------------------------------------
# CSVMODE aware record printing
#------------------------------------------------------------------

# Print the current record, formatted as CSV
function csvprint(       k, sep, sep1) {
    if (_csv_mode==1) {  # $0 is a CSV record
        print
    } else {             # $0 contains clean field values
        sep = ""
        sep1 = _csv_effective(CSVOSEP, CSVSEP, ",")
        for (k=1; k<=NF; k++) {
            printf("%s", sep csvquote($k))
            sep = sep1
        }
        print ""
    }
}

#------------------------------------------------------------------
# Access CSV fields by name
#------------------------------------------------------------------

# Retrieve the current value of the field with the given name
#   val = csvfield(name, missing)
#   name: The field name
#   missing: Optional, returned if there is no field with the given name
function csvfield(name, missing) {
    if (name in _csv_column) {
        return $_csv_column[name]
    } else {
        return missing
    }
}

#------------------------------------------------------------------
# Parse whole CSV input files
# Customized by CSVSEP, CSVQUOTE and CSVOFS, if given
#------------------------------------------------------------------

# Test CSVMODE at beginfile and record parse options
BEGINFILE {
    if (CSVMODE+0 != 0) {
        _csv_sep = _csv_effective(CSVSEP, ",")
        _csv_osep = _csv_effective(CSVOSEP, CSVSEP, ",")
        _csv_quote = _csv_effective(CSVQUOTE, "\"")                  #"
        _csv_trimlvl = _csv_effective(CSVTRIM, 0)
        _csv_ofs = _csv_effective(CSVOFS, SUBSEP)
        _csv_1line = _csv_effective(CSV1LINE, 0)
        _csv_save_input_mode = PROCINFO["FS"]
        _csv_save_fs = FS
        _csv_save_ofs = OFS
        _csv_save_fpat = FPAT
        _csv_save_rs = RS
        RS = "(\\r?\\n|\\r)"
        delete _csv_column
        if (CSVMODE > 0) {
            _csv_mode = 1
            FS = _csv_osep
            OFS = _csv_osep
            FPAT = _csv_fpat(_csv_sep, _csv_quote)
        } else if (CSVMODE < 0) {
            _csv_mode = -1
            FS = _csv_ofs
            OFS = _csv_ofs
        }
    } else {
        _csv_mode = 0
    }
}

# Restore overriden predefined variables at endfile
ENDFILE {
    if (_csv_mode) {
        RS = _csv_save_rs
        OFS = _csv_save_ofs
        FPAT = _csv_save_fpat
        FS = _csv_save_fs
        if (_csv_save_input_mode == "FPAT") FPAT = FPAT
        else if (_csv_save_input_mode == "FIELDWIDTHS") FIELDWIDTHS = FIELDWIDTHS
    }
}

# Process CSV input records
_csv_mode+0 {
    # Collect multi-line data, if it is the case
    CSVRECORD = $0
    while (!_csv_1line && gsub(_csv_quote, _csv_quote, CSVRECORD) % 2 == 1 && (_csv_multi = getline _csv_) > 0) {
        CSVRECORD = CSVRECORD "\n" _csv_
        NR--
        FNR--
    }
    if (_csv_multi) {
        $0 = CSVRECORD
    }

    # Convert the CSV record at user request
    if (_csv_mode < 0) {
        _csv_nf = csvsplit($0, _csv_ff)
        _csv_record = ""
        _csv_sep = ""
        for (k=1; k in _csv_ff; k++) {
            _csv_record = _csv_record _csv_sep csvunquote(_csv_ff[k])
            _csv_sep = OFS
        }
        $0 = _csv_record
    } else if (_csv_trimlvl > 0) {
        for (k=1; k<=NF; k++) {
            sub(/^[[:space:]]+/, "", $k)
            sub(/[[:space:]]+$/, "", $k)
            if (_csv_trimlvl > 1) {
                gsub( /[[:space:]]+/, " ", $k )
            }
        }
    }

    # Store a possible header record
    if (FNR==1) {
        for (k=1; k<=NF; k++) {
            if (_csv_mode < 0) label = $k
            else label = csvunquote($k, _csv_quote, _csv_trimlvl)
            _csv_column[label] = k
        }
    }
}