nutools/lib/nulib/awk/csv.awk

202 lines
5.7 KiB
Awk

# -*- coding: utf-8 mode: awk -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8
@include "base.core.awk"
@include "base.array.awk"
function csv__parse_quoted(line, destl, colsep, qchar, echar, pos, tmpl, nextc, resl) {
line = substr(line, 2);
resl = "";
while (1) {
pos = index(line, qchar);
if (pos == 0) {
# chaine mal terminee
resl = resl line;
destl[0] = "";
destl[1] = 0;
return resl;
}
if (echar != "" && pos > 1) {
# tenir compte du fait qu"un caratère peut être mis en échappement
prevc = substr(line, pos - 1, 1);
quotec = substr(line, pos, 1);
nextc = substr(line, pos + 1, 1);
if (prevc == echar) {
# qchar en échappement
tmpl = substr(line, 1, pos - 2);
resl = resl tmpl quotec;
line = substr(line, pos + 1);
continue;
}
tmpl = substr(line, 1, pos - 1);
if (nextc == colsep || nextc == "") {
# fin de champ ou fin de ligne
resl = resl tmpl;
destl[0] = substr(line, pos + 2);
destl[1] = nextc == colsep;
return resl;
} else {
# erreur de syntaxe: guillemet non mis en échappement
# ignorer cette erreur et prendre le guillemet quand meme
resl = resl tmpl quotec;
line = substr(line, pos + 1);
}
} else {
# pas d"échappement pour qchar. il est éventuellement doublé
tmpl = substr(line, 1, pos - 1);
quotec = substr(line, pos, 1);
nextc = substr(line, pos + 1, 1);
if (nextc == colsep || nextc == "") {
# fin de champ ou fin de ligne
resl = resl tmpl;
destl[0] = substr(line, pos + 2);
destl[1] = nextc == colsep;
return resl;
} else if (nextc == qchar) {
# qchar en echappement
resl = resl tmpl quotec;
line = substr(line, pos + 2);
} else {
# erreur de syntaxe: guillemet non mis en échappement
# ignorer cette erreur et prendre le guillemet quand meme
resl = resl tmpl quotec;
line = substr(line, pos + 1);
}
}
}
}
function csv__parse_unquoted(line, destl, colsep, qchar, echar, pos) {
pos = index(line, colsep);
if (pos == 0) {
destl[0] = "";
destl[1] = 0;
return line;
} else {
destl[0] = substr(line, pos + 1);
destl[1] = 1;
return substr(line, 1, pos - 1);
}
}
function csv__array_parse(fields, line, nbfields, colsep, qchar, echar, shouldparse, destl, i) {
array_new(fields);
array_new(destl);
i = 1;
shouldparse = 0;
# shouldparse permet de gérer le cas où un champ vide est en fin de ligne.
# en effet, après "," il faut toujours parser, même si line==""
while (shouldparse || line != "") {
if (index(line, qchar) == 1) {
value = csv__parse_quoted(line, destl, colsep, qchar, echar);
line = destl[0];
shouldparse = destl[1];
} else {
value = csv__parse_unquoted(line, destl, colsep, qchar, echar);
line = destl[0];
shouldparse = destl[1];
}
fields[i] = value;
i = i + 1;
}
if (nbfields) {
nbfields = int(nbfields);
i = array_len(fields);
while (i < nbfields) {
i++;
fields[i] = "";
}
}
return array_len(fields);
}
BEGIN {
DEFAULT_COLSEP = ",";
DEFAULT_QCHAR = "\"";
DEFAULT_ECHAR = "";
}
function array_parsecsv2(fields, line, nbfields, colsep, qchar, echar) {
return csv__array_parse(fields, line, nbfields, colsep, qchar, echar);
}
function array_parsecsv(fields, line, nbfields, colsep, qchar, echar) {
if (colsep == "") colsep = DEFAULT_COLSEP;
if (qchar == "") qchar = DEFAULT_QCHAR;
if (echar == "") echar = DEFAULT_ECHAR;
return csv__array_parse(fields, line, nbfields, colsep, qchar, echar);
}
function parsecsv(line, fields) {
array_parsecsv(fields, line);
array_getline(fields);
return NF;
}
function getlinecsv(file, fields) {
if (file) {
getline <file;
} else {
getline;
}
return parsecsv($0);
}
function csv__should_quote(s) {
if (s ~ /^[[:blank:][:cntrl:][:space:]]/) return 1;
if (s ~ /[[:blank:][:cntrl:][:space:]]$/) return 1;
return 0;
}
function array_formatcsv2(fields, colsep, mvsep, qchar, echar, count, indices, line, i, value) {
line = "";
count = mkindices(fields, indices);
for (i = 1; i <= count; i++) {
value = fields[indices[i]];
if (i > 1) line = line colsep;
if (qchar != "" && index(value, qchar) != 0) {
if (echar != "") gsub(qchar, quote_subrepl(echar) "&", value);;
else gsub(qchar, "&&", value);;
}
if (qchar != "" && (index(value, mvsep) != 0 || index(value, colsep) != 0 || index(value, qchar) != 0 || csv__should_quote(value))) {
line = line qchar value qchar;
} else {
line = line value;
}
}
return line;
}
function array_formatcsv(fields) {
return array_formatcsv2(fields, ",", ";", "\"", "");
}
function array_printcsv(fields, output) {
printto(array_formatcsv(fields), output);
}
function get_formatcsv( fields) {
array_fill(fields);
return array_formatcsv(fields);
}
function formatcsv() {
$0 = get_formatcsv();
}
function printcsv(output, fields) {
array_fill(fields);
array_printcsv(fields, output);
}
function array_findcsv(fields, input, field, value, nbfields, orig, found, i) {
array_new(orig);
array_fill(orig);
array_new(fields);
found = 0;
while ((getline <input) > 0) {
array_parsecsv(fields, $0, nbfields);
if (fields[field] == value) {
found = 1;
break;
}
}
close(input);
array_getline(orig);
if (!found) {
delete fields;
if (nbfields) {
nbfields = int(nbfields);
i = array_len(fields);
while (i < nbfields) {
i++;
fields[i] = "";
}
}
}
return found;
}