nutools/lib/nulib/awk/csv.awk

# -*- coding: utf-8 mode: awk -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8
@include "base.core.awk"
@include "base.array.awk"

function csv__parse_quoted(line, destl, colsep, qchar, echar,       pos, tmpl, nextc, resl) {
  line = substr(line, 2);
  resl = "";
  while (1) {
    pos = index(line, qchar);
    if (pos == 0) {
      # chaine mal terminee
      resl = resl line;
      destl[0] = "";
      destl[1] = 0;
      return resl;
    }
    if (echar != "" && pos > 1) {
      # tenir compte du fait qu"un caratère peut être mis en échappement
      prevc = substr(line, pos - 1, 1);
      quotec = substr(line, pos, 1);
      nextc = substr(line, pos + 1, 1);
      if (prevc == echar) {
        # qchar en échappement
        tmpl = substr(line, 1, pos - 2);
        resl = resl tmpl quotec;
        line = substr(line, pos + 1);
        continue;
      }
      tmpl = substr(line, 1, pos - 1);
      if (nextc == colsep || nextc == "") {
        # fin de champ ou fin de ligne
        resl = resl tmpl;
        destl[0] = substr(line, pos + 2);
        destl[1] = nextc == colsep;
        return resl;
      } else {
        # erreur de syntaxe: guillemet non mis en échappement
        # ignorer cette erreur et prendre le guillemet quand meme
        resl = resl tmpl quotec;
        line = substr(line, pos + 1);
      }
    } else {
      # pas d"échappement pour qchar. il est éventuellement doublé
      tmpl = substr(line, 1, pos - 1);
      quotec = substr(line, pos, 1);
      nextc = substr(line, pos + 1, 1);
      if (nextc == colsep || nextc == "") {
        # fin de champ ou fin de ligne
        resl = resl tmpl;
        destl[0] = substr(line, pos + 2);
        destl[1] = nextc == colsep;
        return resl;
      } else if (nextc == qchar) {
        # qchar en echappement
        resl = resl tmpl quotec;
        line = substr(line, pos + 2);
      } else {
        # erreur de syntaxe: guillemet non mis en échappement
        # ignorer cette erreur et prendre le guillemet quand meme
        resl = resl tmpl quotec;
        line = substr(line, pos + 1);
      }
    }
  }
}
function csv__parse_unquoted(line, destl, colsep, qchar, echar,     pos) {
  pos = index(line, colsep);
  if (pos == 0) {
    destl[0] = "";
    destl[1] = 0;
    return line;
  } else {
    destl[0] = substr(line, pos + 1);
    destl[1] = 1;
    return substr(line, 1, pos - 1);
  }
}
function csv__array_parse(fields, line, nbfields, colsep, qchar, echar,     shouldparse, destl, i) {
  array_new(fields);
  array_new(destl);
  i = 1;
  shouldparse = 0;
  # shouldparse permet de gérer le cas où un champ vide est en fin de ligne.
  # en effet, après "," il faut toujours parser, même si line==""
  while (shouldparse || line != "") {
    if (index(line, qchar) == 1) {
      value = csv__parse_quoted(line, destl, colsep, qchar, echar);
      line = destl[0];
      shouldparse = destl[1];
    } else {
      value = csv__parse_unquoted(line, destl, colsep, qchar, echar);
      line = destl[0];
      shouldparse = destl[1];
    }
    fields[i] = value;
    i = i + 1;
  }
  if (nbfields) {
    nbfields = int(nbfields);
    i = array_len(fields);
    while (i < nbfields) {
      i++;
      fields[i] = "";
    }
  }
  return array_len(fields);
}
BEGIN {
  DEFAULT_COLSEP = ",";
  DEFAULT_QCHAR = "\"";
  DEFAULT_ECHAR = "";
}
function array_parsecsv2(fields, line, nbfields, colsep, qchar, echar) {
  return csv__array_parse(fields, line, nbfields, colsep, qchar, echar);
}
function array_parsecsv(fields, line, nbfields, colsep, qchar, echar) {
  if (colsep == "") colsep = DEFAULT_COLSEP;
  if (qchar == "") qchar = DEFAULT_QCHAR;
  if (echar == "") echar = DEFAULT_ECHAR;
  return csv__array_parse(fields, line, nbfields, colsep, qchar, echar);
}
function parsecsv(line,             fields) {
  array_parsecsv(fields, line);
  array_getline(fields);
  return NF;
}
function getlinecsv(file,          fields) {
  if (file) {
    getline <file;
  } else {
    getline;
  }
  return parsecsv($0);
}
function csv__should_quote(s) {
  if (s ~ /^[[:blank:][:cntrl:][:space:]]/) return 1;
  if (s ~ /[[:blank:][:cntrl:][:space:]]$/) return 1;
  return 0;
}
function array_formatcsv2(fields, colsep, mvsep, qchar, echar,      count, indices, line, i, value) {
  line = "";
  count = mkindices(fields, indices);
  for (i = 1; i <= count; i++) {
    value = fields[indices[i]];
    if (i > 1) line = line colsep;
    if (qchar != "" && index(value, qchar) != 0) {
      if (echar != "") gsub(qchar, quote_subrepl(echar) "&", value);;
      else gsub(qchar, "&&", value);;
    }
    if (qchar != "" && (index(value, mvsep) != 0 || index(value, colsep) != 0 || index(value, qchar) != 0 || csv__should_quote(value))) {
      line = line qchar value qchar;
    } else {
      line = line value;
    }
  }
  return line;
}
function array_formatcsv(fields) {
  return array_formatcsv2(fields, ",", ";", "\"", "");
}
function array_printcsv(fields, output) {
  printto(array_formatcsv(fields), output);
}
function get_formatcsv(                 fields) {
  array_fill(fields);
  return array_formatcsv(fields);
}
function formatcsv() {
  $0 = get_formatcsv();
}
function printcsv(output,           fields) {
  array_fill(fields);
  array_printcsv(fields, output);
}
function array_findcsv(fields, input, field, value, nbfields,          orig, found, i) {
  array_new(orig);
  array_fill(orig);
  array_new(fields);
  found = 0;
  while ((getline <input) > 0) {
    array_parsecsv(fields, $0, nbfields);
    if (fields[field] == value) {
      found = 1;
      break;
    }
  }
  close(input);
  array_getline(orig);
  if (!found) {
    delete fields;
    if (nbfields) {
      nbfields = int(nbfields);
      i = array_len(fields);
      while (i < nbfields) {
        i++;
        fields[i] = "";
      }
    }
  }
  return found;
}