nutools/lib/nulib/awk/csv

# -*- coding: utf-8 mode: awk -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8
@include "base.core"
@include "base.array"

function csv__parse_quoted(line, destl, colsep, qchar, echar,       pos, tmpl, nextc, resl) {
  line = substr(line, 2)
  resl = ""
  while (1) {
    pos = index(line, qchar)
    if (pos == 0) {
      # chaine mal terminee
      resl = resl line
      destl[0] = ""
      destl[1] = 0
      return resl
    }
    if (echar != "" && pos > 1) {
      # tenir compte du fait qu"un caratère peut être mis en échappement
      prevc = substr(line, pos - 1, 1)
      quotec = substr(line, pos, 1)
      nextc = substr(line, pos + 1, 1)
      if (prevc == echar) {
        # qchar en échappement
        tmpl = substr(line, 1, pos - 2)
        resl = resl tmpl quotec
        line = substr(line, pos + 1)
        continue
      }
      tmpl = substr(line, 1, pos - 1)
      if (nextc == colsep || nextc == "") {
        # fin de champ ou fin de ligne
        resl = resl tmpl
        destl[0] = substr(line, pos + 2)
        destl[1] = nextc == colsep
        return resl
      } else {
        # erreur de syntaxe: guillemet non mis en échappement
        # ignorer cette erreur et prendre le guillemet quand meme
        resl = resl tmpl quotec
        line = substr(line, pos + 1)
      }
    } else {
      # pas d"échappement pour qchar. il est éventuellement doublé
      tmpl = substr(line, 1, pos - 1)
      quotec = substr(line, pos, 1)
      nextc = substr(line, pos + 1, 1)
      if (nextc == colsep || nextc == "") {
        # fin de champ ou fin de ligne
        resl = resl tmpl
        destl[0] = substr(line, pos + 2)
        destl[1] = nextc == colsep
        return resl
      } else if (nextc == qchar) {
        # qchar en echappement
        resl = resl tmpl quotec
        line = substr(line, pos + 2)
      } else {
        # erreur de syntaxe: guillemet non mis en échappement
        # ignorer cette erreur et prendre le guillemet quand meme
        resl = resl tmpl quotec
        line = substr(line, pos + 1)
      }
    }
  }
}
function csv__parse_unquoted(line, destl, colsep, qchar, echar,     pos) {
  pos = index(line, colsep)
  if (pos == 0) {
    destl[0] = ""
    destl[1] = 0
    return line
  } else {
    destl[0] = substr(line, pos + 1)
    destl[1] = 1
    return substr(line, 1, pos - 1)
  }
}
function csv__array_parse(fields, line, nbfields, colsep, qchar, echar,     shouldparse, destl, i) {
  array_new(fields)
  array_new(destl)
  i = 1
  shouldparse = 0
  # shouldparse permet de gérer le cas où un champ vide est en fin de ligne.
  # en effet, après "," il faut toujours parser, même si line==""
  while (shouldparse || line != "") {
    if (index(line, qchar) == 1) {
      value = csv__parse_quoted(line, destl, colsep, qchar, echar)
      line = destl[0]
      shouldparse = destl[1]
    } else {
      value = csv__parse_unquoted(line, destl, colsep, qchar, echar)
      line = destl[0]
      shouldparse = destl[1]
    }
    fields[i] = value
    i = i + 1
  }
  if (nbfields) {
    nbfields = int(nbfields)
    i = array_len(fields)
    while (i < nbfields) {
      i++
      fields[i] = ""
    }
  }
  return array_len(fields)
}
BEGIN {
  DEFAULT_COLSEP = ","
  DEFAULT_QCHAR = "\""
  DEFAULT_ECHAR = ""
}
function array_parsecsv2(fields, line, nbfields, colsep, qchar, echar) {
  return csv__array_parse(fields, line, nbfields, colsep, qchar, echar)
}
function array_parsecsv(fields, line, nbfields, colsep, qchar, echar) {
  if (colsep == "") colsep = DEFAULT_COLSEP
  if (qchar == "") qchar = DEFAULT_QCHAR
  if (echar == "") echar = DEFAULT_ECHAR
  return csv__array_parse(fields, line, nbfields, colsep, qchar, echar)
}
function parsecsv(line,             fields) {
  array_parsecsv(fields, line)
  array_getline(fields)
  return NF
}
function getlinecsv(file,          fields) {
  if (file) {
    getline <file
  } else {
    getline
  }
  return parsecsv($0)
}
function csv__should_quote(s) {
  if (s ~ /^[[:blank:][:cntrl:][:space:]]/) return 1
  if (s ~ /[[:blank:][:cntrl:][:space:]]$/) return 1
  return 0
}
function array_formatcsv2(fields, colsep, mvsep, qchar, echar,      count, indices, line, i, value) {
  line = ""
  count = mkindices(fields, indices)
  for (i = 1; i <= count; i++) {
    value = fields[indices[i]]
    if (i > 1) line = line colsep
    if (qchar != "" && index(value, qchar) != 0) {
      if (echar != "") gsub(qchar, quote_subrepl(echar) "&", value);
      else gsub(qchar, "&&", value);
    }
    if (qchar != "" && (index(value, mvsep) != 0 || index(value, colsep) != 0 || index(value, qchar) != 0 || csv__should_quote(value))) {
      line = line qchar value qchar
    } else {
      line = line value
    }
  }
  return line
}
function array_formatcsv(fields) {
  return array_formatcsv2(fields, ",", ";", "\"", "")
}
function array_printcsv(fields, output) {
  printto(array_formatcsv(fields), output)
}
function get_formatcsv(                 fields) {
  array_fill(fields)
  return array_formatcsv(fields)
}
function formatcsv() {
  $0 = get_formatcsv()
}
function printcsv(output,           fields) {
  array_fill(fields)
  array_printcsv(fields, output)
}
function array_findcsv(fields, input, field, value, nbfields,          orig, found, i) {
  array_new(orig)
  array_fill(orig)
  array_new(fields)
  found = 0
  while ((getline <input) > 0) {
    array_parsecsv(fields, $0, nbfields)
    if (fields[field] == value) {
      found = 1
      break
    }
  }
  close(input)
  array_getline(orig)
  if (!found) {
    delete fields
    if (nbfields) {
      nbfields = int(nbfields)
      i = array_len(fields)
      while (i < nbfields) {
        i++
        fields[i] = ""
      }
    }
  }
  return found
}
début d'implémentation nulib 2018-04-26 23:19:17 +04:00			`# -- coding: utf-8 mode: awk -- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8`
			`@include "base.core"`
			`@include "base.array"`

			`function csv__parse_quoted(line, destl, colsep, qchar, echar, pos, tmpl, nextc, resl) {`
			`line = substr(line, 2)`
			`resl = ""`
			`while (1) {`
			`pos = index(line, qchar)`
			`if (pos == 0) {`
			`# chaine mal terminee`
			`resl = resl line`
			`destl[0] = ""`
			`destl[1] = 0`
			`return resl`
			`}`
			`if (echar != "" && pos > 1) {`
			`# tenir compte du fait qu"un caratère peut être mis en échappement`
			`prevc = substr(line, pos - 1, 1)`
			`quotec = substr(line, pos, 1)`
			`nextc = substr(line, pos + 1, 1)`
			`if (prevc == echar) {`
			`# qchar en échappement`
			`tmpl = substr(line, 1, pos - 2)`
			`resl = resl tmpl quotec`
			`line = substr(line, pos + 1)`
			`continue`
			`}`
			`tmpl = substr(line, 1, pos - 1)`
			`if (nextc == colsep \|\| nextc == "") {`
			`# fin de champ ou fin de ligne`
			`resl = resl tmpl`
			`destl[0] = substr(line, pos + 2)`
			`destl[1] = nextc == colsep`
			`return resl`
			`} else {`
			`# erreur de syntaxe: guillemet non mis en échappement`
			`# ignorer cette erreur et prendre le guillemet quand meme`
			`resl = resl tmpl quotec`
			`line = substr(line, pos + 1)`
			`}`
			`} else {`
			`# pas d"échappement pour qchar. il est éventuellement doublé`
			`tmpl = substr(line, 1, pos - 1)`
			`quotec = substr(line, pos, 1)`
			`nextc = substr(line, pos + 1, 1)`
			`if (nextc == colsep \|\| nextc == "") {`
			`# fin de champ ou fin de ligne`
			`resl = resl tmpl`
			`destl[0] = substr(line, pos + 2)`
			`destl[1] = nextc == colsep`
			`return resl`
			`} else if (nextc == qchar) {`
			`# qchar en echappement`
			`resl = resl tmpl quotec`
			`line = substr(line, pos + 2)`
			`} else {`
			`# erreur de syntaxe: guillemet non mis en échappement`
			`# ignorer cette erreur et prendre le guillemet quand meme`
			`resl = resl tmpl quotec`
			`line = substr(line, pos + 1)`
			`}`
			`}`
			`}`
			`}`
			`function csv__parse_unquoted(line, destl, colsep, qchar, echar, pos) {`
			`pos = index(line, colsep)`
			`if (pos == 0) {`
			`destl[0] = ""`
			`destl[1] = 0`
			`return line`
			`} else {`
			`destl[0] = substr(line, pos + 1)`
			`destl[1] = 1`
			`return substr(line, 1, pos - 1)`
			`}`
			`}`
			`function csv__array_parse(fields, line, nbfields, colsep, qchar, echar, shouldparse, destl, i) {`
			`array_new(fields)`
			`array_new(destl)`
			`i = 1`
			`shouldparse = 0`
			`# shouldparse permet de gérer le cas où un champ vide est en fin de ligne.`
			`# en effet, après "," il faut toujours parser, même si line==""`
			`while (shouldparse \|\| line != "") {`
			`if (index(line, qchar) == 1) {`
			`value = csv__parse_quoted(line, destl, colsep, qchar, echar)`
			`line = destl[0]`
			`shouldparse = destl[1]`
			`} else {`
			`value = csv__parse_unquoted(line, destl, colsep, qchar, echar)`
			`line = destl[0]`
			`shouldparse = destl[1]`
			`}`
			`fields[i] = value`
			`i = i + 1`
			`}`
			`if (nbfields) {`
			`nbfields = int(nbfields)`
			`i = array_len(fields)`
			`while (i < nbfields) {`
			`i++`
			`fields[i] = ""`
			`}`
			`}`
			`return array_len(fields)`
			`}`
			`BEGIN {`
			`DEFAULT_COLSEP = ","`
			`DEFAULT_QCHAR = "\""`
			`DEFAULT_ECHAR = ""`
			`}`
			`function array_parsecsv2(fields, line, nbfields, colsep, qchar, echar) {`
			`return csv__array_parse(fields, line, nbfields, colsep, qchar, echar)`
			`}`
			`function array_parsecsv(fields, line, nbfields, colsep, qchar, echar) {`
			`if (colsep == "") colsep = DEFAULT_COLSEP`
			`if (qchar == "") qchar = DEFAULT_QCHAR`
			`if (echar == "") echar = DEFAULT_ECHAR`
			`return csv__array_parse(fields, line, nbfields, colsep, qchar, echar)`
			`}`
			`function parsecsv(line, fields) {`
			`array_parsecsv(fields, line)`
			`array_getline(fields)`
			`return NF`
			`}`
			`function getlinecsv(file, fields) {`
			`if (file) {`
			`getline <file`
			`} else {`
			`getline`
			`}`
			`return parsecsv($0)`
			`}`
			`function csv__should_quote(s) {`
			`if (s ~ /^[[:blank:][:cntrl:][:space:]]/) return 1`
			`if (s ~ /[[:blank:][:cntrl:][:space:]]$/) return 1`
			`return 0`
			`}`
			`function array_formatcsv2(fields, colsep, mvsep, qchar, echar, count, indices, line, i, value) {`
			`line = ""`
			`count = mkindices(fields, indices)`
			`for (i = 1; i <= count; i++) {`
			`value = fields[indices[i]]`
			`if (i > 1) line = line colsep`
			`if (qchar != "" && index(value, qchar) != 0) {`
			`if (echar != "") gsub(qchar, quote_subrepl(echar) "&", value);`
			`else gsub(qchar, "&&", value);`
			`}`
			`if (qchar != "" && (index(value, mvsep) != 0 \|\| index(value, colsep) != 0 \|\| index(value, qchar) != 0 \|\| csv__should_quote(value))) {`
			`line = line qchar value qchar`
			`} else {`
			`line = line value`
			`}`
			`}`
			`return line`
			`}`
			`function array_formatcsv(fields) {`
			`return array_formatcsv2(fields, ",", ";", "\"", "")`
			`}`
			`function array_printcsv(fields, output) {`
			`printto(array_formatcsv(fields), output)`
			`}`
			`function get_formatcsv( fields) {`
			`array_fill(fields)`
			`return array_formatcsv(fields)`
			`}`
			`function formatcsv() {`
			`$0 = get_formatcsv()`
			`}`
			`function printcsv(output, fields) {`
			`array_fill(fields)`
			`array_printcsv(fields, output)`
			`}`
			`function array_findcsv(fields, input, field, value, nbfields, orig, found, i) {`
			`array_new(orig)`
			`array_fill(orig)`
			`array_new(fields)`
			`found = 0`
			`while ((getline <input) > 0) {`
			`array_parsecsv(fields, $0, nbfields)`
			`if (fields[field] == value) {`
			`found = 1`
			`break`
			`}`
			`}`
			`close(input)`
			`array_getline(orig)`
			`if (!found) {`
			`delete fields`
			`if (nbfields) {`
			`nbfields = int(nbfields)`
			`i = array_len(fields)`
			`while (i < nbfields) {`
			`i++`
			`fields[i] = ""`
			`}`
			`}`
			`}`
			`return found`
			`}`