nutools/lib/nulib/python/nulib/encdetect.py

# -*- coding: utf-8 -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8

"""Fonctions pour détecter l'encoding d'une chaine ou d'un fichier, et/ou tester
si c'est un fichier binaire.
"""

__all__ = ('UNRECOGNIZED_ENCODING', 'UNKNOWN_ENCODING',
           'guess_encoding', 'guess_string_encoding', 'guess_stream_encoding',
           'detect_line_encoding', 'guess_line_encoding',
           'FileType',
           )

from os import path
import re

from .base import isstr, make_prop
from .encoding import LATIN1, UTF8, MACROMAN
from .env import get_default_encoding

# Les tableaux suivants contiennents les caractères suivants:
# àâçèéêîïñôû

ISO_8859_1_CHARS = [
    0xe0, 0xe2, 0xe7, 0xe8, 0xe9, 0xea,
    0xee, 0xef, 0xf1, 0xf4, 0xfb,
]

MAC_ROMAN_CHARS = [
    0x88, 0x89, 0x8d, 0x8f, 0x8e, 0x90,
    0x94, 0x95, 0x96, 0x99, 0x9e,
]

# la séquence est 0xc3 puis l'un des caractères de ce tableau
UTF_8_CHARS = [
    0xa0, 0xa2, 0xa7, 0xa8, 0xa9, 0xaa,
    0xae, 0xaf, 0xb1, 0xb4, 0xbb,
]

UNKNOWN_ENCODING = "Unknown"
UNRECOGNIZED_ENCODING = "Unrecognized"
def guess_string_encoding(ins, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
    ascii = True
    i = 0
    max = len(ins)
    while i < max:
        b = ord(ins[i])
        if b >= 128: ascii = False
        if b == 0xc3:
            b = ord(ins[i + 1])
            if b in UTF_8_CHARS: return UTF8
        elif b in ISO_8859_1_CHARS: return LATIN1
        elif b in MAC_ROMAN_CHARS: return MACROMAN
        elif not ascii: return unrecognized
        i = i + 1
    if unknown is None: return get_default_encoding()
    else: return unknown

def guess_stream_encoding(inf, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
    close_inf = False
    if isstr(inf):
        inf = open(inf, 'rb')
        close_inf = True
    try:
        return guess_string_encoding(inf.read(), unknown, unrecognized)
    finally:
        if close_inf: inf.close()

def guess_encoding(ins=None, inf=None, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
    if ins is not None: return guess_string_encoding(ins, unknown, unrecognized)
    elif inf is not None: return guess_stream_encoding(inf, unknown, unrecognized)
    else: return unknown

RE_ENCODING = re.compile(r'(?i)\b(?:en)?coding: (\S+)\b')
def detect_line_encoding(lines, examine_lines=10):
    nb_lines = len(lines)
    if nb_lines < 2 * examine_lines:
        examine_lines = nb_lines

    for line in lines[:examine_lines]:
        mo = RE_ENCODING.search(line)
        if mo is not None: return mo.group(1)
    if nb_lines > examine_lines:
        for line in lines[-examine_lines:]:
            mo = RE_ENCODING.search(line)
            if mo is not None: return mo.group(1)
    return None

_UNKNOWN = object()
_UNRECOGNIZED = object()
def guess_line_encoding(lines, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
    for line in lines:
        encoding = guess_string_encoding(line, _UNKNOWN, _UNRECOGNIZED)
        if encoding is _UNKNOWN: continue
        elif encoding is _UNRECOGNIZED: return unrecognized
        else: return encoding
    if unknown is None: return get_default_encoding()
    else: return unknown

class FileType(object):
    """Un objet servant à déterminer le type d'un fichier:
    - texte ou binaire
    - encoding

    XXX finir cette classe, et intégrer les fonctions de paths
    """
    _check_ext, check_ext = make_prop('_check_ext', True)[:2]
    _check_content, check_content = make_prop('_check_content', True)[:2]
    _file, file = make_prop('_file')[:2]

    def __init__(self, file):
        self._file = file

    def is_binary(self):
        binary = self._binary
        if binary is None and self.check_ext:
            binary = self.is_binary_ext(self.file)
        if binary is None and self.check_context:
            content = self.get_content(self.file)
            binary = self.is_binary_content(content)
        if binary is not None:
            self._binary = binary
        return binary
    _binary, binary = make_prop('_binary', getter=is_binary)[:2]

    def is_binary_ext(self, file):
        _, filename = path.split(file)
        _, ext = path.splitext(filename)
        if filename == '.DS_Store': return True
        else: return ext.lower() in (
            # exécutables et fichiers objets
            '.bin', '.com', '.co_', '.exe', '.ex_', '.dll',
            '.pyc', '.pyd', '.pyo', '.class',
            '.o', '.so', '.so.*', '.lib', '.ovl',
            # archives
            '.gz', '.bz2', '.tar', '.tgz', '.tbz2',
            '.hqx', '.sit', '.zip', '.jar', '.rpm', '.srpm', '.deb',
            # multimédia
            '.bmp', '.gif', '.png', '.jpeg', '.jpg', '.tif', '.tiff',
            '.xbm', '.icns', '.ico', '.avi', '.mov', '.mpg', '.swf',
            '.mp3', '.snd', '.ogg', '.dat',
            # documents
            '.doc', '.ppt', '.xls', '.pdf',
            # divers
            '.bpt', '.bro', '.eps', '.fm', '.ins', '.mcp', '.objectplant',
            '.ofp', '.opn','.pqg', '.prj', '.ps', '.sl', '.strings', '.wordbreak',
            )

    def get_content(self, file):
        pass #XXX

    def is_binary_content(self, content):
        pass #XXX