nutools/lib/nulib/python/nulib/encdetect.py

153 lines
5.1 KiB
Python

# -*- coding: utf-8 -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8
"""Fonctions pour détecter l'encoding d'une chaine ou d'un fichier, et/ou tester
si c'est un fichier binaire.
"""
__all__ = ('UNRECOGNIZED_ENCODING', 'UNKNOWN_ENCODING',
'guess_encoding', 'guess_string_encoding', 'guess_stream_encoding',
'detect_line_encoding', 'guess_line_encoding',
'FileType',
)
from os import path
import re
from .base import isstr, make_prop
from .encoding import LATIN1, UTF8, MACROMAN
from .env import get_default_encoding
# Les tableaux suivants contiennents les caractères suivants:
# àâçèéêîïñôû
ISO_8859_1_CHARS = [
0xe0, 0xe2, 0xe7, 0xe8, 0xe9, 0xea,
0xee, 0xef, 0xf1, 0xf4, 0xfb,
]
MAC_ROMAN_CHARS = [
0x88, 0x89, 0x8d, 0x8f, 0x8e, 0x90,
0x94, 0x95, 0x96, 0x99, 0x9e,
]
# la séquence est 0xc3 puis l'un des caractères de ce tableau
UTF_8_CHARS = [
0xa0, 0xa2, 0xa7, 0xa8, 0xa9, 0xaa,
0xae, 0xaf, 0xb1, 0xb4, 0xbb,
]
UNKNOWN_ENCODING = "Unknown"
UNRECOGNIZED_ENCODING = "Unrecognized"
def guess_string_encoding(ins, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
ascii = True
i = 0
max = len(ins)
while i < max:
b = ord(ins[i])
if b >= 128: ascii = False
if b == 0xc3:
b = ord(ins[i + 1])
if b in UTF_8_CHARS: return UTF8
elif b in ISO_8859_1_CHARS: return LATIN1
elif b in MAC_ROMAN_CHARS: return MACROMAN
elif not ascii: return unrecognized
i = i + 1
if unknown is None: return get_default_encoding()
else: return unknown
def guess_stream_encoding(inf, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
close_inf = False
if isstr(inf):
inf = open(inf, 'rb')
close_inf = True
try:
return guess_string_encoding(inf.read(), unknown, unrecognized)
finally:
if close_inf: inf.close()
def guess_encoding(ins=None, inf=None, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
if ins is not None: return guess_string_encoding(ins, unknown, unrecognized)
elif inf is not None: return guess_stream_encoding(inf, unknown, unrecognized)
else: return unknown
RE_ENCODING = re.compile(r'(?i)\b(?:en)?coding: (\S+)\b')
def detect_line_encoding(lines, examine_lines=10):
nb_lines = len(lines)
if nb_lines < 2 * examine_lines:
examine_lines = nb_lines
for line in lines[:examine_lines]:
mo = RE_ENCODING.search(line)
if mo is not None: return mo.group(1)
if nb_lines > examine_lines:
for line in lines[-examine_lines:]:
mo = RE_ENCODING.search(line)
if mo is not None: return mo.group(1)
return None
_UNKNOWN = object()
_UNRECOGNIZED = object()
def guess_line_encoding(lines, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
for line in lines:
encoding = guess_string_encoding(line, _UNKNOWN, _UNRECOGNIZED)
if encoding is _UNKNOWN: continue
elif encoding is _UNRECOGNIZED: return unrecognized
else: return encoding
if unknown is None: return get_default_encoding()
else: return unknown
class FileType(object):
"""Un objet servant à déterminer le type d'un fichier:
- texte ou binaire
- encoding
XXX finir cette classe, et intégrer les fonctions de paths
"""
_check_ext, check_ext = make_prop('_check_ext', True)[:2]
_check_content, check_content = make_prop('_check_content', True)[:2]
_file, file = make_prop('_file')[:2]
def __init__(self, file):
self._file = file
def is_binary(self):
binary = self._binary
if binary is None and self.check_ext:
binary = self.is_binary_ext(self.file)
if binary is None and self.check_context:
content = self.get_content(self.file)
binary = self.is_binary_content(content)
if binary is not None:
self._binary = binary
return binary
_binary, binary = make_prop('_binary', getter=is_binary)[:2]
def is_binary_ext(self, file):
_, filename = path.split(file)
_, ext = path.splitext(filename)
if filename == '.DS_Store': return True
else: return ext.lower() in (
# exécutables et fichiers objets
'.bin', '.com', '.co_', '.exe', '.ex_', '.dll',
'.pyc', '.pyd', '.pyo', '.class',
'.o', '.so', '.so.*', '.lib', '.ovl',
# archives
'.gz', '.bz2', '.tar', '.tgz', '.tbz2',
'.hqx', '.sit', '.zip', '.jar', '.rpm', '.srpm', '.deb',
# multimédia
'.bmp', '.gif', '.png', '.jpeg', '.jpg', '.tif', '.tiff',
'.xbm', '.icns', '.ico', '.avi', '.mov', '.mpg', '.swf',
'.mp3', '.snd', '.ogg', '.dat',
# documents
'.doc', '.ppt', '.xls', '.pdf',
# divers
'.bpt', '.bro', '.eps', '.fm', '.ins', '.mcp', '.objectplant',
'.ofp', '.opn','.pqg', '.prj', '.ps', '.sl', '.strings', '.wordbreak',
)
def get_content(self, file):
pass #XXX
def is_binary_content(self, content):
pass #XXX