153 lines
5.1 KiB
Python
153 lines
5.1 KiB
Python
# -*- coding: utf-8 -*- vim:sw=4:sts=4:et:ai:si:sta:fenc=utf-8
|
|
|
|
"""Fonctions pour détecter l'encoding d'une chaine ou d'un fichier, et/ou tester
|
|
si c'est un fichier binaire.
|
|
"""
|
|
|
|
__all__ = ('UNRECOGNIZED_ENCODING', 'UNKNOWN_ENCODING',
|
|
'guess_encoding', 'guess_string_encoding', 'guess_stream_encoding',
|
|
'detect_line_encoding', 'guess_line_encoding',
|
|
'FileType',
|
|
)
|
|
|
|
from os import path
|
|
import re
|
|
|
|
from .base import isstr, make_prop
|
|
from .encoding import LATIN1, UTF8, MACROMAN
|
|
from .env import get_default_encoding
|
|
|
|
# Les tableaux suivants contiennents les caractères suivants:
|
|
# àâçèéêîïñôû
|
|
|
|
ISO_8859_1_CHARS = [
|
|
0xe0, 0xe2, 0xe7, 0xe8, 0xe9, 0xea,
|
|
0xee, 0xef, 0xf1, 0xf4, 0xfb,
|
|
]
|
|
|
|
MAC_ROMAN_CHARS = [
|
|
0x88, 0x89, 0x8d, 0x8f, 0x8e, 0x90,
|
|
0x94, 0x95, 0x96, 0x99, 0x9e,
|
|
]
|
|
|
|
# la séquence est 0xc3 puis l'un des caractères de ce tableau
|
|
UTF_8_CHARS = [
|
|
0xa0, 0xa2, 0xa7, 0xa8, 0xa9, 0xaa,
|
|
0xae, 0xaf, 0xb1, 0xb4, 0xbb,
|
|
]
|
|
|
|
UNKNOWN_ENCODING = "Unknown"
|
|
UNRECOGNIZED_ENCODING = "Unrecognized"
|
|
def guess_string_encoding(ins, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
|
|
ascii = True
|
|
i = 0
|
|
max = len(ins)
|
|
while i < max:
|
|
b = ord(ins[i])
|
|
if b >= 128: ascii = False
|
|
if b == 0xc3:
|
|
b = ord(ins[i + 1])
|
|
if b in UTF_8_CHARS: return UTF8
|
|
elif b in ISO_8859_1_CHARS: return LATIN1
|
|
elif b in MAC_ROMAN_CHARS: return MACROMAN
|
|
elif not ascii: return unrecognized
|
|
i = i + 1
|
|
if unknown is None: return get_default_encoding()
|
|
else: return unknown
|
|
|
|
def guess_stream_encoding(inf, unknown=UNKNOWN_ENCODING, unrecognized=UNRECOGNIZED_ENCODING):
|
|
close_inf = False
|
|
if isstr(inf):
|
|
inf = open(inf, 'rb')
|
|
close_inf = True
|
|
try:
|
|
return guess_string_encoding(inf.read(), unknown, unrecognized)
|
|
finally:
|
|
if close_inf: inf.close()
|
|
|
|
def guess_encoding(ins=None, inf=None, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
|
|
if ins is not None: return guess_string_encoding(ins, unknown, unrecognized)
|
|
elif inf is not None: return guess_stream_encoding(inf, unknown, unrecognized)
|
|
else: return unknown
|
|
|
|
RE_ENCODING = re.compile(r'(?i)\b(?:en)?coding: (\S+)\b')
|
|
def detect_line_encoding(lines, examine_lines=10):
|
|
nb_lines = len(lines)
|
|
if nb_lines < 2 * examine_lines:
|
|
examine_lines = nb_lines
|
|
|
|
for line in lines[:examine_lines]:
|
|
mo = RE_ENCODING.search(line)
|
|
if mo is not None: return mo.group(1)
|
|
if nb_lines > examine_lines:
|
|
for line in lines[-examine_lines:]:
|
|
mo = RE_ENCODING.search(line)
|
|
if mo is not None: return mo.group(1)
|
|
return None
|
|
|
|
_UNKNOWN = object()
|
|
_UNRECOGNIZED = object()
|
|
def guess_line_encoding(lines, unknown=None, unrecognized=UNRECOGNIZED_ENCODING):
|
|
for line in lines:
|
|
encoding = guess_string_encoding(line, _UNKNOWN, _UNRECOGNIZED)
|
|
if encoding is _UNKNOWN: continue
|
|
elif encoding is _UNRECOGNIZED: return unrecognized
|
|
else: return encoding
|
|
if unknown is None: return get_default_encoding()
|
|
else: return unknown
|
|
|
|
class FileType(object):
|
|
"""Un objet servant à déterminer le type d'un fichier:
|
|
- texte ou binaire
|
|
- encoding
|
|
|
|
XXX finir cette classe, et intégrer les fonctions de paths
|
|
"""
|
|
_check_ext, check_ext = make_prop('_check_ext', True)[:2]
|
|
_check_content, check_content = make_prop('_check_content', True)[:2]
|
|
_file, file = make_prop('_file')[:2]
|
|
|
|
def __init__(self, file):
|
|
self._file = file
|
|
|
|
def is_binary(self):
|
|
binary = self._binary
|
|
if binary is None and self.check_ext:
|
|
binary = self.is_binary_ext(self.file)
|
|
if binary is None and self.check_context:
|
|
content = self.get_content(self.file)
|
|
binary = self.is_binary_content(content)
|
|
if binary is not None:
|
|
self._binary = binary
|
|
return binary
|
|
_binary, binary = make_prop('_binary', getter=is_binary)[:2]
|
|
|
|
def is_binary_ext(self, file):
|
|
_, filename = path.split(file)
|
|
_, ext = path.splitext(filename)
|
|
if filename == '.DS_Store': return True
|
|
else: return ext.lower() in (
|
|
# exécutables et fichiers objets
|
|
'.bin', '.com', '.co_', '.exe', '.ex_', '.dll',
|
|
'.pyc', '.pyd', '.pyo', '.class',
|
|
'.o', '.so', '.so.*', '.lib', '.ovl',
|
|
# archives
|
|
'.gz', '.bz2', '.tar', '.tgz', '.tbz2',
|
|
'.hqx', '.sit', '.zip', '.jar', '.rpm', '.srpm', '.deb',
|
|
# multimédia
|
|
'.bmp', '.gif', '.png', '.jpeg', '.jpg', '.tif', '.tiff',
|
|
'.xbm', '.icns', '.ico', '.avi', '.mov', '.mpg', '.swf',
|
|
'.mp3', '.snd', '.ogg', '.dat',
|
|
# documents
|
|
'.doc', '.ppt', '.xls', '.pdf',
|
|
# divers
|
|
'.bpt', '.bro', '.eps', '.fm', '.ins', '.mcp', '.objectplant',
|
|
'.ofp', '.opn','.pqg', '.prj', '.ps', '.sl', '.strings', '.wordbreak',
|
|
)
|
|
|
|
def get_content(self, file):
|
|
pass #XXX
|
|
|
|
def is_binary_content(self, content):
|
|
pass #XXX
|