Update encoding to use bytes for strings.

This commit is contained in:
Jonathan Springer 2020-01-03 13:33:15 -05:00
commit 0debfdcc80
2 changed files with 15 additions and 11 deletions

View file

@ -5,6 +5,8 @@ from __future__ import (
unicode_literals, unicode_literals,
) )
from builtins import bytes
import os import os
from six import text_type from six import text_type
@ -13,7 +15,7 @@ import core
from core import logger from core import logger
def char_replace(name): def char_replace(name_in):
# Special character hex range: # Special character hex range:
# CP850: 0x80-0xA5 (fortunately not used in ISO-8859-15) # CP850: 0x80-0xA5 (fortunately not used in ISO-8859-15)
# UTF-8: 1st hex code 0xC2-0xC3 followed by a 2nd hex code 0xA1-0xFF # UTF-8: 1st hex code 0xC2-0xC3 followed by a 2nd hex code 0xA1-0xFF
@ -22,36 +24,38 @@ def char_replace(name):
# If there is special character, detects if it is a UTF-8, CP850 or ISO-8859-15 encoding # If there is special character, detects if it is a UTF-8, CP850 or ISO-8859-15 encoding
encoded = False encoded = False
encoding = None encoding = None
if isinstance(name, text_type): if isinstance(name_in, text_type):
return encoded, name.encode(core.SYS_ENCODING) return encoded, name_in.encode(core.SYS_ENCODING)
name=bytes(name_in)
for Idx in range(len(name)): for Idx in range(len(name)):
print('Trying to intuit the encoding')
# /!\ detection is done 2char by 2char for UTF-8 special character # /!\ detection is done 2char by 2char for UTF-8 special character
if (len(name) != 1) & (Idx < (len(name) - 1)): if (len(name) != 1) & (Idx < (len(name) - 1)):
# Detect UTF-8 # Detect UTF-8
if ((name[Idx] == '\xC2') | (name[Idx] == '\xC3')) & ( if ((name[Idx] == 0xC2) | (name[Idx] == 0xC3)) & (
(name[Idx + 1] >= '\xA0') & (name[Idx + 1] <= '\xFF')): (name[Idx + 1] >= 0xA0) & (name[Idx + 1] <= 0xFF)):
encoding = 'utf-8' encoding = 'utf-8'
break break
# Detect CP850 # Detect CP850
elif (name[Idx] >= '\x80') & (name[Idx] <= '\xA5'): elif (name[Idx] >= 0x80) & (name[Idx] <= 0xA5):
encoding = 'cp850' encoding = 'cp850'
break break
# Detect ISO-8859-15 # Detect ISO-8859-15
elif (name[Idx] >= '\xA6') & (name[Idx] <= '\xFF'): elif (name[Idx] >= 0xA6) & (name[Idx] <= 0xFF):
encoding = 'iso-8859-15' encoding = 'iso-8859-15'
break break
else: else:
# Detect CP850 # Detect CP850
if (name[Idx] >= '\x80') & (name[Idx] <= '\xA5'): if (name[Idx] >= 0x80) & (name[Idx] <= 0xA5):
encoding = 'cp850' encoding = 'cp850'
break break
# Detect ISO-8859-15 # Detect ISO-8859-15
elif (name[Idx] >= '\xA6') & (name[Idx] <= '\xFF'): elif (name[Idx] >= 0xA6) & (name[Idx] <= 0xFF):
encoding = 'iso-8859-15' encoding = 'iso-8859-15'
break break
if encoding and not encoding == core.SYS_ENCODING: if encoding and not encoding == core.SYS_ENCODING:
encoded = True encoded = True
name = name.decode(encoding).encode(core.SYS_ENCODING) name_enc = name.decode(encoding).encode(core.SYS_ENCODING)
return encoded, name return encoded, name

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
# coding=utf-8 # coding=utf-8
# #
############################################################################## ##############################################################################