208 lines
6.6 KiB
Python
208 lines
6.6 KiB
Python
# -*- coding: utf-8 -*- #
|
|
|
|
# Copyright 2015 Google LLC. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""A module for dealing with unknown string and environment encodings."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import unicode_literals
|
|
|
|
import sys
|
|
|
|
import six
|
|
|
|
|
|
def Encode(string, encoding=None):
|
|
"""Encode the text string to a byte string.
|
|
|
|
Args:
|
|
string: str, The text string to encode.
|
|
encoding: The suggested encoding if known.
|
|
|
|
Returns:
|
|
str, The binary string.
|
|
"""
|
|
if string is None:
|
|
return None
|
|
if not six.PY2:
|
|
# In Python 3, the environment sets and gets accept and return text strings
|
|
# only, and it handles the encoding itself so this is not necessary.
|
|
return string
|
|
if isinstance(string, six.binary_type):
|
|
# Already an encoded byte string, we are done
|
|
return string
|
|
|
|
encoding = encoding or _GetEncoding()
|
|
return string.encode(encoding)
|
|
|
|
|
|
def Decode(data, encoding=None):
|
|
"""Returns string with non-ascii characters decoded to UNICODE.
|
|
|
|
UTF-8, the suggested encoding, and the usual suspects will be attempted in
|
|
order.
|
|
|
|
Args:
|
|
data: A string or object that has str() and unicode() methods that may
|
|
contain an encoding incompatible with the standard output encoding.
|
|
encoding: The suggested encoding if known.
|
|
|
|
Returns:
|
|
A text string representing the decoded byte string.
|
|
"""
|
|
if data is None:
|
|
return None
|
|
|
|
# First we are going to get the data object to be a text string.
|
|
# Don't use six.string_types here because on Python 3 bytes is not considered
|
|
# a string type and we want to include that.
|
|
if isinstance(data, six.text_type) or isinstance(data, six.binary_type):
|
|
string = data
|
|
else:
|
|
# Some non-string type of object.
|
|
try:
|
|
string = six.text_type(data)
|
|
except (TypeError, UnicodeError):
|
|
# The string cannot be converted to unicode -- default to str() which will
|
|
# catch objects with special __str__ methods.
|
|
string = str(data)
|
|
|
|
if isinstance(string, six.text_type):
|
|
# Our work is done here.
|
|
return string
|
|
|
|
try:
|
|
# Just return the string if its pure ASCII.
|
|
return string.decode('ascii')
|
|
except UnicodeError:
|
|
# The string is not ASCII encoded.
|
|
pass
|
|
|
|
# Try the suggested encoding if specified.
|
|
if encoding:
|
|
try:
|
|
return string.decode(encoding)
|
|
except UnicodeError:
|
|
# Bad suggestion.
|
|
pass
|
|
|
|
# Try UTF-8 because the other encodings could be extended ASCII. It would
|
|
# be exceptional if a valid extended ascii encoding with extended chars
|
|
# were also a valid UITF-8 encoding.
|
|
try:
|
|
return string.decode('utf8')
|
|
except UnicodeError:
|
|
# Not a UTF-8 encoding.
|
|
pass
|
|
|
|
# Try the filesystem encoding.
|
|
try:
|
|
return string.decode(sys.getfilesystemencoding())
|
|
except UnicodeError:
|
|
# string is not encoded for filesystem paths.
|
|
pass
|
|
|
|
# Try the system default encoding.
|
|
try:
|
|
return string.decode(sys.getdefaultencoding())
|
|
except UnicodeError:
|
|
# string is not encoded using the default encoding.
|
|
pass
|
|
|
|
# We don't know the string encoding.
|
|
# This works around a Python str.encode() "feature" that throws
|
|
# an ASCII *decode* exception on str strings that contain 8th bit set
|
|
# bytes. For example, this sequence throws an exception:
|
|
# string = '\xdc' # iso-8859-1 'Ü'
|
|
# string = string.encode('ascii', 'backslashreplace')
|
|
# even though 'backslashreplace' is documented to handle encoding
|
|
# errors. We work around the problem by first decoding the str string
|
|
# from an 8-bit encoding to unicode, selecting any 8-bit encoding that
|
|
# uses all 256 bytes (such as ISO-8559-1):
|
|
# string = string.decode('iso-8859-1')
|
|
# Using this produces a sequence that works:
|
|
# string = '\xdc'
|
|
# string = string.decode('iso-8859-1')
|
|
# string = string.encode('ascii', 'backslashreplace')
|
|
return string.decode('iso-8859-1')
|
|
|
|
|
|
def GetEncodedValue(env, name, default=None):
|
|
"""Returns the decoded value of the env var name.
|
|
|
|
Args:
|
|
env: {str: str}, The env dict.
|
|
name: str, The env var name.
|
|
default: The value to return if name is not in env.
|
|
|
|
Returns:
|
|
The decoded value of the env var name.
|
|
"""
|
|
name = Encode(name)
|
|
value = env.get(name)
|
|
if value is None:
|
|
return default
|
|
# In Python 3, the environment sets and gets accept and return text strings
|
|
# only, and it handles the encoding itself so this is not necessary.
|
|
return Decode(value)
|
|
|
|
|
|
def SetEncodedValue(env, name, value, encoding=None):
|
|
"""Sets the value of name in env to an encoded value.
|
|
|
|
Args:
|
|
env: {str: str}, The env dict.
|
|
name: str, The env var name.
|
|
value: str or unicode, The value for name. If None then name is removed from
|
|
env.
|
|
encoding: str, The encoding to use or None to try to infer it.
|
|
"""
|
|
# Python 2 *and* 3 unicode support falls apart at filesystem/argv/environment
|
|
# boundaries. The encoding used for filesystem paths and environment variable
|
|
# names/values is under user control on most systems. With one of those values
|
|
# in hand there is no way to tell exactly how the value was encoded. We get
|
|
# some reasonable hints from sys.getfilesystemencoding() or
|
|
# sys.getdefaultencoding() and use them to encode values that the receiving
|
|
# process will have a chance at decoding. Leaving the values as unicode
|
|
# strings will cause os module Unicode exceptions. What good is a language
|
|
# unicode model when the module support could care less?
|
|
name = Encode(name, encoding=encoding)
|
|
if value is None:
|
|
env.pop(name, None)
|
|
return
|
|
env[name] = Encode(value, encoding=encoding)
|
|
|
|
|
|
def EncodeEnv(env, encoding=None):
|
|
"""Encodes all the key value pairs in env in preparation for subprocess.
|
|
|
|
Args:
|
|
env: {str: str}, The environment you are going to pass to subprocess.
|
|
encoding: str, The encoding to use or None to use the default.
|
|
|
|
Returns:
|
|
{bytes: bytes}, The environment to pass to subprocess.
|
|
"""
|
|
encoding = encoding or _GetEncoding()
|
|
return {
|
|
Encode(k, encoding=encoding): Encode(v, encoding=encoding)
|
|
for k, v in six.iteritems(env)}
|
|
|
|
|
|
def _GetEncoding():
|
|
"""Gets the default encoding to use."""
|
|
return sys.getfilesystemencoding() or sys.getdefaultencoding()
|