# ------------------------------------------------------------------------------ # The MIT License (MIT) # # Copyright (c) 2014-2019 Digital Sapphire # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ------------------------------------------------------------------------------ import chardet import logging import six if not six.PY2: # Helper for Python 2 and 3 compatibility unicode = str log = logging.getLogger(__name__) def make_compat_str(in_str): """ Tries to guess encoding of [str/bytes] and decode it into an unicode object. """ assert isinstance(in_str, (bytes, str, unicode)) if not in_str: return unicode() # Chardet in Py2 works on str + bytes objects if six.PY2 and isinstance(in_str, unicode): return in_str # Chardet in Py3 works on bytes objects if not six.PY2 and not isinstance(in_str, bytes): return in_str # Detect the encoding now enc = chardet.detect(in_str) # Decode the object into a unicode object out_str = in_str.decode(enc['encoding']) # Cleanup: Sometimes UTF-16 strings include the BOM if enc['encoding'] == "UTF-16BE": # Remove byte order marks (BOM) if out_str.startswith('\ufeff'): out_str = out_str[1:] # Return the decoded string return out_str