Source code for pyfcstm.utils.decode

"""
Automatic text decoding utilities with a focus on Chinese encodings.

This module provides helpers for decoding byte sequences by trying a series of
likely encodings. It is designed to work well with Windows-centric Chinese
encodings while still supporting Unicode variants. The decoding strategy
attempts multiple encodings in a defined order and returns the first successful
result.

The module contains the following public components:

* :data:`windows_chinese_encodings` - Ordered list of common Chinese encodings
* :func:`auto_decode` - Robust decoding function with auto-detection

.. note::
   This module relies on :mod:`chardet` for probabilistic encoding detection.

Example::

    >>> from pyfcstm.utils.decode import auto_decode
    >>> text_bytes = b'\\xc4\\xe3\\xba\\xc3'  # "你好" in GBK encoding
    >>> auto_decode(text_bytes)
    '你好'
"""

import sys
from typing import Union

import chardet
from hbutils.collection import unique

windows_chinese_encodings = [
    'utf-8',  # UTF-8 encoding, Unicode standard
    'gbk',  # Most common default encoding for Chinese Windows
    'gb2312',  # Common encoding for Simplified Chinese, subset of GBK
    'gb18030',  # Chinese national standard encoding, includes all Chinese characters
    'big5',  # Common encoding for Traditional Chinese (Taiwan, Hong Kong)
    'cp936',  # Windows code page for Simplified Chinese, essentially an alias for GBK
    'cp950',  # Windows code page for Traditional Chinese, approximately equivalent to Big5
    'hz',  # Early Chinese character encoding
    # 'iso-2022-cn',  # ISO standard encoding for Chinese
    'euc-cn',  # Extended Unix Code for Chinese
    'utf-16',  # Default Unicode encoding used by Windows Notepad
    'utf-16-le',  # Little-endian UTF-16 encoding, commonly used in Windows
    'utf-16-be',  # Big-endian UTF-16 encoding
    'utf-32',  # 32-bit Unicode encoding
    'utf-32-le',  # Little-endian UTF-32 encoding
    'utf-32-be'  # Big-endian UTF-32 encoding
]


def _decode(data: bytes, encoding: str) -> str:
    """
    Decode bytes data using the specified encoding.

    :param data: Bytes to decode.
    :type data: bytes
    :param encoding: Text encoding to use for decoding.
    :type encoding: str
    :return: Decoded text.
    :rtype: str
    :raises UnicodeDecodeError: If the bytes cannot be decoded using ``encoding``.
    """
    return data.decode(encoding)


[docs] def auto_decode(data: Union[bytes, bytearray]) -> str: """ Automatically decode bytes by trying multiple encodings. The decoding order depends on the input length: * For inputs with length >= 30, the order is: 1) encoding detected by :mod:`chardet` 2) entries in :data:`windows_chinese_encodings` 3) system default encoding * For shorter inputs, the order is: 1) entries in :data:`windows_chinese_encodings` 2) system default encoding 3) encoding detected by :mod:`chardet` The function tries each encoding until one succeeds. If all attempts fail, it raises the :class:`UnicodeDecodeError` that progressed furthest (i.e., the error with the highest ``start`` position). :param data: The bytes data to decode. :type data: Union[bytes, bytearray] :return: The decoded string. :rtype: str :raises UnicodeDecodeError: If decoding fails for all attempted encodings. Example:: >>> text_bytes = b'\\xc4\\xe3\\xba\\xc3' # "你好" in GBK encoding >>> auto_decode(text_bytes) '你好' """ if len(data) >= 30: _elist = list(filter(bool, unique([ chardet.detect(data)['encoding'], *windows_chinese_encodings, sys.getdefaultencoding(), ]))) else: _elist = list(filter(bool, unique([ *windows_chinese_encodings, sys.getdefaultencoding(), chardet.detect(data)['encoding'], ]))) last_err = None for enc in _elist: try: text = _decode(data, enc) except UnicodeDecodeError as err: if last_err is None or err.start > last_err.start: last_err = err else: return text raise last_err