From 72c552a04a6410be610f792bd2f8444c84874c44 Mon Sep 17 00:00:00 2001 From: yzqzss Date: Wed, 26 Jul 2023 02:25:54 +0800 Subject: [PATCH] feat: remove XML illegal chars in metadata before uploading --- biliarchiver/_biliarchiver_upload_bvid.py | 36 +++++++---- biliarchiver/utils/xml_chars.py | 73 +++++++++++++++++++++++ 2 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 biliarchiver/utils/xml_chars.py diff --git a/biliarchiver/_biliarchiver_upload_bvid.py b/biliarchiver/_biliarchiver_upload_bvid.py index 13a632d..959a8ce 100644 --- a/biliarchiver/_biliarchiver_upload_bvid.py +++ b/biliarchiver/_biliarchiver_upload_bvid.py @@ -11,6 +11,7 @@ from biliarchiver.exception import VideosBasePathNotFoundError from biliarchiver.utils.identifier import human_readable_upper_part_map from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError +from biliarchiver.utils.xml_chars import xml_chars_legalize from biliarchiver.version import BILI_ARCHIVER_VERSION def upload_bvid(bvid: str, *, update_existing: bool = False, collection: str): @@ -154,17 +155,17 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str): 'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)', } - # XML 中不能有 \b 等特殊控制字符,IA 会拒收。 - # 先只简单删 \b ,如果以后再发现元数据里出现其它非法字符,再说。 - _md_str = json.dumps(md, ensure_ascii=False) - if "\\b" in _md_str: - print("WARNING: \\b in metadata, removing it") - md = json.loads(_md_str.replace("\\b", "")) - del(_md_str) - print(filedict) print(md) + # remove XML illegal characters + _md_before = hash(json.dumps(md)) + md = xml_chars_legalize(obj=md) + assert isinstance(md, dict) + if hash(json.dumps(md)) != _md_before: + print(f"Removed XML illegal characters from metadata, cleaned metadata:") + print(md) + if filedict: r = item.upload( files=filedict, @@ -186,18 +187,27 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str): new_md = {} if item.metadata.get("upload-state") != "uploaded": - new_md.update({"upload-state": "uploaded"}) + new_md["upload-state"] = "uploaded" if item.metadata.get("creator") != md['creator']: - new_md.update({"creator": md['creator']}) + new_md["creator"] = md['creator'] if item.metadata.get("description", "") != bv_info['data']['View']['desc']: - new_md.update({"description": bv_info['data']['View']['desc']}) + new_md["description"] = bv_info['data']['View']['desc'] if item.metadata.get("scanner") != md['scanner']: - new_md.update({"scanner": md['scanner']}) + new_md["scanner"] = md['scanner'] if item.metadata.get("external-identifier") != md['external-identifier']: - new_md.update({"external-identifier": md['external-identifier']}) + new_md["external-identifier"] = md['external-identifier'] if new_md: print(f"Updating metadata:") print(new_md) + + # remove XML illegal characters + _md_before = hash(json.dumps(md)) + new_md = xml_chars_legalize(obj=new_md) + assert isinstance(new_md, dict) + if hash(json.dumps(new_md)) != _md_before: + print(f"Removed XML illegal characters from metadata, cleaned metadata:") + print(new_md) + r = item.modify_metadata( metadata=new_md, access_key=access_key, diff --git a/biliarchiver/utils/xml_chars.py b/biliarchiver/utils/xml_chars.py new file mode 100644 index 0000000..32a90d5 --- /dev/null +++ b/biliarchiver/utils/xml_chars.py @@ -0,0 +1,73 @@ +from typing import Union + +_xml_ILLEGAL_CHARS = [] +_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('01', 16), int('08', 16)+1)]) +_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0b', 16), int('0c', 16)+1)]) +_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0e', 16), int('1f', 16)+1)]) +_xml_ILLEGAL_CHARS.extend(['\x7f']) +# NOTE: The following are non-ASCII characters, which will not appear in string obj in Python. +# range(int('80', 16), int('84', 16)+1)]) +# range(int('86', 16), int('9f', 16)+1)]) +XML_ILLEGAL_CHARS = _xml_ILLEGAL_CHARS + + +def _legalize_str(s: str, print_info: bool=False): + for c in XML_ILLEGAL_CHARS: + hash_before = hash(s) + s = s.replace(c, '') + if print_info and hash(s) != hash_before: + print(f"Removed XML illegal char \\x{ord(c):02x}") + return s + +def _legalize_list(l: list): + for i, v in enumerate(l): + if isinstance(v, str): + l[i] = _legalize_str(v) + elif isinstance(v, dict): + l[i] = _legalize_dict(v) + elif isinstance(v, list): + l[i] = _legalize_list(v) + else: + pass + return l + +def _legalize_dict(d: dict): + for k, v in d.items(): + if isinstance(v, str): + d[k] = _legalize_str(v) + elif isinstance(v, list): + d[k] = _legalize_list(v) + elif isinstance(v, dict): + d[k] = _legalize_dict(v) + else: + pass + return d + +def xml_chars_legalize(obj: Union[dict, str, list]) -> Union[dict, str, list]: + """ Remove XML illegal characters from a dict, list or str. """ + if isinstance(obj, str): + return _legalize_str(obj) + elif isinstance(obj, dict): + return _legalize_dict(obj) + elif isinstance(obj, list): + return _legalize_list(obj) + else: + raise TypeError(f"Unexpected type: {type(obj)}") + +def _test_xml_chars_legalize(): + _str = '\x0bA\x0cB\vC' + _str_after = 'ABC' + assert _legalize_str(_str) == _str_after + _list = ['\x0bA', '\x0cB', 'C\v'] + _list_after = ['A', 'B', 'C'] + assert _legalize_list(_list) == _list_after + _dict = {'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list} + _dict_after = {'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after} + assert _legalize_dict(_dict) == _dict_after + _dict_in_list_in_dict_in_dict = {"dict": [{'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list, 'dict': _dict}]} + _dict_balabala_after = {'dict': [{'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after, 'dict': _dict_after}]} + assert _legalize_dict(_dict_in_list_in_dict_in_dict) == _dict_balabala_after + +if __name__ == '__main__': + # TODO: use pytest + _test_xml_chars_legalize() \ No newline at end of file