feat: remove XML illegal chars in metadata before uploading

This commit is contained in:
yzqzss 2023-07-26 02:25:54 +08:00
parent 4c0516d45f
commit 72c552a04a
2 changed files with 96 additions and 13 deletions

View File

@ -11,6 +11,7 @@ from biliarchiver.exception import VideosBasePathNotFoundError
from biliarchiver.utils.identifier import human_readable_upper_part_map
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config
from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError
from biliarchiver.utils.xml_chars import xml_chars_legalize
from biliarchiver.version import BILI_ARCHIVER_VERSION
def upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
@ -154,17 +155,17 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)',
}
# XML 中不能有 \b 等特殊控制字符IA 会拒收。
# 先只简单删 \b ,如果以后再发现元数据里出现其它非法字符,再说。
_md_str = json.dumps(md, ensure_ascii=False)
if "\\b" in _md_str:
print("WARNING: \\b in metadata, removing it")
md = json.loads(_md_str.replace("\\b", ""))
del(_md_str)
print(filedict)
print(md)
# remove XML illegal characters
_md_before = hash(json.dumps(md))
md = xml_chars_legalize(obj=md)
assert isinstance(md, dict)
if hash(json.dumps(md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print(md)
if filedict:
r = item.upload(
files=filedict,
@ -186,18 +187,27 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
new_md = {}
if item.metadata.get("upload-state") != "uploaded":
new_md.update({"upload-state": "uploaded"})
new_md["upload-state"] = "uploaded"
if item.metadata.get("creator") != md['creator']:
new_md.update({"creator": md['creator']})
new_md["creator"] = md['creator']
if item.metadata.get("description", "") != bv_info['data']['View']['desc']:
new_md.update({"description": bv_info['data']['View']['desc']})
new_md["description"] = bv_info['data']['View']['desc']
if item.metadata.get("scanner") != md['scanner']:
new_md.update({"scanner": md['scanner']})
new_md["scanner"] = md['scanner']
if item.metadata.get("external-identifier") != md['external-identifier']:
new_md.update({"external-identifier": md['external-identifier']})
new_md["external-identifier"] = md['external-identifier']
if new_md:
print(f"Updating metadata:")
print(new_md)
# remove XML illegal characters
_md_before = hash(json.dumps(md))
new_md = xml_chars_legalize(obj=new_md)
assert isinstance(new_md, dict)
if hash(json.dumps(new_md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print(new_md)
r = item.modify_metadata(
metadata=new_md,
access_key=access_key,

View File

@ -0,0 +1,73 @@
from typing import Union
_xml_ILLEGAL_CHARS = []
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('01', 16), int('08', 16)+1)])
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0b', 16), int('0c', 16)+1)])
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0e', 16), int('1f', 16)+1)])
_xml_ILLEGAL_CHARS.extend(['\x7f'])
# NOTE: The following are non-ASCII characters, which will not appear in string obj in Python.
# range(int('80', 16), int('84', 16)+1)])
# range(int('86', 16), int('9f', 16)+1)])
XML_ILLEGAL_CHARS = _xml_ILLEGAL_CHARS
def _legalize_str(s: str, print_info: bool=False):
for c in XML_ILLEGAL_CHARS:
hash_before = hash(s)
s = s.replace(c, '')
if print_info and hash(s) != hash_before:
print(f"Removed XML illegal char \\x{ord(c):02x}")
return s
def _legalize_list(l: list):
for i, v in enumerate(l):
if isinstance(v, str):
l[i] = _legalize_str(v)
elif isinstance(v, dict):
l[i] = _legalize_dict(v)
elif isinstance(v, list):
l[i] = _legalize_list(v)
else:
pass
return l
def _legalize_dict(d: dict):
for k, v in d.items():
if isinstance(v, str):
d[k] = _legalize_str(v)
elif isinstance(v, list):
d[k] = _legalize_list(v)
elif isinstance(v, dict):
d[k] = _legalize_dict(v)
else:
pass
return d
def xml_chars_legalize(obj: Union[dict, str, list]) -> Union[dict, str, list]:
""" Remove XML illegal characters from a dict, list or str. """
if isinstance(obj, str):
return _legalize_str(obj)
elif isinstance(obj, dict):
return _legalize_dict(obj)
elif isinstance(obj, list):
return _legalize_list(obj)
else:
raise TypeError(f"Unexpected type: {type(obj)}")
def _test_xml_chars_legalize():
_str = '\x0bA\x0cB\vC'
_str_after = 'ABC'
assert _legalize_str(_str) == _str_after
_list = ['\x0bA', '\x0cB', 'C\v']
_list_after = ['A', 'B', 'C']
assert _legalize_list(_list) == _list_after
_dict = {'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list}
_dict_after = {'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after}
assert _legalize_dict(_dict) == _dict_after
_dict_in_list_in_dict_in_dict = {"dict": [{'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list, 'dict': _dict}]}
_dict_balabala_after = {'dict': [{'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after, 'dict': _dict_after}]}
assert _legalize_dict(_dict_in_list_in_dict_in_dict) == _dict_balabala_after
if __name__ == '__main__':
# TODO: use pytest
_test_xml_chars_legalize()