mirror of
https://github.com/saveweb/biliarchiver.git
synced 2024-09-19 19:15:27 -07:00
feat: remove XML illegal chars in metadata before uploading
This commit is contained in:
parent
4c0516d45f
commit
72c552a04a
@ -11,6 +11,7 @@ from biliarchiver.exception import VideosBasePathNotFoundError
|
||||
from biliarchiver.utils.identifier import human_readable_upper_part_map
|
||||
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config
|
||||
from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError
|
||||
from biliarchiver.utils.xml_chars import xml_chars_legalize
|
||||
from biliarchiver.version import BILI_ARCHIVER_VERSION
|
||||
|
||||
def upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
|
||||
@ -154,17 +155,17 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
|
||||
'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)',
|
||||
}
|
||||
|
||||
# XML 中不能有 \b 等特殊控制字符,IA 会拒收。
|
||||
# 先只简单删 \b ,如果以后再发现元数据里出现其它非法字符,再说。
|
||||
_md_str = json.dumps(md, ensure_ascii=False)
|
||||
if "\\b" in _md_str:
|
||||
print("WARNING: \\b in metadata, removing it")
|
||||
md = json.loads(_md_str.replace("\\b", ""))
|
||||
del(_md_str)
|
||||
|
||||
print(filedict)
|
||||
print(md)
|
||||
|
||||
# remove XML illegal characters
|
||||
_md_before = hash(json.dumps(md))
|
||||
md = xml_chars_legalize(obj=md)
|
||||
assert isinstance(md, dict)
|
||||
if hash(json.dumps(md)) != _md_before:
|
||||
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
|
||||
print(md)
|
||||
|
||||
if filedict:
|
||||
r = item.upload(
|
||||
files=filedict,
|
||||
@ -186,18 +187,27 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
|
||||
|
||||
new_md = {}
|
||||
if item.metadata.get("upload-state") != "uploaded":
|
||||
new_md.update({"upload-state": "uploaded"})
|
||||
new_md["upload-state"] = "uploaded"
|
||||
if item.metadata.get("creator") != md['creator']:
|
||||
new_md.update({"creator": md['creator']})
|
||||
new_md["creator"] = md['creator']
|
||||
if item.metadata.get("description", "") != bv_info['data']['View']['desc']:
|
||||
new_md.update({"description": bv_info['data']['View']['desc']})
|
||||
new_md["description"] = bv_info['data']['View']['desc']
|
||||
if item.metadata.get("scanner") != md['scanner']:
|
||||
new_md.update({"scanner": md['scanner']})
|
||||
new_md["scanner"] = md['scanner']
|
||||
if item.metadata.get("external-identifier") != md['external-identifier']:
|
||||
new_md.update({"external-identifier": md['external-identifier']})
|
||||
new_md["external-identifier"] = md['external-identifier']
|
||||
if new_md:
|
||||
print(f"Updating metadata:")
|
||||
print(new_md)
|
||||
|
||||
# remove XML illegal characters
|
||||
_md_before = hash(json.dumps(md))
|
||||
new_md = xml_chars_legalize(obj=new_md)
|
||||
assert isinstance(new_md, dict)
|
||||
if hash(json.dumps(new_md)) != _md_before:
|
||||
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
|
||||
print(new_md)
|
||||
|
||||
r = item.modify_metadata(
|
||||
metadata=new_md,
|
||||
access_key=access_key,
|
||||
|
73
biliarchiver/utils/xml_chars.py
Normal file
73
biliarchiver/utils/xml_chars.py
Normal file
@ -0,0 +1,73 @@
|
||||
from typing import Union
|
||||
|
||||
_xml_ILLEGAL_CHARS = []
|
||||
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('01', 16), int('08', 16)+1)])
|
||||
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0b', 16), int('0c', 16)+1)])
|
||||
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0e', 16), int('1f', 16)+1)])
|
||||
_xml_ILLEGAL_CHARS.extend(['\x7f'])
|
||||
# NOTE: The following are non-ASCII characters, which will not appear in string obj in Python.
|
||||
# range(int('80', 16), int('84', 16)+1)])
|
||||
# range(int('86', 16), int('9f', 16)+1)])
|
||||
XML_ILLEGAL_CHARS = _xml_ILLEGAL_CHARS
|
||||
|
||||
|
||||
def _legalize_str(s: str, print_info: bool=False):
|
||||
for c in XML_ILLEGAL_CHARS:
|
||||
hash_before = hash(s)
|
||||
s = s.replace(c, '')
|
||||
if print_info and hash(s) != hash_before:
|
||||
print(f"Removed XML illegal char \\x{ord(c):02x}")
|
||||
return s
|
||||
|
||||
def _legalize_list(l: list):
|
||||
for i, v in enumerate(l):
|
||||
if isinstance(v, str):
|
||||
l[i] = _legalize_str(v)
|
||||
elif isinstance(v, dict):
|
||||
l[i] = _legalize_dict(v)
|
||||
elif isinstance(v, list):
|
||||
l[i] = _legalize_list(v)
|
||||
else:
|
||||
pass
|
||||
return l
|
||||
|
||||
def _legalize_dict(d: dict):
|
||||
for k, v in d.items():
|
||||
if isinstance(v, str):
|
||||
d[k] = _legalize_str(v)
|
||||
elif isinstance(v, list):
|
||||
d[k] = _legalize_list(v)
|
||||
elif isinstance(v, dict):
|
||||
d[k] = _legalize_dict(v)
|
||||
else:
|
||||
pass
|
||||
return d
|
||||
|
||||
def xml_chars_legalize(obj: Union[dict, str, list]) -> Union[dict, str, list]:
|
||||
""" Remove XML illegal characters from a dict, list or str. """
|
||||
if isinstance(obj, str):
|
||||
return _legalize_str(obj)
|
||||
elif isinstance(obj, dict):
|
||||
return _legalize_dict(obj)
|
||||
elif isinstance(obj, list):
|
||||
return _legalize_list(obj)
|
||||
else:
|
||||
raise TypeError(f"Unexpected type: {type(obj)}")
|
||||
|
||||
def _test_xml_chars_legalize():
|
||||
_str = '\x0bA\x0cB\vC'
|
||||
_str_after = 'ABC'
|
||||
assert _legalize_str(_str) == _str_after
|
||||
_list = ['\x0bA', '\x0cB', 'C\v']
|
||||
_list_after = ['A', 'B', 'C']
|
||||
assert _legalize_list(_list) == _list_after
|
||||
_dict = {'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list}
|
||||
_dict_after = {'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after}
|
||||
assert _legalize_dict(_dict) == _dict_after
|
||||
_dict_in_list_in_dict_in_dict = {"dict": [{'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list, 'dict': _dict}]}
|
||||
_dict_balabala_after = {'dict': [{'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after, 'dict': _dict_after}]}
|
||||
assert _legalize_dict(_dict_in_list_in_dict_in_dict) == _dict_balabala_after
|
||||
|
||||
if __name__ == '__main__':
|
||||
# TODO: use pytest
|
||||
_test_xml_chars_legalize()
|
Loading…
Reference in New Issue
Block a user