From 72c552a04a6410be610f792bd2f8444c84874c44 Mon Sep 17 00:00:00 2001
From: yzqzss <yzqzss@yandex.com>
Date: Wed, 26 Jul 2023 02:25:54 +0800
Subject: [PATCH] feat: remove XML illegal chars in metadata before uploading

---
 biliarchiver/_biliarchiver_upload_bvid.py | 36 +++++++----
 biliarchiver/utils/xml_chars.py           | 73 +++++++++++++++++++++++
 2 files changed, 96 insertions(+), 13 deletions(-)
 create mode 100644 biliarchiver/utils/xml_chars.py

diff --git a/biliarchiver/_biliarchiver_upload_bvid.py b/biliarchiver/_biliarchiver_upload_bvid.py
index 13a632d..959a8ce 100644
--- a/biliarchiver/_biliarchiver_upload_bvid.py
+++ b/biliarchiver/_biliarchiver_upload_bvid.py
@@ -11,6 +11,7 @@ from biliarchiver.exception import VideosBasePathNotFoundError
 from biliarchiver.utils.identifier import human_readable_upper_part_map
 from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config
 from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError
+from biliarchiver.utils.xml_chars import xml_chars_legalize
 from biliarchiver.version import BILI_ARCHIVER_VERSION
 
 def upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
@@ -154,17 +155,17 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
             'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)',
         }
 
-        # XML 中不能有 \b 等特殊控制字符，IA 会拒收。
-        # 先只简单删 \b ，如果以后再发现元数据里出现其它非法字符，再说。
-        _md_str = json.dumps(md, ensure_ascii=False)
-        if "\\b" in _md_str:
-            print("WARNING: \\b in metadata, removing it")
-            md = json.loads(_md_str.replace("\\b", ""))
-        del(_md_str)
-
         print(filedict)
         print(md)
 
+        # remove XML illegal characters
+        _md_before = hash(json.dumps(md))
+        md = xml_chars_legalize(obj=md)
+        assert isinstance(md, dict)
+        if hash(json.dumps(md)) != _md_before:
+            print(f"Removed XML illegal characters from metadata, cleaned metadata:")
+            print(md)
+
         if filedict:
             r = item.upload(
                 files=filedict,
@@ -186,18 +187,27 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
 
         new_md = {}
         if item.metadata.get("upload-state") != "uploaded":
-            new_md.update({"upload-state": "uploaded"})
+            new_md["upload-state"] = "uploaded"
         if item.metadata.get("creator") != md['creator']:
-            new_md.update({"creator": md['creator']})
+            new_md["creator"] = md['creator']
         if item.metadata.get("description", "") != bv_info['data']['View']['desc']:
-            new_md.update({"description": bv_info['data']['View']['desc']})
+            new_md["description"] = bv_info['data']['View']['desc']
         if item.metadata.get("scanner") != md['scanner']:
-            new_md.update({"scanner": md['scanner']})
+            new_md["scanner"] = md['scanner']
         if item.metadata.get("external-identifier") != md['external-identifier']:
-            new_md.update({"external-identifier": md['external-identifier']})
+            new_md["external-identifier"] = md['external-identifier']
         if new_md:
             print(f"Updating metadata:")
             print(new_md)
+
+            # remove XML illegal characters
+            _md_before = hash(json.dumps(md))
+            new_md = xml_chars_legalize(obj=new_md)
+            assert isinstance(new_md, dict)
+            if hash(json.dumps(new_md)) != _md_before:
+                print(f"Removed XML illegal characters from metadata, cleaned metadata:")
+                print(new_md)
+
             r = item.modify_metadata(
                 metadata=new_md,
                 access_key=access_key,
diff --git a/biliarchiver/utils/xml_chars.py b/biliarchiver/utils/xml_chars.py
new file mode 100644
index 0000000..32a90d5
--- /dev/null
+++ b/biliarchiver/utils/xml_chars.py
@@ -0,0 +1,73 @@
+from typing import Union
+
+_xml_ILLEGAL_CHARS = []
+_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('01', 16), int('08', 16)+1)])
+_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0b', 16), int('0c', 16)+1)])
+_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0e', 16), int('1f', 16)+1)])
+_xml_ILLEGAL_CHARS.extend(['\x7f'])
+# NOTE: The following are non-ASCII characters, which will not appear in string obj in Python.
+# range(int('80', 16), int('84', 16)+1)])
+# range(int('86', 16), int('9f', 16)+1)])
+XML_ILLEGAL_CHARS = _xml_ILLEGAL_CHARS
+
+
+def _legalize_str(s: str, print_info: bool=False):
+    for c in XML_ILLEGAL_CHARS:
+        hash_before = hash(s)
+        s = s.replace(c, '')
+        if print_info and hash(s) != hash_before:
+            print(f"Removed XML illegal char \\x{ord(c):02x}")
+    return s
+
+def _legalize_list(l: list):
+    for i, v in enumerate(l):
+        if isinstance(v, str):
+            l[i] = _legalize_str(v)
+        elif isinstance(v, dict):
+            l[i] = _legalize_dict(v)
+        elif isinstance(v, list):
+            l[i] = _legalize_list(v)
+        else:
+            pass
+    return l
+
+def _legalize_dict(d: dict):
+    for k, v in d.items():
+        if isinstance(v, str):
+            d[k] = _legalize_str(v)
+        elif isinstance(v, list):
+            d[k] = _legalize_list(v)
+        elif isinstance(v, dict):
+            d[k] = _legalize_dict(v)
+        else:
+            pass
+    return d
+
+def xml_chars_legalize(obj: Union[dict, str, list]) -> Union[dict, str, list]:
+    """ Remove XML illegal characters from a dict, list or str. """
+    if isinstance(obj, str):
+        return _legalize_str(obj)
+    elif isinstance(obj, dict):
+        return _legalize_dict(obj)
+    elif isinstance(obj, list):
+        return _legalize_list(obj)
+    else:
+        raise TypeError(f"Unexpected type: {type(obj)}")
+
+def _test_xml_chars_legalize():
+    _str = '\x0bA\x0cB\vC'
+    _str_after = 'ABC'
+    assert _legalize_str(_str) == _str_after
+    _list = ['\x0bA', '\x0cB', 'C\v']
+    _list_after = ['A', 'B', 'C']
+    assert _legalize_list(_list) == _list_after
+    _dict = {'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list}
+    _dict_after = {'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after}
+    assert _legalize_dict(_dict) == _dict_after
+    _dict_in_list_in_dict_in_dict = {"dict": [{'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list, 'dict': _dict}]}
+    _dict_balabala_after = {'dict': [{'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after, 'dict': _dict_after}]}
+    assert _legalize_dict(_dict_in_list_in_dict_in_dict) == _dict_balabala_after
+
+if __name__ == '__main__':
+    # TODO: use pytest
+    _test_xml_chars_legalize()
\ No newline at end of file