*upper_part

*refactor
*poetry
This commit is contained in:
yzqzss 2023-06-07 03:38:26 +08:00
parent 3505eb975c
commit 191673d289
15 changed files with 271 additions and 96 deletions

6
.gitignore vendored
View File

@ -1,7 +1,7 @@
biliarchiver/
bvids/
sess_data.txt
biliarchiver.home
.venv/
__pycache__/
videos/
videos/
.vscode/
bilibili_archive_dir/

View File

@ -1,18 +1,21 @@
import json
import os
from pathlib import Path
import time
from internetarchive import get_item
from requests import Response
from rich import print
from glob import glob
from _biliarchiver_archive_bvid import BILIBILI_IDENTIFIER_PERFIX
from _uploadingLock import UploadLock, AlreadyRunningError
from biliarchiver.utils.string import human_readable_upper_part_map
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config
from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError
from biliarchiver.version import BILI_ARCHIVER_VERSION
def upload_bvid(bvid):
try:
lock_dir = f'biliarchiver/.locks/{bvid}/'
lock_dir = config.storage_home_dir / '.locks' / bvid
os.makedirs(lock_dir, exist_ok=True)
with UploadLock(lock_dir):
with UploadLock(lock_dir): # type: ignore
_upload_bvid(bvid)
except AlreadyRunningError:
print(f'已经有一个上传 {bvid} 的进程在运行,跳过')
@ -20,48 +23,55 @@ def upload_bvid(bvid):
print(f'上传 {bvid} 时出错:')
raise e
def _upload_bvid(bvid):
if not os.path.exists('biliarchiver.home'):
raise Exception('先创建 biliarchiver.home 文件')
access_key, secret_key = read_ia_keys(os.path.expanduser('~/.bili_ia_keys.txt'))
# sample: BiliBili-BV1Zh4y1x7RL_p3
videos_basepath = f'biliarchiver/videos/{bvid}'
for identifier in os.listdir(videos_basepath):
if os.path.exists(f'{videos_basepath}/{identifier}/_uploaded.mark'):
print(f'{identifier} 已经上传过了(_uploaded.mark)')
def _upload_bvid(bvid: str):
access_key, secret_key = read_ia_keys(config.ia_key_file)
# identifier format: BiliBili-{bvid}_p{pid}-{upper_part}
upper_part = human_readable_upper_part_map(string=bvid, backward=True)
OLD_videos_basepath: Path = config.storage_home_dir / 'videos' / bvid
videos_basepath: Path = config.storage_home_dir / 'videos' / f'{bvid}-{upper_part}'
if os.path.exists(OLD_videos_basepath):
print(f'检测到旧的视频主目录 {OLD_videos_basepath},将其重命名为 {videos_basepath}...')
os.rename(OLD_videos_basepath, videos_basepath)
for local_identifier in os.listdir(videos_basepath):
if os.path.exists(f'{videos_basepath}/{local_identifier}/_uploaded.mark'):
print(f'{local_identifier} 已经上传过了(_uploaded.mark)')
continue
if identifier.startswith('_') :
print(f'跳过 {identifier}')
if local_identifier.startswith('_') :
print(f'跳过 {local_identifier}')
continue
if not identifier.startswith(BILIBILI_IDENTIFIER_PERFIX):
print(f'{identifier} 不是以 {BILIBILI_IDENTIFIER_PERFIX} 开头的正确 identifier')
if not local_identifier.startswith(BILIBILI_IDENTIFIER_PERFIX):
print(f'{local_identifier} 不是以 {BILIBILI_IDENTIFIER_PERFIX} 开头的正确 local_identifier')
continue
if not os.path.exists(f'{videos_basepath}/{identifier}/_downloaded.mark'):
print(f'{identifier} 没有下载完成')
if not os.path.exists(f'{videos_basepath}/{local_identifier}/_downloaded.mark'):
print(f'{local_identifier} 没有下载完成')
continue
pid = identifier.split('_')[-1][1:]
file_basename = identifier[len(BILIBILI_IDENTIFIER_PERFIX)+1:]
pid = local_identifier.split('_')[-1][1:]
file_basename = local_identifier[len(BILIBILI_IDENTIFIER_PERFIX)+1:]
print(f'==== 开始上传 {identifier} ====')
item = get_item(identifier)
remote_identifier = f'{local_identifier}-{upper_part}'
print(f'=== 开始上传 {local_identifier} => {remote_identifier} ===')
item = get_item(remote_identifier)
if item.exists:
print(f'item {identifier} 已存在(item.exists)')
print(f'item {remote_identifier} 已存在(item.exists)')
if item.metadata.get("upload-state") == "uploaded":
print(f'{identifier} 已经上传过了,跳过(item.metadata.uploaded)')
with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
print(f'{remote_identifier} 已经上传过了,跳过(item.metadata.uploaded)')
with open(f'{videos_basepath}/{local_identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
f.write('')
continue
filedict = {} # "remote filename": "local filename"
for filename in os.listdir(f'{videos_basepath}/{identifier}/extra'):
file = f'{videos_basepath}/{identifier}/extra/{filename}'
for filename in os.listdir(f'{videos_basepath}/{local_identifier}/extra'):
file = f'{videos_basepath}/{local_identifier}/extra/{filename}'
if os.path.isfile(file):
if file.startswith('_'):
continue
filedict[filename] = file
for filename in os.listdir(f'{videos_basepath}/{identifier}'):
file = f'{videos_basepath}/{identifier}/{filename}'
for filename in os.listdir(f'{videos_basepath}/{local_identifier}'):
file = f'{videos_basepath}/{local_identifier}/{filename}'
if os.path.isfile(file):
if os.path.basename(file).startswith('_'):
continue
@ -74,10 +84,10 @@ def _upload_bvid(bvid):
for file_in_item in item.files:
if file_in_item["name"] in filedict:
filedict.pop(file_in_item["name"])
print(f"File {file_in_item['name']} already exists in {identifier}.")
print(f"File {file_in_item['name']} already exists in {remote_identifier}.")
with open(f'{videos_basepath}/{identifier}/extra/{file_basename}.info.json', 'r', encoding='utf-8') as f:
with open(f'{videos_basepath}/{local_identifier}/extra/{file_basename}.info.json', 'r', encoding='utf-8') as f:
bv_info = json.load(f)
# with open(f'{videos_basepath}/_videos_info.json', 'r', encoding='utf-8') as f:
# videos_info = json.load(f)
@ -86,17 +96,22 @@ def _upload_bvid(bvid):
for tag in bv_info['data']['Tags']:
tags.append(tag['tag_name'])
pubdate = bv_info['data']['View']['pubdate']
cid = None
p_part = None
for page in bv_info['data']['View']['pages']:
if page['page'] == int(pid):
cid = page['cid']
part = page['part']
p_part = page['part']
break
assert cid is not None
assert p_part is not None
md = {
"mediatype": "movies",
"collection": 'opensource_movies',
"title": bv_info['data']['View']['title'] + f' P{pid} ' + part ,
"description": identifier + ' uploading...',
"title": bv_info['data']['View']['title'] + f' P{pid} ' + p_part ,
"description": remote_identifier + ' uploading...',
'creator': bv_info['data']['View']['owner']['name'], # UP 主
# UTC time
'date': time.strftime("%Y-%m-%d", time.gmtime(pubdate)),
@ -110,8 +125,8 @@ def _upload_bvid(bvid):
), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
"upload-state": "uploading",
'originalurl': f'https://www.bilibili.com/video/{bvid}/?p={pid}',
'scanner': 'biliarchiver v0.0.5 (dev)',
}
'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)',
}
print(filedict)
print(md)
@ -127,11 +142,11 @@ def _upload_bvid(bvid):
)
tries = 30
item = get_item(identifier) # refresh item
item = get_item(remote_identifier) # refresh item
while not item.exists and tries > 0:
print(f"Waiting for item to be created ({tries}) ...", end='\r')
time.sleep(30)
item = get_item(identifier)
item = get_item(remote_identifier)
tries -= 1
new_md = {}
@ -149,10 +164,11 @@ def _upload_bvid(bvid):
access_key=access_key,
secret_key=secret_key,
)
assert isinstance(r, Response)
r.raise_for_status()
with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
with open(f'{videos_basepath}/{local_identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
f.write('')
print(f'==== {identifier} 上传完成 ====')
print(f'==== {remote_identifier} 上传完成 ====')
def read_ia_keys(keysfile):
''' Return: tuple(`access_key`, `secret_key`) '''

View File

@ -1,5 +1,6 @@
import asyncio
import os
from pathlib import Path
import aiofiles
import httpx
@ -12,9 +13,9 @@ from rich import print
import json
from bilix.sites.bilibili.downloader import DownloaderBilibili
BILIBILI_IDENTIFIER_PERFIX = 'BiliBili' # IA identifier 前缀,千万不要改。能与 tubeup 兼容。
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX
from biliarchiver.config import config
from biliarchiver.utils.string import human_readable_upper_part_map
@raise_api_error
async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid):
@ -38,10 +39,17 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str, logined: bool=False):
assert d.hierarchy is True, 'hierarchy 必须为 True' # 为保持后续目录结构、文件命名的一致性
assert d.client.cookies.get('SESSDATA') is not None, 'sess_data 不能为空' # 开个大会员呗,能下 4k 呢。
assert logined is True, '请先检查 SESSDATA 是否过期,再将 logined 设置为 True' # 防误操作
assert os.path.exists('biliarchiver.home'), '先创建 biliarchiver.home 文件' # 防误操作
videos_basepath = f'biliarchiver/videos/{bvid}'
if os.path.exists(f'{videos_basepath}/_all_downloaded.mark'):
upper_part = human_readable_upper_part_map(string=bvid, backward=True)
OLD_videos_basepath: Path = config.storage_home_dir / 'videos' / bvid
videos_basepath: Path = config.storage_home_dir / 'videos' / f'{bvid}-{upper_part}'
if os.path.exists(OLD_videos_basepath):
print(f'检测到旧的视频目录 {OLD_videos_basepath},将其重命名为 {videos_basepath}...')
os.rename(OLD_videos_basepath, videos_basepath)
if os.path.exists(videos_basepath / '_all_downloaded.mark'):
print(f'{bvid} 所有分p都已下载过了')
return
@ -59,8 +67,8 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str, logined: bool=False):
continue
file_basename = f'{bvid}_p{pid}'
video_basepath = f'{videos_basepath}/{BILIBILI_IDENTIFIER_PERFIX}-{file_basename}'
video_extrapath = f'{video_basepath}/extra'
video_basepath = videos_basepath / f'{BILIBILI_IDENTIFIER_PERFIX}-{file_basename}'
video_extrapath = video_basepath / 'extra'
if os.path.exists(f'{video_basepath}/_downloaded.mark'):
print(f'{file_basename}: 已经下载过了')
continue
@ -89,14 +97,15 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str, logined: bool=False):
for media in video_info.dash.videos:
if media.codec.startswith('hev'):
codec = media.codec
print(f'{file_basename}: "{codec}" "{media.quality}" ...')
break
if codec is None:
for media in video_info.dash.videos:
if media.codec.startswith('avc'):
codec = media.codec
print(f'{file_basename}: "{codec}" "{media.quality}" ...')
break
assert codec is not None, f'{file_basename}: 没有 avc 或 hevc 编码的视频'
print(f'{file_basename}: "{media.codec}" "{media.quality}" ...')
elif video_info.other:
print(f'{file_basename}: 未解析到dash资源交给 bilix 处理 ...')
codec = ''

View File

@ -2,23 +2,37 @@ import asyncio
import os
import argparse
from _biliarchiver_archive_bvid import archive_bvid
from biliarchiver.archive_bvid import archive_bvid
from biliarchiver.config import Config
from bilix.sites.bilibili.downloader import DownloaderBilibili
from rich.console import Console
from httpx import Client
from httpx import AsyncClient, Client
from rich.traceback import install
from biliarchiver.utils.string import human_readable_upper_part_map
install()
from _biliarchiver_archive_bvid import BILIBILI_IDENTIFIER_PERFIX
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX
from dataclasses import dataclass
@dataclass
class Args:
cookies: str
bvids: str
skip_ia: bool
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--cookies', type=str, default='~/.cookies.txt')
parser.add_argument('--bvids', type=str, help='bvids 列表的文件路径', required=True)
parser.add_argument('--skip-exist', action='store_true',
help='跳过 IA 上已存在的 item (只检查 p1 是否存在)')
args = parser.parse_args()
parser.add_argument('--cookies', dest='cookies', type=str, default='~/.cookies.txt')
parser.add_argument('--bvids', dest='bvids', type=str, help='bvids 列表的文件路径', required=True)
parser.add_argument('-s', '--skip-ia-check', dest='skip_ia', action='store_true',
help='不检查 IA 上是否已存在对应 BVID 的 item ,直接开始下载')
parser.parse_args()
args = Args(**vars(parser.parse_args()))
return args
def check_ia_item_exist(client: Client, identifier: str) -> bool:
@ -37,22 +51,22 @@ def check_ia_item_exist(client: Client, identifier: str) -> bool:
else:
raise ValueError(f'Unexpected code: {r_json["code"]}')
def main():
def _main():
args = parse_args()
assert args.bvids is not None, '必须指定 bvids 列表的文件路径'
with open(args.bvids, 'r', encoding='utf-8') as f:
bvids = f.read().splitlines()
bvids_from_file = f.read().splitlines()
config = Config()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
from config import video_concurrency, part_concurrency, stream_retry
d = DownloaderBilibili(hierarchy=True, sess_data=None,
video_concurrency=video_concurrency,
part_concurrency=part_concurrency,
stream_retry=stream_retry,
d = DownloaderBilibili(hierarchy=True, sess_data=None, # sess_data 将在后面装载 cookies 时装载 # type: ignore
video_concurrency=config.video_concurrency,
part_concurrency=config.part_concurrency,
stream_retry=config.stream_retry,
)
update_cookies_from_file(d.client, args.cookies)
client = Client(cookies=d.client.cookies, headers=d.client.headers)
@ -61,17 +75,18 @@ def main():
return
d.progress.start()
for index, bvid in enumerate(bvids):
if args.skip_exist:
identifier = f'{BILIBILI_IDENTIFIER_PERFIX}-{bvid}_p1'
if check_ia_item_exist(client, identifier):
print(f'IA 上已存在 {identifier} ,跳过')
for index, bvid in enumerate(bvids_from_file):
if not args.skip_ia:
upper_part = human_readable_upper_part_map(string=bvid, backward=True)
remote_identifier = f'{BILIBILI_IDENTIFIER_PERFIX}-{bvid}_p1-{upper_part}'
if check_ia_item_exist(client, remote_identifier):
print(f'IA 上已存在 {remote_identifier} ,跳过')
continue
while len(asyncio.all_tasks(loop)) > video_concurrency:
loop.run_until_complete(asyncio.sleep(0.01))
while len(asyncio.all_tasks(loop)) > config.video_concurrency:
loop.run_until_complete(asyncio.sleep(0.008))
print(f'=== {bvid} ({index+1}/{len(bvids)}) ===')
print(f'=== {bvid} ({index+1}/{len(bvids_from_file)}) ===')
task = loop.create_task(archive_bvid(d, bvid, logined=logined))
@ -80,7 +95,7 @@ def main():
def update_cookies_from_file(client: Client, cookies_path: str):
def update_cookies_from_file(client: AsyncClient, cookies_path: str):
cookies_path = os.path.expanduser(cookies_path)
assert os.path.exists(cookies_path), f'cookies 文件不存在: {cookies_path}'
from http.cookiejar import MozillaCookieJar
@ -90,11 +105,12 @@ def update_cookies_from_file(client: Client, cookies_path: str):
for cookie in cj:
# only load bilibili cookies
if 'bilibili' in cookie.domain:
assert cookie.value is not None
client.cookies.set(
cookie.name, cookie.value, domain=cookie.domain, path=cookie.path
)
loadded_cookies += 1
print(f'{cookies_path} 加载了 {loadded_cookies} cookies')
print(f'{cookies_path} 加载了 {loadded_cookies} cookies')
if loadded_cookies > 100:
print('可能加载了过多的 cookies可能导致 httpx.Client 响应非常慢')
@ -111,12 +127,15 @@ def is_login(cilent: Client) -> bool:
print('未登录/SESSDATA无效/过期')
return False
if __name__ == '__main__':
def main():
try:
main()
_main()
except KeyboardInterrupt:
print('KeyboardInterrupt')
finally:
# 显示终端光标
console = Console()
console.show_cursor()
console.show_cursor()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,14 @@
from biliarchiver._biliarchiver_upload_bvid import upload_bvid
from biliarchiver.config import config
import os
def main():
for bvid_with_upper_part in os.listdir(config.storage_home_dir / 'videos'):
bvid = bvid_with_upper_part
if '-' in bvid_with_upper_part:
bvid = bvid_with_upper_part.split('-')[0]
upload_bvid(bvid)
if __name__ == '__main__':
main()

56
biliarchiver/config.py Normal file
View File

@ -0,0 +1,56 @@
from dataclasses import dataclass
import os
import json
from pathlib import Path
CONFIG_FILE = 'config.json'
BILIBILI_IDENTIFIER_PERFIX = 'BiliBili' # IA identifier 前缀。
class singleton(type):
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super(singleton, cls).__call__(*args, **kwargs)
return cls._instances[cls]
@dataclass
class Config(metaclass=singleton):
video_concurrency: int = 3
part_concurrency: int = 10
stream_retry: int = 20
storage_home_dir: Path = Path('bilibili_archive_dir/').expanduser()
ia_key_file: Path = Path('~/.bili_ia_keys.txt').expanduser()
def __init__(self):
self.is_right_pwd()
if not os.path.exists(CONFIG_FILE):
print(f'{CONFIG_FILE} 不存在,创建中...')
self.save()
with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
print(f'读取 {CONFIG_FILE}...')
config_file = json.load(f)
self.video_concurrency: int = config_file['video_concurrency']
self.part_concurrency: int = config_file['part_concurrency']
self.stream_retry: int = config_file['stream_retry']
self.storage_home_dir: Path = Path(config_file['storage_home_dir']).expanduser()
self.ia_key_file: Path = Path(config_file['ia_key_file']).expanduser()
def save(self):
with open(CONFIG_FILE, 'w', encoding='utf-8') as f:
json.dump({
'video_concurrency': self.video_concurrency,
'part_concurrency': self.part_concurrency,
'stream_retry': self.stream_retry,
'storage_home_dir': str(self.storage_home_dir),
'ia_key_file': str(self.ia_key_file),
}, f, ensure_ascii=False, indent=4)
def is_right_pwd(self):
if not os.path.exists('biliarchiver.home'):
raise Exception('先在当前工作目录创建 biliarchiver.home 文件')
config = Config()

View File

@ -54,6 +54,7 @@ class UploadLock_Fcntl():
self.lock_file_fd = None
def __enter__(self):
assert self.fcntl is not None
self.lock_file_fd = open(self.lock_file, 'w')
try:
self.fcntl.lockf(self.lock_file_fd, self.fcntl.LOCK_EX | self.fcntl.LOCK_NB)
@ -63,6 +64,7 @@ class UploadLock_Fcntl():
def __exit__(self, exc_type, exc_val, exc_tb):
assert self.fcntl is not None
if self.lock_file_fd is None:
raise IOError("Lock file not opened.")
self.fcntl.lockf(self.lock_file_fd, self.fcntl.LOCK_UN)

View File

@ -0,0 +1,37 @@
''' 为同一字符串序列的不同大小写形式生成不碰撞的字符串。以便在大小写不敏感的系统中存储同一字符串的不同形式。 '''
from io import StringIO
def human_readable_upper_part_map(string: str, backward: bool):
''' 找到字符串中所有的 ASCII 大写字母,并返回一个能表示他们绝对位置的字符串。
其中每个非相邻的大写字母之间用数字表示相隔的字符数
params: backward: 可以表示是正着看还是倒着看
NOTE: 在我们的用例中我们 backward = True 这样产生的 upper_part 就不太像 BV 号或者类似的编号以免 upper_part 污染全文搜索
例如
backward = False
BV1HP411D7Rj -> BV1HP3D1R 长得像 bvid
backward = True
BV1HP411D7Rj -> 1R1D3PH1VB
'''
assert backward
if backward:
string = string[::-1]
result = StringIO()
steps = 0
for char in string:
if char.isascii() and char.isupper():
if steps == 0:
result.write(char)
else:
result.write(f'{steps}{char}')
steps = 0
else:
steps += 1
return result.getvalue()

1
biliarchiver/version.py Normal file
View File

@ -0,0 +1 @@
BILI_ARCHIVER_VERSION = '0.0.9'

View File

@ -1,9 +0,0 @@
from _biliarchiver_upload_bvid import upload_bvid
import os
def main():
for bvid in os.listdir('biliarchiver/videos'):
upload_bvid(bvid)
if __name__ == '__main__':
main()

7
config.json Normal file
View File

@ -0,0 +1,7 @@
{
"video_concurrency": 3,
"part_concurrency": 10,
"stream_retry": 20,
"storage_home_dir": "bilibili_archive_dir",
"ia_key_file": "/home/yzqzss/.bili_ia_keys.txt"
}

View File

@ -1,3 +0,0 @@
video_concurrency = 3
part_concurrency = 10
stream_retry = 20

7
poetry.lock generated Normal file
View File

@ -0,0 +1,7 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
package = []
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c595a0588c25d58f3e3834ad7169126836d262b925fe6ca9b5d540dcf301d254"

19
pyproject.toml Normal file
View File

@ -0,0 +1,19 @@
[tool.poetry]
name = "bilibili-archiver"
version = "0.1.0"
description = ""
authors = ["yzqzss <yzqzss@yandex.com>"]
readme = "README.md"
packages = [{include = "biliarchiver"}]
[tool.poetry.dependencies]
python = "^3.9"
[tool.poetry.scripts]
bili_archive_bvids = "biliarchiver:bili_archive_bvids.main"
bili_uploade = "biliarchiver:bili_uploade.main"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"