diff --git a/_biliup_archive_bvid.py b/_biliup_archive_bvid.py index c7c8d2a..5eee4ec 100644 --- a/_biliup_archive_bvid.py +++ b/_biliup_archive_bvid.py @@ -1,12 +1,5 @@ -""" -bilix 提供了各个网站的api,如果你有需要当然可以使用,并且它们都是异步的 - -bilix provides api for various websites. You can use them if you need, and they are asynchronous -""" import asyncio import os -import shutil -import time import aiofiles import httpx @@ -14,7 +7,6 @@ from bilix.download.utils import raise_api_error, req_retry from bilix.exception import APIError from bilix.sites.bilibili import api -from httpx import AsyncClient from rich import print import json @@ -22,7 +14,7 @@ import json from bilix.sites.bilibili.downloader import DownloaderBilibili -identifier_perfix = 'BiliBili' +BILIBILI_IDENTIFIER_PERFIX = 'BiliBili' # IA identifier 前缀,千万不要改。能与 tubeup 兼容。 @raise_api_error async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid): @@ -31,46 +23,47 @@ async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid): info = json.loads(res.text) if info['code'] == -400: raise APIError(f'未找到字幕信息', params) - # return lan + + # 这里 monkey patch 一下把返回 lan_doc 改成返回 lan,这样生成的字幕文件名就是 语言代码 而不是 中文名 了 + # 例如 + # lan_doc: 中文(中国) + # lang: zh-CN + + # return [[f'http:{i["subtitle_url"]}', i['lan_doc']] for i in info['data']['subtitle']['subtitles']] return [[f'http:{i["subtitle_url"]}', i['lan']] for i in info['data']['subtitle']['subtitles']] api.get_subtitle_info = new_get_subtitle_info async def archive_bvid(d: DownloaderBilibili, bvid: str): - assert d.hierarchy is True, 'hierarchy 必须为 True' # 为了保持后续目录结构、文件命名的一致性 + assert d.hierarchy is True, 'hierarchy 必须为 True' # 为保持后续目录结构、文件命名的一致性 assert d.client.cookies.get('SESSDATA') is not None, 'sess_data 不能为空' # 开个大会员呗,能下 4k 呢。 assert os.path.exists('biliup.home'), '先创建 biliup.home 文件' # 防误操作 - url = f'https://www.bilibili.com/video/{bvid}/' - # data = await api.get_video_info(client, "https://www.bilibili.com/video/BV1jK4y1N7ST?p=5") - - # d.update_cookies_from_browser('firefox') - videos_basepath = f'biliup/videos/{bvid}' if os.path.exists(f'{videos_basepath}/_all_downloaded.mark'): print(f'{bvid} 所有分p都已下载过了') return - videos_info = await api.get_video_info(d.client, url) + + url = f'https://www.bilibili.com/video/{bvid}/' + # 为了获取 pages,先请求一次 + first_video_info = await api.get_video_info(d.client, url) + os.makedirs(videos_basepath, exist_ok=True) - - # async with aiofiles.open(f'{videos_basepath}/_videos_info.json', 'w', encoding='utf-8') as f: - # # 用于 debug 的来自 bilix 输出的视频信息,包含用户敏感信息(mid 等) - # await f.write(json.dumps(videos_info.dict(), ensure_ascii=False, indent=4)) - pid = 0 - for page in videos_info.pages: - pid += 1 + for page in first_video_info.pages: + pid += 1 # pid 从 1 开始 if not page.p_url.endswith(f'?p={pid}'): - print(f'{bvid} 的第 {pid}P 不存在') + print(f'{bvid} 的 P{pid} 不存在 (可能视频被 UP主/B站 删了)') continue file_basename = f'{bvid}_p{pid}' - video_basepath = f'{videos_basepath}/{identifier_perfix}-{file_basename}' + video_basepath = f'{videos_basepath}/{BILIBILI_IDENTIFIER_PERFIX}-{file_basename}' video_extrapath = f'{video_basepath}/extra' if os.path.exists(f'{video_basepath}/_downloaded.mark'): - print(f'{bvid} 的第 {pid}p 已经下载过了') + print(f'{file_basename}: 已经下载过了') continue + video_info = await api.get_video_info(d.client, page.p_url) os.makedirs(video_basepath, exist_ok=True) os.makedirs(video_extrapath, exist_ok=True) @@ -79,10 +72,16 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str): old_p_name = video_info.pages[video_info.p].p_name old_h1_title = video_info.h1_title - video_info.pages[video_info.p].p_name = file_basename - video_info.h1_title = 'tttttt' * 50 # 假装超长标题,强制 bilix fallback 到 file_basename 作为文件名 + # 在 d.hierarchy is True 且 h1_title 超长的情况下, bilix 会将 p_name 作为文件名 + video_info.pages[video_info.p].p_name = file_basename # 所以这里覆盖 p_name 为 file_basename + video_info.h1_title = 'iiiiii' * 50 # 然后假装超长标题 + # 这样 bilix 保存的文件名就是我们想要的了(谁叫 bilix 不支持自定义文件名呢) + # NOTE: p_name 似乎也不宜过长,否则还是会被 bilix 截断。 + # 但是我们以 {bvid}_p{pid} 作为文件名,这个长度是没问题的。 - # 选择编码,优先 hevc + + # 选择编码,优先 hevc,没有的话就 avc + # 不选 av0 ,毕竟目前没几个设备能拖得动 codec = None for media in video_info.dash.videos: if media.codec.startswith('hev'): @@ -93,26 +92,32 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str): if media.codec.startswith('avc'): codec = media.codec break - assert codec is not None, f'{bvid}_{pid}p: 没有 avc 或 hevc 编码的视频' - print(f'{bvid}_{pid}p: "{media.codec}" "{media.quality}" ...') + assert codec is not None, f'{file_basename}: 没有 avc 或 hevc 编码的视频' + print(f'{file_basename}: "{media.codec}" "{media.quality}" ...') cor1 = d.get_video(page.p_url ,video_info=video_info, path=video_basepath, - quality=0, codec=codec, # 选择最高画质 + quality=0, # 选择最高画质 + codec=codec, # 编码 # 下载 ass 弹幕(bilix 会自动调用 danmukuC 将 pb 弹幕转为 ass)、封面、字幕 - # 他们会被放进 extra 子目录里 + # 弹幕、封面、字幕都会被放进 extra 子目录里,所以需要 d.hierarchy is True dm=True, image=True, subtitle=True ) # 下载原始的 pb 弹幕 cor2 = d.get_dm(page.p_url, video_info=video_info, path=video_extrapath) - # 下载视频详细信息 + # 下载视频超详细信息(BV 级别,不是分 P 级别) cor3 = download_bilibili_video_detail(d.client, bvid, f'{video_extrapath}/{file_basename}.info.json') await asyncio.gather(cor1, cor2, cor3) + # 还原为了自定义文件名而做的覆盖 video_info.pages[video_info.p].p_name = old_p_name video_info.h1_title = old_h1_title + # 单 p 下好了 async with aiofiles.open(f'{video_basepath}/_downloaded.mark', 'w', encoding='utf-8') as f: await f.write('') + + + # bv 对应的全部 p 下好了 async with aiofiles.open(f'{videos_basepath}/_all_downloaded.mark', 'w', encoding='utf-8') as f: await f.write('') @@ -124,7 +129,7 @@ async def download_bilibili_video_detail(client, bvid, filename): print(f'{bvid} 视频详情已存在') return # url = 'https://api.bilibili.com/x/web-interface/view' - url = 'https://api.bilibili.com/x/web-interface/view/detail' # 超详细 + url = 'https://api.bilibili.com/x/web-interface/view/detail' # 超详细 API(BV 级别,不是分 P 级别) params = {'bvid': bvid} r = await req_retry(client, url, params=params ,follow_redirects=True) r.raise_for_status() @@ -133,8 +138,3 @@ async def download_bilibili_video_detail(client, bvid, filename): # f.write(json.dumps(r.json(), indent=4, ensure_ascii=False)) await f.write(r.text) print(f'{bvid} 视频详情已保存') - -# d = DownloaderBilibili(video_concurrency=2, part_concurrency=1, hierarchy=True, sess_data=None) -# d.progress.start() -# asyncio.run(archive_bvid(d=d, bvid='BV1Zh4y1x7RL')) -# d.progress.stop() \ No newline at end of file diff --git a/_biliup_upload_bvid.py b/_biliup_upload_bvid.py index 726471c..0bb35ad 100644 --- a/_biliup_upload_bvid.py +++ b/_biliup_upload_bvid.py @@ -30,7 +30,7 @@ def upload_bvid(bvid): pid = identifier.split('_')[-1][1:] file_basename = identifier[len(identifier_perfix)+1:] - print(f'开始上传 {identifier}') + print(f'==== 开始上传 {identifier} ====') item = get_item(identifier) if item.exists: print(f'item {identifier} 已存在(item.exists)') @@ -96,8 +96,7 @@ def upload_bvid(bvid): ), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ... "upload-state": "uploading", 'originalurl': f'https://www.bilibili.com/video/{bvid}/?p={pid}', - 'project': 'bilibili top100 daily archive', - 'scanner': 'biliup v2233.0.2 (dev)', + 'scanner': 'biliup v2233.0.3 (dev)', } print(filedict) print(md) @@ -136,7 +135,7 @@ def upload_bvid(bvid): r.raise_for_status() with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f: f.write('') - print(f'{identifier} 上传完成') + print(f'==== {identifier} 上传完成 ====') def read_ia_keys(keysfile): ''' Return: tuple(`access_key`, `secret_key`) ''' diff --git a/biliup_archive_daily_bvids.py b/biliup_archive_bvids.py similarity index 80% rename from biliup_archive_daily_bvids.py rename to biliup_archive_bvids.py index e1b037b..8cb7b1c 100644 --- a/biliup_archive_daily_bvids.py +++ b/biliup_archive_bvids.py @@ -15,27 +15,28 @@ install() def parse_args(): parser = argparse.ArgumentParser() today = datetime.date.today() - parser.add_argument('--sess-data', type=str, default=get_sess_data()) - parser.add_argument('--bvids', type=str, default=f'bvids/bvids-{today.isoformat()}.txt') + parser.add_argument('--sess-data', type=str, default=get_sess_data(), + help='cookie SESSDATA。不指定则会从 ~/.sess_data.txt 读取,指定则直接使用提供的字符串') + parser.add_argument('--bvids', type=str, help='bvids 列表的文件路径') args = parser.parse_args() return args def main(): args = parse_args() - print(args.sess_data) + + assert args.bvids is not None, '必须指定 bvids 列表的文件路径' with open(args.bvids, 'r', encoding='utf-8') as f: bvids = f.read().splitlines() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - from tasks_limit import tasks_limit + from config import tasks_limit d = DownloaderBilibili(video_concurrency=tasks_limit, part_concurrency=1, hierarchy=True, sess_data=args.sess_data, ) d.progress.start() for bvid in bvids: - # 限制同时下载的数量 while len(asyncio.all_tasks(loop)) > tasks_limit: loop.run_until_complete(asyncio.sleep(0.01)) task = loop.create_task(archive_bvid(d, bvid)) diff --git a/biliup_get_bvids_from_top100_rank_by_rid.py b/biliup_get_bvids_from_top100_rank_by_rid.py new file mode 100644 index 0000000..0b10dfc --- /dev/null +++ b/biliup_get_bvids_from_top100_rank_by_rid.py @@ -0,0 +1,43 @@ +import os +import time +import requests +import json +import argparse + +def arg_parse(): + parser = argparse.ArgumentParser() + parser.add_argument('--rid', type=int, default=0, help='目标排行 rid,0 为全站排行榜 [default: 0]') + args = parser.parse_args() + return args + + + +def main(): + args = arg_parse() + rid: int = args.rid + bilibili_ranking_api = "https://api.bilibili.com/x/web-interface/ranking/v2" + bilibili_ranking_params = { + "rid": rid, + "type": "all" + } + + r = requests.get(bilibili_ranking_api, params=bilibili_ranking_params) + r.raise_for_status() + ranking_json = json.loads(r.text) + assert ranking_json['code'] == 0 # 0 为成功(HTTP 200 不能信) + + ranking = ranking_json['data']['list'] + bvids = [] + for video_info in ranking: + # print(video_info['title'], video_info['bvid'], video_info['pic']) + bvid = video_info['bvid'] + bvids.append(bvid) + + import datetime + today = datetime.date.today() + os.makedirs('bvids', exist_ok=True) + + with open(f'bvids/rank/by-rid/rid-{rid}-{int(time.time())}.txt', 'w', encoding='utf-8') as f: + for bvid in bvids: + f.write(f'{bvid}' + '\n') + print(f'已保存 {len(bvids)} 个 bvid 到 bvids/bvids-{today.isoformat()}.txt') \ No newline at end of file diff --git a/biliup_get_daily_bvids.py b/biliup_get_daily_bvids.py deleted file mode 100644 index 6f8d7e2..0000000 --- a/biliup_get_daily_bvids.py +++ /dev/null @@ -1,70 +0,0 @@ -import asyncio -import os -import sys -import requests -import json -from bilix.sites.bilibili import DownloaderBilibili -from internetarchive import get_item - - -bilibili_ranking_api = "https://api.bilibili.com/x/web-interface/ranking/v2" -bilibili_ranking_params = { - "rid": 0, - "type": "all" -} - -r = requests.get(bilibili_ranking_api, params=bilibili_ranking_params) -ranking_json = json.loads(r.text) -assert ranking_json['code'] == 0 - -ranking = ranking_json['data']['list'] -bvids = [] -for video_info in ranking: - # print(video_info['title'], video_info['bvid'], video_info['pic']) - bvid = video_info['bvid'] - bvids.append(bvid) - -import datetime -today = datetime.date.today() -os.makedirs('bvids', exist_ok=True) -with open(f'bvids/bvids-{today.isoformat()}.txt', 'w', encoding='utf-8') as f: - for bvid in bvids: - f.write(f'{bvid}' + '\n') - - - -# print(bvid) -# assert isinstance(bvid, str) - -# v = video.Video(bvid=bvid) -# video_info = sync(v.get_info()) - -# with open(f'bili/video/{bvid}/video-{bvid}.info.json', 'w', encoding='utf-8') as f: -# json.dump(video_info, f, ensure_ascii=False, indent=4) - -# # with open('ranking.json', 'w', encoding='utf-8') as f: -# # json.dump(ranking_json, f, ensure_ascii=False, indent=4) - - -# async def main(): -# d = DownloaderBilibili(video_concurrency=5, part_concurrency=10, hierarchy=False, -# sess_data=sess_data) - -# d.progress.start() -# # cor1 = d.get_series( -# # 'https://www.bilibili.com/bangumi/play/ss28277' -# # , quality=0) -# # cor2 = d.get_up(url_or_mid='436482484', quality=0) -# os.makedirs(f'bili/video/{bvid}', exist_ok=True) -# cor3 = d.get_series(url=f'https://www.bilibili.com/video/{bvid}', -# dm=True, quality=0, image=True, subtitle=True, path=f'bili/video/{bvid}') - -# await asyncio.gather(cor3) -# await d.aclose() - - -# if __name__ == '__main__': -# # asyncio.run(main()) -# pass - - diff --git a/biliup_upload_daily_bvids.py b/biliup_upload_to_ia_from_default_biliup_video_dir.py similarity index 100% rename from biliup_upload_daily_bvids.py rename to biliup_upload_to_ia_from_default_biliup_video_dir.py diff --git a/tasks_limit.py b/config.py similarity index 100% rename from tasks_limit.py rename to config.py