This commit is contained in:
yzqzss 2023-06-03 02:40:11 +08:00
parent ada66940f2
commit cf75a3e914
7 changed files with 93 additions and 120 deletions

View File

@ -1,12 +1,5 @@
"""
bilix 提供了各个网站的api如果你有需要当然可以使用并且它们都是异步的
bilix provides api for various websites. You can use them if you need, and they are asynchronous
"""
import asyncio
import os
import shutil
import time
import aiofiles
import httpx
@ -14,7 +7,6 @@ from bilix.download.utils import raise_api_error, req_retry
from bilix.exception import APIError
from bilix.sites.bilibili import api
from httpx import AsyncClient
from rich import print
import json
@ -22,7 +14,7 @@ import json
from bilix.sites.bilibili.downloader import DownloaderBilibili
identifier_perfix = 'BiliBili'
BILIBILI_IDENTIFIER_PERFIX = 'BiliBili' # IA identifier 前缀,千万不要改。能与 tubeup 兼容。
@raise_api_error
async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid):
@ -31,46 +23,47 @@ async def new_get_subtitle_info(client: httpx.AsyncClient, bvid, cid):
info = json.loads(res.text)
if info['code'] == -400:
raise APIError(f'未找到字幕信息', params)
# return lan
# 这里 monkey patch 一下把返回 lan_doc 改成返回 lan这样生成的字幕文件名就是 语言代码 而不是 中文名 了
# 例如
# lan_doc: 中文(中国)
# lang: zh-CN
# return [[f'http:{i["subtitle_url"]}', i['lan_doc']] for i in info['data']['subtitle']['subtitles']]
return [[f'http:{i["subtitle_url"]}', i['lan']] for i in info['data']['subtitle']['subtitles']]
api.get_subtitle_info = new_get_subtitle_info
async def archive_bvid(d: DownloaderBilibili, bvid: str):
assert d.hierarchy is True, 'hierarchy 必须为 True' # 为了保持后续目录结构、文件命名的一致性
assert d.hierarchy is True, 'hierarchy 必须为 True' # 为保持后续目录结构、文件命名的一致性
assert d.client.cookies.get('SESSDATA') is not None, 'sess_data 不能为空' # 开个大会员呗,能下 4k 呢。
assert os.path.exists('biliup.home'), '先创建 biliup.home 文件' # 防误操作
url = f'https://www.bilibili.com/video/{bvid}/'
# data = await api.get_video_info(client, "https://www.bilibili.com/video/BV1jK4y1N7ST?p=5")
# d.update_cookies_from_browser('firefox')
videos_basepath = f'biliup/videos/{bvid}'
if os.path.exists(f'{videos_basepath}/_all_downloaded.mark'):
print(f'{bvid} 所有分p都已下载过了')
return
videos_info = await api.get_video_info(d.client, url)
url = f'https://www.bilibili.com/video/{bvid}/'
# 为了获取 pages先请求一次
first_video_info = await api.get_video_info(d.client, url)
os.makedirs(videos_basepath, exist_ok=True)
# async with aiofiles.open(f'{videos_basepath}/_videos_info.json', 'w', encoding='utf-8') as f:
# # 用于 debug 的来自 bilix 输出的视频信息包含用户敏感信息mid 等)
# await f.write(json.dumps(videos_info.dict(), ensure_ascii=False, indent=4))
pid = 0
for page in videos_info.pages:
pid += 1
for page in first_video_info.pages:
pid += 1 # pid 从 1 开始
if not page.p_url.endswith(f'?p={pid}'):
print(f'{bvid} {pid}P 不存在')
print(f'{bvid} 的 P{pid} 不存在 (可能视频被 UP主/B站 删了)')
continue
file_basename = f'{bvid}_p{pid}'
video_basepath = f'{videos_basepath}/{identifier_perfix}-{file_basename}'
video_basepath = f'{videos_basepath}/{BILIBILI_IDENTIFIER_PERFIX}-{file_basename}'
video_extrapath = f'{video_basepath}/extra'
if os.path.exists(f'{video_basepath}/_downloaded.mark'):
print(f'{bvid} 的第 {pid}p 已经下载过了')
print(f'{file_basename}: 已经下载过了')
continue
video_info = await api.get_video_info(d.client, page.p_url)
os.makedirs(video_basepath, exist_ok=True)
os.makedirs(video_extrapath, exist_ok=True)
@ -79,10 +72,16 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str):
old_p_name = video_info.pages[video_info.p].p_name
old_h1_title = video_info.h1_title
video_info.pages[video_info.p].p_name = file_basename
video_info.h1_title = 'tttttt' * 50 # 假装超长标题,强制 bilix fallback 到 file_basename 作为文件名
# 在 d.hierarchy is True 且 h1_title 超长的情况下, bilix 会将 p_name 作为文件名
video_info.pages[video_info.p].p_name = file_basename # 所以这里覆盖 p_name 为 file_basename
video_info.h1_title = 'iiiiii' * 50 # 然后假装超长标题
# 这样 bilix 保存的文件名就是我们想要的了(谁叫 bilix 不支持自定义文件名呢)
# NOTE: p_name 似乎也不宜过长,否则还是会被 bilix 截断。
# 但是我们以 {bvid}_p{pid} 作为文件名,这个长度是没问题的。
# 选择编码,优先 hevc
# 选择编码,优先 hevc没有的话就 avc
# 不选 av0 ,毕竟目前没几个设备能拖得动
codec = None
for media in video_info.dash.videos:
if media.codec.startswith('hev'):
@ -93,26 +92,32 @@ async def archive_bvid(d: DownloaderBilibili, bvid: str):
if media.codec.startswith('avc'):
codec = media.codec
break
assert codec is not None, f'{bvid}_{pid}p: 没有 avc 或 hevc 编码的视频'
print(f'{bvid}_{pid}p: "{media.codec}" "{media.quality}" ...')
assert codec is not None, f'{file_basename}: 没有 avc 或 hevc 编码的视频'
print(f'{file_basename}: "{media.codec}" "{media.quality}" ...')
cor1 = d.get_video(page.p_url ,video_info=video_info, path=video_basepath,
quality=0, codec=codec, # 选择最高画质
quality=0, # 选择最高画质
codec=codec, # 编码
# 下载 ass 弹幕(bilix 会自动调用 danmukuC 将 pb 弹幕转为 ass)、封面、字幕
# 他们会被放进 extra 子目录里
# 弹幕、封面、字幕都会被放进 extra 子目录里,所以需要 d.hierarchy is True
dm=True, image=True, subtitle=True
)
# 下载原始的 pb 弹幕
cor2 = d.get_dm(page.p_url, video_info=video_info, path=video_extrapath)
# 下载视频详细信息
# 下载视频详细信息BV 级别,不是分 P 级别)
cor3 = download_bilibili_video_detail(d.client, bvid, f'{video_extrapath}/{file_basename}.info.json')
await asyncio.gather(cor1, cor2, cor3)
# 还原为了自定义文件名而做的覆盖
video_info.pages[video_info.p].p_name = old_p_name
video_info.h1_title = old_h1_title
# 单 p 下好了
async with aiofiles.open(f'{video_basepath}/_downloaded.mark', 'w', encoding='utf-8') as f:
await f.write('')
# bv 对应的全部 p 下好了
async with aiofiles.open(f'{videos_basepath}/_all_downloaded.mark', 'w', encoding='utf-8') as f:
await f.write('')
@ -124,7 +129,7 @@ async def download_bilibili_video_detail(client, bvid, filename):
print(f'{bvid} 视频详情已存在')
return
# url = 'https://api.bilibili.com/x/web-interface/view'
url = 'https://api.bilibili.com/x/web-interface/view/detail' # 超详细
url = 'https://api.bilibili.com/x/web-interface/view/detail' # 超详细 APIBV 级别,不是分 P 级别)
params = {'bvid': bvid}
r = await req_retry(client, url, params=params ,follow_redirects=True)
r.raise_for_status()
@ -133,8 +138,3 @@ async def download_bilibili_video_detail(client, bvid, filename):
# f.write(json.dumps(r.json(), indent=4, ensure_ascii=False))
await f.write(r.text)
print(f'{bvid} 视频详情已保存')
# d = DownloaderBilibili(video_concurrency=2, part_concurrency=1, hierarchy=True, sess_data=None)
# d.progress.start()
# asyncio.run(archive_bvid(d=d, bvid='BV1Zh4y1x7RL'))
# d.progress.stop()

View File

@ -30,7 +30,7 @@ def upload_bvid(bvid):
pid = identifier.split('_')[-1][1:]
file_basename = identifier[len(identifier_perfix)+1:]
print(f'开始上传 {identifier}')
print(f'==== 开始上传 {identifier} ====')
item = get_item(identifier)
if item.exists:
print(f'item {identifier} 已存在(item.exists)')
@ -96,8 +96,7 @@ def upload_bvid(bvid):
), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
"upload-state": "uploading",
'originalurl': f'https://www.bilibili.com/video/{bvid}/?p={pid}',
'project': 'bilibili top100 daily archive',
'scanner': 'biliup v2233.0.2 (dev)',
'scanner': 'biliup v2233.0.3 (dev)',
}
print(filedict)
print(md)
@ -136,7 +135,7 @@ def upload_bvid(bvid):
r.raise_for_status()
with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
f.write('')
print(f'{identifier} 上传完成')
print(f'==== {identifier} 上传完成 ====')
def read_ia_keys(keysfile):
''' Return: tuple(`access_key`, `secret_key`) '''

View File

@ -15,27 +15,28 @@ install()
def parse_args():
parser = argparse.ArgumentParser()
today = datetime.date.today()
parser.add_argument('--sess-data', type=str, default=get_sess_data())
parser.add_argument('--bvids', type=str, default=f'bvids/bvids-{today.isoformat()}.txt')
parser.add_argument('--sess-data', type=str, default=get_sess_data(),
help='cookie SESSDATA。不指定则会从 ~/.sess_data.txt 读取,指定则直接使用提供的字符串')
parser.add_argument('--bvids', type=str, help='bvids 列表的文件路径')
args = parser.parse_args()
return args
def main():
args = parse_args()
print(args.sess_data)
assert args.bvids is not None, '必须指定 bvids 列表的文件路径'
with open(args.bvids, 'r', encoding='utf-8') as f:
bvids = f.read().splitlines()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
from tasks_limit import tasks_limit
from config import tasks_limit
d = DownloaderBilibili(video_concurrency=tasks_limit, part_concurrency=1, hierarchy=True, sess_data=args.sess_data,
)
d.progress.start()
for bvid in bvids:
# 限制同时下载的数量
while len(asyncio.all_tasks(loop)) > tasks_limit:
loop.run_until_complete(asyncio.sleep(0.01))
task = loop.create_task(archive_bvid(d, bvid))

View File

@ -0,0 +1,43 @@
import os
import time
import requests
import json
import argparse
def arg_parse():
parser = argparse.ArgumentParser()
parser.add_argument('--rid', type=int, default=0, help='目标排行 rid0 为全站排行榜 [default: 0]')
args = parser.parse_args()
return args
def main():
args = arg_parse()
rid: int = args.rid
bilibili_ranking_api = "https://api.bilibili.com/x/web-interface/ranking/v2"
bilibili_ranking_params = {
"rid": rid,
"type": "all"
}
r = requests.get(bilibili_ranking_api, params=bilibili_ranking_params)
r.raise_for_status()
ranking_json = json.loads(r.text)
assert ranking_json['code'] == 0 # 0 为成功HTTP 200 不能信)
ranking = ranking_json['data']['list']
bvids = []
for video_info in ranking:
# print(video_info['title'], video_info['bvid'], video_info['pic'])
bvid = video_info['bvid']
bvids.append(bvid)
import datetime
today = datetime.date.today()
os.makedirs('bvids', exist_ok=True)
with open(f'bvids/rank/by-rid/rid-{rid}-{int(time.time())}.txt', 'w', encoding='utf-8') as f:
for bvid in bvids:
f.write(f'{bvid}' + '\n')
print(f'已保存 {len(bvids)} 个 bvid 到 bvids/bvids-{today.isoformat()}.txt')

View File

@ -1,70 +0,0 @@
import asyncio
import os
import sys
import requests
import json
from bilix.sites.bilibili import DownloaderBilibili
from internetarchive import get_item
bilibili_ranking_api = "https://api.bilibili.com/x/web-interface/ranking/v2"
bilibili_ranking_params = {
"rid": 0,
"type": "all"
}
r = requests.get(bilibili_ranking_api, params=bilibili_ranking_params)
ranking_json = json.loads(r.text)
assert ranking_json['code'] == 0
ranking = ranking_json['data']['list']
bvids = []
for video_info in ranking:
# print(video_info['title'], video_info['bvid'], video_info['pic'])
bvid = video_info['bvid']
bvids.append(bvid)
import datetime
today = datetime.date.today()
os.makedirs('bvids', exist_ok=True)
with open(f'bvids/bvids-{today.isoformat()}.txt', 'w', encoding='utf-8') as f:
for bvid in bvids:
f.write(f'{bvid}' + '\n')
# print(bvid)
# assert isinstance(bvid, str)
# v = video.Video(bvid=bvid)
# video_info = sync(v.get_info())
# with open(f'bili/video/{bvid}/video-{bvid}.info.json', 'w', encoding='utf-8') as f:
# json.dump(video_info, f, ensure_ascii=False, indent=4)
# # with open('ranking.json', 'w', encoding='utf-8') as f:
# # json.dump(ranking_json, f, ensure_ascii=False, indent=4)
# async def main():
# d = DownloaderBilibili(video_concurrency=5, part_concurrency=10, hierarchy=False,
# sess_data=sess_data)
# d.progress.start()
# # cor1 = d.get_series(
# # 'https://www.bilibili.com/bangumi/play/ss28277'
# # , quality=0)
# # cor2 = d.get_up(url_or_mid='436482484', quality=0)
# os.makedirs(f'bili/video/{bvid}', exist_ok=True)
# cor3 = d.get_series(url=f'https://www.bilibili.com/video/{bvid}',
# dm=True, quality=0, image=True, subtitle=True, path=f'bili/video/{bvid}')
# await asyncio.gather(cor3)
# await d.aclose()
# if __name__ == '__main__':
# # asyncio.run(main())
# pass