biliarchiver/_biliarchiver_upload_bvid.py

import json
import os
import time
from internetarchive import get_item
from rich import print

from _biliarchiver_archive_bvid import BILIBILI_IDENTIFIER_PERFIX


def upload_bvid(bvid):
    if not os.path.exists('biliarchiver.home'):
        raise Exception('先创建 biliarchiver.home 文件')
    access_key, secret_key = read_ia_keys(os.path.expanduser('~/.bili_ia_keys.txt'))
    # sample: BiliBili-BV1Zh4y1x7RL_p3
    videos_basepath = f'biliarchiver/videos/{bvid}'
    for identifier in os.listdir(videos_basepath):
        if os.path.exists(f'{videos_basepath}/{identifier}/_uploaded.mark'):
            print(f'{identifier} 已经上传过了(_uploaded.mark)')
            continue
        if identifier.startswith('_') :
            print(f'跳过 {identifier}')
            continue
        if not identifier.startswith(BILIBILI_IDENTIFIER_PERFIX):
            print(f'{identifier} 不是以 {BILIBILI_IDENTIFIER_PERFIX} 开头的正确 identifier')
            continue
        if not os.path.exists(f'{videos_basepath}/{identifier}/_downloaded.mark'):
            print(f'{identifier} 没有下载完成')
            continue

        pid = identifier.split('_')[-1][1:]
        file_basename = identifier[len(BILIBILI_IDENTIFIER_PERFIX)+1:]
    
        print(f'==== 开始上传 {identifier} ====')
        item = get_item(identifier)
        if item.exists:
            print(f'item {identifier} 已存在(item.exists)')
            if item.metadata.get("upload-state") == "uploaded":
                print(f'{identifier} 已经上传过了，跳过(item.metadata.uploaded)')
                with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
                    f.write('')
                continue
        filedict = {} # "remote filename": "local filename"
        for filename in os.listdir(f'{videos_basepath}/{identifier}/extra'):
            file = f'{videos_basepath}/{identifier}/extra/{filename}'
            if os.path.isfile(file):
                if file.startswith('_'):
                    continue
                filedict[filename] = file

        for filename in os.listdir(f'{videos_basepath}/{identifier}'):
            file = f'{videos_basepath}/{identifier}/{filename}'
            if os.path.isfile(file):
                if os.path.basename(file).startswith('_'):
                    continue
                if not os.path.isfile(file):
                    continue
                filedict[filename] = file
        

        # IA 去重
        for file_in_item in item.files:
            if file_in_item["name"] in filedict:
                filedict.pop(file_in_item["name"])
                print(f"File {file_in_item['name']} already exists in {identifier}.")


        with open(f'{videos_basepath}/{identifier}/extra/{file_basename}.info.json', 'r', encoding='utf-8') as f:
            bv_info = json.load(f)
        # with open(f'{videos_basepath}/_videos_info.json', 'r', encoding='utf-8') as f:
        #     videos_info = json.load(f)

        tags = ['BiliBili', 'video']
        for tag in bv_info['data']['Tags']:
            tags.append(tag['tag_name'])
        pubdate = bv_info['data']['View']['pubdate']
        for page in bv_info['data']['View']['pages']:
            if page['page'] == int(pid):
                cid = page['cid']
                part = page['part']
                break
        
        md = {
            "mediatype": "movies",
            "collection": 'opensource_movies',
            "title": bv_info['data']['View']['title'] + f' P{pid} ' + part ,
            "description": identifier + ' uploading...',
            'creator': bv_info['data']['View']['owner']['name'], # UP 主
            # UTC time
            'date': time.strftime("%Y-%m-%d", time.gmtime(pubdate)),
            'year': time.strftime("%Y", time.gmtime(pubdate)),
            'aid': bv_info['data']['View']['aid'],
            'bvid': bvid,
            'cid': cid,
            'mid': bv_info['data']['View']['owner']['mid'],
            "subject": "; ".join(
                tags
            ),  # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...
            "upload-state": "uploading",
            'originalurl': f'https://www.bilibili.com/video/{bvid}/?p={pid}',
            'scanner': 'biliarchiver v0.0.5 (dev)',
        }        
        print(filedict)
        print(md)

        if filedict:
            r = item.upload(
                files=filedict,
                metadata=md,
                access_key=access_key,
                secret_key=secret_key,
                verbose=True,
                queue_derive=True,
                retries=5,
            )

        tries = 30
        item = get_item(identifier) # refresh item
        while not item.exists and tries > 0:
            print(f"Waiting for item to be created ({tries})  ...", end='\r')
            time.sleep(30)
            item = get_item(identifier)
            tries -= 1

        new_md = {}
        if item.metadata.get("upload-state") != "uploaded":
            new_md.update({"upload-state": "uploaded"})
        if item.metadata.get("description") != bv_info['data']['View']['desc']:
            new_md.update({"description": bv_info['data']['View']['desc']})
        if item.metadata.get("scanner") != md['scanner']:
            new_md.update({"scanner": md['scanner']})
        if new_md:
            print(f"Updating metadata:")
            print(new_md)
            r = item.modify_metadata(
                metadata=new_md,
                access_key=access_key,
                secret_key=secret_key,
            )
            r.raise_for_status()
        with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:
            f.write('')
        print(f'==== {identifier} 上传完成 ====')

def read_ia_keys(keysfile):
    ''' Return: tuple(`access_key`, `secret_key`) '''
    with open(keysfile, 'r', encoding='utf-8') as f:
        key_lines = f.readlines()

    access_key = key_lines[0].strip()
    secret_key = key_lines[1].strip()

    return access_key, secret_key
init 2023-06-01 07:00:37 -07:00			`import json`
			`import os`
			`import time`
			`from internetarchive import get_item`
rich print 2023-06-01 20:55:42 -07:00			`from rich import print`
准备公开仓库 2023-06-02 12:09:57 -07:00
rename biliarchiver 2023-06-02 13:32:11 -07:00			`from _biliarchiver_archive_bvid import BILIBILI_IDENTIFIER_PERFIX`
准备公开仓库 2023-06-02 12:09:57 -07:00

init 2023-06-01 07:00:37 -07:00			`def upload_bvid(bvid):`
rename biliarchiver 2023-06-02 13:32:11 -07:00			`if not os.path.exists('biliarchiver.home'):`
			`raise Exception('先创建 biliarchiver.home 文件')`
init 2023-06-01 07:00:37 -07:00			`access_key, secret_key = read_ia_keys(os.path.expanduser('~/.bili_ia_keys.txt'))`
			`# sample: BiliBili-BV1Zh4y1x7RL_p3`
rename biliarchiver 2023-06-02 13:32:11 -07:00			`videos_basepath = f'biliarchiver/videos/{bvid}'`
init 2023-06-01 07:00:37 -07:00			`for identifier in os.listdir(videos_basepath):`
revert 2023-06-02 13:50:05 -07:00			`if os.path.exists(f'{videos_basepath}/{identifier}/_uploaded.mark'):`
			`print(f'{identifier} 已经上传过了(_uploaded.mark)')`
			`continue`
hevc mediatype pid find 2023-06-02 01:18:28 -07:00			`if identifier.startswith('_') :`
			`print(f'跳过 {identifier}')`
			`continue`
准备公开仓库 2023-06-02 12:09:57 -07:00			`if not identifier.startswith(BILIBILI_IDENTIFIER_PERFIX):`
			`print(f'{identifier} 不是以 {BILIBILI_IDENTIFIER_PERFIX} 开头的正确 identifier')`
init 2023-06-01 07:00:37 -07:00			`continue`
			`if not os.path.exists(f'{videos_basepath}/{identifier}/_downloaded.mark'):`
			`print(f'{identifier} 没有下载完成')`
			`continue`

hevc mediatype pid find 2023-06-02 01:18:28 -07:00			`pid = identifier.split('_')[-1][1:]`
准备公开仓库 2023-06-02 12:09:57 -07:00			`file_basename = identifier[len(BILIBILI_IDENTIFIER_PERFIX)+1:]`
hevc mediatype pid find 2023-06-02 01:18:28 -07:00
refact 2023-06-02 11:40:11 -07:00			`print(f'==== 开始上传 {identifier} ====')`
init 2023-06-01 07:00:37 -07:00			`item = get_item(identifier)`
			`if item.exists:`
uploader item exists check 2023-06-01 21:08:26 -07:00			`print(f'item {identifier} 已存在(item.exists)')`
revert 2023-06-02 13:50:05 -07:00			`if item.metadata.get("upload-state") == "uploaded":`
			`print(f'{identifier} 已经上传过了，跳过(item.metadata.uploaded)')`
			`with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:`
			`f.write('')`
			`continue`
init 2023-06-01 07:00:37 -07:00			`filedict = {} # "remote filename": "local filename"`
优先上传 extra 里的小文件 2023-06-02 06:57:34 -07:00			`for filename in os.listdir(f'{videos_basepath}/{identifier}/extra'):`
			`file = f'{videos_basepath}/{identifier}/extra/{filename}'`
			`if os.path.isfile(file):`
			`if file.startswith('_'):`
			`continue`
			`filedict[filename] = file`

init 2023-06-01 07:00:37 -07:00			`for filename in os.listdir(f'{videos_basepath}/{identifier}'):`
			`file = f'{videos_basepath}/{identifier}/{filename}'`
			`if os.path.isfile(file):`
			`if os.path.basename(file).startswith('_'):`
			`continue`
			`if not os.path.isfile(file):`
			`continue`
			`filedict[filename] = file`


优先上传 extra 里的小文件 2023-06-02 06:57:34 -07:00			`# IA 去重`
init 2023-06-01 07:00:37 -07:00			`for file_in_item in item.files:`
			`if file_in_item["name"] in filedict:`
			`filedict.pop(file_in_item["name"])`
			`print(f"File {file_in_item['name']} already exists in {identifier}.")`


			`with open(f'{videos_basepath}/{identifier}/extra/{file_basename}.info.json', 'r', encoding='utf-8') as f:`
			`bv_info = json.load(f)`
hevc mediatype pid find 2023-06-02 01:18:28 -07:00			`# with open(f'{videos_basepath}/_videos_info.json', 'r', encoding='utf-8') as f:`
			`# videos_info = json.load(f)`
init 2023-06-01 07:00:37 -07:00
			`tags = ['BiliBili', 'video']`
			`for tag in bv_info['data']['Tags']:`
			`tags.append(tag['tag_name'])`
upload 2023-06-01 20:53:20 -07:00			`pubdate = bv_info['data']['View']['pubdate']`
			`for page in bv_info['data']['View']['pages']:`
			`if page['page'] == int(pid):`
			`cid = page['cid']`
			`part = page['part']`
			`break`

init 2023-06-01 07:00:37 -07:00			`md = {`
hevc mediatype pid find 2023-06-02 01:18:28 -07:00			`"mediatype": "movies",`
upload 2023-06-01 20:53:20 -07:00			`"collection": 'opensource_movies',`
			`"title": bv_info['data']['View']['title'] + f' P{pid} ' + part ,`
uploader: print update md 2023-06-02 07:04:25 -07:00			`"description": identifier + ' uploading...',`
upload 2023-06-01 20:53:20 -07:00			`'creator': bv_info['data']['View']['owner']['name'], # UP 主`
			`# UTC time`
			`'date': time.strftime("%Y-%m-%d", time.gmtime(pubdate)),`
			`'year': time.strftime("%Y", time.gmtime(pubdate)),`
			`'aid': bv_info['data']['View']['aid'],`
UP 的 mid 2023-06-03 04:30:10 -07:00			`'bvid': bvid,`
upload 2023-06-01 20:53:20 -07:00			`'cid': cid,`
UP 的 mid 2023-06-03 04:30:10 -07:00			`'mid': bv_info['data']['View']['owner']['mid'],`
init 2023-06-01 07:00:37 -07:00			`"subject": "; ".join(`
			`tags`
			`), # Keywords should be separated by ; but it doesn't matter much; the alternative is to set one per field with subject[0], subject[1], ...`
			`"upload-state": "uploading",`
给 originalurl 的 ?p= 前加上 / 2023-06-01 21:17:21 -07:00			`'originalurl': f'https://www.bilibili.com/video/{bvid}/?p={pid}',`
bump 0.0.5 2023-06-03 04:30:47 -07:00			`'scanner': 'biliarchiver v0.0.5 (dev)',`
init 2023-06-01 07:00:37 -07:00			`}`
			`print(filedict)`
			`print(md)`

uploader 重跑全部的 ia team 元数据 2023-06-02 13:41:08 -07:00			`if filedict:`
			`r = item.upload(`
			`files=filedict,`
			`metadata=md,`
			`access_key=access_key,`
			`secret_key=secret_key,`
			`verbose=True,`
			`queue_derive=True,`
			`retries=5,`
			`)`
init 2023-06-01 07:00:37 -07:00
			`tries = 30`
			`item = get_item(identifier) # refresh item`
			`while not item.exists and tries > 0:`
			`print(f"Waiting for item to be created ({tries}) ...", end='\r')`
			`time.sleep(30)`
			`item = get_item(identifier)`
			`tries -= 1`

			`new_md = {}`
			`if item.metadata.get("upload-state") != "uploaded":`
			`new_md.update({"upload-state": "uploaded"})`
desc lazy add 2023-06-02 07:01:23 -07:00			`if item.metadata.get("description") != bv_info['data']['View']['desc']:`
			`new_md.update({"description": bv_info['data']['View']['desc']})`
uploader 重跑全部的 ia team 元数据 2023-06-02 13:41:08 -07:00			`if item.metadata.get("scanner") != md['scanner']:`
			`new_md.update({"scanner": md['scanner']})`
init 2023-06-01 07:00:37 -07:00			`if new_md:`
uploader: print update md 2023-06-02 07:04:25 -07:00			`print(f"Updating metadata:")`
			`print(new_md)`
init 2023-06-01 07:00:37 -07:00			`r = item.modify_metadata(`
			`metadata=new_md,`
			`access_key=access_key,`
			`secret_key=secret_key,`
			`)`
			`r.raise_for_status()`
			`with open(f'{videos_basepath}/{identifier}/_uploaded.mark', 'w', encoding='utf-8') as f:`
			`f.write('')`
refact 2023-06-02 11:40:11 -07:00			`print(f'==== {identifier} 上传完成 ====')`
init 2023-06-01 07:00:37 -07:00
			`def read_ia_keys(keysfile):`
			''' Return: tuple(`access_key`, `secret_key`) '''
			`with open(keysfile, 'r', encoding='utf-8') as f:`
			`key_lines = f.readlines()`

			`access_key = key_lines[0].strip()`
			`secret_key = key_lines[1].strip()`

			`return access_key, secret_key`