2024-04-18 11:24:39 -07:00
|
|
|
|
import copy
|
|
|
|
|
from dataclasses import dataclass
|
|
|
|
|
import re
|
|
|
|
|
import json
|
|
|
|
|
import time
|
2024-05-25 09:36:51 -07:00
|
|
|
|
from typing import Any, Dict, Optional, Union
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
class ProjectMeta:
|
|
|
|
|
identifier: str
|
|
|
|
|
slug: str
|
|
|
|
|
icon: str
|
|
|
|
|
deadline: str
|
|
|
|
|
@dataclass
|
|
|
|
|
class ProjectStatus:
|
|
|
|
|
public: bool
|
|
|
|
|
paused: bool
|
|
|
|
|
@dataclass
|
|
|
|
|
class ProjectClient:
|
|
|
|
|
version: str
|
|
|
|
|
claim_task_delay: float
|
|
|
|
|
""" 客户端在 claim_task 之后,多久之后才能再次 claim_task (软限制)"""
|
|
|
|
|
@dataclass
|
|
|
|
|
class ProjectMongodb:
|
|
|
|
|
db_name: str
|
|
|
|
|
item_collection: str
|
|
|
|
|
queue_collection: str
|
|
|
|
|
custom_doc_id_name: str
|
|
|
|
|
""" TODO: 未来弃用 """
|
|
|
|
|
@dataclass
|
|
|
|
|
class Project:
|
|
|
|
|
meta: ProjectMeta
|
|
|
|
|
status: ProjectStatus
|
|
|
|
|
client: ProjectClient
|
|
|
|
|
mongodb: ProjectMongodb
|
|
|
|
|
|
|
|
|
|
def __init__(self, meta: dict, status: dict, client: dict, mongodb: dict):
|
|
|
|
|
self.meta = ProjectMeta(**meta)
|
|
|
|
|
self.status = ProjectStatus(**status)
|
|
|
|
|
self.client = ProjectClient(**client)
|
|
|
|
|
self.mongodb = ProjectMongodb(**mongodb)
|
|
|
|
|
|
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
TRACKER_NODES = [
|
|
|
|
|
"https://0.tracker.saveweb.org/",
|
|
|
|
|
"https://1.tracker.saveweb.org/",
|
|
|
|
|
"https://2.tracker.saveweb.org/",
|
|
|
|
|
"http://3.tracker.saveweb.org/",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
TEST_TRACKER_NODES = [
|
|
|
|
|
"http://localhost:8080/",
|
|
|
|
|
]
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
|
|
|
|
class Tracker:
|
2024-05-25 09:36:51 -07:00
|
|
|
|
API_BASE: str = TEST_TRACKER_NODES[0]
|
2024-04-18 11:24:39 -07:00
|
|
|
|
API_VERSION = "v1"
|
|
|
|
|
client_version: str
|
|
|
|
|
project_id: str
|
|
|
|
|
""" [^a-zA-Z0-9_\\-] """
|
|
|
|
|
archivist: str
|
|
|
|
|
""" [^a-zA-Z0-9_\\-] """
|
2024-05-25 09:36:51 -07:00
|
|
|
|
session: requests.Session
|
2024-04-18 11:24:39 -07:00
|
|
|
|
project_id: str
|
|
|
|
|
__project: Project
|
|
|
|
|
__project_last_fetched: float = 0
|
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
__last_task_claimed_at: float = 0
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
|
|
|
|
def _is_safe_string(self, s: str):
|
|
|
|
|
r = re.compile(r"[^a-zA-Z0-9_\\-]")
|
|
|
|
|
return not r.search(s)
|
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
def __init__(self, project_id: str, client_version: str, archivist: str, session: requests.Session):
|
2024-04-18 11:24:39 -07:00
|
|
|
|
assert self._is_safe_string(project_id) and self._is_safe_string(archivist), "[^a-zA-Z0-9_\\-]"
|
|
|
|
|
self.project_id = project_id
|
|
|
|
|
self.client_version = client_version
|
|
|
|
|
self.archivist = archivist
|
|
|
|
|
self.session = session
|
2024-05-25 09:36:51 -07:00
|
|
|
|
|
|
|
|
|
self.select_best_tracker()
|
|
|
|
|
|
|
|
|
|
assert self.project.client.version == self.client_version, "client_version mismatch, please upgrade your client."
|
|
|
|
|
|
|
|
|
|
print(f"[tracker] Hello, {self.archivist}!")
|
|
|
|
|
print(f"[tracker] Project: {self.project}")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def project(self):
|
|
|
|
|
"""
|
|
|
|
|
return: Project, deep copy.
|
|
|
|
|
return __project cache if it's not expired (60s)
|
|
|
|
|
"""
|
|
|
|
|
if time.time() - self.__project_last_fetched > 60:
|
|
|
|
|
self.__project = self.fetch_project()
|
|
|
|
|
self.__project_last_fetched = time.time()
|
|
|
|
|
return copy.deepcopy(self.__project)
|
2024-05-25 09:36:51 -07:00
|
|
|
|
|
|
|
|
|
def select_best_tracker(self):
|
|
|
|
|
result = [] # [(node, ping)]
|
|
|
|
|
print("[client->trackers] select_best_tracker...")
|
|
|
|
|
for node in TRACKER_NODES:
|
|
|
|
|
try:
|
|
|
|
|
self.session.get(node + 'ping', timeout=0.05) # DNS preload, dirty hack
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
for node in TRACKER_NODES:
|
|
|
|
|
print(f"[client->tracker({node})] ping...")
|
|
|
|
|
start = time.time()
|
|
|
|
|
try:
|
|
|
|
|
r = self.session.get(node + 'ping', timeout=5)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
print(f"[client<-tracker({node})] ping OK. {time.time() - start:.2f}s")
|
|
|
|
|
result.append((node, time.time() - start))
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"[client->tracker({node}) ping failed. {type(e)}")
|
|
|
|
|
result.append((node, float('inf')))
|
|
|
|
|
result.sort(key=lambda x: x[1])
|
|
|
|
|
self.API_BASE = result[0][0]
|
|
|
|
|
print("===============================")
|
|
|
|
|
print(f"tracker selected: {self.API_BASE}")
|
|
|
|
|
print("===============================")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
|
|
|
|
def fetch_project(self):
|
|
|
|
|
"""
|
|
|
|
|
return: Project, deep copy
|
|
|
|
|
also refresh __project cache
|
|
|
|
|
"""
|
2024-05-25 09:36:51 -07:00
|
|
|
|
assert isinstance(self.session, (requests.Session))
|
|
|
|
|
print("[client->tracker] fetch_project...")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
r = self.session.post(self.API_BASE + self.API_VERSION + '/project/' + self.project_id)
|
|
|
|
|
r.raise_for_status()
|
|
|
|
|
proj = Project(**r.json())
|
|
|
|
|
self.__project = copy.deepcopy(proj)
|
2024-05-25 09:36:51 -07:00
|
|
|
|
self.__project_last_fetched = time.time()
|
|
|
|
|
print("[client<-tracker] project fetched.")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
return proj
|
|
|
|
|
|
|
|
|
|
def get_projects(self):
|
2024-05-25 09:36:51 -07:00
|
|
|
|
assert isinstance(self.session, (requests.Session))
|
|
|
|
|
print("[client->tracker] get_projects")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
r = self.session.post(self.API_BASE + self.API_VERSION + '/projects')
|
|
|
|
|
r.raise_for_status()
|
2024-05-25 09:36:51 -07:00
|
|
|
|
print("[client<-tracker] get_projects. OK")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
return [Project(**p) for p in r.json()]
|
2024-05-25 09:36:51 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def claim_task(self, with_delay: bool = True):
|
|
|
|
|
assert isinstance(self.session, (requests.Session))
|
|
|
|
|
if with_delay:
|
|
|
|
|
if sleep_need := self.project.client.claim_task_delay - (time.time() - self.__last_task_claimed_at):
|
|
|
|
|
if sleep_need > 0:
|
|
|
|
|
print(f"[tracker] slow down you {sleep_need:.2f}s, Qos: {self.project.client.claim_task_delay}")
|
|
|
|
|
time.sleep(sleep_need)
|
|
|
|
|
elif sleep_need < 0:
|
|
|
|
|
print(f"[tracker] you are {sleep_need:.2f}s late, Qos: {self.project.client.claim_task_delay}")
|
|
|
|
|
|
|
|
|
|
print("[client->tracker] claim_task")
|
|
|
|
|
start = time.time()
|
2024-04-18 11:24:39 -07:00
|
|
|
|
r = self.session.post(f'{self.API_BASE}{self.API_VERSION}/project/{self.project_id}/{self.client_version}/{self.archivist}/claim_task')
|
2024-05-25 09:36:51 -07:00
|
|
|
|
self.__last_task_claimed_at = time.time()
|
2024-04-18 11:24:39 -07:00
|
|
|
|
if r.status_code == 404:
|
2024-05-25 09:36:51 -07:00
|
|
|
|
print(f'[client<-tracker] claim_task. (time cost: {time.time() - start:.2f}s):', r.text)
|
2024-04-18 11:24:39 -07:00
|
|
|
|
return None # No tasks available
|
|
|
|
|
if r.status_code == 200:
|
2024-05-25 09:36:51 -07:00
|
|
|
|
r_json = r.json()
|
|
|
|
|
print(f'[client<-tracker] claim_task. OK (time cost: {time.time() - start:.2f}s):', r_json)
|
|
|
|
|
return r_json
|
|
|
|
|
raise Exception(r.status_code, r.text)
|
2024-04-18 11:24:39 -07:00
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
def update_task(self, task_id: Union[str, int], status: str)->Dict[str, Any]:
|
2024-04-18 11:24:39 -07:00
|
|
|
|
"""
|
|
|
|
|
task_id: 必须明确传入的是 int 还是 str
|
|
|
|
|
status: 任务状态
|
|
|
|
|
"""
|
2024-05-25 09:36:51 -07:00
|
|
|
|
assert isinstance(self.session, (requests.Session))
|
2024-04-18 11:24:39 -07:00
|
|
|
|
assert isinstance(task_id, (str, int))
|
|
|
|
|
assert isinstance(status, str)
|
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
print(f"[client->tracker] update_task task({task_id}) to status({status})")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
r = self.session.post(
|
|
|
|
|
f'{self.API_BASE}{self.API_VERSION}/project/{self.project_id}/{self.client_version}/{self.archivist}/update_task/{task_id}',
|
|
|
|
|
data={
|
|
|
|
|
'status': status,
|
|
|
|
|
'task_id_type': 'int' if isinstance(task_id, int) else 'str'
|
|
|
|
|
})
|
|
|
|
|
if r.status_code == 200:
|
2024-05-25 09:36:51 -07:00
|
|
|
|
r_json = r.json()
|
|
|
|
|
print(f'[client<-tracker] update_task task({task_id}). OK:', r_json)
|
|
|
|
|
return r_json
|
2024-04-18 11:24:39 -07:00
|
|
|
|
raise Exception(r.text)
|
|
|
|
|
|
2024-05-25 09:36:51 -07:00
|
|
|
|
def insert_item(self, item_id: Union[str, int], item_status: Optional[Union[str, int]] = None, payload: Optional[dict] = None)->Dict[str, Any]:
|
2024-04-18 11:24:39 -07:00
|
|
|
|
"""
|
|
|
|
|
item_id: 必须明确传入的是 int 还是 str
|
|
|
|
|
item_status: item 状态,而不是 task 状态。可用于标记一些被删除、被隐藏的 item 之类的。就不用添加一堆错误响应到 payload 里了。
|
|
|
|
|
payload: 可以传入任意的可转为 ext-json 的对象 (包括 None)
|
|
|
|
|
"""
|
2024-05-25 09:36:51 -07:00
|
|
|
|
assert isinstance(self.session, (requests.Session))
|
2024-04-18 11:24:39 -07:00
|
|
|
|
# item_id_type
|
|
|
|
|
if isinstance(item_id, int):
|
|
|
|
|
item_id_type = 'int'
|
|
|
|
|
elif isinstance(item_id, str):
|
|
|
|
|
item_id_type = 'str'
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("item_id must be int or str")
|
|
|
|
|
|
|
|
|
|
if item_status is None:
|
|
|
|
|
status_type = "None"
|
|
|
|
|
elif isinstance(item_status, int):
|
|
|
|
|
status_type = "int"
|
|
|
|
|
elif isinstance(item_status, str):
|
|
|
|
|
status_type = "str"
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError("status must be int, str or None")
|
2024-05-25 09:36:51 -07:00
|
|
|
|
payload_json_str = json.dumps(payload, ensure_ascii=False, separators=(',', ':'))
|
|
|
|
|
print(f"[client->tracker] insert_item item({item_id}), len(payload)={len(payload_json_str)}")
|
2024-04-18 11:24:39 -07:00
|
|
|
|
r = self.session.post(
|
|
|
|
|
f'{self.API_BASE}{self.API_VERSION}/project/{self.project_id}/{self.client_version}/{self.archivist}/insert_item/{item_id}',
|
|
|
|
|
data={
|
|
|
|
|
'item_id_type': item_id_type,
|
|
|
|
|
'item_status': item_status,
|
|
|
|
|
'item_status_type': status_type,
|
2024-05-25 09:36:51 -07:00
|
|
|
|
'payload': payload_json_str
|
2024-04-18 11:24:39 -07:00
|
|
|
|
})
|
|
|
|
|
if r.status_code == 200:
|
2024-05-25 09:36:51 -07:00
|
|
|
|
r_json = r.json()
|
|
|
|
|
print(f'[client<-tracker] insert_item item({item_id}). OK:', r_json)
|
|
|
|
|
return r_json
|
2024-04-18 11:24:39 -07:00
|
|
|
|
raise Exception(r.text)
|