commit c943b2023bc9c65be37e79c57effe1dbe59b33d7 Author: yzqzss Date: Sun Feb 18 23:33:21 2024 +0800 public diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f4d214c --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.git +.venv +__pycache__ +search.log +.vscode +deploy.sh \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6ae41a5 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# saveweb-search-backend + +## Installation + +```bash +pip install -r requirements.txt +``` + +## Setup environment variables + +```bash +MEILI_KEY # MeiliSearch API key. +# default: '' (empty string) +MEILI_HOST # MeiliSearch host. +# default: "http://localhost:7700" +STWP_SEARCH_MAX_LOAD # If the load is higher than this, API will return 503. +# default: cpu_count / 1.5 +STWP_SEARCH_MAX_FLYING_OPS # If the number of flying requests is higher than this, API will return 503. +# default: $STWP_SEARCH_MAX_LOAD * 2 (min value: 1) +STWP_SEARCH_CORS # CORS Allow-Origin header, split by `,` +# default: * +``` + +## Run + +```bash +python saveweb-search-backend.py +# or +hypercorn --bind '[::]:8077' saveweb-search-backend:app # to customize the bind address +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e751fae --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +fastapi +hypercorn +meilisearch-python-sdk diff --git a/saveweb-search-backend.py b/saveweb-search-backend.py new file mode 100644 index 0000000..74f6fba --- /dev/null +++ b/saveweb-search-backend.py @@ -0,0 +1,221 @@ +from functools import wraps +import asyncio +import os +import time + +from fastapi import FastAPI, Response +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import HTMLResponse, JSONResponse + +import meilisearch_python_sdk +import meilisearch_python_sdk.errors + + +MEILI_KEY = os.getenv('MEILI_KEY', '') +print('$MEILI_KEY', 'set' if MEILI_KEY else 'not set') +MEILI_HOST = os.getenv('MEILI_HOST', 'http://127.0.0.1:7700') +print('$MEILI_HOST', MEILI_HOST) +STWP_SEARCH_MAX_LOAD = float(os.getenv('STWP_SEARCH_MAX_LOAD')) if os.getenv('STWP_SEARCH_MAX_LOAD') else ( + os.cpu_count() / 1.5 if os.cpu_count() else 1.5 +) +print('$STWP_SEARCH_MAX_LOAD', STWP_SEARCH_MAX_LOAD) +STWP_SEARCH_MAX_FLYING_OPS = int(os.getenv('STWP_SEARCH_MAX_FLYING_OPS')) if os.getenv('STWP_SEARCH_MAX_FLYING_OPS') else ( + int(STWP_SEARCH_MAX_LOAD * 2) +) +STWP_SEARCH_MAX_FLYING_OPS = STWP_SEARCH_MAX_FLYING_OPS if STWP_SEARCH_MAX_FLYING_OPS >= 1 else 1 +print('$STWP_SEARCH_MAX_FLYING_OPS', STWP_SEARCH_MAX_FLYING_OPS) +STWP_SEARCH_CORS = os.getenv('STWP_SEARCH_CORS', ','.join([ + # 'https://search.saveweb.org', + '*' +])) +print('$STWP_SEARCH_CORS', STWP_SEARCH_CORS) + +app = FastAPI() +# set CORS +app.add_middleware( + CORSMiddleware, + allow_origins=STWP_SEARCH_CORS.split(','), + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +index_name = "entry" + + +async def get_load(): + with open('/proc/loadavg', 'r') as f: + load = f.read().split()[0] + return float(load) + +def load_limiter(func): + @wraps(func) + async def wrapper(*args, **kwargs): + if await get_load() > STWP_SEARCH_MAX_LOAD: + print('[INFO] 荷载过高') + return JSONResponse({ + 'hits': [ + { + 'title': '丑搜当前荷载过高,请稍后再试', + 'content': '服务器荷载过高,请稍后再试。原因:1. 数据库正在更新全文索引 2. 服务器没有摸鱼,在干其它重荷载的任务', + 'author': ';丑搜', + 'date': int(time.time()), + 'link': '#', + }, + ], + 'error': '丑搜当前荷载过高,请稍后再试', + }, status_code=503, headers={'Retry-After': '30'}) + return await func(*args, **kwargs) + return wrapper + +flying_ops = 0 +def ops_limiter(func): + @wraps(func) + async def wrapper(*args, **kwargs): + global flying_ops + if flying_ops >= STWP_SEARCH_MAX_FLYING_OPS: + print('[INFO] 操作过多') + return JSONResponse({ + 'hits': [ + { + 'title': '飞行中的搜索过多,请稍后再试', + 'content': '同一时间内的搜索请求过多。请稍后再试。', + 'author': ';丑搜', + 'date': int(time.time()), + 'link': '#', + }, + ], + 'error': '操作过多,请稍后再试', + }, status_code=503, headers={'Retry-After': '30'}) + flying_ops += 1 + try: + return await func(*args, **kwargs) + finally: + flying_ops -= 1 + return wrapper + + +client = meilisearch_python_sdk.AsyncClient(MEILI_HOST, MEILI_KEY) + + +@app.get('/api/') +async def go_back_home(): + # redirect to / + return Response(status_code=302, headers={'Location': '/'}) + +@app.get('/api/entry/{entry_id}') +@load_limiter +@ops_limiter +async def article(entry_id: int): + results = {} + results['data'] = await client.index(index_name).get_document(entry_id) + results['humans.txt'] = 'is_favorite 目前与主数据库不同步' + + return results + +@app.get('/api/stats') +@app.head('/api/stats') +@load_limiter +@ops_limiter +async def stats(): + stats = await client.index(index_name).get_stats() + return stats + + + + +@app.get('/api/search') +@load_limiter +@ops_limiter +async def search(q: str = 'saveweb', p: int = 0, f: str = 'false', h: str = 'false'): + query = q # 搜索词 + page = p # 0-based + fulltext = f # 返回全文(搜索还是以全文做搜索,只是返回的时候限制一下长度) + highlight = h # 是否高亮 + + print(query, page, 'fulltext:', fulltext, 'highlight:', highlight) + with open('search.log', 'a') as fio: + fio.write(query + '\t' + str(page) + '\n') + + # 搜空,返空 + if not query: + return JSONResponse({ + 'error': '搜索词为空', + }, status_code=400) + + opt_params = { + 'limit': 10, + 'offset': 10 * page, + 'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'], + } + + if fulltext != 'true': + opt_params['attributes_to_crop'] = ['content'] + opt_params['crop_length'] = 120 + + if highlight == 'true': + opt_params['attributes_to_highlight'] = ['title', 'content', 'date', 'tags', 'author'] + opt_params['highlight_pre_tag'] = '' + opt_params['highlight_post_tag'] = '' + + # 第一次搜索 + try: + _results = await client.index(index_name).search(query, **opt_params) + except meilisearch_python_sdk.errors.MeilisearchError as e: + print('数据库错误', e) + return { + 'hits': [ + { + 'title': '数据库错误', + 'content': '查询数据库时出错。如果一直出现这个错误,说明数据库寄了,请反馈。', + 'author': ';丑搜', + 'date': int(time.time()), + 'link': '#', + }, + ], + 'error': '数据库错误', + } + + lengths : dict[str, int]= {} + if fulltext != 'true': # 再搜索一次,获取全文长度 + opt_params = { + 'limit': 10, + 'offset': 10 * page, + 'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'], + } + _results2 = await client.index(index_name).search(query, **opt_params) + for hit in _results2.hits: + lengths.update({str(hit['id']): len(hit['content'])}) + + # replace the hit with _formatted + for hit in _results.hits: + if fulltext != 'true': + assert lengths != {} + if str(hit['id']) in lengths: + hit['content_length'] = lengths[str(hit['id'])] + else: + hit['content_length'] = len(hit['content']) + if '_formatted' in hit: + hit.update(hit['_formatted']) + del hit['_formatted'] + + results = { + 'hits': _results.hits, + 'estimatedTotalHits': _results.estimated_total_hits, #TODO: estimatedTotalHits 改为 estimated_total_hits + 'humans.txt': '使用 API 时请检查 error 字段,高荷载/出错时会返回它。is_favorite 字段目前与主数据库不同步,只有在全库重新索引时才会更新。', + } + + return results + +@app.route('/') +async def root(request): + return HTMLResponse(open('templates/index.html', 'r').read()) # 反正只有一个页面 + +async def main(): + import hypercorn.asyncio + config = hypercorn.Config() + config.bind = ['[::]:8077'] + await hypercorn.asyncio.serve(app, config) + +if __name__ == '__main__': + # hypercorn --bind '[::]:8077' saveweb-search-backend:app + asyncio.run(main()) \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..b09f74f --- /dev/null +++ b/templates/index.html @@ -0,0 +1,270 @@ + + + + + 丑搜 + + + + + +
页 + + + +
随便打点字呗
+
+ +
+ +
  • 多么优雅的搜索界面!全文搜索,模糊搜索,简繁同搜,拼音,同音字。
  • +
  • 有近 13 万篇中文博客文章(包含少量播客),共收录有 1.5K+ 博客。
  • +

    搜索结果以匹配度排序,没有时间权重,这样更容易找到真正有价值的文章。如果你需要更精准的搜索结果,请发动你的小脑瓜。可以用 ";作者" 来筛选同作者的文章。数据库月度更新,如果你需要实时信息,请使用其他优美的搜索引擎。希望你能在这十几万篇文章里找到有用的东西。

    +
    +
  • 输入文字后如果没反应说明数据库炸了。
  • +
  • 什么,左右键能同时移动光标和翻页,是的,这是 feature 。翻页翻过了就啥都没有了,是的,这也是 feature !
  • +
  • 为什么下面的翻页按钮没用?这是 fea... 好吧,是 BUG,修了。
  • +
  • 为什么你认为本站需要搜索按钮?那太优雅了,你只管在框框里打字,剩下的浏览器来想办法。
  • +
  • 什么,你说本站真的太优雅了?请把您写好的 uGly CsS 直接发给我!
  • +

    展开全文还不太优雅,我看能不能塞个 MarkDown 渲染器。不加了不加了,人脑不就是最好的 MarkDown 渲染器吗?

    +

    如需添加收录,给我发消息 TG: @yzqzss / Email: yzqzss@othing.xyz

    + + + \ No newline at end of file