public

2024-09-19 03:15:29 -07:00 · 2024-02-18 23:33:21 +08:00 · 2024-02-18 23:33:21 +08:00 · c943b2023b
commit c943b2023b
5 changed files with 530 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+.git
+.venv
+__pycache__
+search.log
+.vscode
+deploy.sh
--- a/README.md
+++ b/README.md
@ -0,0 +1,30 @@
+# saveweb-search-backend
+
+## Installation
+
+```bash
+pip install -r requirements.txt
+```
+
+## Setup environment variables
+
+```bash
+MEILI_KEY # MeiliSearch API key.
+# default: '' (empty string)
+MEILI_HOST # MeiliSearch host.
+# default: "http://localhost:7700"
+STWP_SEARCH_MAX_LOAD # If the load is higher than this, API will return 503.
+# default: cpu_count / 1.5
+STWP_SEARCH_MAX_FLYING_OPS # If the number of flying requests is higher than this, API will return 503.
+# default: $STWP_SEARCH_MAX_LOAD * 2 (min value: 1)
+STWP_SEARCH_CORS # CORS Allow-Origin header, split by `,`
+# default: *
+```
+
+## Run
+
+```bash
+python saveweb-search-backend.py
+# or
+hypercorn --bind '[::]:8077' saveweb-search-backend:app # to customize the bind address
+```
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+fastapi
+hypercorn
+meilisearch-python-sdk
--- a/saveweb-search-backend.py
+++ b/saveweb-search-backend.py
@ -0,0 +1,221 @@
+from functools import wraps
+import asyncio
+import os
+import time
+
+from fastapi import FastAPI, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, JSONResponse
+
+import meilisearch_python_sdk
+import meilisearch_python_sdk.errors
+
+
+MEILI_KEY = os.getenv('MEILI_KEY', '')
+print('$MEILI_KEY', 'set' if MEILI_KEY else 'not set')
+MEILI_HOST = os.getenv('MEILI_HOST', 'http://127.0.0.1:7700')
+print('$MEILI_HOST', MEILI_HOST)
+STWP_SEARCH_MAX_LOAD = float(os.getenv('STWP_SEARCH_MAX_LOAD')) if os.getenv('STWP_SEARCH_MAX_LOAD') else (
+    os.cpu_count() / 1.5 if os.cpu_count() else 1.5
+)
+print('$STWP_SEARCH_MAX_LOAD', STWP_SEARCH_MAX_LOAD)
+STWP_SEARCH_MAX_FLYING_OPS = int(os.getenv('STWP_SEARCH_MAX_FLYING_OPS')) if os.getenv('STWP_SEARCH_MAX_FLYING_OPS') else (
+    int(STWP_SEARCH_MAX_LOAD * 2)
+)
+STWP_SEARCH_MAX_FLYING_OPS = STWP_SEARCH_MAX_FLYING_OPS if STWP_SEARCH_MAX_FLYING_OPS >= 1 else 1
+print('$STWP_SEARCH_MAX_FLYING_OPS', STWP_SEARCH_MAX_FLYING_OPS)
+STWP_SEARCH_CORS = os.getenv('STWP_SEARCH_CORS', ','.join([
+    # 'https://search.saveweb.org',
+    '*'
+]))
+print('$STWP_SEARCH_CORS', STWP_SEARCH_CORS)
+
+app = FastAPI()
+# set CORS
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=STWP_SEARCH_CORS.split(','),
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+index_name = "entry"
+
+
+async def get_load():
+    with open('/proc/loadavg', 'r') as f:
+        load = f.read().split()[0]
+    return float(load)
+
+def load_limiter(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        if await get_load() > STWP_SEARCH_MAX_LOAD:
+            print('[INFO] 荷载过高')
+            return JSONResponse({
+                'hits': [
+                    {
+                        'title': '丑搜当前荷载过高，请稍后再试',
+                        'content': '服务器荷载过高，请稍后再试。原因：1. 数据库正在更新全文索引 2. 服务器没有摸鱼，在干其它重荷载的任务',
+                        'author': ';丑搜',
+                        'date': int(time.time()),
+                        'link': '#',
+                    },
+                ],
+                'error': '丑搜当前荷载过高，请稍后再试',
+            }, status_code=503, headers={'Retry-After': '30'})
+        return await func(*args, **kwargs)
+    return wrapper
+
+flying_ops = 0
+def ops_limiter(func):
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        global flying_ops
+        if flying_ops >= STWP_SEARCH_MAX_FLYING_OPS:
+            print('[INFO] 操作过多')
+            return JSONResponse({
+                'hits': [
+                    {
+                        'title': '飞行中的搜索过多，请稍后再试',
+                        'content': '同一时间内的搜索请求过多。请稍后再试。',
+                        'author': ';丑搜',
+                        'date': int(time.time()),
+                        'link': '#',
+                    },
+                ],
+                'error': '操作过多，请稍后再试',
+            }, status_code=503, headers={'Retry-After': '30'})
+        flying_ops += 1
+        try:
+            return await func(*args, **kwargs)
+        finally:
+            flying_ops -= 1
+    return wrapper
+
+
+client = meilisearch_python_sdk.AsyncClient(MEILI_HOST, MEILI_KEY)
+
+
+@app.get('/api/')
+async def go_back_home():
+    # redirect to /
+    return Response(status_code=302, headers={'Location': '/'})
+
+@app.get('/api/entry/{entry_id}')
+@load_limiter
+@ops_limiter
+async def article(entry_id: int):
+    results = {}
+    results['data'] = await client.index(index_name).get_document(entry_id)
+    results['humans.txt'] = 'is_favorite 目前与主数据库不同步'
+
+    return results
+
+@app.get('/api/stats')
+@app.head('/api/stats')
+@load_limiter
+@ops_limiter
+async def stats():
+    stats = await client.index(index_name).get_stats()
+    return stats
+
+    
+    
+
+@app.get('/api/search')
+@load_limiter
+@ops_limiter
+async def search(q: str = 'saveweb', p: int = 0, f: str = 'false', h: str = 'false'):
+    query = q  # 搜索词
+    page = p  # 0-based
+    fulltext = f  # 返回全文（搜索还是以全文做搜索，只是返回的时候限制一下长度）
+    highlight = h  # 是否高亮
+
+    print(query, page, 'fulltext:', fulltext, 'highlight:', highlight)
+    with open('search.log', 'a') as fio:
+        fio.write(query + '\t' + str(page) + '\n')
+
+    # 搜空，返空
+    if not query:
+        return JSONResponse({
+            'error': '搜索词为空',
+        }, status_code=400)
+
+    opt_params = {
+        'limit': 10,
+        'offset': 10 * page,
+        'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
+    }
+
+    if fulltext != 'true':
+        opt_params['attributes_to_crop'] = ['content']
+        opt_params['crop_length'] = 120
+
+    if highlight == 'true':
+        opt_params['attributes_to_highlight'] = ['title', 'content', 'date', 'tags', 'author']
+        opt_params['highlight_pre_tag'] = '<span class="uglyHighlight text-purple-500">'
+        opt_params['highlight_post_tag'] = '</span>'
+
+    # 第一次搜索
+    try:
+        _results = await client.index(index_name).search(query, **opt_params)
+    except meilisearch_python_sdk.errors.MeilisearchError as e:
+        print('数据库错误', e)
+        return {
+            'hits': [
+                {
+                    'title': '数据库错误',
+                    'content': '查询数据库时出错。如果一直出现这个错误，说明数据库寄了，请反馈。',
+                    'author': ';丑搜',
+                    'date': int(time.time()),
+                    'link': '#',
+                },
+            ],
+            'error': '数据库错误',
+        }
+
+    lengths : dict[str, int]= {}
+    if fulltext != 'true':  # 再搜索一次，获取全文长度
+        opt_params = {
+            'limit': 10,
+            'offset': 10 * page,
+            'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
+        }
+        _results2 = await client.index(index_name).search(query, **opt_params)
+        for hit in _results2.hits:
+            lengths.update({str(hit['id']): len(hit['content'])})
+
+    # replace the hit with _formatted
+    for hit in _results.hits:
+        if fulltext != 'true':
+            assert lengths != {}
+            if str(hit['id']) in lengths:
+                hit['content_length'] = lengths[str(hit['id'])]
+        else:
+            hit['content_length'] = len(hit['content'])
+        if '_formatted' in hit:
+            hit.update(hit['_formatted'])
+            del hit['_formatted']
+
+    results = {
+        'hits': _results.hits,
+        'estimatedTotalHits': _results.estimated_total_hits, #TODO: estimatedTotalHits 改为 estimated_total_hits
+        'humans.txt': '使用 API 时请检查 error 字段，高荷载/出错时会返回它。is_favorite 字段目前与主数据库不同步，只有在全库重新索引时才会更新。',
+    }
+
+    return results
+
+@app.route('/')
+async def root(request):
+    return HTMLResponse(open('templates/index.html', 'r').read()) # 反正只有一个页面
+
+async def main():
+    import hypercorn.asyncio
+    config = hypercorn.Config()
+    config.bind = ['[::]:8077']
+    await hypercorn.asyncio.serve(app, config)
+
+if __name__ == '__main__':
+    # hypercorn --bind '[::]:8077' saveweb-search-backend:app
+    asyncio.run(main())
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,270 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <title>丑搜</title>
+    <style>
+        #searchBar {
+            width: 500px;
+            height: 30px;
+            margin-top: 20px;
+            margin-bottom: 20px;
+            padding: 5px;
+            border: solid 1px #ccc;
+            box-shadow: 0px 0px 5px #ccc;
+            font-size: 16px;
+        }
+        .resultItem {
+            margin-top: 10px;
+            /* padding: 5px; */
+            /* 居中 */
+            margin-left: auto;
+            margin-right: auto;
+            border: solid 1px #ccc;
+            box-shadow: 0px 0px 5px #ccc;
+            background-color: #eeeeeecb;
+            max-width: 1000px;
+        }
+        .resultTitle {
+            font-size: 18px;
+            font-weight: bold;
+        }
+        .resultInfo {
+            font-size: 14px;
+        }
+        .resultContent {
+            font-size: 14px;
+        }
+        #prevPage, #nextPage, #fullText {
+            width: 100px;
+            height: 30px;
+            margin-top: 20px;
+            margin-bottom: 20px;
+            margin-right: auto;
+            border: solid 1px #ccc;
+            box-shadow: 0px 0px 5px #ccc;
+            font-size: 16px;
+        }
+        #estimatedTotalHits, #page, #fullText, #prevPage, #nextPage {
+            display: inline-block;
+            /* font-size: 16px; */
+        }
+        #fullText {
+            color: red;
+        }
+        #buttonGroup_ {
+            margin-top: 20px;
+            margin-bottom: 20px;
+            margin-left: auto;
+            margin-right: auto;
+            max-width: 220px;
+        }
+        #prevPage_, #nextPage_ {
+            width: 100px;
+            height: 30px;
+            margin-top: 20px;
+            margin-bottom: 20px;
+            margin-right: auto;
+            border: solid 1px #ccc;
+            box-shadow: 0px 0px 5px #ccc;
+            font-size: 16px;
+        }
+        .uglyHighlight {
+            color: rgb(255, 0, 153);
+        }
+    </style>
+</head>
+<body>
+    <input type="text" id="searchBar" placeholder="请输入关键字">
+    
+    <div id="estimatedTotalHits"></div> 第 <div id="page"></div> 页
+    <button id="prevPage">上一页</button> <button id="nextPage">下一页</button>
+    <button id="fullText">展开全文</button>
+
+    <div id="searchResults">随便打点字呗</div>
+    <div id="buttonGroup_">
+        <button id="prevPage_">上一页</button> <button id="nextPage_">下一页</button>
+    </div>
+
+    <li>多么优雅的搜索界面！全文搜索，模糊搜索，简繁同搜，拼音，同音字。</li>
+    <li>有近 13 万篇中文博客文章（包含少量播客），共收录有 1.5K+ 博客。</li>
+    <p><strong>搜索结果以匹配度排序，没有时间权重，这样更容易找到真正有价值的文章</strong>。如果你需要更精准的搜索结果，请发动你的小脑瓜。可以用 ";作者" 来筛选同作者的文章。数据库月度更新，如果你需要实时信息，请使用其他优美的搜索引擎。希望你能在这十几万篇文章里找到有用的东西。</p>
+    <br>
+    <li>输入文字后如果没反应说明数据库炸了。</li>
+    <li>什么，左右键能同时移动光标和翻页，是的，这是 feature 。翻页翻过了就啥都没有了，是的，这也是 feature ！</li>
+    <li>为什么下面的翻页按钮没用？这是 fea... 好吧，是 BUG，修了。</li>
+    <li>为什么你认为本站需要搜索按钮？那太优雅了，你只管在框框里打字，剩下的浏览器来想办法。</li>
+    <li>什么，你说本站真的太优雅了？请把您写好的 uGly CsS 直接发给我！</li>
+    <p><del>展开全文还不太优雅，我看能不能塞个 MarkDown 渲染器。</del>不加了不加了，人脑不就是最好的 MarkDown 渲染器吗？</p>
+    <p>如需添加收录，给我发消息 TG: @yzqzss / Email: yzqzss@othing.xyz </p>
+    <script>
+        // 获取搜索框、搜索结果、总量估计 元素
+        const searchBar = document.getElementById('searchBar');
+        const searchResults = document.getElementById('searchResults');
+        const estimatedTotalHits = document.getElementById('estimatedTotalHits');
+        const prevPage = document.getElementById('prevPage');
+        const nextPage = document.getElementById('nextPage');
+        const prevPage_ = document.getElementById('prevPage_');
+        const nextPage_ = document.getElementById('nextPage_');
+        const fullText = document.getElementById('fullText');
+
+        // 流控
+        let dosearchCount = 0;
+        setInterval(() => {
+            dosearchCount = 0;
+        }, 10 * 1000);
+
+        // 默认页码
+        let page = 0;
+
+        // 监听上一页
+        prevPage_.addEventListener('click', () => {
+            if (page > 0) {
+                page--;
+                searchBar.dispatchEvent(new Event('dosearch'));
+            }
+        });
+        prevPage.addEventListener('click', () => {
+            if (page > 0) {
+                page--;
+                searchBar.dispatchEvent(new Event('dosearch'));
+            }
+        });
+        // 监听左箭头
+        document.addEventListener('keydown', (event) => {
+            if (event.keyCode == 37) {
+                if (page > 0) {
+                    page--;
+                    searchBar.dispatchEvent(new Event('dosearch'));
+                }
+            }
+        });
+
+        // 监听下一页
+        nextPage_.addEventListener('click', () => {
+            page++;
+            searchBar.dispatchEvent(new Event('dosearch'));
+        });
+        nextPage.addEventListener('click', () => {
+            page++;
+            searchBar.dispatchEvent(new Event('dosearch'));
+        });
+        // 监听右箭头
+        document.addEventListener('keydown', (event) => {
+            if (event.keyCode == 39) {
+                page++;
+                searchBar.dispatchEvent(new Event('dosearch'));
+            }
+        });
+
+        let fullTextFlag = false;
+
+        // 监听展开全文
+        fullText.addEventListener('click', () => {
+            if (fullTextFlag) {
+                fullTextFlag = false;
+                fullText.innerHTML = '展开全文';
+                searchResults.innerHTML = '';
+                searchBar.dispatchEvent(new Event('dosearch'));
+            } else {
+                fullTextFlag = true;
+                fullText.innerHTML = '收起全文';
+                searchResults.innerHTML = '';
+                searchBar.dispatchEvent(new Event('dosearch'));
+            }
+        });
+
+
+        // 监听搜索框的输入事件
+        searchBar.addEventListener('input', () => {
+            // 重置页码
+            page = 0;
+            // 重置全文
+            fullTextFlag = false;
+            fullText.innerHTML = '展开全文';
+            // 更新页码
+            document.getElementById('page').innerHTML = page + 1;
+            
+            // 等待用户输入完毕后再搜索
+            clearTimeout(window.searchTimer);
+            window.searchTimer = setTimeout(() => {
+                searchBar.dispatchEvent(new Event('dosearch'));
+            }, 200);
+        });
+        // 监听搜索框的搜索事件
+        searchBar.addEventListener('dosearch', () => {
+            // 获取搜索关键字
+            const query = searchBar.value.trim();
+
+            // 如果搜索关键字为空，则清空搜索结果并返回
+            if (!query) {
+                searchResults.innerHTML = '';
+                return;
+            }
+
+            if (dosearchCount > 20) {
+                searchResults.innerHTML = '搜索太频繁了，休息一下吧。';
+                return;
+            }
+            dosearchCount++;
+
+            // 发送搜索请求
+            // p 从 0 开始，h 代表是否返回高亮
+            fetch('/api/search?q=' + encodeURIComponent(query) + '&p=' + page + '&f=' + fullTextFlag + '&h=' + true)
+                .then(response => response.json())
+                .then(data => {
+
+                    // results.update({
+                    //     'hits': _results['hits'],
+                    //     'estimatedTotalHits': _results['estimatedTotalHits'],
+                    // })
+
+
+                    // 清空搜索结果
+                    searchResults.innerHTML = '';
+
+                    // 添加估计的总命中数
+                    if (data.estimatedTotalHits == 1000) {
+                        estimatedTotalHits.innerHTML = `约 999+ 条结果`;
+                    } else {
+                        estimatedTotalHits.innerHTML = `约 ${data.estimatedTotalHits} 条结果`;
+                    }
+
+                    // 更新页码
+                    document.getElementById('page').innerHTML = page + 1;
+
+                    // 显示搜索结果
+                    data.hits.forEach(hit => {
+                        const resultItem = document.createElement('div');
+                        resultItem.classList.add('resultItem');
+                        const resultTitle = document.createElement('a');
+                        resultTitle.classList.add('resultTitle');
+                        resultTitle.innerHTML = hit.title.replace(/;/g, '');
+                        resultTitle.href = hit.link;
+
+                        const resultInfo = document.createElement('div');
+                        resultInfo.classList.add('resultInfo');
+                        resultInfo.innerHTML = "by " + hit.author + ' at ' + new Date(hit.date*1000).toLocaleString() + '. 大概字数: ' + hit.content.length;
+
+                        const resultContent = document.createElement('div');
+                        resultContent.classList.add('resultContent');
+                        if (fullTextFlag) {
+                            // 去掉连续两个以上的换行符（最多保留两个）
+                            hit.content = hit.content.replace(/\n{3,}/g, '\n\n');
+                            // 将 \n 替换为 <br>
+                            resultContent.innerHTML = hit.content.replace(/\n/g, '<br>');
+                        } else {
+                            resultContent.innerHTML = hit.content + '...';
+                        }
+
+                        resultItem.appendChild(resultTitle);
+                        resultItem.appendChild(resultInfo);
+                        resultItem.appendChild(resultContent);
+                        searchResults.appendChild(resultItem);
+                    });
+                })
+                .catch(error => console.error(error));
+        });
+    </script>
+</body>
+</html>