This commit is contained in:
yzqzss 2024-02-18 23:33:21 +08:00
commit c943b2023b
5 changed files with 530 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
.git
.venv
__pycache__
search.log
.vscode
deploy.sh

30
README.md Normal file
View File

@ -0,0 +1,30 @@
# saveweb-search-backend
## Installation
```bash
pip install -r requirements.txt
```
## Setup environment variables
```bash
MEILI_KEY # MeiliSearch API key.
# default: '' (empty string)
MEILI_HOST # MeiliSearch host.
# default: "http://localhost:7700"
STWP_SEARCH_MAX_LOAD # If the load is higher than this, API will return 503.
# default: cpu_count / 1.5
STWP_SEARCH_MAX_FLYING_OPS # If the number of flying requests is higher than this, API will return 503.
# default: $STWP_SEARCH_MAX_LOAD * 2 (min value: 1)
STWP_SEARCH_CORS # CORS Allow-Origin header, split by `,`
# default: *
```
## Run
```bash
python saveweb-search-backend.py
# or
hypercorn --bind '[::]:8077' saveweb-search-backend:app # to customize the bind address
```

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
fastapi
hypercorn
meilisearch-python-sdk

221
saveweb-search-backend.py Normal file
View File

@ -0,0 +1,221 @@
from functools import wraps
import asyncio
import os
import time
from fastapi import FastAPI, Response
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import HTMLResponse, JSONResponse
import meilisearch_python_sdk
import meilisearch_python_sdk.errors
MEILI_KEY = os.getenv('MEILI_KEY', '')
print('$MEILI_KEY', 'set' if MEILI_KEY else 'not set')
MEILI_HOST = os.getenv('MEILI_HOST', 'http://127.0.0.1:7700')
print('$MEILI_HOST', MEILI_HOST)
STWP_SEARCH_MAX_LOAD = float(os.getenv('STWP_SEARCH_MAX_LOAD')) if os.getenv('STWP_SEARCH_MAX_LOAD') else (
os.cpu_count() / 1.5 if os.cpu_count() else 1.5
)
print('$STWP_SEARCH_MAX_LOAD', STWP_SEARCH_MAX_LOAD)
STWP_SEARCH_MAX_FLYING_OPS = int(os.getenv('STWP_SEARCH_MAX_FLYING_OPS')) if os.getenv('STWP_SEARCH_MAX_FLYING_OPS') else (
int(STWP_SEARCH_MAX_LOAD * 2)
)
STWP_SEARCH_MAX_FLYING_OPS = STWP_SEARCH_MAX_FLYING_OPS if STWP_SEARCH_MAX_FLYING_OPS >= 1 else 1
print('$STWP_SEARCH_MAX_FLYING_OPS', STWP_SEARCH_MAX_FLYING_OPS)
STWP_SEARCH_CORS = os.getenv('STWP_SEARCH_CORS', ','.join([
# 'https://search.saveweb.org',
'*'
]))
print('$STWP_SEARCH_CORS', STWP_SEARCH_CORS)
app = FastAPI()
# set CORS
app.add_middleware(
CORSMiddleware,
allow_origins=STWP_SEARCH_CORS.split(','),
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
index_name = "entry"
async def get_load():
with open('/proc/loadavg', 'r') as f:
load = f.read().split()[0]
return float(load)
def load_limiter(func):
@wraps(func)
async def wrapper(*args, **kwargs):
if await get_load() > STWP_SEARCH_MAX_LOAD:
print('[INFO] 荷载过高')
return JSONResponse({
'hits': [
{
'title': '丑搜当前荷载过高,请稍后再试',
'content': '服务器荷载过高请稍后再试。原因1. 数据库正在更新全文索引 2. 服务器没有摸鱼,在干其它重荷载的任务',
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '丑搜当前荷载过高,请稍后再试',
}, status_code=503, headers={'Retry-After': '30'})
return await func(*args, **kwargs)
return wrapper
flying_ops = 0
def ops_limiter(func):
@wraps(func)
async def wrapper(*args, **kwargs):
global flying_ops
if flying_ops >= STWP_SEARCH_MAX_FLYING_OPS:
print('[INFO] 操作过多')
return JSONResponse({
'hits': [
{
'title': '飞行中的搜索过多,请稍后再试',
'content': '同一时间内的搜索请求过多。请稍后再试。',
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '操作过多,请稍后再试',
}, status_code=503, headers={'Retry-After': '30'})
flying_ops += 1
try:
return await func(*args, **kwargs)
finally:
flying_ops -= 1
return wrapper
client = meilisearch_python_sdk.AsyncClient(MEILI_HOST, MEILI_KEY)
@app.get('/api/')
async def go_back_home():
# redirect to /
return Response(status_code=302, headers={'Location': '/'})
@app.get('/api/entry/{entry_id}')
@load_limiter
@ops_limiter
async def article(entry_id: int):
results = {}
results['data'] = await client.index(index_name).get_document(entry_id)
results['humans.txt'] = 'is_favorite 目前与主数据库不同步'
return results
@app.get('/api/stats')
@app.head('/api/stats')
@load_limiter
@ops_limiter
async def stats():
stats = await client.index(index_name).get_stats()
return stats
@app.get('/api/search')
@load_limiter
@ops_limiter
async def search(q: str = 'saveweb', p: int = 0, f: str = 'false', h: str = 'false'):
query = q # 搜索词
page = p # 0-based
fulltext = f # 返回全文(搜索还是以全文做搜索,只是返回的时候限制一下长度)
highlight = h # 是否高亮
print(query, page, 'fulltext:', fulltext, 'highlight:', highlight)
with open('search.log', 'a') as fio:
fio.write(query + '\t' + str(page) + '\n')
# 搜空,返空
if not query:
return JSONResponse({
'error': '搜索词为空',
}, status_code=400)
opt_params = {
'limit': 10,
'offset': 10 * page,
'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
}
if fulltext != 'true':
opt_params['attributes_to_crop'] = ['content']
opt_params['crop_length'] = 120
if highlight == 'true':
opt_params['attributes_to_highlight'] = ['title', 'content', 'date', 'tags', 'author']
opt_params['highlight_pre_tag'] = '<span class="uglyHighlight text-purple-500">'
opt_params['highlight_post_tag'] = '</span>'
# 第一次搜索
try:
_results = await client.index(index_name).search(query, **opt_params)
except meilisearch_python_sdk.errors.MeilisearchError as e:
print('数据库错误', e)
return {
'hits': [
{
'title': '数据库错误',
'content': '查询数据库时出错。如果一直出现这个错误,说明数据库寄了,请反馈。',
'author': ';丑搜',
'date': int(time.time()),
'link': '#',
},
],
'error': '数据库错误',
}
lengths : dict[str, int]= {}
if fulltext != 'true': # 再搜索一次,获取全文长度
opt_params = {
'limit': 10,
'offset': 10 * page,
'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
}
_results2 = await client.index(index_name).search(query, **opt_params)
for hit in _results2.hits:
lengths.update({str(hit['id']): len(hit['content'])})
# replace the hit with _formatted
for hit in _results.hits:
if fulltext != 'true':
assert lengths != {}
if str(hit['id']) in lengths:
hit['content_length'] = lengths[str(hit['id'])]
else:
hit['content_length'] = len(hit['content'])
if '_formatted' in hit:
hit.update(hit['_formatted'])
del hit['_formatted']
results = {
'hits': _results.hits,
'estimatedTotalHits': _results.estimated_total_hits, #TODO: estimatedTotalHits 改为 estimated_total_hits
'humans.txt': '使用 API 时请检查 error 字段,高荷载/出错时会返回它。is_favorite 字段目前与主数据库不同步,只有在全库重新索引时才会更新。',
}
return results
@app.route('/')
async def root(request):
return HTMLResponse(open('templates/index.html', 'r').read()) # 反正只有一个页面
async def main():
import hypercorn.asyncio
config = hypercorn.Config()
config.bind = ['[::]:8077']
await hypercorn.asyncio.serve(app, config)
if __name__ == '__main__':
# hypercorn --bind '[::]:8077' saveweb-search-backend:app
asyncio.run(main())

270
templates/index.html Normal file
View File

@ -0,0 +1,270 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>丑搜</title>
<style>
#searchBar {
width: 500px;
height: 30px;
margin-top: 20px;
margin-bottom: 20px;
padding: 5px;
border: solid 1px #ccc;
box-shadow: 0px 0px 5px #ccc;
font-size: 16px;
}
.resultItem {
margin-top: 10px;
/* padding: 5px; */
/* 居中 */
margin-left: auto;
margin-right: auto;
border: solid 1px #ccc;
box-shadow: 0px 0px 5px #ccc;
background-color: #eeeeeecb;
max-width: 1000px;
}
.resultTitle {
font-size: 18px;
font-weight: bold;
}
.resultInfo {
font-size: 14px;
}
.resultContent {
font-size: 14px;
}
#prevPage, #nextPage, #fullText {
width: 100px;
height: 30px;
margin-top: 20px;
margin-bottom: 20px;
margin-right: auto;
border: solid 1px #ccc;
box-shadow: 0px 0px 5px #ccc;
font-size: 16px;
}
#estimatedTotalHits, #page, #fullText, #prevPage, #nextPage {
display: inline-block;
/* font-size: 16px; */
}
#fullText {
color: red;
}
#buttonGroup_ {
margin-top: 20px;
margin-bottom: 20px;
margin-left: auto;
margin-right: auto;
max-width: 220px;
}
#prevPage_, #nextPage_ {
width: 100px;
height: 30px;
margin-top: 20px;
margin-bottom: 20px;
margin-right: auto;
border: solid 1px #ccc;
box-shadow: 0px 0px 5px #ccc;
font-size: 16px;
}
.uglyHighlight {
color: rgb(255, 0, 153);
}
</style>
</head>
<body>
<input type="text" id="searchBar" placeholder="请输入关键字">
<div id="estimatedTotalHits"></div><div id="page"></div>
<button id="prevPage">上一页</button> <button id="nextPage">下一页</button>
<button id="fullText">展开全文</button>
<div id="searchResults">随便打点字呗</div>
<div id="buttonGroup_">
<button id="prevPage_">上一页</button> <button id="nextPage_">下一页</button>
</div>
<li>多么优雅的搜索界面!全文搜索,模糊搜索,简繁同搜,拼音,同音字。</li>
<li>有近 13 万篇中文博客文章(包含少量播客),共收录有 1.5K+ 博客。</li>
<p><strong>搜索结果以匹配度排序,没有时间权重,这样更容易找到真正有价值的文章</strong>。如果你需要更精准的搜索结果,请发动你的小脑瓜。可以用 ";作者" 来筛选同作者的文章。数据库月度更新,如果你需要实时信息,请使用其他优美的搜索引擎。希望你能在这十几万篇文章里找到有用的东西。</p>
<br>
<li>输入文字后如果没反应说明数据库炸了。</li>
<li>什么,左右键能同时移动光标和翻页,是的,这是 feature 。翻页翻过了就啥都没有了,是的,这也是 feature </li>
<li>为什么下面的翻页按钮没用?这是 fea... 好吧,是 BUG修了。</li>
<li>为什么你认为本站需要搜索按钮?那太优雅了,你只管在框框里打字,剩下的浏览器来想办法。</li>
<li>什么,你说本站真的太优雅了?请把您写好的 uGly CsS 直接发给我!</li>
<p><del>展开全文还不太优雅,我看能不能塞个 MarkDown 渲染器。</del>不加了不加了,人脑不就是最好的 MarkDown 渲染器吗?</p>
<p>如需添加收录,给我发消息 TG: @yzqzss / Email: yzqzss@othing.xyz </p>
<script>
// 获取搜索框、搜索结果、总量估计 元素
const searchBar = document.getElementById('searchBar');
const searchResults = document.getElementById('searchResults');
const estimatedTotalHits = document.getElementById('estimatedTotalHits');
const prevPage = document.getElementById('prevPage');
const nextPage = document.getElementById('nextPage');
const prevPage_ = document.getElementById('prevPage_');
const nextPage_ = document.getElementById('nextPage_');
const fullText = document.getElementById('fullText');
// 流控
let dosearchCount = 0;
setInterval(() => {
dosearchCount = 0;
}, 10 * 1000);
// 默认页码
let page = 0;
// 监听上一页
prevPage_.addEventListener('click', () => {
if (page > 0) {
page--;
searchBar.dispatchEvent(new Event('dosearch'));
}
});
prevPage.addEventListener('click', () => {
if (page > 0) {
page--;
searchBar.dispatchEvent(new Event('dosearch'));
}
});
// 监听左箭头
document.addEventListener('keydown', (event) => {
if (event.keyCode == 37) {
if (page > 0) {
page--;
searchBar.dispatchEvent(new Event('dosearch'));
}
}
});
// 监听下一页
nextPage_.addEventListener('click', () => {
page++;
searchBar.dispatchEvent(new Event('dosearch'));
});
nextPage.addEventListener('click', () => {
page++;
searchBar.dispatchEvent(new Event('dosearch'));
});
// 监听右箭头
document.addEventListener('keydown', (event) => {
if (event.keyCode == 39) {
page++;
searchBar.dispatchEvent(new Event('dosearch'));
}
});
let fullTextFlag = false;
// 监听展开全文
fullText.addEventListener('click', () => {
if (fullTextFlag) {
fullTextFlag = false;
fullText.innerHTML = '展开全文';
searchResults.innerHTML = '';
searchBar.dispatchEvent(new Event('dosearch'));
} else {
fullTextFlag = true;
fullText.innerHTML = '收起全文';
searchResults.innerHTML = '';
searchBar.dispatchEvent(new Event('dosearch'));
}
});
// 监听搜索框的输入事件
searchBar.addEventListener('input', () => {
// 重置页码
page = 0;
// 重置全文
fullTextFlag = false;
fullText.innerHTML = '展开全文';
// 更新页码
document.getElementById('page').innerHTML = page + 1;
// 等待用户输入完毕后再搜索
clearTimeout(window.searchTimer);
window.searchTimer = setTimeout(() => {
searchBar.dispatchEvent(new Event('dosearch'));
}, 200);
});
// 监听搜索框的搜索事件
searchBar.addEventListener('dosearch', () => {
// 获取搜索关键字
const query = searchBar.value.trim();
// 如果搜索关键字为空,则清空搜索结果并返回
if (!query) {
searchResults.innerHTML = '';
return;
}
if (dosearchCount > 20) {
searchResults.innerHTML = '搜索太频繁了,休息一下吧。';
return;
}
dosearchCount++;
// 发送搜索请求
// p 从 0 开始h 代表是否返回高亮
fetch('/api/search?q=' + encodeURIComponent(query) + '&p=' + page + '&f=' + fullTextFlag + '&h=' + true)
.then(response => response.json())
.then(data => {
// results.update({
// 'hits': _results['hits'],
// 'estimatedTotalHits': _results['estimatedTotalHits'],
// })
// 清空搜索结果
searchResults.innerHTML = '';
// 添加估计的总命中数
if (data.estimatedTotalHits == 1000) {
estimatedTotalHits.innerHTML = `约 999+ 条结果`;
} else {
estimatedTotalHits.innerHTML = `约 ${data.estimatedTotalHits} 条结果`;
}
// 更新页码
document.getElementById('page').innerHTML = page + 1;
// 显示搜索结果
data.hits.forEach(hit => {
const resultItem = document.createElement('div');
resultItem.classList.add('resultItem');
const resultTitle = document.createElement('a');
resultTitle.classList.add('resultTitle');
resultTitle.innerHTML = hit.title.replace(/;/g, '');
resultTitle.href = hit.link;
const resultInfo = document.createElement('div');
resultInfo.classList.add('resultInfo');
resultInfo.innerHTML = "by " + hit.author + ' at ' + new Date(hit.date*1000).toLocaleString() + '. 大概字数: ' + hit.content.length;
const resultContent = document.createElement('div');
resultContent.classList.add('resultContent');
if (fullTextFlag) {
// 去掉连续两个以上的换行符(最多保留两个)
hit.content = hit.content.replace(/\n{3,}/g, '\n\n');
// 将 \n 替换为 <br>
resultContent.innerHTML = hit.content.replace(/\n/g, '<br>');
} else {
resultContent.innerHTML = hit.content + '...';
}
resultItem.appendChild(resultTitle);
resultItem.appendChild(resultInfo);
resultItem.appendChild(resultContent);
searchResults.appendChild(resultItem);
});
})
.catch(error => console.error(error));
});
</script>
</body>
</html>