mirror of
https://github.com/saveweb/saveweb-search-backend.git
synced 2024-09-19 03:15:29 -07:00
public
This commit is contained in:
commit
c943b2023b
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
.git
|
||||
.venv
|
||||
__pycache__
|
||||
search.log
|
||||
.vscode
|
||||
deploy.sh
|
30
README.md
Normal file
30
README.md
Normal file
@ -0,0 +1,30 @@
|
||||
# saveweb-search-backend
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Setup environment variables
|
||||
|
||||
```bash
|
||||
MEILI_KEY # MeiliSearch API key.
|
||||
# default: '' (empty string)
|
||||
MEILI_HOST # MeiliSearch host.
|
||||
# default: "http://localhost:7700"
|
||||
STWP_SEARCH_MAX_LOAD # If the load is higher than this, API will return 503.
|
||||
# default: cpu_count / 1.5
|
||||
STWP_SEARCH_MAX_FLYING_OPS # If the number of flying requests is higher than this, API will return 503.
|
||||
# default: $STWP_SEARCH_MAX_LOAD * 2 (min value: 1)
|
||||
STWP_SEARCH_CORS # CORS Allow-Origin header, split by `,`
|
||||
# default: *
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
python saveweb-search-backend.py
|
||||
# or
|
||||
hypercorn --bind '[::]:8077' saveweb-search-backend:app # to customize the bind address
|
||||
```
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
fastapi
|
||||
hypercorn
|
||||
meilisearch-python-sdk
|
221
saveweb-search-backend.py
Normal file
221
saveweb-search-backend.py
Normal file
@ -0,0 +1,221 @@
|
||||
from functools import wraps
|
||||
import asyncio
|
||||
import os
|
||||
import time
|
||||
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import HTMLResponse, JSONResponse
|
||||
|
||||
import meilisearch_python_sdk
|
||||
import meilisearch_python_sdk.errors
|
||||
|
||||
|
||||
MEILI_KEY = os.getenv('MEILI_KEY', '')
|
||||
print('$MEILI_KEY', 'set' if MEILI_KEY else 'not set')
|
||||
MEILI_HOST = os.getenv('MEILI_HOST', 'http://127.0.0.1:7700')
|
||||
print('$MEILI_HOST', MEILI_HOST)
|
||||
STWP_SEARCH_MAX_LOAD = float(os.getenv('STWP_SEARCH_MAX_LOAD')) if os.getenv('STWP_SEARCH_MAX_LOAD') else (
|
||||
os.cpu_count() / 1.5 if os.cpu_count() else 1.5
|
||||
)
|
||||
print('$STWP_SEARCH_MAX_LOAD', STWP_SEARCH_MAX_LOAD)
|
||||
STWP_SEARCH_MAX_FLYING_OPS = int(os.getenv('STWP_SEARCH_MAX_FLYING_OPS')) if os.getenv('STWP_SEARCH_MAX_FLYING_OPS') else (
|
||||
int(STWP_SEARCH_MAX_LOAD * 2)
|
||||
)
|
||||
STWP_SEARCH_MAX_FLYING_OPS = STWP_SEARCH_MAX_FLYING_OPS if STWP_SEARCH_MAX_FLYING_OPS >= 1 else 1
|
||||
print('$STWP_SEARCH_MAX_FLYING_OPS', STWP_SEARCH_MAX_FLYING_OPS)
|
||||
STWP_SEARCH_CORS = os.getenv('STWP_SEARCH_CORS', ','.join([
|
||||
# 'https://search.saveweb.org',
|
||||
'*'
|
||||
]))
|
||||
print('$STWP_SEARCH_CORS', STWP_SEARCH_CORS)
|
||||
|
||||
app = FastAPI()
|
||||
# set CORS
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=STWP_SEARCH_CORS.split(','),
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
index_name = "entry"
|
||||
|
||||
|
||||
async def get_load():
|
||||
with open('/proc/loadavg', 'r') as f:
|
||||
load = f.read().split()[0]
|
||||
return float(load)
|
||||
|
||||
def load_limiter(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
if await get_load() > STWP_SEARCH_MAX_LOAD:
|
||||
print('[INFO] 荷载过高')
|
||||
return JSONResponse({
|
||||
'hits': [
|
||||
{
|
||||
'title': '丑搜当前荷载过高,请稍后再试',
|
||||
'content': '服务器荷载过高,请稍后再试。原因:1. 数据库正在更新全文索引 2. 服务器没有摸鱼,在干其它重荷载的任务',
|
||||
'author': ';丑搜',
|
||||
'date': int(time.time()),
|
||||
'link': '#',
|
||||
},
|
||||
],
|
||||
'error': '丑搜当前荷载过高,请稍后再试',
|
||||
}, status_code=503, headers={'Retry-After': '30'})
|
||||
return await func(*args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
flying_ops = 0
|
||||
def ops_limiter(func):
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
global flying_ops
|
||||
if flying_ops >= STWP_SEARCH_MAX_FLYING_OPS:
|
||||
print('[INFO] 操作过多')
|
||||
return JSONResponse({
|
||||
'hits': [
|
||||
{
|
||||
'title': '飞行中的搜索过多,请稍后再试',
|
||||
'content': '同一时间内的搜索请求过多。请稍后再试。',
|
||||
'author': ';丑搜',
|
||||
'date': int(time.time()),
|
||||
'link': '#',
|
||||
},
|
||||
],
|
||||
'error': '操作过多,请稍后再试',
|
||||
}, status_code=503, headers={'Retry-After': '30'})
|
||||
flying_ops += 1
|
||||
try:
|
||||
return await func(*args, **kwargs)
|
||||
finally:
|
||||
flying_ops -= 1
|
||||
return wrapper
|
||||
|
||||
|
||||
client = meilisearch_python_sdk.AsyncClient(MEILI_HOST, MEILI_KEY)
|
||||
|
||||
|
||||
@app.get('/api/')
|
||||
async def go_back_home():
|
||||
# redirect to /
|
||||
return Response(status_code=302, headers={'Location': '/'})
|
||||
|
||||
@app.get('/api/entry/{entry_id}')
|
||||
@load_limiter
|
||||
@ops_limiter
|
||||
async def article(entry_id: int):
|
||||
results = {}
|
||||
results['data'] = await client.index(index_name).get_document(entry_id)
|
||||
results['humans.txt'] = 'is_favorite 目前与主数据库不同步'
|
||||
|
||||
return results
|
||||
|
||||
@app.get('/api/stats')
|
||||
@app.head('/api/stats')
|
||||
@load_limiter
|
||||
@ops_limiter
|
||||
async def stats():
|
||||
stats = await client.index(index_name).get_stats()
|
||||
return stats
|
||||
|
||||
|
||||
|
||||
|
||||
@app.get('/api/search')
|
||||
@load_limiter
|
||||
@ops_limiter
|
||||
async def search(q: str = 'saveweb', p: int = 0, f: str = 'false', h: str = 'false'):
|
||||
query = q # 搜索词
|
||||
page = p # 0-based
|
||||
fulltext = f # 返回全文(搜索还是以全文做搜索,只是返回的时候限制一下长度)
|
||||
highlight = h # 是否高亮
|
||||
|
||||
print(query, page, 'fulltext:', fulltext, 'highlight:', highlight)
|
||||
with open('search.log', 'a') as fio:
|
||||
fio.write(query + '\t' + str(page) + '\n')
|
||||
|
||||
# 搜空,返空
|
||||
if not query:
|
||||
return JSONResponse({
|
||||
'error': '搜索词为空',
|
||||
}, status_code=400)
|
||||
|
||||
opt_params = {
|
||||
'limit': 10,
|
||||
'offset': 10 * page,
|
||||
'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
|
||||
}
|
||||
|
||||
if fulltext != 'true':
|
||||
opt_params['attributes_to_crop'] = ['content']
|
||||
opt_params['crop_length'] = 120
|
||||
|
||||
if highlight == 'true':
|
||||
opt_params['attributes_to_highlight'] = ['title', 'content', 'date', 'tags', 'author']
|
||||
opt_params['highlight_pre_tag'] = '<span class="uglyHighlight text-purple-500">'
|
||||
opt_params['highlight_post_tag'] = '</span>'
|
||||
|
||||
# 第一次搜索
|
||||
try:
|
||||
_results = await client.index(index_name).search(query, **opt_params)
|
||||
except meilisearch_python_sdk.errors.MeilisearchError as e:
|
||||
print('数据库错误', e)
|
||||
return {
|
||||
'hits': [
|
||||
{
|
||||
'title': '数据库错误',
|
||||
'content': '查询数据库时出错。如果一直出现这个错误,说明数据库寄了,请反馈。',
|
||||
'author': ';丑搜',
|
||||
'date': int(time.time()),
|
||||
'link': '#',
|
||||
},
|
||||
],
|
||||
'error': '数据库错误',
|
||||
}
|
||||
|
||||
lengths : dict[str, int]= {}
|
||||
if fulltext != 'true': # 再搜索一次,获取全文长度
|
||||
opt_params = {
|
||||
'limit': 10,
|
||||
'offset': 10 * page,
|
||||
'attributes_to_retrieve': ['id', 'id_feed', 'title', 'content', 'link', 'date', 'tags', 'author', 'lastSeen'],
|
||||
}
|
||||
_results2 = await client.index(index_name).search(query, **opt_params)
|
||||
for hit in _results2.hits:
|
||||
lengths.update({str(hit['id']): len(hit['content'])})
|
||||
|
||||
# replace the hit with _formatted
|
||||
for hit in _results.hits:
|
||||
if fulltext != 'true':
|
||||
assert lengths != {}
|
||||
if str(hit['id']) in lengths:
|
||||
hit['content_length'] = lengths[str(hit['id'])]
|
||||
else:
|
||||
hit['content_length'] = len(hit['content'])
|
||||
if '_formatted' in hit:
|
||||
hit.update(hit['_formatted'])
|
||||
del hit['_formatted']
|
||||
|
||||
results = {
|
||||
'hits': _results.hits,
|
||||
'estimatedTotalHits': _results.estimated_total_hits, #TODO: estimatedTotalHits 改为 estimated_total_hits
|
||||
'humans.txt': '使用 API 时请检查 error 字段,高荷载/出错时会返回它。is_favorite 字段目前与主数据库不同步,只有在全库重新索引时才会更新。',
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
@app.route('/')
|
||||
async def root(request):
|
||||
return HTMLResponse(open('templates/index.html', 'r').read()) # 反正只有一个页面
|
||||
|
||||
async def main():
|
||||
import hypercorn.asyncio
|
||||
config = hypercorn.Config()
|
||||
config.bind = ['[::]:8077']
|
||||
await hypercorn.asyncio.serve(app, config)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# hypercorn --bind '[::]:8077' saveweb-search-backend:app
|
||||
asyncio.run(main())
|
270
templates/index.html
Normal file
270
templates/index.html
Normal file
@ -0,0 +1,270 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>丑搜</title>
|
||||
<style>
|
||||
#searchBar {
|
||||
width: 500px;
|
||||
height: 30px;
|
||||
margin-top: 20px;
|
||||
margin-bottom: 20px;
|
||||
padding: 5px;
|
||||
border: solid 1px #ccc;
|
||||
box-shadow: 0px 0px 5px #ccc;
|
||||
font-size: 16px;
|
||||
}
|
||||
.resultItem {
|
||||
margin-top: 10px;
|
||||
/* padding: 5px; */
|
||||
/* 居中 */
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
border: solid 1px #ccc;
|
||||
box-shadow: 0px 0px 5px #ccc;
|
||||
background-color: #eeeeeecb;
|
||||
max-width: 1000px;
|
||||
}
|
||||
.resultTitle {
|
||||
font-size: 18px;
|
||||
font-weight: bold;
|
||||
}
|
||||
.resultInfo {
|
||||
font-size: 14px;
|
||||
}
|
||||
.resultContent {
|
||||
font-size: 14px;
|
||||
}
|
||||
#prevPage, #nextPage, #fullText {
|
||||
width: 100px;
|
||||
height: 30px;
|
||||
margin-top: 20px;
|
||||
margin-bottom: 20px;
|
||||
margin-right: auto;
|
||||
border: solid 1px #ccc;
|
||||
box-shadow: 0px 0px 5px #ccc;
|
||||
font-size: 16px;
|
||||
}
|
||||
#estimatedTotalHits, #page, #fullText, #prevPage, #nextPage {
|
||||
display: inline-block;
|
||||
/* font-size: 16px; */
|
||||
}
|
||||
#fullText {
|
||||
color: red;
|
||||
}
|
||||
#buttonGroup_ {
|
||||
margin-top: 20px;
|
||||
margin-bottom: 20px;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
max-width: 220px;
|
||||
}
|
||||
#prevPage_, #nextPage_ {
|
||||
width: 100px;
|
||||
height: 30px;
|
||||
margin-top: 20px;
|
||||
margin-bottom: 20px;
|
||||
margin-right: auto;
|
||||
border: solid 1px #ccc;
|
||||
box-shadow: 0px 0px 5px #ccc;
|
||||
font-size: 16px;
|
||||
}
|
||||
.uglyHighlight {
|
||||
color: rgb(255, 0, 153);
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<input type="text" id="searchBar" placeholder="请输入关键字">
|
||||
|
||||
<div id="estimatedTotalHits"></div> 第 <div id="page"></div> 页
|
||||
<button id="prevPage">上一页</button> <button id="nextPage">下一页</button>
|
||||
<button id="fullText">展开全文</button>
|
||||
|
||||
<div id="searchResults">随便打点字呗</div>
|
||||
<div id="buttonGroup_">
|
||||
<button id="prevPage_">上一页</button> <button id="nextPage_">下一页</button>
|
||||
</div>
|
||||
|
||||
<li>多么优雅的搜索界面!全文搜索,模糊搜索,简繁同搜,拼音,同音字。</li>
|
||||
<li>有近 13 万篇中文博客文章(包含少量播客),共收录有 1.5K+ 博客。</li>
|
||||
<p><strong>搜索结果以匹配度排序,没有时间权重,这样更容易找到真正有价值的文章</strong>。如果你需要更精准的搜索结果,请发动你的小脑瓜。可以用 ";作者" 来筛选同作者的文章。数据库月度更新,如果你需要实时信息,请使用其他优美的搜索引擎。希望你能在这十几万篇文章里找到有用的东西。</p>
|
||||
<br>
|
||||
<li>输入文字后如果没反应说明数据库炸了。</li>
|
||||
<li>什么,左右键能同时移动光标和翻页,是的,这是 feature 。翻页翻过了就啥都没有了,是的,这也是 feature !</li>
|
||||
<li>为什么下面的翻页按钮没用?这是 fea... 好吧,是 BUG,修了。</li>
|
||||
<li>为什么你认为本站需要搜索按钮?那太优雅了,你只管在框框里打字,剩下的浏览器来想办法。</li>
|
||||
<li>什么,你说本站真的太优雅了?请把您写好的 uGly CsS 直接发给我!</li>
|
||||
<p><del>展开全文还不太优雅,我看能不能塞个 MarkDown 渲染器。</del>不加了不加了,人脑不就是最好的 MarkDown 渲染器吗?</p>
|
||||
<p>如需添加收录,给我发消息 TG: @yzqzss / Email: yzqzss@othing.xyz </p>
|
||||
<script>
|
||||
// 获取搜索框、搜索结果、总量估计 元素
|
||||
const searchBar = document.getElementById('searchBar');
|
||||
const searchResults = document.getElementById('searchResults');
|
||||
const estimatedTotalHits = document.getElementById('estimatedTotalHits');
|
||||
const prevPage = document.getElementById('prevPage');
|
||||
const nextPage = document.getElementById('nextPage');
|
||||
const prevPage_ = document.getElementById('prevPage_');
|
||||
const nextPage_ = document.getElementById('nextPage_');
|
||||
const fullText = document.getElementById('fullText');
|
||||
|
||||
// 流控
|
||||
let dosearchCount = 0;
|
||||
setInterval(() => {
|
||||
dosearchCount = 0;
|
||||
}, 10 * 1000);
|
||||
|
||||
// 默认页码
|
||||
let page = 0;
|
||||
|
||||
// 监听上一页
|
||||
prevPage_.addEventListener('click', () => {
|
||||
if (page > 0) {
|
||||
page--;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}
|
||||
});
|
||||
prevPage.addEventListener('click', () => {
|
||||
if (page > 0) {
|
||||
page--;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}
|
||||
});
|
||||
// 监听左箭头
|
||||
document.addEventListener('keydown', (event) => {
|
||||
if (event.keyCode == 37) {
|
||||
if (page > 0) {
|
||||
page--;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 监听下一页
|
||||
nextPage_.addEventListener('click', () => {
|
||||
page++;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
});
|
||||
nextPage.addEventListener('click', () => {
|
||||
page++;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
});
|
||||
// 监听右箭头
|
||||
document.addEventListener('keydown', (event) => {
|
||||
if (event.keyCode == 39) {
|
||||
page++;
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}
|
||||
});
|
||||
|
||||
let fullTextFlag = false;
|
||||
|
||||
// 监听展开全文
|
||||
fullText.addEventListener('click', () => {
|
||||
if (fullTextFlag) {
|
||||
fullTextFlag = false;
|
||||
fullText.innerHTML = '展开全文';
|
||||
searchResults.innerHTML = '';
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
} else {
|
||||
fullTextFlag = true;
|
||||
fullText.innerHTML = '收起全文';
|
||||
searchResults.innerHTML = '';
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// 监听搜索框的输入事件
|
||||
searchBar.addEventListener('input', () => {
|
||||
// 重置页码
|
||||
page = 0;
|
||||
// 重置全文
|
||||
fullTextFlag = false;
|
||||
fullText.innerHTML = '展开全文';
|
||||
// 更新页码
|
||||
document.getElementById('page').innerHTML = page + 1;
|
||||
|
||||
// 等待用户输入完毕后再搜索
|
||||
clearTimeout(window.searchTimer);
|
||||
window.searchTimer = setTimeout(() => {
|
||||
searchBar.dispatchEvent(new Event('dosearch'));
|
||||
}, 200);
|
||||
});
|
||||
// 监听搜索框的搜索事件
|
||||
searchBar.addEventListener('dosearch', () => {
|
||||
// 获取搜索关键字
|
||||
const query = searchBar.value.trim();
|
||||
|
||||
// 如果搜索关键字为空,则清空搜索结果并返回
|
||||
if (!query) {
|
||||
searchResults.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
if (dosearchCount > 20) {
|
||||
searchResults.innerHTML = '搜索太频繁了,休息一下吧。';
|
||||
return;
|
||||
}
|
||||
dosearchCount++;
|
||||
|
||||
// 发送搜索请求
|
||||
// p 从 0 开始,h 代表是否返回高亮
|
||||
fetch('/api/search?q=' + encodeURIComponent(query) + '&p=' + page + '&f=' + fullTextFlag + '&h=' + true)
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
|
||||
// results.update({
|
||||
// 'hits': _results['hits'],
|
||||
// 'estimatedTotalHits': _results['estimatedTotalHits'],
|
||||
// })
|
||||
|
||||
|
||||
// 清空搜索结果
|
||||
searchResults.innerHTML = '';
|
||||
|
||||
// 添加估计的总命中数
|
||||
if (data.estimatedTotalHits == 1000) {
|
||||
estimatedTotalHits.innerHTML = `约 999+ 条结果`;
|
||||
} else {
|
||||
estimatedTotalHits.innerHTML = `约 ${data.estimatedTotalHits} 条结果`;
|
||||
}
|
||||
|
||||
// 更新页码
|
||||
document.getElementById('page').innerHTML = page + 1;
|
||||
|
||||
// 显示搜索结果
|
||||
data.hits.forEach(hit => {
|
||||
const resultItem = document.createElement('div');
|
||||
resultItem.classList.add('resultItem');
|
||||
const resultTitle = document.createElement('a');
|
||||
resultTitle.classList.add('resultTitle');
|
||||
resultTitle.innerHTML = hit.title.replace(/;/g, '');
|
||||
resultTitle.href = hit.link;
|
||||
|
||||
const resultInfo = document.createElement('div');
|
||||
resultInfo.classList.add('resultInfo');
|
||||
resultInfo.innerHTML = "by " + hit.author + ' at ' + new Date(hit.date*1000).toLocaleString() + '. 大概字数: ' + hit.content.length;
|
||||
|
||||
const resultContent = document.createElement('div');
|
||||
resultContent.classList.add('resultContent');
|
||||
if (fullTextFlag) {
|
||||
// 去掉连续两个以上的换行符(最多保留两个)
|
||||
hit.content = hit.content.replace(/\n{3,}/g, '\n\n');
|
||||
// 将 \n 替换为 <br>
|
||||
resultContent.innerHTML = hit.content.replace(/\n/g, '<br>');
|
||||
} else {
|
||||
resultContent.innerHTML = hit.content + '...';
|
||||
}
|
||||
|
||||
resultItem.appendChild(resultTitle);
|
||||
resultItem.appendChild(resultInfo);
|
||||
resultItem.appendChild(resultContent);
|
||||
searchResults.appendChild(resultItem);
|
||||
});
|
||||
})
|
||||
.catch(error => console.error(error));
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in New Issue
Block a user