Compare commits

...

9 Commits
v0.2.0 ... main

Author SHA1 Message Date
0a984a1e3c Update stage3 status 2024-08-24 01:56:55 -07:00
313c0a0b57 stage2 finished 2024-07-19 00:05:16 +08:00
2d75209b43 feat: handle infinite loops gracefully
Some checks failed
Gitea Go Release Actions / Release Go Binary (amd64, linux) (push) Has been cancelled
Gitea Go Release Actions / Release Go Binary (arm64, darwin) (push) Has been cancelled
Gitea Go Release Actions / Release Go Binary (arm64, linux) (push) Has been cancelled
Gitea Go Release Actions / Release Go Binary (arm, linux) (push) Has been cancelled
Gitea Go Release Actions / Release Go Binary (amd64, windows) (push) Has been cancelled
Gitea Go Release Actions / Release Go Binary (amd64, darwin) (push) Has been cancelled
2024-07-18 16:59:30 +08:00
61816823df --label=com.centurylinklabs.watchtower.enable=true 2024-07-17 14:52:48 -07:00
9db3698d54 rm poweredby flag
All checks were successful
Gitea Go Release Actions / Release Go Binary (amd64, linux) (push) Successful in 50s
Gitea Go Release Actions / Release Go Binary (arm64, darwin) (push) Successful in 47s
Gitea Go Release Actions / Release Go Binary (arm64, linux) (push) Successful in 1m14s
Gitea Go Release Actions / Release Go Binary (amd64, darwin) (push) Successful in 39s
Gitea Go Release Actions / Release Go Binary (amd64, windows) (push) Successful in 1m9s
Gitea Go Release Actions / Release Go Binary (arm, linux) (push) Successful in 1m6s
2024-07-17 14:04:19 -07:00
8260784a7f format README 2024-07-18 04:02:17 +08:00
b44ace123d publish docker 2024-07-18 03:55:34 +08:00
3bc33e3201 README i18n 2024-07-18 02:11:10 +08:00
eebf9fe6a9 add README 2024-07-18 02:05:33 +08:00
3 changed files with 83 additions and 4 deletions

View File

@ -1 +1,49 @@
GOOOOOOO
# cnblogs archiver
## How can I help?
### Binary
Go to [release](https://git.saveweb.org/saveweb/cnblogs/releases) page, downlaod `cnblogs_posts_list` and run it.
WARNING: DO NOT run `cnblogs_posts_list` concurrently (on same IP), you may be banned by cnblogs.
### With Docker
```bash
export ARCHIVIST=<your_node_name> # a string that can uniquely identify your node (for example: bob-gcloud-514). (Legal characters: letters, numbers, -, _)
```
```bash
if [[ -z "$ARCHIVIST" ]]; then
echo "WARN: ARCHIVIST must be set"
exit 1
fi
_image="icecodexi/saveweb:cnblogs"
docker pull "${_image}" \
&& docker stop cnblogs
docker rm -f cnblogs \
&& docker run --env ARCHIVIST="$ARCHIVIST" --restart always \
--volume /etc/localtime:/etc/localtime:ro \
--cpu-shares 512 --memory 512M --memory-swap 512M \
--label=com.centurylinklabs.watchtower.enable=true \
--detach --name cnblogs \
"${_image}"
```
## Archiving stages
### stage1detect all blogids (~~finished~~)
run `cnblogs_rss_detect`
### stage2iterate all blogids and collect all posts' URLs (~~finished~~)
run `cnblogs_posts_list`
<!-- ### stage3导出文章 urls.txt 并发送给 ArchiveTeam -->
### stage3export all posts' URLs and send to ArchiveTeam (~~finished~~)
### stage4also download all posts' HTMLs by ourselves (TODO)

View File

@ -69,6 +69,30 @@ func claimWorker(i int, tracker *savewebtracker.Tracker) {
}
}
// Refactored deduplication function
func dedup_append(allPostMetas, newPostMetas []cnblogs_api.PostMeta) ([]cnblogs_api.PostMeta, int) {
founds := 0
for _, newPostMeta := range newPostMetas {
_found := false
for all_idx, allPostMeta := range allPostMetas {
if newPostMeta.URL == allPostMeta.URL {
_found = true
founds++
// replace the old
Logger.Println("dedup_append: replace the old", allPostMeta, "with", newPostMeta)
allPostMetas[all_idx] = newPostMeta
break
}
}
if !_found {
allPostMetas = append(allPostMetas, newPostMeta)
}
}
return allPostMetas, founds
}
var DUP_THRESHOLD_RATE = 3
func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
Logger.Println("[START] ProcesserWorker", i)
defer Logger.Println("[STOP] ProcesserWorker", i, " exited...")
@ -83,7 +107,13 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
Logger.Panicln(head, err)
}
all_postMetas := []cnblogs_api.PostMeta{}
dups_found := 0
for page := 1; ; page++ {
if dups_found > len(all_postMetas)*DUP_THRESHOLD_RATE {
Logger.Println(head, "Dups found", dups_found, "exceeds the threshold", DUP_THRESHOLD_RATE, ", break")
break
}
Logger.Println(head, "Processing", blogURI, "page:", page, "Got:", len(all_postMetas))
htmlBody, statusCode, err := cnblogs_api.GetBlogHomepage(tracker.HTTP_client, blogURI, page)
if err != nil {
@ -104,7 +134,9 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
break
}
Logger.Println(head, "Got", postMetas)
all_postMetas = append(all_postMetas, postMetas...)
_all_postMetas, _founds_in_the_page := dedup_append(all_postMetas, postMetas)
dups_found += _founds_in_the_page
all_postMetas = _all_postMetas
}
items := []savewebtracker.Item{}
@ -170,7 +202,7 @@ func ShowStatus(t *savewebtracker.Tracker) {
}
func main() {
tracker := savewebtracker.GetTracker(project_id, "0.2", savewebtracker.Archivist())
tracker := savewebtracker.GetTracker(project_id, "0.3.0", savewebtracker.Archivist())
tracker.PING_client = GetRetryableHttpClient(10*time.Second, DEBUG)
// tracker.HTTP_client = GetRetryableHttpClient(10*time.Second, DEBUG)
tracker.SelectBestTracker().StartSelectTrackerBackground().StartFetchProjectBackground()

View File

@ -145,7 +145,6 @@ func EnsureHomepageOK(htmldata string) bool {
"currentBlogApp",
"application/rss+xml",
"antiforgery_token",
"poweredby",
}
for _, flag := range Flags {
if !strings.Contains(htmldata, flag) {