Compare commits
9 Commits
Author | SHA1 | Date | |
---|---|---|---|
0a984a1e3c | |||
313c0a0b57 | |||
2d75209b43 | |||
61816823df | |||
9db3698d54 | |||
8260784a7f | |||
b44ace123d | |||
3bc33e3201 | |||
eebf9fe6a9 |
50
README.md
50
README.md
@ -1 +1,49 @@
|
||||
GOOOOOOO
|
||||
# cnblogs archiver
|
||||
|
||||
## How can I help?
|
||||
|
||||
### Binary
|
||||
|
||||
Go to [release](https://git.saveweb.org/saveweb/cnblogs/releases) page, downlaod `cnblogs_posts_list` and run it.
|
||||
|
||||
WARNING: DO NOT run `cnblogs_posts_list` concurrently (on same IP), you may be banned by cnblogs.
|
||||
|
||||
### With Docker
|
||||
|
||||
```bash
|
||||
export ARCHIVIST=<your_node_name> # a string that can uniquely identify your node (for example: bob-gcloud-514). (Legal characters: letters, numbers, -, _)
|
||||
```
|
||||
|
||||
```bash
|
||||
if [[ -z "$ARCHIVIST" ]]; then
|
||||
echo "WARN: ARCHIVIST must be set"
|
||||
exit 1
|
||||
fi
|
||||
_image="icecodexi/saveweb:cnblogs"
|
||||
docker pull "${_image}" \
|
||||
&& docker stop cnblogs
|
||||
docker rm -f cnblogs \
|
||||
&& docker run --env ARCHIVIST="$ARCHIVIST" --restart always \
|
||||
--volume /etc/localtime:/etc/localtime:ro \
|
||||
--cpu-shares 512 --memory 512M --memory-swap 512M \
|
||||
--label=com.centurylinklabs.watchtower.enable=true \
|
||||
--detach --name cnblogs \
|
||||
"${_image}"
|
||||
```
|
||||
|
||||
|
||||
## Archiving stages
|
||||
|
||||
### stage1:detect all blogids (~~finished~~)
|
||||
|
||||
run `cnblogs_rss_detect`
|
||||
|
||||
### stage2:iterate all blogids and collect all posts' URLs (~~finished~~)
|
||||
|
||||
run `cnblogs_posts_list`
|
||||
|
||||
<!-- ### stage3:导出文章 urls.txt 并发送给 ArchiveTeam -->
|
||||
|
||||
### stage3:export all posts' URLs and send to ArchiveTeam (~~finished~~)
|
||||
|
||||
### stage4:also download all posts' HTMLs by ourselves (TODO)
|
@ -69,6 +69,30 @@ func claimWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
}
|
||||
}
|
||||
|
||||
// Refactored deduplication function
|
||||
func dedup_append(allPostMetas, newPostMetas []cnblogs_api.PostMeta) ([]cnblogs_api.PostMeta, int) {
|
||||
founds := 0
|
||||
for _, newPostMeta := range newPostMetas {
|
||||
_found := false
|
||||
for all_idx, allPostMeta := range allPostMetas {
|
||||
if newPostMeta.URL == allPostMeta.URL {
|
||||
_found = true
|
||||
founds++
|
||||
// replace the old
|
||||
Logger.Println("dedup_append: replace the old", allPostMeta, "with", newPostMeta)
|
||||
allPostMetas[all_idx] = newPostMeta
|
||||
break
|
||||
}
|
||||
}
|
||||
if !_found {
|
||||
allPostMetas = append(allPostMetas, newPostMeta)
|
||||
}
|
||||
}
|
||||
return allPostMetas, founds
|
||||
}
|
||||
|
||||
var DUP_THRESHOLD_RATE = 3
|
||||
|
||||
func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
Logger.Println("[START] ProcesserWorker", i)
|
||||
defer Logger.Println("[STOP] ProcesserWorker", i, " exited...")
|
||||
@ -83,7 +107,13 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
Logger.Panicln(head, err)
|
||||
}
|
||||
all_postMetas := []cnblogs_api.PostMeta{}
|
||||
dups_found := 0
|
||||
for page := 1; ; page++ {
|
||||
if dups_found > len(all_postMetas)*DUP_THRESHOLD_RATE {
|
||||
Logger.Println(head, "Dups found", dups_found, "exceeds the threshold", DUP_THRESHOLD_RATE, ", break")
|
||||
break
|
||||
}
|
||||
|
||||
Logger.Println(head, "Processing", blogURI, "page:", page, "Got:", len(all_postMetas))
|
||||
htmlBody, statusCode, err := cnblogs_api.GetBlogHomepage(tracker.HTTP_client, blogURI, page)
|
||||
if err != nil {
|
||||
@ -104,7 +134,9 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
break
|
||||
}
|
||||
Logger.Println(head, "Got", postMetas)
|
||||
all_postMetas = append(all_postMetas, postMetas...)
|
||||
_all_postMetas, _founds_in_the_page := dedup_append(all_postMetas, postMetas)
|
||||
dups_found += _founds_in_the_page
|
||||
all_postMetas = _all_postMetas
|
||||
}
|
||||
|
||||
items := []savewebtracker.Item{}
|
||||
@ -170,7 +202,7 @@ func ShowStatus(t *savewebtracker.Tracker) {
|
||||
}
|
||||
|
||||
func main() {
|
||||
tracker := savewebtracker.GetTracker(project_id, "0.2", savewebtracker.Archivist())
|
||||
tracker := savewebtracker.GetTracker(project_id, "0.3.0", savewebtracker.Archivist())
|
||||
tracker.PING_client = GetRetryableHttpClient(10*time.Second, DEBUG)
|
||||
// tracker.HTTP_client = GetRetryableHttpClient(10*time.Second, DEBUG)
|
||||
tracker.SelectBestTracker().StartSelectTrackerBackground().StartFetchProjectBackground()
|
||||
|
@ -145,7 +145,6 @@ func EnsureHomepageOK(htmldata string) bool {
|
||||
"currentBlogApp",
|
||||
"application/rss+xml",
|
||||
"antiforgery_token",
|
||||
"poweredby",
|
||||
}
|
||||
for _, flag := range Flags {
|
||||
if !strings.Contains(htmldata, flag) {
|
||||
|
Loading…
Reference in New Issue
Block a user