diff --git a/cmd/cnblogs_posts_list/cnblogs_posts_list.go b/cmd/cnblogs_posts_list/cnblogs_posts_list.go index 6a57352..2ff2776 100644 --- a/cmd/cnblogs_posts_list/cnblogs_posts_list.go +++ b/cmd/cnblogs_posts_list/cnblogs_posts_list.go @@ -69,6 +69,30 @@ func claimWorker(i int, tracker *savewebtracker.Tracker) { } } +// Refactored deduplication function +func dedup_append(allPostMetas, newPostMetas []cnblogs_api.PostMeta) ([]cnblogs_api.PostMeta, int) { + founds := 0 + for _, newPostMeta := range newPostMetas { + _found := false + for all_idx, allPostMeta := range allPostMetas { + if newPostMeta.URL == allPostMeta.URL { + _found = true + founds++ + // replace the old + Logger.Println("dedup_append: replace the old", allPostMeta, "with", newPostMeta) + allPostMetas[all_idx] = newPostMeta + break + } + } + if !_found { + allPostMetas = append(allPostMetas, newPostMeta) + } + } + return allPostMetas, founds +} + +var DUP_THRESHOLD_RATE = 3 + func ProcesserWorker(i int, tracker *savewebtracker.Tracker) { Logger.Println("[START] ProcesserWorker", i) defer Logger.Println("[STOP] ProcesserWorker", i, " exited...") @@ -83,7 +107,13 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) { Logger.Panicln(head, err) } all_postMetas := []cnblogs_api.PostMeta{} + dups_found := 0 for page := 1; ; page++ { + if dups_found > len(all_postMetas)*DUP_THRESHOLD_RATE { + Logger.Println(head, "Dups found", dups_found, "exceeds the threshold", DUP_THRESHOLD_RATE, ", break") + break + } + Logger.Println(head, "Processing", blogURI, "page:", page, "Got:", len(all_postMetas)) htmlBody, statusCode, err := cnblogs_api.GetBlogHomepage(tracker.HTTP_client, blogURI, page) if err != nil { @@ -104,7 +134,9 @@ func ProcesserWorker(i int, tracker *savewebtracker.Tracker) { break } Logger.Println(head, "Got", postMetas) - all_postMetas = append(all_postMetas, postMetas...) + _all_postMetas, _founds_in_the_page := dedup_append(all_postMetas, postMetas) + dups_found += _founds_in_the_page + all_postMetas = _all_postMetas } items := []savewebtracker.Item{} @@ -170,7 +202,7 @@ func ShowStatus(t *savewebtracker.Tracker) { } func main() { - tracker := savewebtracker.GetTracker(project_id, "0.2", savewebtracker.Archivist()) + tracker := savewebtracker.GetTracker(project_id, "0.3.0", savewebtracker.Archivist()) tracker.PING_client = GetRetryableHttpClient(10*time.Second, DEBUG) // tracker.HTTP_client = GetRetryableHttpClient(10*time.Second, DEBUG) tracker.SelectBestTracker().StartSelectTrackerBackground().StartFetchProjectBackground()