From 592fa2dea55d8dbb7acf3daa4e7793c6a9d62503 Mon Sep 17 00:00:00 2001 From: yzqzss Date: Wed, 17 Jul 2024 14:56:23 +0800 Subject: [PATCH] init --- .gitea/workflows/release.yml | 56 +++++ .gitignore | 2 + cmd/cnblogs_rss_detect/cnblogs_rss_detect.go | 217 +++++++++++++++++++ go.mod | 10 + go.sum | 24 ++ pkg/cnblogs_api.go | 64 ++++++ 6 files changed, 373 insertions(+) create mode 100644 .gitea/workflows/release.yml create mode 100644 .gitignore create mode 100644 cmd/cnblogs_rss_detect/cnblogs_rss_detect.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 pkg/cnblogs_api.go diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 0000000..1ba298a --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,56 @@ +name: Gitea Go Release Actions +run-name: ${{ gitea.actor }} go🚀 +on: + push: + tags: + - '*' + +jobs: + releases-matrix: + name: Release Go Binary + runs-on: ubuntu-latest + strategy: + matrix: + goos: [linux, windows, darwin] + goarch: [amd64, arm64] + include: + - goarch: arm + goos: linux + exclude: + - goarch: arm64 + goos: windows + steps: + - uses: actions/checkout@v4 + - uses: wangyoucao577/go-release-action@v1 + id: go-release-action + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + goos: ${{ matrix.goos }} + goarch: ${{ matrix.goarch }} + multi_binaries: true + project_path: ./cmd/... + upload: false + # output is release_asset_dir + - name: rename artifact + # append os and arch to the artifact name (handle windows .exe) + run: | + cd ${{ steps.go-release-action.outputs.release_asset_dir }} + for f in *; do + if [[ $f == *.exe ]]; then + noextname=$(basename "$f" .exe) + mv "$f" "${noextname}-${{ matrix.goos }}-${{ matrix.goarch }}.exe" + else + mv "$f" "${f}-${{ matrix.goos }}-${{ matrix.goarch }}" + fi + done + cd - + - name: Compress binaries + continue-on-error: true + uses: svenstaro/upx-action@v2 + with: + files: | + ${{ steps.go-release-action.outputs.release_asset_dir }}/** + - uses: akkuman/gitea-release-action@v1 + with: + files: |- + ${{ steps.go-release-action.outputs.release_asset_dir }}/** diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fba74fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.conf +/cnblogs_rss_detect \ No newline at end of file diff --git a/cmd/cnblogs_rss_detect/cnblogs_rss_detect.go b/cmd/cnblogs_rss_detect/cnblogs_rss_detect.go new file mode 100644 index 0000000..00bbaac --- /dev/null +++ b/cmd/cnblogs_rss_detect/cnblogs_rss_detect.go @@ -0,0 +1,217 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + "net/http" + "os" + "os/signal" + "strings" + "sync" + "time" + + "strconv" + + cnblogs_api "git.saveweb.org/saveweb/cnblogs/pkg" + savewebtracker "git.saveweb.org/saveweb/saveweb_tracker/src/saveweb_tracker" + "github.com/hashicorp/go-retryablehttp" +) + +var BASE_CONCURRENCY = 10 +var WITH_DELAY = true + +var tasks_chan = make(chan savewebtracker.Task, BASE_CONCURRENCY) +var Interrupted = false +var WaitClaimWorker sync.WaitGroup +var WaitProcesserWorker sync.WaitGroup + +var project_id = "cnblogs_rss_detect" + +var Logger *log.Logger +var DEBUG = false + +func init() { + if os.Getenv("BASE_CONCURRENCY") != "" { + fmt.Println("BASE_CONCURRENCY:", os.Getenv("BASE_CONCURRENCY")) + BASE_CONCURRENCY, _ = strconv.Atoi(os.Getenv("BASE_CONCURRENCY")) + } + if os.Getenv("NO_WITH_DELAY") != "" { + fmt.Println("NO_WITH_DELAY:", os.Getenv("NO_WITH_DELAY")) + WITH_DELAY = false + } + if os.Getenv("DEBUG") != "" { + DEBUG = true + } + Logger = log.New(os.Stdout, "["+project_id+"] ", log.Ldate|log.Ltime|log.Lmsgprefix) +} + +var custom_delay_lock = sync.Mutex{} + +// ClaimTask 并把任务放入 task_chan +func claimWorker(i int, tracker *savewebtracker.Tracker) { + Logger.Println("[START] ClaimWorker", i) + defer Logger.Println("[STOP] ClaimWorker", i, " exited...") + defer WaitClaimWorker.Done() + for { + if Interrupted { + return + } + if os.Getenv("CUSTOM_DELAY") != "" { + custom_delay_lock.Lock() + // xxxms + custom_delay, _ := time.ParseDuration(os.Getenv("CUSTOM_DELAY")) + Logger.Println("Custom delay:", custom_delay, "...") + time.Sleep(custom_delay) + custom_delay_lock.Unlock() + + WITH_DELAY = false + } + task := tracker.ClaimTask(WITH_DELAY) + if task == nil { + notask_sleep := max( + time.Duration(tracker.Project().Client.ClaimTaskDelay)*10*time.Second, + time.Duration(10)*time.Second, + ) + Logger.Println("No task to claim, sleep", notask_sleep) + time.Sleep(notask_sleep) + continue + } + Logger.Println("Claimed task", task.Id) + tasks_chan <- *task + } +} + +func ProcesserWorker(i int, tracker *savewebtracker.Tracker) { + Logger.Println("[START] ProcesserWorker", i) + defer Logger.Println("[STOP] ProcesserWorker", i, " exited...") + defer WaitProcesserWorker.Done() + for task := range tasks_chan { + Logger.Println("Processing task", task.Id) + + // 在这儿处理任务 + headers, r_status, ContentLength := cnblogs_api.GetRSSHeadHeaders(*tracker.HTTP_client, task.Id) + + // HTTP/2 200 + // date: Wed, 17 Jul 2024 06:23:58 GMT + // content-type: application/rss+xml // 或 application/xml + // content-length: 27623 + // vary: Accept-Encoding + // set-cookie: ... + // last-modified: Wed, 17 Jul 2024 06:23:58 GMT + + // HTTP/2 500 + // date: Wed, 17 Jul 2024 06:23:20 GMT + // content-length: 0 + // set-cookie: .... + + var payload map[string]interface{} + + var to_status savewebtracker.Status + + if r_status == 200 { + if !(strings.Contains(headers.Get("Content-Type"), "application/xml") || strings.Contains(headers.Get("Content-Type"), "application/rss")) { + Logger.Panicln(task.Id, "unexpected Content-Type: ", headers.Get("Content-Type")) + } + if ContentLength == -1 || ContentLength == 0 { + // panic("unexpected content-length: " + fmt.Sprintf("%d", ContentLength)) + Logger.Panicln(task.Id, "unexpected content-length: ", fmt.Sprintf("%d", ContentLength)) + } + payload = map[string]interface{}{ + "content-length": ContentLength, + } + to_status = savewebtracker.StatusDONE + } else if r_status == 500 { + Logger.Println(task.Id, "empty content") + to_status = savewebtracker.StatusEMPTY + } else { + Logger.Panicln(task.Id, "unexpected status code: ", r_status) + } + + payload_str, err := json.Marshal(payload) + if err != nil { + panic(err) + } + + Logger.Println("Inserting item", task.Id, r_status, string(payload_str)) + tracker.InsertItem(task, fmt.Sprintf("%d", r_status), "int", string(payload_str)) + Logger.Println("Inserted item", task.Id, to_status) + tracker.UpdateTask(task.Id, task.Id_type, to_status) + Logger.Println("Updated task", task.Id) + } +} + +func InterruptHandler() { + fmt.Println("Press Ctrl+C to exit") + interrupt_c := make(chan os.Signal, 1) + signal.Notify(interrupt_c, os.Interrupt) + for { + s := <-interrupt_c + Logger.Println("Interrupted by", s, "signal (Press Ctrl+C again to force exit)") + if Interrupted { + Logger.Println("Force exit") + os.Exit(1) + return + } + Interrupted = true + } +} + +func GetRetryableHttpClient(timeout time.Duration, debug bool) *http.Client { + retryClient := retryablehttp.NewClient() + retryClient.RetryMax = 3 + retryClient.RetryWaitMin = 1 * time.Second + retryClient.RetryWaitMax = 10 * time.Second + retryClient.HTTPClient.Timeout = timeout + if !debug { + retryClient.Logger = nil + } + standardClient := retryClient.StandardClient() // *http.Client + Logger.Println("standardClient.Timeout:", standardClient.Timeout) + return standardClient +} + +func ShowStatus(t *savewebtracker.Tracker) { + for { + project_json, err := json.Marshal(t.Project()) + if err != nil { + panic(err) + } + Logger.Println("Project:", string(project_json)) + time.Sleep(60 * time.Second) + } +} + +func main() { + tracker := savewebtracker.GetTracker(project_id, "0.1", savewebtracker.Archivist()) + tracker.PING_client = GetRetryableHttpClient(10*time.Second, DEBUG) + // tracker.HTTP_client = GetRetryableHttpClient(10*time.Second, DEBUG) + tracker.SelectBestTracker().StartSelectTrackerBackground().StartFetchProjectBackground() + + go InterruptHandler() + go ShowStatus(tracker) + + cnblogs_api.EnsureConnection(*tracker.HTTP_client) + + Logger.Println("-- Start --") + + for i := 0; i < BASE_CONCURRENCY; i++ { + go claimWorker(i, tracker) + WaitClaimWorker.Add(1) + go ProcesserWorker(i, tracker) + WaitProcesserWorker.Add(1) + } + + // wait for all claimWorker to finish + WaitClaimWorker.Wait() + Logger.Println("[STOP] All claimWorker done") + // close task_chan + close(tasks_chan) + Logger.Println("[STOP] task_chan closed") + // wait for all task_chan to finish + WaitProcesserWorker.Wait() + Logger.Println("[STOP] All ProcesserWorker done") + + Logger.Println("-- All done --") + +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..99a7041 --- /dev/null +++ b/go.mod @@ -0,0 +1,10 @@ +module git.saveweb.org/saveweb/cnblogs + +go 1.22.4 + +require ( + git.saveweb.org/saveweb/saveweb_tracker v0.1.12 + github.com/hashicorp/go-retryablehttp v0.7.7 +) + +require github.com/hashicorp/go-cleanhttp v0.5.2 // indirect diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..8baa5d8 --- /dev/null +++ b/go.sum @@ -0,0 +1,24 @@ +git.saveweb.org/saveweb/saveweb_tracker v0.1.12 h1:zBYkMjABF5wwvSHZI9t3cVUjU0rhFFZJh0dFE0W59Nw= +git.saveweb.org/saveweb/saveweb_tracker v0.1.12/go.mod h1:p891f4fshoA/Wiwmey23f2xJ9sKNEZwd5kmzG6lobik= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= +github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= +github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= +github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/cnblogs_api.go b/pkg/cnblogs_api.go new file mode 100644 index 0000000..8abd686 --- /dev/null +++ b/pkg/cnblogs_api.go @@ -0,0 +1,64 @@ +package cnblogs_api + +import ( + "io" + "log" + "net/http" + "os" + "strings" +) + +var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0" + +var Logger = log.New(os.Stdout, "[cnblogs_api] ", log.Ldate|log.Ltime|log.Lmsgprefix) + +func EnsureConnection(client http.Client) { + req, err := http.NewRequest("GET", "https://www.cnblogs.com/robots.txt", nil) + if err != nil { + panic(err) + } + req.Header.Add("User-Agent", USER_AGENT) + + resp, err := client.Do(req) + + if err != nil { + panic(err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + panic(err) + } + text := string(body) + Logger.Println(text) + if !strings.Contains(text, "User-Agent") { + panic("NotImplementedError: " + text) + } +} + +func GetRSSHeadHeaders(client http.Client, blogID string) (http.Header, int, int64) { + req, err := http.NewRequest("HEAD", "https://feed.cnblogs.com/blog/u/"+blogID+"/rss/", nil) + if err != nil { + panic(err) + } + + headers := map[string][]string{ + "User-Agent": {USER_AGENT}, + } + for k, v := range headers { + req.Header[k] = v + } + + resp, err := client.Do(req) + if err != nil { + panic(err) + } + defer resp.Body.Close() + _, err = io.ReadAll(resp.Body) + if err != nil { + panic(err) + } + + return resp.Header, resp.StatusCode, resp.ContentLength + +}