Compare commits
10 Commits
Author | SHA1 | Date | |
---|---|---|---|
0a984a1e3c | |||
313c0a0b57 | |||
2d75209b43 | |||
61816823df | |||
9db3698d54 | |||
8260784a7f | |||
b44ace123d | |||
3bc33e3201 | |||
eebf9fe6a9 | |||
2848efdcc8 |
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
*.conf
|
||||
/cnblogs_rss_detect
|
||||
/cnblogs_rss_detect
|
||||
/cnblogs_posts_list
|
||||
|
50
README.md
50
README.md
@ -1 +1,49 @@
|
||||
GOOOOOOO
|
||||
# cnblogs archiver
|
||||
|
||||
## How can I help?
|
||||
|
||||
### Binary
|
||||
|
||||
Go to [release](https://git.saveweb.org/saveweb/cnblogs/releases) page, downlaod `cnblogs_posts_list` and run it.
|
||||
|
||||
WARNING: DO NOT run `cnblogs_posts_list` concurrently (on same IP), you may be banned by cnblogs.
|
||||
|
||||
### With Docker
|
||||
|
||||
```bash
|
||||
export ARCHIVIST=<your_node_name> # a string that can uniquely identify your node (for example: bob-gcloud-514). (Legal characters: letters, numbers, -, _)
|
||||
```
|
||||
|
||||
```bash
|
||||
if [[ -z "$ARCHIVIST" ]]; then
|
||||
echo "WARN: ARCHIVIST must be set"
|
||||
exit 1
|
||||
fi
|
||||
_image="icecodexi/saveweb:cnblogs"
|
||||
docker pull "${_image}" \
|
||||
&& docker stop cnblogs
|
||||
docker rm -f cnblogs \
|
||||
&& docker run --env ARCHIVIST="$ARCHIVIST" --restart always \
|
||||
--volume /etc/localtime:/etc/localtime:ro \
|
||||
--cpu-shares 512 --memory 512M --memory-swap 512M \
|
||||
--label=com.centurylinklabs.watchtower.enable=true \
|
||||
--detach --name cnblogs \
|
||||
"${_image}"
|
||||
```
|
||||
|
||||
|
||||
## Archiving stages
|
||||
|
||||
### stage1:detect all blogids (~~finished~~)
|
||||
|
||||
run `cnblogs_rss_detect`
|
||||
|
||||
### stage2:iterate all blogids and collect all posts' URLs (~~finished~~)
|
||||
|
||||
run `cnblogs_posts_list`
|
||||
|
||||
<!-- ### stage3:导出文章 urls.txt 并发送给 ArchiveTeam -->
|
||||
|
||||
### stage3:export all posts' URLs and send to ArchiveTeam (~~finished~~)
|
||||
|
||||
### stage4:also download all posts' HTMLs by ourselves (TODO)
|
236
cmd/cnblogs_posts_list/cnblogs_posts_list.go
Normal file
236
cmd/cnblogs_posts_list/cnblogs_posts_list.go
Normal file
@ -0,0 +1,236 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"strconv"
|
||||
|
||||
cnblogs_api "git.saveweb.org/saveweb/cnblogs/pkg"
|
||||
savewebtracker "git.saveweb.org/saveweb/saveweb_tracker/src/saveweb_tracker"
|
||||
"github.com/hashicorp/go-retryablehttp"
|
||||
)
|
||||
|
||||
var BASE_CONCURRENCY = 3
|
||||
var WITH_DELAY = true
|
||||
|
||||
var tasks_chan = make(chan savewebtracker.Task, BASE_CONCURRENCY)
|
||||
var Interrupted = false
|
||||
var WaitClaimWorker sync.WaitGroup
|
||||
var WaitProcesserWorker sync.WaitGroup
|
||||
|
||||
var project_id = "cnblogs_posts_list"
|
||||
|
||||
var Logger *log.Logger
|
||||
var DEBUG = false
|
||||
|
||||
func init() {
|
||||
if os.Getenv("BASE_CONCURRENCY") != "" {
|
||||
fmt.Println("BASE_CONCURRENCY:", os.Getenv("BASE_CONCURRENCY"))
|
||||
BASE_CONCURRENCY, _ = strconv.Atoi(os.Getenv("BASE_CONCURRENCY"))
|
||||
}
|
||||
if os.Getenv("NO_WITH_DELAY") != "" {
|
||||
fmt.Println("NO_WITH_DELAY:", os.Getenv("NO_WITH_DELAY"))
|
||||
WITH_DELAY = false
|
||||
}
|
||||
if os.Getenv("DEBUG") != "" {
|
||||
DEBUG = true
|
||||
}
|
||||
Logger = log.New(os.Stdout, "["+project_id+"] ", log.Ldate|log.Ltime|log.Lmsgprefix)
|
||||
}
|
||||
|
||||
// ClaimTask 并把任务放入 task_chan
|
||||
func claimWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
Logger.Println("[START] ClaimWorker", i)
|
||||
defer Logger.Println("[STOP] ClaimWorker", i, " exited...")
|
||||
defer WaitClaimWorker.Done()
|
||||
for {
|
||||
if Interrupted {
|
||||
return
|
||||
}
|
||||
task := tracker.ClaimTask(WITH_DELAY)
|
||||
if task == nil {
|
||||
notask_sleep := max(
|
||||
time.Duration(tracker.Project().Client.ClaimTaskDelay)*10*time.Second,
|
||||
time.Duration(10)*time.Second,
|
||||
)
|
||||
Logger.Println("No task to claim, sleep", notask_sleep)
|
||||
time.Sleep(notask_sleep)
|
||||
continue
|
||||
}
|
||||
Logger.Println("Claimed task", task.Id)
|
||||
tasks_chan <- *task
|
||||
}
|
||||
}
|
||||
|
||||
// Refactored deduplication function
|
||||
func dedup_append(allPostMetas, newPostMetas []cnblogs_api.PostMeta) ([]cnblogs_api.PostMeta, int) {
|
||||
founds := 0
|
||||
for _, newPostMeta := range newPostMetas {
|
||||
_found := false
|
||||
for all_idx, allPostMeta := range allPostMetas {
|
||||
if newPostMeta.URL == allPostMeta.URL {
|
||||
_found = true
|
||||
founds++
|
||||
// replace the old
|
||||
Logger.Println("dedup_append: replace the old", allPostMeta, "with", newPostMeta)
|
||||
allPostMetas[all_idx] = newPostMeta
|
||||
break
|
||||
}
|
||||
}
|
||||
if !_found {
|
||||
allPostMetas = append(allPostMetas, newPostMeta)
|
||||
}
|
||||
}
|
||||
return allPostMetas, founds
|
||||
}
|
||||
|
||||
var DUP_THRESHOLD_RATE = 3
|
||||
|
||||
func ProcesserWorker(i int, tracker *savewebtracker.Tracker) {
|
||||
Logger.Println("[START] ProcesserWorker", i)
|
||||
defer Logger.Println("[STOP] ProcesserWorker", i, " exited...")
|
||||
defer WaitProcesserWorker.Done()
|
||||
for task := range tasks_chan {
|
||||
head := "[" + task.Id + "]"
|
||||
Logger.Println("Processing task", task.Id)
|
||||
|
||||
// 在这儿处理任务
|
||||
blogURI, err := cnblogs_api.GetBlogUri(tracker.HTTP_client, task.Id)
|
||||
if err != nil {
|
||||
Logger.Panicln(head, err)
|
||||
}
|
||||
all_postMetas := []cnblogs_api.PostMeta{}
|
||||
dups_found := 0
|
||||
for page := 1; ; page++ {
|
||||
if dups_found > len(all_postMetas)*DUP_THRESHOLD_RATE {
|
||||
Logger.Println(head, "Dups found", dups_found, "exceeds the threshold", DUP_THRESHOLD_RATE, ", break")
|
||||
break
|
||||
}
|
||||
|
||||
Logger.Println(head, "Processing", blogURI, "page:", page, "Got:", len(all_postMetas))
|
||||
htmlBody, statusCode, err := cnblogs_api.GetBlogHomepage(tracker.HTTP_client, blogURI, page)
|
||||
if err != nil {
|
||||
Logger.Panicln(head, err)
|
||||
}
|
||||
if !cnblogs_api.EnsureHomepageOK(string(htmlBody)) {
|
||||
Logger.Panicln(head, "EnsureHomepageOK is false")
|
||||
}
|
||||
if statusCode != 200 {
|
||||
Logger.Panicln(head, "statusCode is not 200")
|
||||
}
|
||||
|
||||
postMetas, err := cnblogs_api.ParsePostMetasFromHomepage(htmlBody)
|
||||
if err != nil {
|
||||
Logger.Panicln(head, err)
|
||||
}
|
||||
if len(postMetas) == 0 {
|
||||
break
|
||||
}
|
||||
Logger.Println(head, "Got", postMetas)
|
||||
_all_postMetas, _founds_in_the_page := dedup_append(all_postMetas, postMetas)
|
||||
dups_found += _founds_in_the_page
|
||||
all_postMetas = _all_postMetas
|
||||
}
|
||||
|
||||
items := []savewebtracker.Item{}
|
||||
for _, postMeta := range all_postMetas {
|
||||
postMeta_json, err := json.Marshal(postMeta)
|
||||
if err != nil {
|
||||
Logger.Panicln(head, err)
|
||||
}
|
||||
items = append(items, savewebtracker.Item{
|
||||
Item_id: postMeta.URL,
|
||||
Item_id_type: "str",
|
||||
Item_status: "None",
|
||||
Item_status_type: "None",
|
||||
Payload: string(postMeta_json),
|
||||
})
|
||||
}
|
||||
resp_msg := tracker.InsertMany(items)
|
||||
Logger.Println(head, "InsertMany", resp_msg)
|
||||
tracker.UpdateTask(task.Id, task.Id_type, savewebtracker.StatusDONE)
|
||||
Logger.Println(head, "Updated task", task.Id)
|
||||
}
|
||||
}
|
||||
|
||||
func InterruptHandler() {
|
||||
fmt.Println("\n\nPress Ctrl+C to exit\n ")
|
||||
interrupt_c := make(chan os.Signal, 1)
|
||||
signal.Notify(interrupt_c, os.Interrupt)
|
||||
for {
|
||||
s := <-interrupt_c
|
||||
Logger.Println("\n\nInterrupted by", s, "signal (Press Ctrl+C again to force exit)\n\n ")
|
||||
if Interrupted {
|
||||
Logger.Println("Force exit")
|
||||
os.Exit(1)
|
||||
return
|
||||
}
|
||||
Interrupted = true
|
||||
}
|
||||
}
|
||||
|
||||
func GetRetryableHttpClient(timeout time.Duration, debug bool) *http.Client {
|
||||
retryClient := retryablehttp.NewClient()
|
||||
retryClient.RetryMax = 3
|
||||
retryClient.RetryWaitMin = 1 * time.Second
|
||||
retryClient.RetryWaitMax = 10 * time.Second
|
||||
retryClient.HTTPClient.Timeout = timeout
|
||||
if !debug {
|
||||
retryClient.Logger = nil
|
||||
}
|
||||
standardClient := retryClient.StandardClient() // *http.Client
|
||||
Logger.Println("standardClient.Timeout:", standardClient.Timeout)
|
||||
return standardClient
|
||||
}
|
||||
|
||||
func ShowStatus(t *savewebtracker.Tracker) {
|
||||
for {
|
||||
project_json, err := json.Marshal(t.Project())
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
Logger.Println("Project:", string(project_json))
|
||||
time.Sleep(60 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
tracker := savewebtracker.GetTracker(project_id, "0.3.0", savewebtracker.Archivist())
|
||||
tracker.PING_client = GetRetryableHttpClient(10*time.Second, DEBUG)
|
||||
// tracker.HTTP_client = GetRetryableHttpClient(10*time.Second, DEBUG)
|
||||
tracker.SelectBestTracker().StartSelectTrackerBackground().StartFetchProjectBackground()
|
||||
|
||||
go InterruptHandler()
|
||||
go ShowStatus(tracker)
|
||||
|
||||
cnblogs_api.EnsureConnection(*tracker.HTTP_client)
|
||||
|
||||
Logger.Println("-- Start --")
|
||||
|
||||
for i := 0; i < BASE_CONCURRENCY; i++ {
|
||||
go claimWorker(i, tracker)
|
||||
WaitClaimWorker.Add(1)
|
||||
go ProcesserWorker(i, tracker)
|
||||
WaitProcesserWorker.Add(1)
|
||||
}
|
||||
|
||||
// wait for all claimWorker to finish
|
||||
WaitClaimWorker.Wait()
|
||||
Logger.Println("[STOP] All claimWorker done")
|
||||
// close task_chan
|
||||
close(tasks_chan)
|
||||
Logger.Println("[STOP] task_chan closed")
|
||||
// wait for all task_chan to finish
|
||||
WaitProcesserWorker.Wait()
|
||||
Logger.Println("[STOP] All ProcesserWorker done")
|
||||
|
||||
Logger.Println("-- All done --")
|
||||
|
||||
}
|
7
go.mod
7
go.mod
@ -4,7 +4,12 @@ go 1.22.4
|
||||
|
||||
require (
|
||||
git.saveweb.org/saveweb/saveweb_tracker v0.1.12
|
||||
github.com/PuerkitoBio/goquery v1.9.2
|
||||
github.com/hashicorp/go-retryablehttp v0.7.7
|
||||
)
|
||||
|
||||
require github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
|
||||
require (
|
||||
github.com/andybalholm/cascadia v1.3.2 // indirect
|
||||
github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
|
||||
golang.org/x/net v0.24.0 // indirect
|
||||
)
|
||||
|
40
go.sum
40
go.sum
@ -1,5 +1,9 @@
|
||||
git.saveweb.org/saveweb/saveweb_tracker v0.1.12 h1:zBYkMjABF5wwvSHZI9t3cVUjU0rhFFZJh0dFE0W59Nw=
|
||||
git.saveweb.org/saveweb/saveweb_tracker v0.1.12/go.mod h1:p891f4fshoA/Wiwmey23f2xJ9sKNEZwd5kmzG6lobik=
|
||||
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
|
||||
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
|
||||
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
|
||||
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
|
||||
@ -18,7 +22,43 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
|
||||
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
|
||||
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y=
|
||||
golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
@ -1,11 +1,20 @@
|
||||
package cnblogs_api
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"
|
||||
@ -62,3 +71,154 @@ func GetRSSHeadHeaders(client http.Client, blogID string) (http.Header, int, int
|
||||
return resp.Header, resp.StatusCode, resp.ContentLength
|
||||
|
||||
}
|
||||
|
||||
var exp = "<uri[^>]*>(.*?)</uri>"
|
||||
var compiledr = regexp.MustCompile(exp)
|
||||
|
||||
func ParseBlogUriByRegex(body []byte) string {
|
||||
// only find the first match
|
||||
matches := compiledr.FindSubmatch(body)
|
||||
if len(matches) > 1 {
|
||||
return string(matches[1])
|
||||
} else {
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func GetBlogUri(client *http.Client, BlogID string) (string, error) {
|
||||
req, err := http.NewRequest("GET", "https://feed.cnblogs.com/blog/u/"+BlogID+"/rss", nil)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
req.Header.Add("User-Agent", USER_AGENT)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
// <?xml version="1.0" encoding="utf-8"?>
|
||||
// <feed xmlns="http://www.w3.org/2005/Atom">
|
||||
// <title type="text">博客园 - 吴松~</title>
|
||||
// <id>uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1761407</id>
|
||||
// <updated>2018-06-19T09:44:31Z</updated>
|
||||
// <author>
|
||||
// <name>吴松~</name>
|
||||
// <uri>https://www.cnblogs.com/superws/</uri>
|
||||
// </author>
|
||||
// </feed>
|
||||
type Author struct {
|
||||
URI string `xml:"uri"`
|
||||
}
|
||||
type Feed struct {
|
||||
XMLName xml.Name `xml:"feed"`
|
||||
Author Author `xml:"author"`
|
||||
}
|
||||
|
||||
var feed Feed
|
||||
err = xml.Unmarshal(body, &feed)
|
||||
if err != nil {
|
||||
Logger.Println("xml.Unmarshal error", err, "fallback to regex")
|
||||
reg_result := ParseBlogUriByRegex(body)
|
||||
if reg_result != "" {
|
||||
return reg_result, nil
|
||||
} else {
|
||||
return "", errors.New("xml.Unmarshal error")
|
||||
}
|
||||
}
|
||||
|
||||
if feed.Author.URI == "" {
|
||||
return "", errors.New("URI is empty")
|
||||
}
|
||||
|
||||
return feed.Author.URI, nil
|
||||
}
|
||||
|
||||
func EnsureHomepageOK(htmldata string) bool {
|
||||
Flags := []string{
|
||||
"currentBlogId",
|
||||
"currentBlogApp",
|
||||
"application/rss+xml",
|
||||
"antiforgery_token",
|
||||
}
|
||||
for _, flag := range Flags {
|
||||
if !strings.Contains(htmldata, flag) {
|
||||
fmt.Println("EnsureHomepageOK failed for", flag)
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var homepage_lock = sync.Mutex{}
|
||||
|
||||
func GetBlogHomepage(client *http.Client, BlogUri string, page int) ([]byte, int, error) {
|
||||
// replace last /
|
||||
for strings.HasSuffix(BlogUri, "/") {
|
||||
BlogUri = BlogUri[:len(BlogUri)-1]
|
||||
}
|
||||
|
||||
homepage_lock.Lock()
|
||||
time.Sleep(1 * time.Second)
|
||||
homepage_lock.Unlock()
|
||||
|
||||
req, err := http.NewRequest("GET", BlogUri+"?page="+fmt.Sprintf("%d", page), nil)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
req.Header.Add("User-Agent", USER_AGENT)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, resp.StatusCode, err
|
||||
}
|
||||
|
||||
return body, resp.StatusCode, nil
|
||||
}
|
||||
|
||||
type PostMeta struct {
|
||||
Title string
|
||||
URL string
|
||||
}
|
||||
|
||||
func ParsePostMetasFromHomepage(htmlBody []byte) ([]PostMeta, error) {
|
||||
dom, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBody))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
postMetas := []PostMeta{}
|
||||
|
||||
err_in_query := false
|
||||
// <a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/superws/p/5960116.html"><span>类型转换</span></a>
|
||||
dom.Find(".postTitle2").Each(func(i int, s *goquery.Selection) {
|
||||
href, exists := s.Attr("href")
|
||||
if exists {
|
||||
title := s.Find("span").Text()
|
||||
// strip space
|
||||
title = strings.TrimSpace(title)
|
||||
postMetas = append(postMetas, PostMeta{
|
||||
Title: title,
|
||||
URL: href,
|
||||
})
|
||||
} else {
|
||||
err_in_query = true
|
||||
}
|
||||
})
|
||||
if err_in_query {
|
||||
return postMetas, errors.New("error in query")
|
||||
}
|
||||
|
||||
return postMetas, nil
|
||||
|
||||
}
|
||||
|
101
pkg/cnblogs_api_test.go
Normal file
101
pkg/cnblogs_api_test.go
Normal file
@ -0,0 +1,101 @@
|
||||
package cnblogs_api
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
var client = &http.Client{
|
||||
Timeout: 120 * time.Second,
|
||||
}
|
||||
|
||||
func TestGetBlogUri(t *testing.T) {
|
||||
blogApp, err := GetBlogUri(client, "270749")
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
if blogApp == "" {
|
||||
t.Error("blogApp is empty")
|
||||
}
|
||||
|
||||
t.Log(blogApp)
|
||||
}
|
||||
|
||||
func TestGetBlogHomepage(t *testing.T) {
|
||||
blogApp, err := GetBlogUri(client, "270749")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
htmlBody, statusCode, err := GetBlogHomepage(client, blogApp, 7)
|
||||
if EnsureHomepageOK(string(htmlBody)) == false {
|
||||
t.Fatal("EnsureHomepageOK is false")
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if statusCode != 200 {
|
||||
t.Fatal("statusCode is not 200")
|
||||
}
|
||||
|
||||
t.Log(string(htmlBody))
|
||||
}
|
||||
|
||||
func TestParsePostsURLFromHomepage(t *testing.T) {
|
||||
blogApp, err := GetBlogUri(client, "270749")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
htmlBody, _, err := GetBlogHomepage(client, blogApp, 1)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
postMetas, err := ParsePostMetasFromHomepage(htmlBody)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
t.Log(postMetas)
|
||||
}
|
||||
|
||||
func TestParseBlogUriByRegex(t *testing.T) {
|
||||
text := `<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title type="text">博客园 - 真幻de现实</title>
|
||||
<id>uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1780685</id>
|
||||
<updated>2018-11-28T09:56:51Z</updated>
|
||||
<author>
|
||||
<name>真幻de现实</name>
|
||||
<uri>https://www.cnblogs.com/lummon/</uri>
|
||||
</author>
|
||||
<generator>feed.cnblogs.com</generator>
|
||||
<entry>
|
||||
<id>https://www.cnblogs.com/lummon/p/10033657.html</id>
|
||||
<title type="text">EF 基础提供程序在 Open 上失败 - 真幻de现实</title>
|
||||
<summary type="text">搜来的思路: 客户端以管理员身份运行:netsh winsock reset命令,作用是重置 Winsock 目录。如果一台机器上的Winsock协议配置有问题的话将会导致网络连接等问题,就需要用netsh winsock reset命令来重置Winsock目录借以恢复网络。这个命令可以重新初始化网</summary>
|
||||
<published>2018-11-28T09:56:00Z</published>
|
||||
<updated>2018-11-28T09:56:00Z</updated>
|
||||
<author>
|
||||
<name>真幻de现实</name>
|
||||
<uri>https://www.cnblogs.com/lummon/</uri>
|
||||
</author>
|
||||
<link rel="alternate" href="https://www.cnblogs.com/lummon/p/10033657.html" />
|
||||
<link rel="alternate" type="text/html" href="https://www.cnblogs.com/lummon/p/10033657.html" />
|
||||
<content type="html">【摘要】搜来的思路: 客户端以管理员身份运行:netsh winsock reset命令,作用是重置 Winsock 目录。如果一台机器上的Winsock协议配置有问题的话将会导致网络连接等问题,就需要用netsh winsock reset命令来重置Winsock目录借以恢复网络。这个命令可以重新初始化网 <a href="https://www.cnblogs.com/lummon/p/10033657.html" target="_blank">阅读全文</a></content>
|
||||
</entry>
|
||||
<entry>
|
||||
<id>https://www.cnblogs.com/lummon/p/5950095.html</id>
|
||||
<title type="text">flexbox学习 - 真幻de现实</title>
|
||||
<summary type="text">https://philipwalton.github.io/solved-by-flexbox/ http://www.ruanyifeng.com/blog/2015/07/flex-grammar.html?utm_source=tuicool http://www.ruanyifeng.co</summary>
|
||||
<published>2016-10-11T09:24:00Z</published>
|
||||
<updated>2016-10-11T09:24:00Z</updated>
|
||||
<author>
|
||||
<name>真幻de现实</name>
|
||||
<uri>https://www.cnblogs.com/lummon/</uri>
|
||||
</author>
|
||||
<link rel="alternate" href="https://www.cnblogs.com/lummon/p/5950095.html" />`
|
||||
uri := ParseBlogUriByRegex([]byte(text))
|
||||
if uri != "https://www.cnblogs.com/lummon/" {
|
||||
t.Error("uri is not https://www.cnblogs.com/lummon/")
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user