yzqzss
2beaab1e72
Some checks failed
Gitea Go Release Actions / Release Go Binary (arm, linux) (push) Waiting to run
Gitea Go Release Actions / Release Go Binary (amd64, darwin) (push) Failing after 24s
Gitea Go Release Actions / Release Go Binary (amd64, linux) (push) Failing after 45s
Gitea Go Release Actions / Release Go Binary (amd64, windows) (push) Failing after 46s
Gitea Go Release Actions / Release Go Binary (arm64, darwin) (push) Failing after 47s
Gitea Go Release Actions / Release Go Binary (arm64, linux) (push) Failing after 21s
226 lines
4.8 KiB
Go
226 lines
4.8 KiB
Go
package cnblogs_api
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"
|
|
|
|
var Logger = log.New(os.Stdout, "[cnblogs_api] ", log.Ldate|log.Ltime|log.Lmsgprefix)
|
|
|
|
func EnsureConnection(client http.Client) {
|
|
req, err := http.NewRequest("GET", "https://www.cnblogs.com/robots.txt", nil)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
req.Header.Add("User-Agent", USER_AGENT)
|
|
|
|
resp, err := client.Do(req)
|
|
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer resp.Body.Close()
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
text := string(body)
|
|
Logger.Println(text)
|
|
if !strings.Contains(text, "User-Agent") {
|
|
panic("NotImplementedError: " + text)
|
|
}
|
|
}
|
|
|
|
func GetRSSHeadHeaders(client http.Client, blogID string) (http.Header, int, int64) {
|
|
req, err := http.NewRequest("HEAD", "https://feed.cnblogs.com/blog/u/"+blogID+"/rss/", nil)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
headers := map[string][]string{
|
|
"User-Agent": {USER_AGENT},
|
|
}
|
|
for k, v := range headers {
|
|
req.Header[k] = v
|
|
}
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer resp.Body.Close()
|
|
_, err = io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return resp.Header, resp.StatusCode, resp.ContentLength
|
|
|
|
}
|
|
|
|
var exp = "<uri[^>]*>(.*?)</uri>"
|
|
var compiledr = regexp.MustCompile(exp)
|
|
|
|
func ParseBlogUriByRegex(body []byte) string {
|
|
// only find the first match
|
|
matches := compiledr.FindSubmatch(body)
|
|
if len(matches) > 1 {
|
|
return string(matches[1])
|
|
} else {
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func GetBlogUri(client *http.Client, BlogID string) (string, error) {
|
|
req, err := http.NewRequest("GET", "https://feed.cnblogs.com/blog/u/"+BlogID+"/rss", nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
req.Header.Add("User-Agent", USER_AGENT)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// <?xml version="1.0" encoding="utf-8"?>
|
|
// <feed xmlns="http://www.w3.org/2005/Atom">
|
|
// <title type="text">博客园 - 吴松~</title>
|
|
// <id>uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1761407</id>
|
|
// <updated>2018-06-19T09:44:31Z</updated>
|
|
// <author>
|
|
// <name>吴松~</name>
|
|
// <uri>https://www.cnblogs.com/superws/</uri>
|
|
// </author>
|
|
// </feed>
|
|
type Author struct {
|
|
URI string `xml:"uri"`
|
|
}
|
|
type Feed struct {
|
|
XMLName xml.Name `xml:"feed"`
|
|
Author Author `xml:"author"`
|
|
}
|
|
|
|
var feed Feed
|
|
err = xml.Unmarshal(body, &feed)
|
|
if err != nil {
|
|
Logger.Println("xml.Unmarshal error", err, "fallback to regex")
|
|
reg_result := ParseBlogUriByRegex(body)
|
|
if reg_result != "" {
|
|
return reg_result, nil
|
|
} else {
|
|
return "", errors.New("xml.Unmarshal error")
|
|
}
|
|
}
|
|
|
|
if feed.Author.URI == "" {
|
|
return "", errors.New("URI is empty")
|
|
}
|
|
|
|
return feed.Author.URI, nil
|
|
}
|
|
|
|
func EnsureHomepageOK(htmldata string) bool {
|
|
Flags := []string{
|
|
"currentBlogId",
|
|
"currentBlogApp",
|
|
"application/rss+xml",
|
|
"antiforgery_token",
|
|
"poweredby",
|
|
}
|
|
for _, flag := range Flags {
|
|
if !strings.Contains(htmldata, flag) {
|
|
fmt.Println("EnsureHomepageOK failed for", flag)
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
var homepage_lock = sync.Mutex{}
|
|
|
|
func GetBlogHomepage(client *http.Client, BlogUri string, page int) ([]byte, int, error) {
|
|
// replace last /
|
|
for strings.HasSuffix(BlogUri, "/") {
|
|
BlogUri = BlogUri[:len(BlogUri)-1]
|
|
}
|
|
|
|
homepage_lock.Lock()
|
|
time.Sleep(1 * time.Second)
|
|
homepage_lock.Unlock()
|
|
|
|
req, err := http.NewRequest("GET", BlogUri+"?page="+fmt.Sprintf("%d", page), nil)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
req.Header.Add("User-Agent", USER_AGENT)
|
|
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
defer resp.Body.Close()
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, resp.StatusCode, err
|
|
}
|
|
|
|
return body, resp.StatusCode, nil
|
|
}
|
|
|
|
type PostMeta struct {
|
|
Title string
|
|
URL string
|
|
}
|
|
|
|
func ParsePostMetasFromHomepage(htmlBody []byte) ([]PostMeta, error) {
|
|
dom, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBody))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
postMetas := []PostMeta{}
|
|
|
|
err_in_query := false
|
|
// <a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/superws/p/5960116.html"><span>类型转换</span></a>
|
|
dom.Find(".postTitle2").Each(func(i int, s *goquery.Selection) {
|
|
href, exists := s.Attr("href")
|
|
if exists {
|
|
title := s.Find("span").Text()
|
|
// strip space
|
|
title = strings.TrimSpace(title)
|
|
postMetas = append(postMetas, PostMeta{
|
|
Title: title,
|
|
URL: href,
|
|
})
|
|
} else {
|
|
err_in_query = true
|
|
}
|
|
})
|
|
if err_in_query {
|
|
return postMetas, errors.New("error in query")
|
|
}
|
|
|
|
return postMetas, nil
|
|
|
|
}
|