cnblogs/pkg/cnblogs_api.go

225 lines
4.8 KiB
Go
Raw Normal View History

2024-07-16 23:56:23 -07:00
package cnblogs_api
import (
2024-07-17 10:36:32 -07:00
"bytes"
"encoding/xml"
"errors"
"fmt"
2024-07-16 23:56:23 -07:00
"io"
"log"
"net/http"
"os"
2024-07-17 10:36:32 -07:00
"regexp"
2024-07-16 23:56:23 -07:00
"strings"
2024-07-17 10:36:32 -07:00
"sync"
"time"
"github.com/PuerkitoBio/goquery"
2024-07-16 23:56:23 -07:00
)
var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"
var Logger = log.New(os.Stdout, "[cnblogs_api] ", log.Ldate|log.Ltime|log.Lmsgprefix)
func EnsureConnection(client http.Client) {
req, err := http.NewRequest("GET", "https://www.cnblogs.com/robots.txt", nil)
if err != nil {
panic(err)
}
req.Header.Add("User-Agent", USER_AGENT)
resp, err := client.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
panic(err)
}
text := string(body)
Logger.Println(text)
if !strings.Contains(text, "User-Agent") {
panic("NotImplementedError: " + text)
}
}
func GetRSSHeadHeaders(client http.Client, blogID string) (http.Header, int, int64) {
req, err := http.NewRequest("HEAD", "https://feed.cnblogs.com/blog/u/"+blogID+"/rss/", nil)
if err != nil {
panic(err)
}
headers := map[string][]string{
"User-Agent": {USER_AGENT},
}
for k, v := range headers {
req.Header[k] = v
}
resp, err := client.Do(req)
if err != nil {
panic(err)
}
defer resp.Body.Close()
_, err = io.ReadAll(resp.Body)
if err != nil {
panic(err)
}
return resp.Header, resp.StatusCode, resp.ContentLength
}
2024-07-17 10:36:32 -07:00
var exp = "<uri[^>]*>(.*?)</uri>"
var compiledr = regexp.MustCompile(exp)
func ParseBlogUriByRegex(body []byte) string {
// only find the first match
matches := compiledr.FindSubmatch(body)
if len(matches) > 1 {
return string(matches[1])
} else {
return ""
}
}
func GetBlogUri(client *http.Client, BlogID string) (string, error) {
req, err := http.NewRequest("GET", "https://feed.cnblogs.com/blog/u/"+BlogID+"/rss", nil)
if err != nil {
return "", err
}
req.Header.Add("User-Agent", USER_AGENT)
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
// <?xml version="1.0" encoding="utf-8"?>
// <feed xmlns="http://www.w3.org/2005/Atom">
// <title type="text">博客园 - 吴松~</title>
// <id>uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1761407</id>
// <updated>2018-06-19T09:44:31Z</updated>
// <author>
// <name>吴松~</name>
// <uri>https://www.cnblogs.com/superws/</uri>
// </author>
// </feed>
type Author struct {
URI string `xml:"uri"`
}
type Feed struct {
XMLName xml.Name `xml:"feed"`
Author Author `xml:"author"`
}
var feed Feed
err = xml.Unmarshal(body, &feed)
if err != nil {
Logger.Println("xml.Unmarshal error", err, "fallback to regex")
reg_result := ParseBlogUriByRegex(body)
if reg_result != "" {
return reg_result, nil
} else {
return "", errors.New("xml.Unmarshal error")
}
}
if feed.Author.URI == "" {
return "", errors.New("URI is empty")
}
return feed.Author.URI, nil
}
func EnsureHomepageOK(htmldata string) bool {
Flags := []string{
"currentBlogId",
"currentBlogApp",
"application/rss+xml",
"antiforgery_token",
}
for _, flag := range Flags {
if !strings.Contains(htmldata, flag) {
fmt.Println("EnsureHomepageOK failed for", flag)
return false
}
}
return true
}
var homepage_lock = sync.Mutex{}
func GetBlogHomepage(client *http.Client, BlogUri string, page int) ([]byte, int, error) {
// replace last /
for strings.HasSuffix(BlogUri, "/") {
BlogUri = BlogUri[:len(BlogUri)-1]
}
homepage_lock.Lock()
time.Sleep(1 * time.Second)
homepage_lock.Unlock()
req, err := http.NewRequest("GET", BlogUri+"?page="+fmt.Sprintf("%d", page), nil)
if err != nil {
return nil, 0, err
}
req.Header.Add("User-Agent", USER_AGENT)
resp, err := client.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, resp.StatusCode, err
}
return body, resp.StatusCode, nil
}
type PostMeta struct {
Title string
URL string
}
func ParsePostMetasFromHomepage(htmlBody []byte) ([]PostMeta, error) {
dom, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBody))
if err != nil {
return nil, err
}
postMetas := []PostMeta{}
err_in_query := false
// <a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/superws/p/5960116.html"><span>类型转换</span></a>
dom.Find(".postTitle2").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if exists {
title := s.Find("span").Text()
// strip space
title = strings.TrimSpace(title)
postMetas = append(postMetas, PostMeta{
Title: title,
URL: href,
})
} else {
err_in_query = true
}
})
if err_in_query {
return postMetas, errors.New("error in query")
}
return postMetas, nil
}