package cnblogs_api import ( "bytes" "encoding/xml" "errors" "fmt" "io" "log" "net/http" "os" "regexp" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" ) var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0" var Logger = log.New(os.Stdout, "[cnblogs_api] ", log.Ldate|log.Ltime|log.Lmsgprefix) func EnsureConnection(client http.Client) { req, err := http.NewRequest("GET", "https://www.cnblogs.com/robots.txt", nil) if err != nil { panic(err) } req.Header.Add("User-Agent", USER_AGENT) resp, err := client.Do(req) if err != nil { panic(err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { panic(err) } text := string(body) Logger.Println(text) if !strings.Contains(text, "User-Agent") { panic("NotImplementedError: " + text) } } func GetRSSHeadHeaders(client http.Client, blogID string) (http.Header, int, int64) { req, err := http.NewRequest("HEAD", "https://feed.cnblogs.com/blog/u/"+blogID+"/rss/", nil) if err != nil { panic(err) } headers := map[string][]string{ "User-Agent": {USER_AGENT}, } for k, v := range headers { req.Header[k] = v } resp, err := client.Do(req) if err != nil { panic(err) } defer resp.Body.Close() _, err = io.ReadAll(resp.Body) if err != nil { panic(err) } return resp.Header, resp.StatusCode, resp.ContentLength } var exp = "]*>(.*?)" var compiledr = regexp.MustCompile(exp) func ParseBlogUriByRegex(body []byte) string { // only find the first match matches := compiledr.FindSubmatch(body) if len(matches) > 1 { return string(matches[1]) } else { return "" } } func GetBlogUri(client *http.Client, BlogID string) (string, error) { req, err := http.NewRequest("GET", "https://feed.cnblogs.com/blog/u/"+BlogID+"/rss", nil) if err != nil { return "", err } req.Header.Add("User-Agent", USER_AGENT) resp, err := client.Do(req) if err != nil { return "", err } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return "", err } // // // 博客园 - 吴松~ // uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1761407 // 2018-06-19T09:44:31Z // // 吴松~ // https://www.cnblogs.com/superws/ // // type Author struct { URI string `xml:"uri"` } type Feed struct { XMLName xml.Name `xml:"feed"` Author Author `xml:"author"` } var feed Feed err = xml.Unmarshal(body, &feed) if err != nil { Logger.Println("xml.Unmarshal error", err, "fallback to regex") reg_result := ParseBlogUriByRegex(body) if reg_result != "" { return reg_result, nil } else { return "", errors.New("xml.Unmarshal error") } } if feed.Author.URI == "" { return "", errors.New("URI is empty") } return feed.Author.URI, nil } func EnsureHomepageOK(htmldata string) bool { Flags := []string{ "currentBlogId", "currentBlogApp", "application/rss+xml", "antiforgery_token", } for _, flag := range Flags { if !strings.Contains(htmldata, flag) { fmt.Println("EnsureHomepageOK failed for", flag) return false } } return true } var homepage_lock = sync.Mutex{} func GetBlogHomepage(client *http.Client, BlogUri string, page int) ([]byte, int, error) { // replace last / for strings.HasSuffix(BlogUri, "/") { BlogUri = BlogUri[:len(BlogUri)-1] } homepage_lock.Lock() time.Sleep(1 * time.Second) homepage_lock.Unlock() req, err := http.NewRequest("GET", BlogUri+"?page="+fmt.Sprintf("%d", page), nil) if err != nil { return nil, 0, err } req.Header.Add("User-Agent", USER_AGENT) resp, err := client.Do(req) if err != nil { return nil, 0, err } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, resp.StatusCode, err } return body, resp.StatusCode, nil } type PostMeta struct { Title string URL string } func ParsePostMetasFromHomepage(htmlBody []byte) ([]PostMeta, error) { dom, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlBody)) if err != nil { return nil, err } postMetas := []PostMeta{} err_in_query := false // 类型转换 dom.Find(".postTitle2").Each(func(i int, s *goquery.Selection) { href, exists := s.Attr("href") if exists { title := s.Find("span").Text() // strip space title = strings.TrimSpace(title) postMetas = append(postMetas, PostMeta{ Title: title, URL: href, }) } else { err_in_query = true } }) if err_in_query { return postMetas, errors.New("error in query") } return postMetas, nil }