2024-07-16 23:56:23 -07:00
package cnblogs_api
import (
2024-07-17 10:36:32 -07:00
"bytes"
"encoding/xml"
"errors"
"fmt"
2024-07-16 23:56:23 -07:00
"io"
"log"
"net/http"
"os"
2024-07-17 10:36:32 -07:00
"regexp"
2024-07-16 23:56:23 -07:00
"strings"
2024-07-17 10:36:32 -07:00
"sync"
"time"
"github.com/PuerkitoBio/goquery"
2024-07-16 23:56:23 -07:00
)
var USER_AGENT = "SaveTheWebProject cnblogs-preserve/0.1 (+saveweb@saveweb.org) and not Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"
var Logger = log . New ( os . Stdout , "[cnblogs_api] " , log . Ldate | log . Ltime | log . Lmsgprefix )
func EnsureConnection ( client http . Client ) {
req , err := http . NewRequest ( "GET" , "https://www.cnblogs.com/robots.txt" , nil )
if err != nil {
panic ( err )
}
req . Header . Add ( "User-Agent" , USER_AGENT )
resp , err := client . Do ( req )
if err != nil {
panic ( err )
}
defer resp . Body . Close ( )
body , err := io . ReadAll ( resp . Body )
if err != nil {
panic ( err )
}
text := string ( body )
Logger . Println ( text )
if ! strings . Contains ( text , "User-Agent" ) {
panic ( "NotImplementedError: " + text )
}
}
func GetRSSHeadHeaders ( client http . Client , blogID string ) ( http . Header , int , int64 ) {
req , err := http . NewRequest ( "HEAD" , "https://feed.cnblogs.com/blog/u/" + blogID + "/rss/" , nil )
if err != nil {
panic ( err )
}
headers := map [ string ] [ ] string {
"User-Agent" : { USER_AGENT } ,
}
for k , v := range headers {
req . Header [ k ] = v
}
resp , err := client . Do ( req )
if err != nil {
panic ( err )
}
defer resp . Body . Close ( )
_ , err = io . ReadAll ( resp . Body )
if err != nil {
panic ( err )
}
return resp . Header , resp . StatusCode , resp . ContentLength
}
2024-07-17 10:36:32 -07:00
var exp = "<uri[^>]*>(.*?)</uri>"
var compiledr = regexp . MustCompile ( exp )
func ParseBlogUriByRegex ( body [ ] byte ) string {
// only find the first match
matches := compiledr . FindSubmatch ( body )
if len ( matches ) > 1 {
return string ( matches [ 1 ] )
} else {
return ""
}
}
func GetBlogUri ( client * http . Client , BlogID string ) ( string , error ) {
req , err := http . NewRequest ( "GET" , "https://feed.cnblogs.com/blog/u/" + BlogID + "/rss" , nil )
if err != nil {
return "" , err
}
req . Header . Add ( "User-Agent" , USER_AGENT )
resp , err := client . Do ( req )
if err != nil {
return "" , err
}
defer resp . Body . Close ( )
body , err := io . ReadAll ( resp . Body )
if err != nil {
return "" , err
}
// <?xml version="1.0" encoding="utf-8"?>
// <feed xmlns="http://www.w3.org/2005/Atom">
// <title type="text">博客园 - 吴松~</title>
// <id>uuid:0a75ddf1-c050-403f-937c-cf7790585fb1;id=1761407</id>
// <updated>2018-06-19T09:44:31Z</updated>
// <author>
// <name>吴松~</name>
// <uri>https://www.cnblogs.com/superws/</uri>
// </author>
// </feed>
type Author struct {
URI string ` xml:"uri" `
}
type Feed struct {
XMLName xml . Name ` xml:"feed" `
Author Author ` xml:"author" `
}
var feed Feed
err = xml . Unmarshal ( body , & feed )
if err != nil {
Logger . Println ( "xml.Unmarshal error" , err , "fallback to regex" )
reg_result := ParseBlogUriByRegex ( body )
if reg_result != "" {
return reg_result , nil
} else {
return "" , errors . New ( "xml.Unmarshal error" )
}
}
if feed . Author . URI == "" {
return "" , errors . New ( "URI is empty" )
}
return feed . Author . URI , nil
}
func EnsureHomepageOK ( htmldata string ) bool {
Flags := [ ] string {
"currentBlogId" ,
"currentBlogApp" ,
"application/rss+xml" ,
"antiforgery_token" ,
"poweredby" ,
}
for _ , flag := range Flags {
if ! strings . Contains ( htmldata , flag ) {
fmt . Println ( "EnsureHomepageOK failed for" , flag )
return false
}
}
return true
}
var homepage_lock = sync . Mutex { }
func GetBlogHomepage ( client * http . Client , BlogUri string , page int ) ( [ ] byte , int , error ) {
// replace last /
for strings . HasSuffix ( BlogUri , "/" ) {
BlogUri = BlogUri [ : len ( BlogUri ) - 1 ]
}
homepage_lock . Lock ( )
time . Sleep ( 1 * time . Second )
homepage_lock . Unlock ( )
req , err := http . NewRequest ( "GET" , BlogUri + "?page=" + fmt . Sprintf ( "%d" , page ) , nil )
if err != nil {
return nil , 0 , err
}
req . Header . Add ( "User-Agent" , USER_AGENT )
resp , err := client . Do ( req )
if err != nil {
return nil , 0 , err
}
defer resp . Body . Close ( )
body , err := io . ReadAll ( resp . Body )
if err != nil {
return nil , resp . StatusCode , err
}
return body , resp . StatusCode , nil
}
type PostMeta struct {
Title string
URL string
}
func ParsePostMetasFromHomepage ( htmlBody [ ] byte ) ( [ ] PostMeta , error ) {
dom , err := goquery . NewDocumentFromReader ( bytes . NewReader ( htmlBody ) )
if err != nil {
return nil , err
}
postMetas := [ ] PostMeta { }
err_in_query := false
// <a class="postTitle2 vertical-middle" href="https://www.cnblogs.com/superws/p/5960116.html"><span>类型转换</span></a>
dom . Find ( ".postTitle2" ) . Each ( func ( i int , s * goquery . Selection ) {
href , exists := s . Attr ( "href" )
if exists {
title := s . Find ( "span" ) . Text ( )
// strip space
title = strings . TrimSpace ( title )
postMetas = append ( postMetas , PostMeta {
Title : title ,
URL : href ,
} )
} else {
err_in_query = true
}
} )
if err_in_query {
return postMetas , errors . New ( "error in query" )
}
return postMetas , nil
}