diff --git a/README.md b/README.md index 67ae70f..d99829c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,18 @@ + # is-github-page + 判断网站是否托管于Github-Pages (数据源于timqian/chinese-independent-blogs) - main.py 用于从 timqian/chinese-independent-blogs 批量自动判断网站是否为 GH-Pages - sigle-check.py 手动输入域名/链接来判断网站是否为 GH-Pages +- gh-pages-check.py + - 使用 `dnspython` 库完成 DNS 查询 + - 遍历查询域名的 `A`, `CNAME`, `AAAA` 记录, 任一查询结果符合 GitHub Pages 特征则返回结果 + - 支持通过命令行选项设置 nameservers + - 可使用 `--urls` 选项直接输入 URL 查询 + - 可使用 `--csv` 读取远程或本地的 `.csv` 文件查询, `.csv` 文件格式需要与 [blogs-original.csv](https://raw.githubusercontent.com/timqian/chinese-independent-blogs/master/blogs-original.csv) 一致 + - 可使用 `--output` 选项可将查询结果写入文件 + - 使用 `--help` 选项查看脚本帮助信息 输出结果:gh-domains.txt diff --git a/gh-pages-check.py b/gh-pages-check.py new file mode 100644 index 0000000..b63b8ad --- /dev/null +++ b/gh-pages-check.py @@ -0,0 +1,113 @@ +# coding: utf-8 + +import argparse +import ipaddress +import json +import logging +import os +import urllib.parse +import urllib.request + +import dns.resolver + +logger = logging.getLogger(__name__) + + +def get_pages_ip(): + api = "https://api.github.com/meta" + meta = urllib.request.urlopen(api) + + pages_network = json.loads(meta.read()).get("pages", []) + pages_ip = [i for n in pages_network for i in ipaddress.ip_network(n)] + + return pages_ip + + +def check_domain(domain: str, resolver: dns.resolver.Resolver, pages_ip: list): + if domain.endswith("github.io"): + return True + + for rdtype in ["A", "CNAME", "AAAA"]: + try: + answers = resolver.resolve(domain, rdtype=rdtype) + except dns.resolver.NoAnswer: + logger.debug( + f"The DNS response does not contain an answer to the question: {domain} IN {rdtype}") + continue + except dns.resolver.Timeout: + logger.debug(f"The DNS operation has timed out to {domain}") + continue + except dns.resolver.NoNameservers: + logger.debug( + f"All nameservers failed to answer the query {domain}") + continue + + for answer in answers.rrset: + if rdtype in ["A", "AAAA"]: + if answer.to_text() in pages_ip: + return True + else: + if "github.io" in answer.to_text(): + return True + + return False + + +def csv_lines_to_dict_list(csv_lines: list): + header = ["intro", "url", "rss", "tags"] + + csv_list = [] + for line in csv_lines[1:]: + line_dict = {} + for k, v in zip(header, [i.strip() for i in line.split(",")]): + if k == "tags": + v = [t.strip() for t in v.split(";")] + line_dict.update({k: v}) + csv_list.append(line_dict) + + return csv_list + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-n", "--nameservers", default="8.8.8.8,1.0.0.1", + help="comma split nameservers, default: %(default)s") + parser.add_argument("-u", "--urls", nargs="+", + metavar="URL", help="urls to check") + parser.add_argument( + "-c", "--csv", help="use remote/local csv file as input") + parser.add_argument("-o", "--output", help="output file to write") + args = parser.parse_args() + + urls = [] + if args.urls and len(args.urls) > 0: + urls.extend(args.urls) + + if args.csv and args.csv.startswith("http"): + resp = urllib.request.urlopen(args.csv) + csv_lines = resp.read().decode().splitlines() + csv_list = csv_lines_to_dict_list(csv_lines) + urls.extend([item["url"] for item in csv_list]) + + if args.csv and os.path.exists(os.path.expanduser(args.csv)): + with open(os.path.expanduser(args.csv)) as f: + csv_lines = f.readlines() + csv_list = csv_lines_to_dict_list(csv_lines) + urls.extend([item["url"] for item in csv_list]) + + resolver = dns.resolver.Resolver() + resolver.nameservers = [ns.strip() for ns in args.nameservers.split(",")] + pages_ip = get_pages_ip() + + for url in urls: + domain = urllib.parse.urlparse(url).netloc + result = check_domain(domain, resolver, pages_ip) + + print(f"{result}, {url}") + if args.output: + with open(args.output, "a") as f: + f.write(f"{result}, {url}\n") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2f73596 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +dnspython