From e4e557377cf93b971e119a1261ccee89f8762c54 Mon Sep 17 00:00:00 2001 From: Jack Date: Fri, 22 Mar 2024 23:44:48 +0800 Subject: [PATCH] :truck: chore(spider file): rename spider file --- create_markdown.py => telegram_spider.py | 246 +++++++++++------------ 1 file changed, 123 insertions(+), 123 deletions(-) rename create_markdown.py => telegram_spider.py (97%) diff --git a/create_markdown.py b/telegram_spider.py similarity index 97% rename from create_markdown.py rename to telegram_spider.py index 133dfbd..e76d9f5 100644 --- a/create_markdown.py +++ b/telegram_spider.py @@ -1,123 +1,123 @@ -import codecs -import posixpath -import re -from itertools import chain -from urllib.parse import urljoin -import random -import requests -from jinja2 import Template -from lxml import etree - - -class CreateMarkdown: - """ Create GitHub Markdown """ - - def __init__(self): - self.url = 'https://github.com/jackhawks/rectg' - self.template_file = '_template.md' - - def readme_handler(self): - readme_url = posixpath.join(self.url, "blob/main/README.md") - response = requests.get(readme_url) - html = etree.HTML(response.text) - elements = html.xpath('//*[contains(@href,"t.me")]/@href') - for element in elements: - yield element.replace('\\"', '') - - def issues_handler(self): - issues_url = posixpath.join(self.url, "issues") - response = requests.get(issues_url) - html = etree.HTML(response.text) - elements = html.xpath("//div[contains(@role,'group')]//a[contains(@id,'issue_')]/@href") - for element in elements: - issues_title_url = urljoin(self.url, element) - iss_resp = requests.get(issues_title_url) - iss_html = etree.HTML(iss_resp.text) - iss_elements = iss_html.xpath("//a[contains(@href,'t.me')]/@href")[0] - yield iss_elements - - def url_join(self, *args): - return chain(*args) - - def get_info(self, urls): - for idx, url in enumerate(urls): - - print(idx, ' ---> ', url) - - response = requests.get(url) - html = etree.HTML(response.text) - - tg_me_page_url = url - - try: - tg_me_page_title_raw = dict(enumerate(html.xpath( - "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_title')]//span/text()"))).get( - 0) - tg_me_page_title = tg_me_page_title_raw.replace('|', '') - except: - continue - - tg_me_page_extra = dict(enumerate( - html.xpath( - "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_extra')]/text()"))).get( - 0) - - try: - tg_me_page_description_raw = dict(enumerate(html.xpath( - "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_description')]/text()"))).get( - 0) - if 'If you have' in tg_me_page_description_raw: - continue - - tg_me_page_description = tg_me_page_description_raw.replace('|', '') - - except: - tg_me_page_description = None - - # 数据处理 - tg_me_audience = None - tg_me_category = None - if '@' in tg_me_page_extra: - tg_me_category = '机器人' - tg_me_audience = None - elif 'subscribers' in tg_me_page_extra: - tg_me_category = '频道' - tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group() - elif 'members' in tg_me_page_extra: - tg_me_category = '群组' - tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group() - - yield { - 'tg_me_page_url': tg_me_page_url, - 'tg_me_page_title': tg_me_page_title, - 'tg_me_audience': tg_me_audience, - 'tg_me_page_description': tg_me_page_description, - 'tg_me_category': tg_me_category, - } - - def create_md(self, repo): - with open('_template.md', 'r', encoding='utf-8') as file: - template = Template(file.read(), trim_blocks=True) - rendered_file = template.render(repo=repo) - output_file = codecs.open("README.md", "w", "utf-8") - output_file.write(rendered_file) - output_file.close() - - def shuffle(self, generator): - lst = list(generator) - lst = list(set(lst)) - random.shuffle(lst) - return (y for y in lst) - - def start(self): - issues = self.issues_handler() - readme = self.readme_handler() - urls = self.url_join(issues, readme) - suf = self.shuffle(urls) - info = self.get_info(suf) - self.create_md(info) - - -if __name__ == '__main__': - cm = CreateMarkdown() - cm.start() +import codecs +import posixpath +import re +from itertools import chain +from urllib.parse import urljoin +import random +import requests +from jinja2 import Template +from lxml import etree + + +class CreateMarkdown: + """ Create GitHub Markdown """ + + def __init__(self): + self.url = 'https://github.com/jackhawks/rectg' + self.template_file = '_template.md' + + def readme_handler(self): + readme_url = posixpath.join(self.url, "blob/main/README.md") + response = requests.get(readme_url) + html = etree.HTML(response.text) + elements = html.xpath('//*[contains(@href,"t.me")]/@href') + for element in elements: + yield element.replace('\\"', '') + + def issues_handler(self): + issues_url = posixpath.join(self.url, "issues") + response = requests.get(issues_url) + html = etree.HTML(response.text) + elements = html.xpath("//div[contains(@role,'group')]//a[contains(@id,'issue_')]/@href") + for element in elements: + issues_title_url = urljoin(self.url, element) + iss_resp = requests.get(issues_title_url) + iss_html = etree.HTML(iss_resp.text) + iss_elements = iss_html.xpath("//a[contains(@href,'t.me')]/@href")[0] + yield iss_elements + + def url_join(self, *args): + return chain(*args) + + def get_info(self, urls): + for idx, url in enumerate(urls): + + print(idx, ' ---> ', url) + + response = requests.get(url) + html = etree.HTML(response.text) + + tg_me_page_url = url + + try: + tg_me_page_title_raw = dict(enumerate(html.xpath( + "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_title')]//span/text()"))).get( + 0) + tg_me_page_title = tg_me_page_title_raw.replace('|', '') + except: + continue + + tg_me_page_extra = dict(enumerate( + html.xpath( + "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_extra')]/text()"))).get( + 0) + + try: + tg_me_page_description_raw = dict(enumerate(html.xpath( + "//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_description')]/text()"))).get( + 0) + if 'If you have' in tg_me_page_description_raw: + continue + + tg_me_page_description = tg_me_page_description_raw.replace('|', '') + + except: + tg_me_page_description = None + + # 数据处理 + tg_me_audience = None + tg_me_category = None + if '@' in tg_me_page_extra: + tg_me_category = '机器人' + tg_me_audience = None + elif 'subscribers' in tg_me_page_extra: + tg_me_category = '频道' + tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group() + elif 'members' in tg_me_page_extra: + tg_me_category = '群组' + tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group() + + yield { + 'tg_me_page_url': tg_me_page_url, + 'tg_me_page_title': tg_me_page_title, + 'tg_me_audience': tg_me_audience, + 'tg_me_page_description': tg_me_page_description, + 'tg_me_category': tg_me_category, + } + + def create_md(self, repo): + with open('_template.md', 'r', encoding='utf-8') as file: + template = Template(file.read(), trim_blocks=True) + rendered_file = template.render(repo=repo) + output_file = codecs.open("README.md", "w", "utf-8") + output_file.write(rendered_file) + output_file.close() + + def shuffle(self, generator): + lst = list(generator) + lst = list(set(lst)) + random.shuffle(lst) + return (y for y in lst) + + def start(self): + issues = self.issues_handler() + readme = self.readme_handler() + urls = self.url_join(issues, readme) + suf = self.shuffle(urls) + info = self.get_info(suf) + self.create_md(info) + + +if __name__ == '__main__': + cm = CreateMarkdown() + cm.start()