Skip to content

Commit

Permalink
新增next四种主题的文章获取,与Yun规则合并
Browse files Browse the repository at this point in the history
移除bs4依赖
移除旧订阅规则解析
release 4.2.3
  • Loading branch information
hiltay committed Feb 10, 2022
1 parent ba83e5d commit 4e5c1b1
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 161 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

⭐从4.1.3版本开始,一定要在配置项中配置友链页的获取策略
```
目前 release 4.2.2 版本:
目前 release 4.2.3 版本:
- 支持 gitee 和 github 上的 issuse 友链获取
- 支持butterfly、volantis、matery、sakura、fluid、nexmoe、Yun、stun、stellar主题的友链和文章获取
- 支持feed订阅规则,如atom、rss等规则(支持wordpress类型的博客)
Expand All @@ -19,6 +19,7 @@
- 新增数据存储配置,提供多种存储方式
- 新增部署方式配置,可部署在本地服务端
- 将api整合到主仓库
- 新增next四种主题的文章获取,与Yun规则合并,暂不支持友链页获取
bug修复:
- wordpress类型博客的时间格式问题
Expand All @@ -31,5 +32,7 @@ bug修复:
- feed订阅解析更加精准了
- 解决了docker和server定时任务运行爬虫报错的问题
- 文章超出当前时间的判断,逻辑优化与代码格式化
- 移除bs4依赖
- 移除旧订阅规则解析
```

2 changes: 0 additions & 2 deletions hexo_circle_of_friends/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ arrow==1.2.1
asgiref==3.4.1
attrs==21.2.0
Automat==20.2.0
beautifulsoup4==4.10.0
certifi==2021.10.8
cffi==1.15.0
charset-normalizer==2.0.8
Expand Down Expand Up @@ -47,7 +46,6 @@ Scrapy==2.5.1
secure-cookie==0.2.0
service-identity==21.1.0
six==1.16.0
soupsieve==2.3.1
SQLAlchemy==1.4.31
starlette==0.14.2
Twisted==21.7.0
Expand Down
14 changes: 7 additions & 7 deletions hexo_circle_of_friends/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@

##############################除非您了解本项目,否则请勿修改以下内容################################

VERSION = "4.2.2"
VERSION = "4.2.3"

# debug
# debug模式
Expand All @@ -116,12 +116,12 @@
# debug模式使用

# https://yun.yunyoujun.cn/demo/ , Yun
FRIENDPAGE_LINK = [
"https://www.yyyzyyyz.cn/link/", # butterfly
"https://akilar.top/link/", # butterfly
"https://www.zyoushuo.cn/friends/", # volantis
]
# FRIENDPAGE_LINK = ["https://www.yyyzyyyz.cn/link/"]
# FRIENDPAGE_LINK = [
# "https://www.yyyzyyyz.cn/link/", # butterfly
# "https://akilar.top/link/", # butterfly
# "https://www.zyoushuo.cn/friends/", # volantis
# ]
FRIENDPAGE_LINK = ["https://www.yyyzyyyz.cn/link/"]


BOT_NAME = 'hexo_circle_of_friends'
Expand Down
222 changes: 71 additions & 151 deletions hexo_circle_of_friends/spiders/hexo_circle_of_friends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,33 @@

import datetime
import os
import time
import scrapy
import queue
import feedparser
from scrapy.http.request import Request
from hexo_circle_of_friends import settings
from bs4 import BeautifulSoup
from hexo_circle_of_friends.utils.get_url import get_theme_url, Yun_async_link_handler
from hexo_circle_of_friends.utils.regulations import *
from hexo_circle_of_friends.utils.process_time import format_time
# from hexo_circle_of_friends import items todo use items

# post_parsers = []
post_parsers = [
"post_feed_parse", "theme_butterfly_parse", "theme_fluid_parse", "theme_matery_parse", "theme_sakura_parse",
"theme_volantis_parse", "theme_nexmoe_parse", "theme_next_parse", "theme_stun_parse", "theme_stellar_parse",
]

feed_suffix = [
"atom.xml","feed/atom","rss.xml","rss2.xml","feed"
]

class CRequest(Request):
def __init__(self, url, callback=None, meta=None, dont_filter=True,
errback=None,
*args, **kwargs):
super(CRequest, self).__init__(url, callback, meta=meta,dont_filter=dont_filter,
errback=errback, *args, **kwargs)

# from hexo_circle_of_friends import items todo use items
class FriendpageLinkSpider(scrapy.Spider):
name = 'hexo_circle_of_friends'
allowed_domains = ['*']
Expand All @@ -25,7 +39,7 @@ def __init__(self, name=None, **kwargs):
self.friend_list = queue.Queue()
self.today = datetime.datetime.now().strftime('%Y-%m-%d')

super().__init__(name, **kwargs)
super(FriendpageLinkSpider,self).__init__(name, **kwargs)

def start_requests(self):
# 从配置文件导入友链列表
Expand Down Expand Up @@ -72,7 +86,7 @@ def friend_poor_parse(self, response):
if main_content:
for item in main_content:
issueslink = response.meta["gitee"]["domain"] + item
yield Request(issueslink, self.friend_poor_parse, meta={"gitee-issues": None}, dont_filter=True)
yield CRequest(issueslink, self.friend_poor_parse, meta={"gitee-issues": None})
if "gitee-issues" in response.meta.keys():
try:
content = ''.join(response.css("code *::text").extract())
Expand All @@ -93,7 +107,7 @@ def friend_poor_parse(self, response):
if main_content:
for item in main_content:
issueslink = response.meta["github"]["domain"] + item
yield Request(issueslink, self.friend_poor_parse, meta={"github-issues": None}, dont_filter=True)
yield CRequest(issueslink, self.friend_poor_parse, meta={"github-issues": None})
if "github-issues" in response.meta.keys():
try:
content = ''.join(response.css("pre *::text").extract())
Expand All @@ -116,57 +130,28 @@ def friend_poor_parse(self, response):
async_link = get_theme_url(theme, response, self.friend_poor)
if async_link:
# Yun主题的async_link临时解决
yield Request(async_link, callback=self.friend_poor_parse, meta={"async_link": async_link},
dont_filter=True)
yield CRequest(async_link, self.friend_poor_parse, meta={"async_link": async_link})
else:
pass
if "async_link" in response.meta.keys():
Yun_async_link_handler(response, self.friend_poor)

# 要添加主题扩展,在这里添加一个请求
while not self.friend_poor.empty():
friend = self.friend_poor.get()
friend[1] += "/" if not friend[1].endswith("/") else ""
if settings.SETTINGS_FRIENDS_LINKS['enable'] and len(friend) == 4:
url = friend[1] + friend[3]
yield Request(url, callback=self.post_feed_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield CRequest(url,self.post_feed_parse,meta={"friend": friend},errback=self.errback_handler)
self.friend_list.put(friend[:3])
continue
self.friend_list.put(friend)
yield Request(friend[1] + "atom.xml", callback=self.post_feed_parse, meta={"friend": friend},
dont_filter=True, errback=self.errback_handler)
yield Request(friend[1] + "feed/atom", callback=self.post_feed_parse, meta={"friend": friend},
dont_filter=True, errback=self.typecho_errback_handler)
yield Request(friend[1] + "rss.xml", callback=self.post_feed_parse, meta={"friend": friend},
dont_filter=True, errback=self.errback_handler)
yield Request(friend[1] + "rss2.xml", callback=self.post_feed_parse, meta={"friend": friend},
dont_filter=True, errback=self.errback_handler)
yield Request(friend[1] + "feed", callback=self.post_feed_parse, meta={"friend": friend},
dont_filter=True, errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_butterfly_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_fluid_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_matery_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_sakura_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_volantis_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_nexmoe_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_Yun_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_stun_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)
yield Request(friend[1], callback=self.theme_stellar_parse, meta={"friend": friend}, dont_filter=True,
errback=self.errback_handler)

# friend = ['小冰博客', 'https://shujin.fun/', 'https://zfe.space/images/headimage.png']
for r in self.start_post_requests(friend[1],post_parsers,feed_suffix,meta={"friend": friend}):
yield r

# friend = ['小冰博客', 'https://blog.zzbd.org/', 'https://zfe.space/images/headimage.png']
# friend = ['小冰博客', 'https://copur.xyz/', 'https://zfe.space/images/headimage.png']
# [[1,1,1],[2,3,2]]
# yield Request(friend[1], callback=self.theme_stellar_parse, meta={"friend": friend}, dont_filter=True,
# errback=self.errback_handler)
# yield CRequest(friend[1], callback=self.theme_next_parse, meta={"friend": friend})

# 将获取到的朋友列表传递到管道
while not self.friend_list.empty():
Expand All @@ -178,6 +163,15 @@ def friend_poor_parse(self, response):
userdata["userdata"] = "userdata"
yield userdata

def start_post_requests(self,domain,parsers,suffixs,meta,errback=None):
errback = self.errback_handler if not errback else ...
for p in parsers:
parser = getattr(self,p)
if p == "post_feed_parse":
for suffix in suffixs:
yield CRequest(domain+suffix,parser,meta,errback=errback)
yield CRequest(domain,parser,meta,errback=errback)

def post_feed_parse(self, response):
# print("post_feed_parse---------->" + response.url)
friend = response.meta.get("friend")
Expand Down Expand Up @@ -223,91 +217,6 @@ def post_feed_parse(self, response):
except:
pass

def post_atom_parse(self, response):
# print("post_atom_parse---------->" + response.url)
friend = response.meta.get("friend")
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all("entry")
if items:
if 0 < len(items) < 5:
l = len(items)
else:
l = 5
try:
for i in range(l):
post_info = {}
item = items[i]
title = item.find("title").text
url = item.find("link")['href']
date = item.find("published").text[:10]
updated = item.find("updated").text[:10]
post_info['title'] = title
post_info['time'] = date
post_info['updated'] = updated
post_info['link'] = url
post_info['name'] = friend[0]
post_info['img'] = friend[2]
post_info['rule'] = "atom"
yield post_info
except:
pass

def post_rss2_parse(self, response):
# print("post_rss2_parse---------->" + response.url)
friend = response.meta.get("friend")
sel = scrapy.Selector(text=response.text)
title = sel.css("item title::text").extract()
link = sel.css("item guid::text").extract()
pubDate = sel.css("item pubDate::text").extract()
if len(link) > 0:
l = len(link) if len(link) < 5 else 5
try:
for i in range(l):
m = pubDate[i].split(" ")
ts = time.strptime(m[3] + "-" + m[2] + "-" + m[1], "%Y-%b-%d")
date = time.strftime("%Y-%m-%d", ts)
if link[i].startswith("/"):
link[i] = friend[1] + link[i].split("/", 1)[1]
post_info = {
'title': title[i],
'time': date,
'updated': date,
'link': link[i],
'name': friend[0],
'img': friend[2],
'rule': "rss"
}
yield post_info
except:
pass

def post_wordpress_parse(self, response):
# print("post_wordpress_parse---------->" + response.url)
friend = response.meta.get("friend")
sel = scrapy.Selector(text=response.text)
title = sel.css("item title::text").extract()
link = [comm.split("#comments")[0] for comm in sel.css("item link+comments::text").extract()]
pubDate = sel.css("item pubDate::text").extract()
if len(link) > 0:
l = len(link) if len(link) < 5 else 5
try:
for i in range(l):
m = pubDate[i].split(" ")
ts = time.strptime(m[3] + "-" + m[2] + "-" + m[1], "%Y-%b-%d")
date = time.strftime("%Y-%m-%d", ts)
post_info = {
'title': title[i],
'time': date,
'updated': date,
'link': link[i],
'name': friend[0],
'img': friend[2],
'rule': "wordpress"
}
yield post_info
except:
pass

def theme_butterfly_parse(self, response):
# print("theme_butterfly_parse---------->" + response.url)
friend = response.meta.get("friend")
Expand Down Expand Up @@ -463,29 +372,6 @@ def theme_nexmoe_parse(self, response):
except:
pass

def theme_Yun_parse(self, response):
# print("theme_Yun_parse---------->" + response.url)
friend = response.meta.get("friend")
titles = response.css("article .post-title a::text").extract()
links = response.css("article link::attr(href)").extract()
createds = response.css("article time[itemprop*=dateCreated]::text").extract()
updateds = response.css("article time[itemprop=dateModified]::text").extract()
try:
l = len(links) if len(links) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "Yun")
for i in range(l):
link = self.process_link(links[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
except:
pass

def theme_stun_parse(self, response):
# print("theme_stun_parse---------->" + response.url)
Expand Down Expand Up @@ -535,6 +421,37 @@ def theme_stellar_parse(self, response):
except:
pass

def theme_next_parse(self,response):
# print("theme_next_parse---------->" + response.url)
friend = response.meta.get("friend")
base_css = ["article h2","article .post-title","article .post-title-link"]
links_l = []
for css in base_css:
links = response.css("%s a:first-child::attr(href)"%css).extract()
links_l.append(len(links))
ind = links_l.index(max(links_l))
links = response.css("%s a:first-child::attr(href)" % base_css[ind]).extract()
titles = response.css("%s a:first-child::text" % base_css[ind]).extract()
createds = response.css("article time[itemprop*=dateCreated]::text").extract()
updateds = response.css("article time[itemprop=dateModified]::text").extract()
try:
l = len(links) if len(links) < 5 else 5
titles = self.process_title(titles, l)
createds, updateds = self.process_time(createds, updateds, l)
init_post_info = self.init_post_info(friend, "next/Yun")
for i in range(l):
link = self.process_link(links[i], friend[1])
yield self.generate_postinfo(
init_post_info,
titles[i],
createds[i] if createds else self.today,
updateds[i] if updateds else self.today,
link
)
except:
pass
pass

def init_post_info(self, friend, rule):
post_info = {
"name": friend[0],
Expand Down Expand Up @@ -605,3 +522,6 @@ def errback_handler(self, error):
def typecho_errback_handler(self, error):
yield Request(error.request.url, callback=self.post_feed_parse, dont_filter=True, meta=error.request.meta,
errback=self.errback_handler)



0 comments on commit 4e5c1b1

Please sign in to comment.