-
Notifications
You must be signed in to change notification settings - Fork 780
/
service.py
253 lines (223 loc) · 7.83 KB
/
service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
from weibo import Weibo, handle_config_renaming
import const
import logging
import logging.config
import os
from flask import Flask, jsonify, request
import sqlite3
import json
from concurrent.futures import ThreadPoolExecutor
import threading
import uuid
import time
from datetime import datetime
# 1896820725 天津股侠 2024-12-09T16:47:04
DATABASE_PATH = './weibo/weibodata.db'
print(DATABASE_PATH)
# 如果日志文件夹不存在,则创建
if not os.path.isdir("log/"):
os.makedirs("log/")
logging_path = os.path.split(os.path.realpath(__file__))[0] + os.sep + "logging.conf"
logging.config.fileConfig(logging_path)
logger = logging.getLogger("api")
config = {
"user_id_list": [
"6067225218",
"1445403190"
],
"only_crawl_original": 1,
"since_date": 1,
"start_page": 1,
"write_mode": [
"csv",
"json",
"sqlite"
],
"original_pic_download": 0,
"retweet_pic_download": 0,
"original_video_download": 0,
"retweet_video_download": 0,
"download_comment": 0,
"comment_max_download_count": 100,
"download_repost": 0,
"repost_max_download_count": 100,
"user_id_as_folder_name": 0,
"remove_html_tag": 1,
"cookie": "your weibo cookie",
"mysql_config": {
"host": "localhost",
"port": 3306,
"user": "root",
"password": "123456",
"charset": "utf8mb4"
},
"mongodb_URI": "mongodb://[username:password@]host[:port][/[defaultauthdb][?options]]",
"post_config": {
"api_url": "https://api.example.com",
"api_token": ""
}
}
app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False # 确保JSON响应中的中文不会被转义
app.config['JSONIFY_MIMETYPE'] = 'application/json;charset=utf-8'
# 添加线程池和任务状态跟踪
executor = ThreadPoolExecutor(max_workers=1) # 限制只有1个worker避免并发爬取
tasks = {} # 存储任务状态
# 在executor定义后添加任务锁相关变量
current_task_id = None
task_lock = threading.Lock()
def get_running_task():
"""获取当前运行的任务信息"""
if current_task_id and current_task_id in tasks:
task = tasks[current_task_id]
if task['state'] in ['PENDING', 'PROGRESS']:
return current_task_id, task
return None, None
def get_config(user_id_list=None):
"""获取配置,允许动态设置user_id_list"""
current_config = config.copy()
if user_id_list:
current_config['user_id_list'] = user_id_list
handle_config_renaming(current_config, oldName="filter", newName="only_crawl_original")
handle_config_renaming(current_config, oldName="result_dir_name", newName="user_id_as_folder_name")
return current_config
def run_refresh_task(task_id, user_id_list=None):
global current_task_id
try:
tasks[task_id]['state'] = 'PROGRESS'
tasks[task_id]['progress'] = 0
config = get_config(user_id_list)
wb = Weibo(config)
tasks[task_id]['progress'] = 50
wb.start() # 爬取微博信息
tasks[task_id]['progress'] = 100
tasks[task_id]['state'] = 'SUCCESS'
tasks[task_id]['result'] = {"message": "微博列表已刷新"}
except Exception as e:
tasks[task_id]['state'] = 'FAILED'
tasks[task_id]['error'] = str(e)
logger.exception(e)
finally:
with task_lock:
if current_task_id == task_id:
current_task_id = None
@app.route('/refresh', methods=['POST'])
def refresh():
global current_task_id
# 获取请求参数
data = request.get_json()
user_id_list = data.get('user_id_list') if data else None
# 验证参数
if not user_id_list or not isinstance(user_id_list, list):
return jsonify({
'error': 'Invalid user_id_list parameter'
}), 400
# 检查是否有正在运行的任务
with task_lock:
running_task_id, running_task = get_running_task()
if running_task:
return jsonify({
'task_id': running_task_id,
'status': 'Task already running',
'state': running_task['state'],
'progress': running_task['progress']
}), 409 # 409 Conflict
# 创建新任务
task_id = str(uuid.uuid4())
tasks[task_id] = {
'state': 'PENDING',
'progress': 0,
'created_at': datetime.now().isoformat(),
'user_id_list': user_id_list
}
current_task_id = task_id
executor.submit(run_refresh_task, task_id, user_id_list)
return jsonify({
'task_id': task_id,
'status': 'Task started',
'state': 'PENDING',
'progress': 0,
'user_id_list': user_id_list
}), 202
@app.route('/task/<task_id>', methods=['GET'])
def get_task_status(task_id):
task = tasks.get(task_id)
if not task:
return jsonify({'error': 'Task not found'}), 404
response = {
'state': task['state'],
'progress': task['progress']
}
if task['state'] == 'SUCCESS':
response['result'] = task.get('result')
elif task['state'] == 'FAILED':
response['error'] = task.get('error')
return jsonify(response)
@app.route('/weibos', methods=['GET'])
def get_weibos():
try:
conn = sqlite3.connect(DATABASE_PATH)
cursor = conn.cursor()
# 按created_at倒序查询所有微博
cursor.execute("SELECT * FROM weibo ORDER BY created_at DESC")
columns = [column[0] for column in cursor.description]
weibos = []
for row in cursor.fetchall():
weibo = dict(zip(columns, row))
weibos.append(weibo)
conn.close()
res1 = json.dumps(weibos, ensure_ascii=False)
print(res1)
res = jsonify(weibos)
print(res)
return res, 200
except Exception as e:
logger.exception(e)
return {"error": str(e)}, 500
@app.route('/weibos/<weibo_id>', methods=['GET'])
def get_weibo_detail(weibo_id):
try:
conn = sqlite3.connect(DATABASE_PATH)
cursor = conn.cursor()
cursor.execute("SELECT * FROM weibo WHERE id=?", (weibo_id,))
columns = [column[0] for column in cursor.description]
row = cursor.fetchone()
conn.close()
if row:
weibo = dict(zip(columns, row))
return jsonify(weibo), 200
else:
return {"error": "Weibo not found"}, 404
except Exception as e:
logger.exception(e)
return {"error": str(e)}, 500
def schedule_refresh():
"""定时刷新任务"""
while True:
try:
# 检查是否有运行中的任务
running_task_id, running_task = get_running_task()
if not running_task:
task_id = str(uuid.uuid4())
tasks[task_id] = {
'state': 'PENDING',
'progress': 0,
'created_at': datetime.now().isoformat(),
'user_id_list': config['user_id_list'] # 使用默认配置
}
with task_lock:
global current_task_id
current_task_id = task_id
executor.submit(run_refresh_task, task_id, config['user_id_list'])
logger.info(f"Scheduled task {task_id} started")
time.sleep(600) # 10分钟间隔
except Exception as e:
logger.exception("Schedule task error")
time.sleep(60) # 发生错误时等待1分钟后重试
if __name__ == "__main__":
# 启动定时任务线程
scheduler_thread = threading.Thread(target=schedule_refresh, daemon=True)
scheduler_thread.start()
logger.info("服务启动")
# 启动Flask应用
app.run(debug=True, use_reloader=False) # 关闭reloader避免启动两次