-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch_player.py
112 lines (84 loc) · 3.55 KB
/
fetch_player.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import glob
import re
import scraping as sp
from typing import List, Tuple
import crawling
import write_db as wd
import datetime
class MemberPage(crawling.Page):
player_list: list = []
@classmethod
def append_player_url_list(cls, player_url: str) -> None:
cls.player_list.append(player_url)
@classmethod
def get_player_url_list(cls) -> list:
return cls.player_list
def fetch_players(self) -> list:
url_list: list = []
root_url = 'https://baseball.yahoo.co.jp'
tr_list = self.soup.select('#tm_plyr > tr')
for tr in tr_list:
td_list = tr.select('td')
player_profile_url = root_url + td_list[1].a.get('href')
url_list.append(player_profile_url)
self.append_player_url_list(player_profile_url)
return url_list
class PlayerPage(crawling.Page):
def storage_html(self) -> None:
player_id = self.url.split('/')[5]
player_dir: str = f'./HTML/player'
html_name: str = f'{player_id}.html'
with open(player_dir + '/' + html_name, 'w', encoding='utf-8') as f:
f.write(self.res.text)
print(f'Done(player): {player_id}')
class PlayerDbWrite(wd.DbOperator):
def write_player(self, insert_data_list: List[Tuple[str, str, int, str, str, int, int, str, str, int, str, int]]):
cur = self.cnn.cursor()
cur.executemany(
'INSERT INTO player VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
insert_data_list)
self.cnn.commit()
def take_player_id_list(self) -> List[Tuple[int]]:
c = self.cnn.cursor()
c.execute('select id from player')
player_list = c.fetchall()
return player_list
def write_db():
now_year = datetime.date.today().year
db_name = now_year
playerDbWriter = PlayerDbWrite(db_name)
player_files = glob.glob('./HTML/player/*.html')
player_id_list = playerDbWriter.take_player_id_list()
save_data_list = []
for file in player_files:
playerPage = sp.PlayerPageScraper(file)
pdd = playerPage.take_player_profile() # player_data_dict
player_id = int(re.sub('\D+', '', file))
if (player_id,) in player_id_list:
continue
save_data_list.append(
(player_id, pdd['player_name'], pdd['team_name'], pdd['uniform_number'], pdd['position'],
pdd['date_of_birth'], pdd['height'], pdd['weight'], pdd['throw_arm'],
pdd['batting_arm'], pdd['draft_year'], pdd['draft_rank'], pdd['total_year']))
playerDbWriter.write_player(save_data_list)
def fetch_player_html():
team_number_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '11', '12', '376']
root_url = 'https://baseball.yahoo.co.jp/npb/teams'
for team_num in team_number_list:
exec(f'team_url_p = root_url + "/{team_num}/memberlist?kind=p"')
exec(f'team_url_b = root_url + "/{team_num}/memberlist?kind=b"')
exec(f'memberPage{team_num}p = MemberPage(team_url_p)')
exec(f'memberPage{team_num}b = MemberPage(team_url_b)')
exec(f'memberPage{team_num}p.send_request()')
exec(f'memberPage{team_num}b.send_request()')
exec(f'memberPage{team_num}p.fetch_players()')
exec(f'memberPage{team_num}b.fetch_players()')
memberPage = MemberPage('')
player_url_list = memberPage.get_player_url_list()
for player_url in player_url_list:
playerPage = PlayerPage(player_url)
playerPage.send_request()
playerPage.storage_html()
if __name__ == '__main__':
fetch_player_html()
write_db()