-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsector.py
106 lines (91 loc) · 3.47 KB
/
sector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import json
import requests
from bs4 import BeautifulSoup
# 取得臺灣證券交易所公告內容的 URL
urls = [
"https://isin.twse.com.tw/isin/C_public.jsp?strMode=2", # 上市證券
# "https://isin.twse.com.tw/isin/C_public.jsp?strMode=4", # 上櫃證券
# "https://isin.twse.com.tw/isin/C_public.jsp?strMode=5" # 興櫃證券
]
# 分類規則
category_mapping = {
"半導體業": ["半導體業"],
"電腦及週邊設備業": ["電腦及週邊設備業"],
"電子零組件業": ["電子零組件業"],
"航運業": ["航運業"],
"通信網路業": ["通信網路業"],
"其他": ["其他"],
"汽車工業": ["汽車工業"],
"橡膠工業": ["橡膠工業"],
"建材營造": ["建材營造"],
"玻璃陶瓷": ["玻璃陶瓷"],
"造紙工業": ["造紙工業"],
"鋼鐵工業": ["鋼鐵工業"],
"塑膠工業": ["塑膠工業"],
"紡織纖維": ["紡織纖維"],
"生技醫療業": ["生技醫療業"],
"化學工業": ["化學工業"],
"水泥工業": ["水泥工業"],
"油電燃氣業": ["油電燃氣業"],
"貿易百貨": ["貿易百貨"],
"觀光事業": ["觀光事業"],
"資訊服務業": ["資訊服務業"],
"電機機械": ["電機機械"],
"電器電纜": ["電器電纜"],
"金融保險": ["金融保險"],
"食品工業": ["食品工業"],
"其他電子業": ["其他電子業"],
"光電業": ["光電業"],
"電子通路業": ["電子通路業"],
"綜合": ["綜合"]
}
# 初始化分類結果
classified_data = {category: [] for category in category_mapping.keys()}
# 取得和處理公告內容
data = {}
total_urls = len(urls)
for index, url in enumerate(urls, start=1):
print(f"Processing URL {index}/{total_urls}: {url}")
response = requests.get(url)
response.encoding = 'big5' # 設定正確的編碼格式
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'h4'})
if not table:
print(f"Table not found for URL: {url}")
continue
for row in table.find_all('tr')[1:]: # 跳過表頭
cells = row.find_all('td')
if len(cells) != 7:
continue
try:
code, name = cells[0].text.split("\u3000")
except ValueError:
continue # 避免資料格式不符導致的錯誤
internationality = cells[1].text
list_date = cells[2].text
market_type = cells[3].text
industry_type = cells[4].text
data[code] = {
"名稱": name,
"代號": code,
"市場別": market_type,
"產業別": industry_type,
"上市日期": list_date,
"國際代碼": internationality
}
# 將資料分類
for code, info in data.items():
# 只處理純數字代號
if code.isdigit() and 1000 < int(code) < 10000:
industry = info["產業別"]
for category, industries in category_mapping.items():
if industry in industries:
classified_data[category].append(info)
break
# 儲存完整資料和分類結果到 JSON 檔案
with open("stock_infos.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
with open("classified_stocks.json", "w", encoding="utf-8") as f:
json.dump(classified_data, f, ensure_ascii=False, indent=2)
print("資料已處理完畢,分別儲存到 stock_infos.json 和 classified_stocks.json")