-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSECedgarpyDownloading.py
207 lines (155 loc) · 7.97 KB
/
SECedgarpyDownloading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#importing necessary modules
from requests import get
from SECedgarpyExceptions import ErrorFoundWhileGETRequest
from SECedgarpyProcessing import HEAD
import pandas as pd
import os
#function to download the XLSX file from the lists
def getCSVfile(URLlist: list[str], nameOfFile: str) -> None:
# iterating through every URL in the list
counter = 0
for urlelt in URLlist:
# perform HTTP GET request to download the XLSX file
urlresp = get(urlelt, timeout=5000, headers=HEAD)
# check the status code for the response
statcode = urlresp.status_code
# handle 404 errors (file not found)
if statcode == 404:
print(f"File not found at {urlelt}. Skipping...")
continue # Skip this URL and move to the next one
# handle other errors (anything other than 200 or 404)
elif statcode != 200:
raise ErrorFoundWhileGETRequest
# if request is successful, save the file
else:
# increment the counter for unique file names
counter += 1
# create the final file name with the counter and .xlsx extension
FinalNameXLSX = f"{nameOfFile}_{counter}.xlsx"
FinalNameCSV = f"{nameOfFile}_{counter}.csv"
# open the file in binary write mode and write the content
with open(FinalNameXLSX, "wb") as f:
f.write(urlresp.content)
print(f"Downloaded file: {FinalNameXLSX}")
# Convert XLSX to CSV
try:
# read the XLSX file
excel_data = pd.read_excel(FinalNameXLSX)
# save the content as CSV
excel_data.to_csv(FinalNameCSV, index=False)
print(f"Converted {FinalNameXLSX} to {FinalNameCSV}")
except Exception as e:
print(f"Failed to convert {FinalNameXLSX} to CSV: {e}")
def getXLSXfile(URLlist: list[str], nameOfFile: str) -> None:
# iterating through every URL in the list
counter = 0
for urlelt in URLlist:
# perform HTTP GET request to download the XLSX file
urlresp = get(urlelt, timeout=5000, headers=HEAD)
# check the status code for the response
statcode = urlresp.status_code
# handle 404 errors (file not found)
if statcode == 404:
print(f"File not found at {urlelt}. Skipping...")
continue # Skip this URL and move to the next one
# handle other errors (anything other than 200 or 404)
elif statcode != 200:
raise ErrorFoundWhileGETRequest
# if request is successful, save the file
else:
# increment the counter for unique file names
counter += 1
# create the final file name with the counter and .xlsx extension
FinalNameXLSX = f"{nameOfFile}_{counter}.xlsx"
FinalNameCSV = f"{nameOfFile}_{counter}.csv"
# open the file in binary write mode and write the content
with open(FinalNameXLSX, "wb") as f:
f.write(urlresp.content)
print(f"Downloaded file: {FinalNameXLSX}")
# Convert XLSX to CSV
try:
# read the XLSX file
excel_data = pd.read_excel(FinalNameXLSX)
# save the content as CSV
excel_data.to_csv(FinalNameCSV, index=False)
print(f"Converted {FinalNameXLSX} to {FinalNameCSV}")
except Exception as e:
print(f"Failed to convert {FinalNameXLSX} to CSV: {e}")
#to generate the CSV report by filtering and keeping the necessary sheets only
def GenerateCSVreport(nameOfFile):
# Define the XLSX file path
xlsx_file_path = 'input.xlsx'
# Target sheet names (in lowercase and with spaces)
target_sheets = ["consolidated statements of income", "consolidated balance sheets"]
# Load the Excel file and extract sheet names
xlsx_file = pd.ExcelFile(xlsx_file_path)
all_sheet_names = xlsx_file.sheet_names
# Convert sheet names to lowercase and replace "_" with " " for comparison
formatted_sheet_names = [sheet_name.lower().replace("_", " ") for sheet_name in all_sheet_names]
# Initialize an empty list to hold dataframes of the matched sheets
sheets_to_merge = []
# Loop through each formatted sheet name and match with target list
for original_sheet, formatted_sheet in zip(all_sheet_names, formatted_sheet_names):
if formatted_sheet in target_sheets:
# Parse the matching sheet and append to the list
sheet = xlsx_file.parse(original_sheet)
sheets_to_merge.append(sheet)
print(f'Sheet {original_sheet} matched and added for merging')
# Concatenate all matched sheets if there are any matches
if sheets_to_merge:
merged_sheets = pd.concat(sheets_to_merge, ignore_index=True)
# Derive the CSV filename from the XLSX file name
csv_file_name = os.path.splitext(os.path.basename(xlsx_file_path))[0] + '.csv'
# Save the merged sheets to a single CSV file
merged_sheets.to_csv(csv_file_name, index=False)
print(f'Matched sheets have been saved to {csv_file_name}')
else:
print("No matching sheets found.")
# new combined function
def download_and_convert_filtered_xlsx(URLlist: list[str], nameOfFile: str, target_sheets: list[str]) -> None:
counter = 0
for urlelt in URLlist:
try:
# Perform HTTP GET request
urlresp = get(urlelt, timeout=5000, headers=HEAD)
if urlresp.status_code == 404:
print(f"File not found at {urlelt}. Skipping...")
continue
elif urlresp.status_code != 200:
raise ErrorFoundWhileGETRequest
# Increment counter and save the XLSX file
counter += 1
FinalNameXLSX = f"{nameOfFile}_{counter}.xlsx"
with open(FinalNameXLSX, "wb") as f:
f.write(urlresp.content)
print(f"Downloaded file: {FinalNameXLSX}")
# Convert to CSV but only for the filtered sheets
FinalNameCSV = f"{nameOfFile}_{counter}.csv"
filter_and_convert_to_csv(FinalNameXLSX, FinalNameCSV, target_sheets)
except Exception as e:
print(f"Failed to download or process {urlelt}: {e}")
# Function to filter relevant sheets and convert them to CSV
def filter_and_convert_to_csv(xlsx_file_path: str, csv_file_path: str, target_sheets: list[str]) -> None:
try:
# Load the Excel file and extract sheet names
xlsx_file = pd.ExcelFile(xlsx_file_path)
all_sheet_names = xlsx_file.sheet_names
formatted_sheet_names = [sheet_name.lower().replace("_", " ") for sheet_name in all_sheet_names]
# Initialize an empty list to hold dataframes of the matched sheets
sheets_to_merge = []
# Loop through each formatted sheet name and match with target list
for original_sheet, formatted_sheet in zip(all_sheet_names, formatted_sheet_names):
if formatted_sheet in target_sheets:
# Parse the matching sheet and append to the list
sheet = xlsx_file.parse(original_sheet)
sheets_to_merge.append(sheet)
print(f'Sheet {original_sheet} matched and added for merging')
# If there are matched sheets, concatenate and save to CSV
if sheets_to_merge:
merged_sheets = pd.concat(sheets_to_merge, ignore_index=True)
merged_sheets.to_csv(csv_file_path, index=False)
print(f"Filtered sheets saved to {csv_file_path}")
else:
print("No matching sheets found.")
except Exception as e:
print(f"Failed to process and convert {xlsx_file_path}: {e}")