forked from arsen41531/opendatam-egov-am-budget-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbudget_parser.py
executable file
·69 lines (53 loc) · 2.15 KB
/
budget_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
import datetime
import logging
from pathlib import Path
import pandas as pd
import requests
NOW = datetime.datetime.now()
CURRENT_YEAR = NOW.year
CURRENT_DATE = NOW.strftime("%Y-%m-%d")
BASE_URL = "https://www.e-gov.am"
XML_FILES = [
"GOV_BUDGET.XML",
"GOV_CONTR.XML",
"GOV_GROUP.XML",
"GOV_CONTR_F.XML",
"GOV_ITEM.XML",
"GOV_MIN.XML",
]
logging.basicConfig(
format="%(asctime)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO
)
def fetch_gov_budget_data(year: int, xml_file: str) -> pd.DataFrame:
"""Fetches data from e-gov.am in a form of XML files"""
url_prefix = f"{BASE_URL}/budget_archive/{str(year)}/data"
# for current year, data is in a different location
if year == CURRENT_YEAR:
url_prefix = f"{BASE_URL}/interactive/data"
# some servers block requests with default user agent
# fake user agent to avoid this
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(f"{url_prefix}/{xml_file}", headers=headers)
return pd.read_xml(response.text, xpath=".//ROW")
if __name__ == "__main__":
for year in range(2016, CURRENT_YEAR + 1):
if year == 2018:
# data for 2018 is not available
logging.info(f"Skipping {year}...")
continue
data_folder = Path.joinpath(Path.cwd(), "_data", CURRENT_DATE)
data_folder.mkdir(exist_ok=True)
for xml_file in XML_FILES:
logging.info(f"Fetching {xml_file} for {year}...")
contents = fetch_gov_budget_data(year, xml_file)
csv_filename = Path(f"{year}-{xml_file.lower()}").with_suffix(".csv")
csv_filepath = Path.joinpath(data_folder, csv_filename)
logging.info(f"Writing {csv_filename}...")
csv_filepath.write_text(contents.to_csv(index=False))
json_filename = Path(f"{year}-{xml_file.lower()}").with_suffix(".json")
json_filepath = Path.joinpath(data_folder, json_filename)
logging.info(f"Writing {json_filename}...")
json_filepath.write_text(
contents.to_json(orient="records", force_ascii=False, indent=2)
)