-
Notifications
You must be signed in to change notification settings - Fork 75
/
Copy path3-data-transform-solutions.py
168 lines (146 loc) · 6.72 KB
/
3-data-transform-solutions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
print(
"################################################################################"
)
print("Use standard python libraries to do the transformations")
print(
"################################################################################"
)
import csv
# Question: How do you read data from a CSV file into a list of dictionaries?
data = []
with open("./data/sample_data.csv", "r", newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data.append(row)
# Question: How do you remove duplicate rows based on customer ID?
data_unique = []
customer_ids_seen = set()
for row in data:
if row["Customer_ID"] not in customer_ids_seen:
data_unique.append(row)
customer_ids_seen.add(row["Customer_ID"])
else:
print(f'duplicate customer id {row["Customer_ID"]}')
# Question: How do you handle missing values by replacing them with 0?
for row in data_unique:
if not row["Age"]:
print(f'Customer {row["Customer_Name"]} does not have Age value')
row["Age"] = 0
if not row["Purchase_Amount"]:
row["Purchase_Amount"] = 0.0
# Question: How do you remove outliers such as age > 100 or purchase amount > 1000?
data_cleaned = [
row
for row in data_unique
if int(row["Age"]) <= 100 and float(row["Purchase_Amount"]) <= 1000
]
# Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male)?
for row in data_cleaned:
if row["Gender"] == "Female":
row["Gender"] = 0
elif row["Gender"] == "Male":
row["Gender"] = 1
# Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns?
for row in data_cleaned:
first_name, last_name = row["Customer_Name"].split(" ", 1)
row["First_Name"] = first_name
row["Last_Name"] = last_name
del row["Customer_Name"]
# Question: How do you calculate the total purchase amount by Gender?
total_purchase_by_gender = {}
for row in data_cleaned:
total_purchase_by_gender[row["Gender"]] += float(row["Purchase_Amount"])
# Question: How do you calculate the average purchase amount by Age group?
# assume age_groups is the grouping we want
# hint: Why do we convert to float?
age_groups = {"18-30": [], "31-40": [], "41-50": [], "51-60": [], "61-70": []}
for row in data_cleaned:
age = int(row["Age"])
if age <= 30:
age_groups["18-30"].append(float(row["Purchase_Amount"]))
elif age <= 40:
age_groups["31-40"].append(float(row["Purchase_Amount"]))
elif age <= 50:
age_groups["41-50"].append(float(row["Purchase_Amount"]))
elif age <= 60:
age_groups["51-60"].append(float(row["Purchase_Amount"]))
else:
age_groups["61-70"].append(float(row["Purchase_Amount"]))
average_purchase_by_age_group = {
group: sum(amounts) / len(amounts) for group, amounts in age_groups.items()
}
# Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group?
print("Total purchase amount by Gender:", total_purchase_by_gender)
print("Average purchase amount by Age group:", average_purchase_by_age_group)
print(
"################################################################################"
)
print("Use DuckDB to do the transformations")
print(
"################################################################################"
)
import duckdb
# Question: How do you connect to DuckDB and load data from a CSV file into a DuckDB table?
# Connect to DuckDB and load data
con = duckdb.connect(database=":memory:", read_only=False)
con.execute(
"CREATE TABLE data (Customer_ID INTEGER, Customer_Name VARCHAR, Age INTEGER, Gender VARCHAR, Purchase_Amount FLOAT, Purchase_Date DATE)"
)
# Read data from CSV file into DuckDB table
con.execute("COPY data FROM './data/sample_data.csv' WITH HEADER CSV")
# Question: How do you remove duplicate rows based on customer ID in DuckDB?
con.execute("CREATE TABLE data_unique AS SELECT DISTINCT * FROM data")
# Question: How do you handle missing values by replacing them with 0 in DuckDB?
con.execute(
"CREATE TABLE data_cleaned_missing AS SELECT \
Customer_ID, Customer_Name, \
COALESCE(Age, 0) AS Age, \
Gender, \
COALESCE(Purchase_Amount, 0.0) AS Purchase_Amount, \
Purchase_Date \
FROM data_unique"
)
# Question: How do you remove outliers (e.g., age > 100 or purchase amount > 1000) in DuckDB?
con.execute(
"CREATE TABLE data_cleaned_outliers AS SELECT * FROM data_cleaned_missing \
WHERE Age <= 100 AND Purchase_Amount <= 1000"
)
# Question: How do you convert the Gender column to a binary format (0 for Female, 1 for Male) in DuckDB?
con.execute(
"CREATE TABLE data_cleaned_gender AS SELECT *, \
CASE WHEN Gender = 'Female' THEN 0 ELSE 1 END AS Gender_Binary \
FROM data_cleaned_outliers"
)
# Question: How do you split the Customer_Name column into separate First_Name and Last_Name columns in DuckDB?
con.execute(
"CREATE TABLE data_cleaned AS SELECT \
Customer_ID, \
SPLIT_PART(Customer_Name, ' ', 1) AS First_Name, \
SPLIT_PART(Customer_Name, ' ', 2) AS Last_Name, \
Age, Gender_Binary, Purchase_Amount, Purchase_Date \
FROM data_cleaned_gender"
)
# Question: How do you calculate the total purchase amount by Gender in DuckDB?
total_purchase_by_gender = con.execute(
"SELECT Gender_Binary, SUM(Purchase_Amount) AS Total_Purchase_Amount \
FROM data_cleaned_gender \
GROUP BY Gender_Binary"
).fetchall()
# Question: How do you calculate the average purchase amount by Age group in DuckDB?
average_purchase_by_age_group = con.execute(
"SELECT CASE \
WHEN Age BETWEEN 18 AND 30 THEN '18-30' \
WHEN Age BETWEEN 31 AND 40 THEN '31-40' \
WHEN Age BETWEEN 41 AND 50 THEN '41-50' \
WHEN Age BETWEEN 51 AND 60 THEN '51-60' \
ELSE '61-70' END AS Age_Group, \
AVG(Purchase_Amount) AS Average_Purchase_Amount \
FROM data_cleaned \
GROUP BY Age_Group"
).fetchall()
# Question: How do you print the results for total purchase amount by Gender and average purchase amount by Age group in DuckDB?
print("====================== Results ======================")
print("Total purchase amount by Gender:")
print(total_purchase_by_gender)
print("Average purchase amount by Age group:")
print(average_purchase_by_age_group)