-
Notifications
You must be signed in to change notification settings - Fork 2
/
popden_feature.py
152 lines (119 loc) · 4.7 KB
/
popden_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""This script takes the data about Population Density and gets it ready to be
used as a feature for the models.
"""
import os
import numpy as np
import pandas as pd
from io import StringIO
from sklearn.cluster import KMeans
from classes.plt_handler import plot_clustering
from classes.csv_handler import read_csv_as_df, df_to_csv
popden_raw_cols = ["Rank",
"name",
"pop2019",
"pop2018",
"GrowthRate",
"area",
"Density"]
def _read_popden_raw(raw_csv, handicaps=None):
"""Read the raw data from the CSV specified by the type parameter, and
returns the feature dataframe ready removing undesired columns. Results are
rounded to multiples of 5 and are applied a handicap.
"""
# Load the specified one
raw_df = read_csv_as_df(raw_csv)
if raw_df is None:
raise Exception(f"Raw data not found in file {raw_csv}")
# Remove unnecesary columns
remove = "cols"
axis = 1 if "cols" in remove else 0
raw_df = raw_df.drop(popden_raw_cols[0], axis)
raw_df = raw_df.drop(popden_raw_cols[2], axis)
raw_df = raw_df.drop(popden_raw_cols[3], axis)
raw_df = raw_df.drop(popden_raw_cols[4], axis)
raw_df = raw_df.drop(popden_raw_cols[5], axis)
# Apply existing handicaps
name = popden_raw_cols[1]
col = popden_raw_cols[-1]
if not handicaps is None:
for k in handicaps:
indexes = raw_df[name][raw_df[name] == k].index.tolist()
for index in indexes:
raw_df.loc[index, col] = raw_df.loc[index, col] * handicaps[k]
# Make the results to be multiple of 5 and at least 5
if np.issubdtype(raw_df[col].dtype, np.number):
raw_df[col] = (raw_df[col] / 5).round() * 5
# Make sure it is at least 5 and not 0
raw_df[col] = raw_df[col] + (raw_df[col] == 0) * 5
return raw_df
def _apply_classification(popden_df, K=3):
"""Applies K-Means to the population density dataframe that is passed and
returns a dictionary with the clusters.
It also stores the output picture of the classification.
"""
# Convert the dataframe to an array
col = popden_raw_cols[-1]
popden_array = popden_df[col].to_numpy(dtype=float)
data_2_feed = []
for k in popden_array:
data_2_feed.append([k, 0])
# Apply K-Means
kmeans = KMeans(n_clusters=K, random_state=0).fit(data_2_feed)
clusters = {}
for label in kmeans.labels_:
clusters[label] = \
[data_2_feed[i][0] for i in range(0, len(data_2_feed)) if kmeans.labels_[i] == label]
return clusters
def gen_popden_feat(input_raw="./data/raw/popden/",
output_folder="./data/features/popden",
handicaps=None,
remove_over=False):
"""This function process the raw CSV file with all the world's population
density and creates the output CSV file with the classification of each
country based on its poulation density.
It also allows to apply a specific correction factor to countries based on
a dictionary passed as argument.
"""
# 1. Get the data from the raw CSV file
# Results are already multiple of 5, we apply the handicaps
popden_raw_df = _read_popden_raw(input_raw, handicaps)
# 2. We apply the classification
if remove_over:
den = popden_raw_cols[-1]
name = popden_raw_cols[1]
indexes = popden_raw_df[popden_raw_df[den] < 10000][name].index.tolist()
classification_df = popden_raw_df.iloc[indexes]
else:
classification_df = popden_raw_df
clusters = _apply_classification(classification_df, K=3)
# Generate the CSV output with the classification
# Work with the original dataframe
popden_feat_df = popden_raw_df.copy(deep=False)
# Draw it
draw_title = "Population Density Feature - Clustering by K-Means"
output_png = output_folder + "/popden.png"
frontiers = plot_clustering(clusters, draw_title, "Density", output_png)
# Now we generate the final classification. We take advantage from frontiers
# calculated when plotting
# Generate new classification
col = popden_raw_cols[-1]
popden_feat_df["Classification"] = [3 if density >= frontiers[1] else \
2 if density >= frontiers[0] else \
1 for density in popden_feat_df[col]]
# We can drop density already
remove = "cols"
axis = 1 if "cols" in remove else 0
#print(popden_feat_df)
#from IPython import embed
#embed()
popden_feat_df = popden_feat_df.drop(col, axis)
# Rename column to "Country"
popden_feat_df.rename(columns={
popden_raw_cols[1]: "Country"
}, inplace=True)
output_csv = output_folder + "/popden.csv"
# 3. Write to features folder
if os.path.isfile(output_csv):
os.remove(output_csv)
df_to_csv(output_csv, popden_feat_df)
return popden_feat_df