forked from abdulmalikadeyemo/Rainfall_Prediction
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_selection_vif.py
50 lines (43 loc) · 2.17 KB
/
feature_selection_vif.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Utility function to return the VIF value for each feature provided
#from statsmodels.stats.outliers_influence import variance_inflation_factor
def compute_vif(features, df):
"""
Returns a DataFrame containing features and their corresponding variance inflation factor
features: list of features whoes multicollinearity check is needed
df: DataFrame of the data under review
"""
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
X = df[features]
X['intercept'] = 1
# Create dataframe to store vif values
vif = pd.DataFrame()
vif['Feature'] = X.columns
vif['Vif Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif = vif[vif['Feature']!='intercept'].sort_values('Vif Factor', ascending=False)
return vif
def select_features(train_df, threshold):
"""Returns two objects;
1. a DataFrame containing features and their corresponding variance inflation factor, and
2. Pandas Index object containing the list of features that have the least Multicollinearity in accordance with
the supplied threshold.
train_df: The training dataset whoes Multicollinearity is to be checked
threshold: value to compare VIF value with, above which, the feature droped
"""
data = train_df.copy()
flag = True
while flag:
features_to_consider = data.columns
# Calling the "compute_vif" utility function the Variance Inflation Factor dataframe
sorted_vif_df = (compute_vif(features_to_consider, data).reset_index().drop('index', axis=1))
# Get the highest vif value to compare against a threshold
highest_vif = sorted_vif_df.at[0, 'Vif Factor']
# Compare the highest_vif with a threshold (5 was decided for this problem by the team)
if highest_vif > threshold: # or highest_vif=='inf':
# Select the feature corresponding to the highest_vif (index 0 for both)
feature = sorted_vif_df.at[0, 'Feature']
# Drop the feature
data.drop(feature, axis=1, inplace=True)
else:
flag = False
return sorted_vif_df, data.columns