random_forest.html

# -*- coding: utf-8 -*-
"""Random_Forest.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1-LuddYLJSQNKKfILGmlBB_lrzFph1JxH
"""

import numpy as np
import pandas as pd
import os
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
from sklearn import tree
from IPython. display import Image as PImage 
from subprocess import check_call 
from PIL import Image, ImageDraw, ImageFont 
import re 
import pydotplus

df = pd.read_csv('HR Employee Attrition.csv',na_values= "?")
df.head(25)


seed = 0
parameter = {
    'n_jobs': -1,
    'n_estimators': 800,
    'warm_start': True,
    'max_features': 0.3,
    'max_depth': 9 ,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'random_state': seed,
    'verbose': 0 
}

df = df[['Attrition',
        'Age',
        'BusinessTravel',
        'DailyRate',
        'Department',                    
        'DistanceFromHome',
        'Education',
        'EducationField',
        'EmployeeCount',
        'EmployeeNumber',
        'EnvironmentSatisfaction',
        'Gender',
        'HourlyRate',
        'JobInvolvement',
        'JobLevel',
        'JobRole',
        'JobSatisfaction',
        'MaritalStatus',
        'MonthlyIncome',
        'MonthlyRate',
        'NumCompaniesWorked',
        'Over18',
        'OverTime',
        'PercentSalaryHike',
        'PerformanceRating',
        'RelationshipSatisfaction',
        'StandardHours',
        'StockOptionLevel',
        'TotalWorkingYears',
        'TrainingTimesLastYear',
        'WorkLifeBalance',
        'YearsAtCompany',
        'YearsInCurrentRole',
        'YearsSinceLastPromotion',
        'YearsWithCurrManager']]

dataset = df.drop(['EmployeeCount'], axis = 1)
dataset = dataset.drop(['StandardHours'], axis = 1)
dataset = dataset.drop(['Over18'], axis = 1)
dataset = dataset.drop(['EmployeeNumber'], axis = 1)
dataset = dataset.drop(['PerformanceRating'], axis = 1)

dataset.head()

X = dataset.iloc[:,1:]
y = dataset.iloc[:, 0]
X.head()

y.head()

X = pd.get_dummies(X)
X.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

dropout = 0.1
epochs = 100
batch_size = 30
optimizer = 'adam'
k = 20

"""seed = 0

"""

rf = RandomForestClassifier(**parameter)

rf = rf.fit(X_train, y_train)

rf_predictions = rf.predict(X_test)

accuracy_score(y_test, rf_predictions)

confusion_matrix(y_test, rf_predictions)

f1_score(y_test, rf_predictions, pos_label='No')

precision_score(y_test, rf_predictions, pos_label='No')

py.init_notebook_mode(connected=True)

trace = go. Scatter(
    y = rf. feature_importances_,
    x = X. columns. values,
    #x = attrition_final. columns. values,

    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 6,
        color = rf.feature_importances_,
        colorscale = 'Portland',
        showscale = True
    ),
    text = X. columns. values
)
data = [trace]

layout  = go.Layout(
    autosize= True,
    title='Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
        ticklen= 5,
        showgrid=False,
       zeroline=False, 
       showline=False
    ),
   yaxis=dict(
        title= 'Feature Importance',
        showgrid=False, zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend = False
)
fig = go.Figure (data = data, layout = layout)
py.iplot(fig,filename = 'scatter2010')

decision_tree = tree.DecisionTreeClassifier (max_depth = 4)
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict (X_test)

with open("tree1.dot",'w') as f:
    f= tree.export_graphviz(decision_tree,
                                out_file = f,
                                max_depth = 4,
                                impurity = False,
                                feature_names = X. columns. values,
                                class_names = ['No', 'Yes'],
                                rounded = True,
                                filled = True )

check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save ('sample-out.png')
PImage ("sample-out.png")