-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path01_transforming_data.r
133 lines (83 loc) · 2.93 KB
/
01_transforming_data.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library(tidyverse)
link <- 'https://github.com/gumdropsteve/datasets/raw/master/december_real_estate.csv'
df <- read.csv(link)
# view dataset tidy (note the difference with how these display in console)
df
as_tibble(df)
df <- as_tibble(df)
# view like excel or github
View(df)
### filter function
# single filter
filter(df, closing_price > 1000000)
filter(df, client == 'seller')
filter(df, ds > '12/14/2019')
# multiple filters
filter(df, client != 'seller', commission > 10000)
filter(df, ds <= '12/14/2019', realtor_id == 0)
filter(df, client == 'seller', ds == '12/14/2019', realtor_id == 0)
filter(df,
realtor_id == 3,
location == 'pleasanton')
### arrange function
# order by single column
arrange(df, ds)
arrange(df, closing_price)
# descending order by single column
arrange(df, desc(ds))
# order by multiple columns
arrange(df, ds, realtor_id)
arrange(df, ds, realtor_id, closing_price)
# descending order by multiple columns
arrange(df, desc(realtor_id), desc(closing_price))
# mixed order by multiple columns
arrange(df, desc(realtor_id), desc(commission), location)
### select function
# select a single column
select(df, ds)
# select multiple columns
select(df, ds, closing_price)
select(df, realtor_id, closing_price, ds)
# select all columns, but put one (realtor_id in our case) first (most left)
select(df, realtor_id, everything())
select(df, realtor_id, commission, everything())
# keep results around as a variable
r_df <- select(df, ds, client, realtor_id, everything())
View(r_df)
### mutate function
df_small <- select(df, ds, client, realtor_id, closing_price, commission_rate)
df_small
# create (add) a single new column
mutate(df_small,
commission = closing_price * commission_rate)
# create (add) multiple new columns
mutate(df_small,
commission = closing_price * commission_rate,
was_listing_agent = client == 'seller')
### transmute function
# create a single column (drop all others)
transmute(df_small,
commission = closing_price * commission_rate)
# create multiple columns (drop all others)
transmute(df_small,
commission = closing_price * commission_rate,
was_listing_agent = client == 'seller')
# create multiple columns (keep a few original around as well, drop all others)
transmute(df_small,
ds,
closing_price,
commission = closing_price * commission_rate,
was_listing_agent = client == 'seller')
### summarize function
summarize(df, mean(closing_price))
# best when used with group by
by_id <- group_by(df, realtor_id)
summarize(by_id, count = n())
summarize(by_id,
count = n(),
avg_closing_price = mean(closing_price),
avg_commission = mean(commission),
gross_commission = sum(commission))
s_df <- summarize(by_id, count=n(), avg_closing_price=mean(closing_price),
avg_commission=mean(commission), gross_commission=sum(commission))
filter(s_df, count > 5)