generated from KSUDS/p3_spatial
-
Notifications
You must be signed in to change notification settings - Fork 1
/
eda_safegraph.py
75 lines (59 loc) · 2.3 KB
/
eda_safegraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#%%
import pandas as pd
import numpy as np
import geopandas as gpd
import folium
from plotnine import *
import safegraph_functions as sgf
import requests
#%%
import re #regular expression
#range number - if the size is 1
#%%
your_location = "safegraph_functions.py"
url = "https://gist.githubusercontent.com/hathawayj/ddb41bb308aaf4e95cede353311fb4f5/raw/02184ca131c0b145931a028feba5c38f8c7e4b52/safegraph_functions.py"
response = requests.get(url)
print(response.headers.get('content-type'))
open(your_location, "wb").write(response.content)
# %%
url_loc = "https://github.com/KSUDS/p3_spatial/raw/main/SafeGraph%20-%20Patterns%20and%20Core%20Data%20-%20Chipotle%20-%20July%202021/Core%20Places%20and%20Patterns%20Data/chipotle_core_poi_and_patterns.csv"
dat = pd.read_csv(url_loc)
datl = dat.iloc[:10,:]
#%%
#Now we can use sgf.expand_json() and sgf.expand.list() to get the embedded data out of the dataframe.
list_cols = ['visits_by_day', 'popularity_by_hour']
json_cols = ['open_hours','visitor_home_cbgs', 'visitor_country_of_orgin', 'bucketed_dwell_times', 'related_same_day_brand', 'related_same_month_brand', 'popularity_by_day', 'device_type', 'visitor_home_aggregation', 'visitor_daytime_cbgs']
dat_pbd = sgf.expand_json('popularity_by_day', datl)
#row is a store by day of the week
dat_rsdb = sgf.expand_json('related_same_day_brand', datl)
#different stores by seeing who went to each.
dat_vbd = sgf.expand_list("visits_by_day", datl)
#return 0 if no visits, number by day of the month
dat_pbh = sgf.expand_list("popularity_by_hour", datl)
#%%
#What are the top three brands that Chipotle customers visit on the same day?
#Create a bar chart of the top 10 to show us.
dat_rsdb = sgf.expand_json('related_same_day_brand', dat)
#%%
(dat_rsdb
.drop(columns=["placekey"])
.sum()
.reset.index()
.rename(columns = {"index":"brand",
0:"visits"})
.sort_values(by = "visits", ascending = False)
.assign(brand = lambda x: x.brand.str.replace)
.head(20)
.reset.index(drop =True)
)
#%%
ggplot(dat20, aes(x= "brand", y = "visits")) +
geom_col() +
coord_flip())
#%%
dat_pbh = sgf.expand_list("popularity_by_hour", dat)
(ggplot(dat_pbd, aes( x = "hour.astype(str).str.", y = "popularity_by_hour"))+
geom_boxplot)
#last code is similar to cbind (pd.concat)in r code/ if code not order then row
#
# %%