Skip to content

Commit

Permalink
add features
Browse files Browse the repository at this point in the history
  • Loading branch information
mkearney committed Dec 28, 2018
1 parent fc2f70d commit c536836
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 19 deletions.
76 changes: 57 additions & 19 deletions R/features-exp.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@ extract_features_exp <- function(x) {
x <- x %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(
## tweets features
n_sincelast = count_mean(since_last(.data$created_at)),
n_timeofday = count_mean(hourofweekday(.data$created_at)),
n = dplyr::n(),
n_retweets = sum_(.data$is_retweet),
n_quotes = sum_(.data$is_quote),
n_langs = tfse::n_uq(.data$lang),
retweet_count = mean_(c(0, .data$retweet_count)),
favorite_count = mean_(c(0, .data$favorite_count)),
favourites_count = max_(c(0, .data$favourites_count)),
n_tweets = sum_(!.data$is_retweet & !.data$is_quote),

n_places = sum_(!is.na(.data$place_name)),
n_geo_coords = ncoord(.data$geo_coords),
n_bbox_coords = ncoord(.data$bbox_coords),
iphone = sum_("Twitter for iPhone" %in% .data$source) / .data$n,
webclient = sum_("Twitter Web Client" %in% .data$source) / .data$n,
android = sum_("Twitter for Android" %in% .data$source) / .data$n,
Expand All @@ -25,37 +28,43 @@ extract_features_exp <- function(x) {
google = sum_("Google" %in% .data$source) / .data$n,
ifttt = sum_("IFTTT" %in% .data$source) / .data$n,
facebook = sum_("Facebook" %in% .data$source) / .data$n,

twittbotnet = sum_("twittbot.net" %in% .data$source) / .data$n,
tweetdeck = sum_("TweetDeck" %in% .data$source) / .data$n,
twitterforblackberry = sum_("Twitter for BlackBerry®" %in% .data$source) / .data$n,
twitterforblackberry = sum_(
"Twitter for BlackBerry®" %in% .data$source) / .data$n,
dlvrit = sum_("dlvr.it" %in% .data$source) / .data$n,
instagram = sum_("Instagram" %in% .data$source) / .data$n,
curiouscat = sum_("Curious Cat" %in% .data$source) / .data$n,
echofon = sum_("Echofon" %in% .data$source) / .data$n,
ubersocialforblackberry = sum_("UberSocial for BlackBerry" %in% .data$source) / .data$n,
ubersocialforblackberry = sum_(
"UberSocial for BlackBerry" %in% .data$source) / .data$n,
athkarapp = sum_("athkarApp" %in% .data$source) / .data$n,
mobilewebm2 = sum_("Mobile Web (M2)" %in% .data$source) / .data$n,
twitterfeed = sum_("twitterfeed" %in% .data$source) / .data$n,
tweetbotforiοs = sum_("Tweetbot for iΟS" %in% .data$source) / .data$n,
tweetcasterforandroid = sum_("TweetCaster for Android" %in% .data$source) / .data$n,
twitcomcomunidades = sum_("Twitcom - Comunidades " %in% .data$source) / .data$n,
tweetcasterforandroid = sum_(
"TweetCaster for Android" %in% .data$source) / .data$n,
twitcomcomunidades = sum_(
"Twitcom - Comunidades " %in% .data$source) / .data$n,
cloudhopper = sum_("Cloudhopper" %in% .data$source) / .data$n,
twicca = sum_("twicca" %in% .data$source) / .data$n,
wordpresscom = sum_("WordPress.com" %in% .data$source) / .data$n,
mobileweb = sum_("Mobile Web" %in% .data$source) / .data$n,
foursquare = sum_("Foursquare" %in% .data$source) / .data$n,
showroomlive = sum_("SHOWROOM-LIVE" %in% .data$source) / .data$n,
twitterforwebsites = sum_("Twitter for Websites" %in% .data$source) / .data$n,
twitterforwebsites = sum_(
"Twitter for Websites" %in% .data$source) / .data$n,
ios = sum_("iOS" %in% .data$source) / .data$n,
tumblr = sum_("Tumblr" %in% .data$source) / .data$n,
tweetlogix = sum_("Tweetlogix" %in% .data$source) / .data$n,
socialoomph = sum_("SocialOomph" %in% .data$source) / .data$n,
buffer = sum_("Buffer" %in% .data$source) / .data$n,
twitcleplus = sum_("twitcle plus" %in% .data$source) / .data$n,
keitaiweb = sum_("Keitai Web" %in% .data$source) / .data$n,
sandaysoftcumulus = sum_("Sandaysoft Cumulus" %in% .data$source) / .data$n,
twitpaneforandroid = sum_("TwitPane for Android" %in% .data$source) / .data$n,
sandaysoftcumulus = sum_(
"Sandaysoft Cumulus" %in% .data$source) / .data$n,
twitpaneforandroid = sum_(
"TwitPane for Android" %in% .data$source) / .data$n,
playstationr4 = sum_("PlayStation(R)4" %in% .data$source) / .data$n,
writelonger = sum_("Write Longer" %in% .data$source) / .data$n,
featherforios = sum_("feather for iOS " %in% .data$source) / .data$n,
Expand All @@ -66,28 +75,50 @@ extract_features_exp <- function(x) {
janetter = sum_("Janetter" %in% .data$source) / .data$n,
dynamictweets = sum_("Dynamic Tweets" %in% .data$source) / .data$n,
twitcasting = sum_("TwitCasting" %in% .data$source) / .data$n,
ubersocialforandroid = sum_("UberSocial for Android" %in% .data$source) / .data$n,
janetterforandroid = sum_("Janetter for Android" %in% .data$source) / .data$n,
twitterforandroidtablets = sum_("Twitter for Android Tablets" %in% .data$source) / .data$n,
ubersocialforandroid = sum_(
"UberSocial for Android" %in% .data$source) / .data$n,
janetterforandroid = sum_(
"Janetter for Android" %in% .data$source) / .data$n,
twitterforandroidtablets = sum_(
"Twitter for Android Tablets" %in% .data$source) / .data$n,
twitterformac = sum_("Twitter for Mac" %in% .data$source) / .data$n,

## users features
lang_und = as.integer(.data$account_lang[1] == "und"),
lang_tr = as.integer(.data$account_lang[1] == "tr"),
lang_ru = as.integer(.data$account_lang[1] == "ru"),
lang_pt = as.integer(.data$account_lang[1] == "pt"),
lang_ja = as.integer(.data$account_lang[1] == "ja"),
lang_in = as.integer(.data$account_lang[1] == "in"),
lang_fr = as.integer(.data$account_lang[1] == "fr"),
lang_es = as.integer(.data$account_lang[1] == "es"),
lang_en = as.integer(.data$account_lang[1] == "en"),
lang_are = as.integer(.data$account_lang[1] == "ar"),
lang_de = as.integer(.data$account_lang[1] == "de"),
lang_it = as.integer(.data$account_lang[1] == "it"),
lang_id = as.integer(.data$account_lang[1] == "id"),
lang_ko = as.integer(.data$account_lang[1] == "ko"),
lang_nl = as.integer(.data$account_lang[1] == "nl"),
lang_hi = as.integer(.data$account_lang[1] == "hi"),
lang_fil = as.integer(.data$account_lang[1] == "fil"),
lang_th = as.integer(.data$account_lang[1] == "th"),
lang_engb = as.integer(.data$account_lang[1] == "en-gb"),
screen_name_alpha = nchar_(.data$screen_name[1]),
screen_name_num = ndigit_(.data$screen_name[1]),
prof_image_na = sum_(is.na(.data$profile_image_url[1])),
prof_image_type = sum_(grepl("\\.jpg", .data$profile_image_url[1])),

profile_bg_na = sum_(is.na(.data$profile_background_url[1])),
profile_bg_type = sum_(grepl("\\.png", .data$profile_background_url[1])),

profile_bn_na = sum_(is.na(.data$profile_banner_url[1])),

verified = as.integer(.data$verified[1]),
profile_url = !is.na(.data$profile_url[1]),
years_on_twitter = relative_twitter_age(.data$account_created_at[1]),
tweets_per_year = .data$n_tweets / (1 + .data$years_on_twitter),

## i added one here so it wouldn't return NaN or undefined values (0 / x)
statuses_count = max_(c(0, .data$statuses_count)),
followers_count = max_(c(0, .data$followers_count)),
friends_count = max_(c(0, .data$friends_count)),
listed_count = max_(c(0, .data$listed_count)),
favourites_count = max_(c(0, .data$favourites_count)),
tweets_to_followers = (.data$statuses_count + 1) /
(.data$followers_count + 1),
statuses_rate = (.data$statuses_count + 1) /
Expand All @@ -106,8 +137,15 @@ age_of_twitter <- function() {
}

relative_twitter_age <- function(account_created_at) {
years <- as.numeric(difftime(Sys.time(), account_created_at, units = "days"))/365
years <- as.numeric(difftime(
Sys.time(), account_created_at, units = "days"))/365
aot <- age_of_twitter()
## set it at 15
(years / aot) * 15
}

ncoord <- function(x) {
sum(vapply(x, function(.x) !is.na(.x[1]), integer(1), USE.NAMES = FALSE))
}


11 changes: 11 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,14 @@ count_mean <- function(x) {
x <- as.integer(x) - 1L
mean(x, na.rm = TRUE)
}


nchar_ <- function(x) {
ifelse(is.na(x), 0, nchar(x))
}


ndigit_ <- function(x) {
ifelse(is.na(x), 0, nchar(gsub("\\D", "", x)))
}

0 comments on commit c536836

Please sign in to comment.