diff --git a/R/features-exp.R b/R/features-exp.R index 44eaf38..88392d9 100644 --- a/R/features-exp.R +++ b/R/features-exp.R @@ -6,16 +6,19 @@ extract_features_exp <- function(x) { x <- x %>% dplyr::group_by(user_id) %>% dplyr::summarise( + ## tweets features n_sincelast = count_mean(since_last(.data$created_at)), n_timeofday = count_mean(hourofweekday(.data$created_at)), n = dplyr::n(), n_retweets = sum_(.data$is_retweet), n_quotes = sum_(.data$is_quote), + n_langs = tfse::n_uq(.data$lang), retweet_count = mean_(c(0, .data$retweet_count)), favorite_count = mean_(c(0, .data$favorite_count)), - favourites_count = max_(c(0, .data$favourites_count)), n_tweets = sum_(!.data$is_retweet & !.data$is_quote), - + n_places = sum_(!is.na(.data$place_name)), + n_geo_coords = ncoord(.data$geo_coords), + n_bbox_coords = ncoord(.data$bbox_coords), iphone = sum_("Twitter for iPhone" %in% .data$source) / .data$n, webclient = sum_("Twitter Web Client" %in% .data$source) / .data$n, android = sum_("Twitter for Android" %in% .data$source) / .data$n, @@ -25,28 +28,32 @@ extract_features_exp <- function(x) { google = sum_("Google" %in% .data$source) / .data$n, ifttt = sum_("IFTTT" %in% .data$source) / .data$n, facebook = sum_("Facebook" %in% .data$source) / .data$n, - twittbotnet = sum_("twittbot.net" %in% .data$source) / .data$n, tweetdeck = sum_("TweetDeck" %in% .data$source) / .data$n, - twitterforblackberry = sum_("Twitter for BlackBerry®" %in% .data$source) / .data$n, + twitterforblackberry = sum_( + "Twitter for BlackBerry®" %in% .data$source) / .data$n, dlvrit = sum_("dlvr.it" %in% .data$source) / .data$n, instagram = sum_("Instagram" %in% .data$source) / .data$n, curiouscat = sum_("Curious Cat" %in% .data$source) / .data$n, echofon = sum_("Echofon" %in% .data$source) / .data$n, - ubersocialforblackberry = sum_("UberSocial for BlackBerry" %in% .data$source) / .data$n, + ubersocialforblackberry = sum_( + "UberSocial for BlackBerry" %in% .data$source) / .data$n, athkarapp = sum_("athkarApp" %in% .data$source) / .data$n, mobilewebm2 = sum_("Mobile Web (M2)" %in% .data$source) / .data$n, twitterfeed = sum_("twitterfeed" %in% .data$source) / .data$n, tweetbotforiοs = sum_("Tweetbot for iΟS" %in% .data$source) / .data$n, - tweetcasterforandroid = sum_("TweetCaster for Android" %in% .data$source) / .data$n, - twitcomcomunidades = sum_("Twitcom - Comunidades " %in% .data$source) / .data$n, + tweetcasterforandroid = sum_( + "TweetCaster for Android" %in% .data$source) / .data$n, + twitcomcomunidades = sum_( + "Twitcom - Comunidades " %in% .data$source) / .data$n, cloudhopper = sum_("Cloudhopper" %in% .data$source) / .data$n, twicca = sum_("twicca" %in% .data$source) / .data$n, wordpresscom = sum_("WordPress.com" %in% .data$source) / .data$n, mobileweb = sum_("Mobile Web" %in% .data$source) / .data$n, foursquare = sum_("Foursquare" %in% .data$source) / .data$n, showroomlive = sum_("SHOWROOM-LIVE" %in% .data$source) / .data$n, - twitterforwebsites = sum_("Twitter for Websites" %in% .data$source) / .data$n, + twitterforwebsites = sum_( + "Twitter for Websites" %in% .data$source) / .data$n, ios = sum_("iOS" %in% .data$source) / .data$n, tumblr = sum_("Tumblr" %in% .data$source) / .data$n, tweetlogix = sum_("Tweetlogix" %in% .data$source) / .data$n, @@ -54,8 +61,10 @@ extract_features_exp <- function(x) { buffer = sum_("Buffer" %in% .data$source) / .data$n, twitcleplus = sum_("twitcle plus" %in% .data$source) / .data$n, keitaiweb = sum_("Keitai Web" %in% .data$source) / .data$n, - sandaysoftcumulus = sum_("Sandaysoft Cumulus" %in% .data$source) / .data$n, - twitpaneforandroid = sum_("TwitPane for Android" %in% .data$source) / .data$n, + sandaysoftcumulus = sum_( + "Sandaysoft Cumulus" %in% .data$source) / .data$n, + twitpaneforandroid = sum_( + "TwitPane for Android" %in% .data$source) / .data$n, playstationr4 = sum_("PlayStation(R)4" %in% .data$source) / .data$n, writelonger = sum_("Write Longer" %in% .data$source) / .data$n, featherforios = sum_("feather for iOS " %in% .data$source) / .data$n, @@ -66,28 +75,50 @@ extract_features_exp <- function(x) { janetter = sum_("Janetter" %in% .data$source) / .data$n, dynamictweets = sum_("Dynamic Tweets" %in% .data$source) / .data$n, twitcasting = sum_("TwitCasting" %in% .data$source) / .data$n, - ubersocialforandroid = sum_("UberSocial for Android" %in% .data$source) / .data$n, - janetterforandroid = sum_("Janetter for Android" %in% .data$source) / .data$n, - twitterforandroidtablets = sum_("Twitter for Android Tablets" %in% .data$source) / .data$n, + ubersocialforandroid = sum_( + "UberSocial for Android" %in% .data$source) / .data$n, + janetterforandroid = sum_( + "Janetter for Android" %in% .data$source) / .data$n, + twitterforandroidtablets = sum_( + "Twitter for Android Tablets" %in% .data$source) / .data$n, twitterformac = sum_("Twitter for Mac" %in% .data$source) / .data$n, + ## users features + lang_und = as.integer(.data$account_lang[1] == "und"), + lang_tr = as.integer(.data$account_lang[1] == "tr"), + lang_ru = as.integer(.data$account_lang[1] == "ru"), + lang_pt = as.integer(.data$account_lang[1] == "pt"), + lang_ja = as.integer(.data$account_lang[1] == "ja"), + lang_in = as.integer(.data$account_lang[1] == "in"), + lang_fr = as.integer(.data$account_lang[1] == "fr"), + lang_es = as.integer(.data$account_lang[1] == "es"), + lang_en = as.integer(.data$account_lang[1] == "en"), + lang_are = as.integer(.data$account_lang[1] == "ar"), + lang_de = as.integer(.data$account_lang[1] == "de"), + lang_it = as.integer(.data$account_lang[1] == "it"), + lang_id = as.integer(.data$account_lang[1] == "id"), + lang_ko = as.integer(.data$account_lang[1] == "ko"), + lang_nl = as.integer(.data$account_lang[1] == "nl"), + lang_hi = as.integer(.data$account_lang[1] == "hi"), + lang_fil = as.integer(.data$account_lang[1] == "fil"), + lang_th = as.integer(.data$account_lang[1] == "th"), + lang_engb = as.integer(.data$account_lang[1] == "en-gb"), + screen_name_alpha = nchar_(.data$screen_name[1]), + screen_name_num = ndigit_(.data$screen_name[1]), prof_image_na = sum_(is.na(.data$profile_image_url[1])), prof_image_type = sum_(grepl("\\.jpg", .data$profile_image_url[1])), - profile_bg_na = sum_(is.na(.data$profile_background_url[1])), profile_bg_type = sum_(grepl("\\.png", .data$profile_background_url[1])), - profile_bn_na = sum_(is.na(.data$profile_banner_url[1])), - verified = as.integer(.data$verified[1]), + profile_url = !is.na(.data$profile_url[1]), years_on_twitter = relative_twitter_age(.data$account_created_at[1]), tweets_per_year = .data$n_tweets / (1 + .data$years_on_twitter), - - ## i added one here so it wouldn't return NaN or undefined values (0 / x) statuses_count = max_(c(0, .data$statuses_count)), followers_count = max_(c(0, .data$followers_count)), friends_count = max_(c(0, .data$friends_count)), listed_count = max_(c(0, .data$listed_count)), + favourites_count = max_(c(0, .data$favourites_count)), tweets_to_followers = (.data$statuses_count + 1) / (.data$followers_count + 1), statuses_rate = (.data$statuses_count + 1) / @@ -106,8 +137,15 @@ age_of_twitter <- function() { } relative_twitter_age <- function(account_created_at) { - years <- as.numeric(difftime(Sys.time(), account_created_at, units = "days"))/365 + years <- as.numeric(difftime( + Sys.time(), account_created_at, units = "days"))/365 aot <- age_of_twitter() ## set it at 15 (years / aot) * 15 } + +ncoord <- function(x) { + sum(vapply(x, function(.x) !is.na(.x[1]), integer(1), USE.NAMES = FALSE)) +} + + diff --git a/R/utils.R b/R/utils.R index 5a2f027..f6a16cb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -67,3 +67,14 @@ count_mean <- function(x) { x <- as.integer(x) - 1L mean(x, na.rm = TRUE) } + + +nchar_ <- function(x) { + ifelse(is.na(x), 0, nchar(x)) +} + + +ndigit_ <- function(x) { + ifelse(is.na(x), 0, nchar(gsub("\\D", "", x))) +} +