Skip to content

Commit

Permalink
Final files
Browse files Browse the repository at this point in the history
  • Loading branch information
ViswanathRavindran authored Nov 20, 2018
1 parent f8bf25c commit e9429a7
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 79 deletions.
17 changes: 3 additions & 14 deletions New_LGB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@
}
],
"source": [
"# Find frequency of is_attributed for each unique value in column\n",
"freqs = {}\n",
"for cols in CLICK_ATTR_CATS:\n",
" \n",
Expand Down Expand Up @@ -345,9 +344,6 @@
"source": [
"# Define all the groupby transformations\n",
"GROUPBY_AGGREGATIONS = [\n",
" # V1 - GroupBy Features #\n",
" ######################### \n",
" # Variance in day, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
" # Variance in hour, for user_id-prod-product_category_1\n",
" {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n",
Expand All @@ -364,21 +360,15 @@
" # Mean hour, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
" \n",
" # V2 - GroupBy Features #\n",
" #########################\n",
" # Average clicks on app by distinct users; is it an app they return to?\n",
" {'groupby': ['prod'], \n",
" 'select': 'user_id', \n",
" 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
" 'agg_name': 'AvgprodPerDistinct'\n",
" },\n",
" # How popular is the app or channel?\n",
"\n",
" {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
" {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
" \n",
" # V3 - GroupBy Features #\n",
" # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
" ###################################################################### \n",
"\n",
" {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
Expand Down Expand Up @@ -427,7 +417,7 @@
}
],
"source": [
"# Apply all the groupby transformations\n",
"\n",
"for spec in GROUPBY_AGGREGATIONS:\n",
" \n",
" # Name of the aggregation we're applying\n",
Expand All @@ -451,7 +441,6 @@
" reset_index(). \\\n",
" rename(index=str, columns={spec['select']: new_feature})\n",
" \n",
" # Merge back to X_total\n",
" if 'cumcount' == spec['agg']:\n",
" merge[new_feature] = gp[0].values\n",
" else:\n",
Expand Down
41 changes: 4 additions & 37 deletions New_XGB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@
}
],
"source": [
"# Find frequency of is_attributed for each unique value in column\n",
"freqs = {}\n",
"for cols in CLICK_ATTR_CATS:\n",
" \n",
Expand Down Expand Up @@ -286,23 +285,15 @@
}
],
"source": [
"# Identify the previous ads and history ads\n",
"\n",
"HISTORY_ADS = {\n",
" 'identical_': ['user_id', 'prod', 'product_category_1', 'webpage_id', 'campaign_id'],\n",
" 'user_prods': ['user_id', 'prod']\n",
"}\n",
"\n",
"# Go through different group-by combinations\n",
"for fname, fset in HISTORY_ADS.items():\n",
" \n",
" # Clicks in the past\n",
" merge['prev_'+fname] = merge.groupby(fset).cumcount().rename('prev_'+fname)\n",
" \n",
" # Clicks in the future\n",
" merge['future_'+fname] = merge.iloc[::-1].groupby(fset).cumcount().rename('future_'+fname).iloc[::-1]\n",
"\n",
"# Count cumulative subsequent clicks\n",
"print(merge.shape)"
]
},
Expand All @@ -312,42 +303,26 @@
"metadata": {},
"outputs": [],
"source": [
"# Define all the groupby transformations\n",
"\n",
"GROUPBY_AGGREGATIONS = [\n",
" # V1 - GroupBy Features #\n",
" ######################### \n",
" # Variance in day, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
" # Variance in hour, for user_id-prod-product_category_1\n",
" {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n",
" # Variance in hour, for user_id-day-campaign_id\n",
" {'groupby': ['user_id','day','campaign_id'], 'select': 'hour', 'agg': 'var'},\n",
" # Count, for user_id-day-hour'dow','hour'\n",
" {'groupby': ['user_id','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n",
" # Count, for user_id-prod\n",
" {'groupby': ['user_id', 'prod'], 'select': 'campaign_id', 'agg': 'count'}, \n",
" # Count, for user_id-prod-webpage_id\n",
" {'groupby': ['user_id', 'prod', 'webpage_id'], 'select': 'campaign_id', 'agg': 'count'},\n",
" # Count, for user_id-prod-day-hour\n",
" {'groupby': ['user_id','prod','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n",
" # Mean hour, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
" \n",
" # V2 - GroupBy Features #\n",
" #########################\n",
" # Average clicks on app by distinct users; is it an app they return to?\n",
"\n",
" {'groupby': ['prod'], \n",
" 'select': 'user_id', \n",
" 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
" 'agg_name': 'AvgprodPerDistinct'\n",
" },\n",
" # How popular is the app or channel?\n",
" {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
" {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
" \n",
" # V3 - GroupBy Features #\n",
" # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
" ###################################################################### \n",
" {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
Expand Down Expand Up @@ -399,34 +374,26 @@
"# Apply all the groupby transformations\n",
"for spec in GROUPBY_AGGREGATIONS:\n",
" \n",
" # Name of the aggregation we're applying\n",
"\n",
" agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']\n",
" \n",
" # Name of new feature\n",
" new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])\n",
" \n",
" # Info\n",
" print(\"Grouping by {}, and aggregating {} with {}\".format(\n",
" spec['groupby'], spec['select'], agg_name\n",
" ))\n",
" \n",
" # Unique list of features to select\n",
" all_features = list(set(spec['groupby'] + [spec['select']]))\n",
" \n",
" # Perform the groupby\n",
" gp = merge[all_features]. \\\n",
" groupby(spec['groupby'])[spec['select']]. \\\n",
" agg(spec['agg']). \\\n",
" reset_index(). \\\n",
" rename(index=str, columns={spec['select']: new_feature})\n",
" \n",
" # Merge back to X_total\n",
"\n",
" if 'cumcount' == spec['agg']:\n",
" merge[new_feature] = gp[0].values\n",
" else:\n",
" merge = merge.merge(gp, on=spec['groupby'], how='left')\n",
" \n",
" # Clear memory\n",
" del gp\n",
" gc.collect()\n",
"\n",
Expand Down
27 changes: 8 additions & 19 deletions New_XGB_LGB2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@
}
],
"source": [
"# Find frequency of is_attributed for each unique value in column\n",
"# Find frequency of is_click for each unique value in column\n",
"freqs = {}\n",
"for cols in CLICK_ATTR_CATS:\n",
" \n",
Expand All @@ -195,9 +195,8 @@
" cols, new_feature,group_sizes.max(), np.round(group_sizes.mean(), 2), np.round(group_sizes.median(), 2),\n",
" group_sizes.min()))\n",
" \n",
" # Aggregation function\n",
" # Calculate the click rate. Scale by confidence\n",
" def rate_calculation(x):\n",
" \"\"\"Calculate the click rate. Scale by confidence\"\"\"\n",
" rate = x.sum() / float(x.count())\n",
" conf = np.min([1, np.log(x.count()) / log_group])\n",
" return rate * conf\n",
Expand Down Expand Up @@ -316,9 +315,7 @@
"outputs": [],
"source": [
"# Define all the groupby transformations\n",
"GROUPBY_AGGREGATIONS = [\n",
" # V1 - GroupBy Features #\n",
" ######################### \n",
"GROUPBY_AGGREGATIONS = [ \n",
" # Variance in day, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
" # Variance in hour, for user_id-prod-product_category_1\n",
Expand All @@ -336,21 +333,13 @@
" # Mean hour, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
" \n",
" # V2 - GroupBy Features #\n",
" #########################\n",
" # Average clicks on app by distinct users; is it an app they return to?\n",
" {'groupby': ['prod'], \n",
" 'select': 'user_id', \n",
" 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
" 'agg_name': 'AvgprodPerDistinct'\n",
" },\n",
" # How popular is the app or channel?\n",
" \n",
" {'groupby': ['prod'], 'select': 'user_id', 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
" 'agg_name': 'AvgprodPerDistinct'},\n",
"\n",
" {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
" {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
" \n",
" # V3 - GroupBy Features #\n",
" # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
" ###################################################################### \n",
" {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
Expand Down Expand Up @@ -423,7 +412,7 @@
" reset_index(). \\\n",
" rename(index=str, columns={spec['select']: new_feature})\n",
" \n",
" # Merge back to X_total\n",
" # Merge back to data\n",
" if 'cumcount' == spec['agg']:\n",
" merge[new_feature] = gp[0].values\n",
" else:\n",
Expand Down
10 changes: 1 addition & 9 deletions XGboost_Basic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,6 @@
"source": [
"# Define all the groupby transformations\n",
"GROUPBY_AGGREGATIONS = [\n",
" # V1 - GroupBy Features #\n",
" ######################### \n",
" # Variance in day, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
" # Variance in hour, for user_id-prod-product_category_1\n",
Expand All @@ -304,9 +302,6 @@
" # Mean hour, for user_id-prod-campaign_id\n",
" {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
" \n",
" # V2 - GroupBy Features #\n",
" #########################\n",
" # Average clicks on app by distinct users; is it an app they return to?\n",
" {'groupby': ['prod'], \n",
" 'select': 'user_id', \n",
" 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
Expand All @@ -315,10 +310,7 @@
" # How popular is the app or channel?\n",
" {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
" {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
" \n",
" # V3 - GroupBy Features #\n",
" # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
" ###################################################################### \n",
"\n",
" {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
" {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
Expand Down

0 comments on commit e9429a7

Please sign in to comment.