diff --git a/New_LGB.ipynb b/New_LGB.ipynb index 2edd4c4..ab62033 100644 --- a/New_LGB.ipynb +++ b/New_LGB.ipynb @@ -206,7 +206,6 @@ } ], "source": [ - "# Find frequency of is_attributed for each unique value in column\n", "freqs = {}\n", "for cols in CLICK_ATTR_CATS:\n", " \n", @@ -345,9 +344,6 @@ "source": [ "# Define all the groupby transformations\n", "GROUPBY_AGGREGATIONS = [\n", - " # V1 - GroupBy Features #\n", - " ######################### \n", - " # Variance in day, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n", " # Variance in hour, for user_id-prod-product_category_1\n", " {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n", @@ -364,21 +360,15 @@ " # Mean hour, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n", " \n", - " # V2 - GroupBy Features #\n", - " #########################\n", - " # Average clicks on app by distinct users; is it an app they return to?\n", " {'groupby': ['prod'], \n", " 'select': 'user_id', \n", " 'agg': lambda x: float(len(x)) / len(x.unique()), \n", " 'agg_name': 'AvgprodPerDistinct'\n", " },\n", - " # How popular is the app or channel?\n", + "\n", " {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n", " {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n", - " \n", - " # V3 - GroupBy Features #\n", - " # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n", - " ###################################################################### \n", + "\n", " {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n", " {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n", " {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n", @@ -427,7 +417,7 @@ } ], "source": [ - "# Apply all the groupby transformations\n", + "\n", "for spec in GROUPBY_AGGREGATIONS:\n", " \n", " # Name of the aggregation we're applying\n", @@ -451,7 +441,6 @@ " reset_index(). \\\n", " rename(index=str, columns={spec['select']: new_feature})\n", " \n", - " # Merge back to X_total\n", " if 'cumcount' == spec['agg']:\n", " merge[new_feature] = gp[0].values\n", " else:\n", diff --git a/New_XGB.ipynb b/New_XGB.ipynb index 87aa4cb..81962ae 100644 --- a/New_XGB.ipynb +++ b/New_XGB.ipynb @@ -175,7 +175,6 @@ } ], "source": [ - "# Find frequency of is_attributed for each unique value in column\n", "freqs = {}\n", "for cols in CLICK_ATTR_CATS:\n", " \n", @@ -286,23 +285,15 @@ } ], "source": [ - "# Identify the previous ads and history ads\n", - "\n", "HISTORY_ADS = {\n", " 'identical_': ['user_id', 'prod', 'product_category_1', 'webpage_id', 'campaign_id'],\n", " 'user_prods': ['user_id', 'prod']\n", "}\n", "\n", - "# Go through different group-by combinations\n", "for fname, fset in HISTORY_ADS.items():\n", " \n", - " # Clicks in the past\n", " merge['prev_'+fname] = merge.groupby(fset).cumcount().rename('prev_'+fname)\n", - " \n", - " # Clicks in the future\n", " merge['future_'+fname] = merge.iloc[::-1].groupby(fset).cumcount().rename('future_'+fname).iloc[::-1]\n", - "\n", - "# Count cumulative subsequent clicks\n", "print(merge.shape)" ] }, @@ -312,42 +303,26 @@ "metadata": {}, "outputs": [], "source": [ - "# Define all the groupby transformations\n", + "\n", "GROUPBY_AGGREGATIONS = [\n", - " # V1 - GroupBy Features #\n", - " ######################### \n", - " # Variance in day, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n", - " # Variance in hour, for user_id-prod-product_category_1\n", " {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n", - " # Variance in hour, for user_id-day-campaign_id\n", " {'groupby': ['user_id','day','campaign_id'], 'select': 'hour', 'agg': 'var'},\n", - " # Count, for user_id-day-hour'dow','hour'\n", " {'groupby': ['user_id','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n", - " # Count, for user_id-prod\n", " {'groupby': ['user_id', 'prod'], 'select': 'campaign_id', 'agg': 'count'}, \n", - " # Count, for user_id-prod-webpage_id\n", " {'groupby': ['user_id', 'prod', 'webpage_id'], 'select': 'campaign_id', 'agg': 'count'},\n", - " # Count, for user_id-prod-day-hour\n", " {'groupby': ['user_id','prod','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n", - " # Mean hour, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n", " \n", - " # V2 - GroupBy Features #\n", - " #########################\n", - " # Average clicks on app by distinct users; is it an app they return to?\n", + "\n", " {'groupby': ['prod'], \n", " 'select': 'user_id', \n", " 'agg': lambda x: float(len(x)) / len(x.unique()), \n", " 'agg_name': 'AvgprodPerDistinct'\n", " },\n", - " # How popular is the app or channel?\n", " {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n", " {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n", " \n", - " # V3 - GroupBy Features #\n", - " # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n", - " ###################################################################### \n", " {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n", " {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n", " {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n", @@ -399,34 +374,26 @@ "# Apply all the groupby transformations\n", "for spec in GROUPBY_AGGREGATIONS:\n", " \n", - " # Name of the aggregation we're applying\n", + "\n", " agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']\n", - " \n", - " # Name of new feature\n", " new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])\n", " \n", - " # Info\n", " print(\"Grouping by {}, and aggregating {} with {}\".format(\n", " spec['groupby'], spec['select'], agg_name\n", " ))\n", " \n", - " # Unique list of features to select\n", " all_features = list(set(spec['groupby'] + [spec['select']]))\n", - " \n", - " # Perform the groupby\n", " gp = merge[all_features]. \\\n", " groupby(spec['groupby'])[spec['select']]. \\\n", " agg(spec['agg']). \\\n", " reset_index(). \\\n", " rename(index=str, columns={spec['select']: new_feature})\n", - " \n", - " # Merge back to X_total\n", + "\n", " if 'cumcount' == spec['agg']:\n", " merge[new_feature] = gp[0].values\n", " else:\n", " merge = merge.merge(gp, on=spec['groupby'], how='left')\n", " \n", - " # Clear memory\n", " del gp\n", " gc.collect()\n", "\n", diff --git a/New_XGB_LGB2.ipynb b/New_XGB_LGB2.ipynb index f058fe3..4a1c527 100644 --- a/New_XGB_LGB2.ipynb +++ b/New_XGB_LGB2.ipynb @@ -178,7 +178,7 @@ } ], "source": [ - "# Find frequency of is_attributed for each unique value in column\n", + "# Find frequency of is_click for each unique value in column\n", "freqs = {}\n", "for cols in CLICK_ATTR_CATS:\n", " \n", @@ -195,9 +195,8 @@ " cols, new_feature,group_sizes.max(), np.round(group_sizes.mean(), 2), np.round(group_sizes.median(), 2),\n", " group_sizes.min()))\n", " \n", - " # Aggregation function\n", + " # Calculate the click rate. Scale by confidence\n", " def rate_calculation(x):\n", - " \"\"\"Calculate the click rate. Scale by confidence\"\"\"\n", " rate = x.sum() / float(x.count())\n", " conf = np.min([1, np.log(x.count()) / log_group])\n", " return rate * conf\n", @@ -316,9 +315,7 @@ "outputs": [], "source": [ "# Define all the groupby transformations\n", - "GROUPBY_AGGREGATIONS = [\n", - " # V1 - GroupBy Features #\n", - " ######################### \n", + "GROUPBY_AGGREGATIONS = [ \n", " # Variance in day, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n", " # Variance in hour, for user_id-prod-product_category_1\n", @@ -336,21 +333,13 @@ " # Mean hour, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n", " \n", - " # V2 - GroupBy Features #\n", - " #########################\n", - " # Average clicks on app by distinct users; is it an app they return to?\n", - " {'groupby': ['prod'], \n", - " 'select': 'user_id', \n", - " 'agg': lambda x: float(len(x)) / len(x.unique()), \n", - " 'agg_name': 'AvgprodPerDistinct'\n", - " },\n", - " # How popular is the app or channel?\n", + " \n", + " {'groupby': ['prod'], 'select': 'user_id', 'agg': lambda x: float(len(x)) / len(x.unique()), \n", + " 'agg_name': 'AvgprodPerDistinct'},\n", + "\n", " {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n", " {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n", " \n", - " # V3 - GroupBy Features #\n", - " # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n", - " ###################################################################### \n", " {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n", " {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n", " {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n", @@ -423,7 +412,7 @@ " reset_index(). \\\n", " rename(index=str, columns={spec['select']: new_feature})\n", " \n", - " # Merge back to X_total\n", + " # Merge back to data\n", " if 'cumcount' == spec['agg']:\n", " merge[new_feature] = gp[0].values\n", " else:\n", diff --git a/XGboost_Basic.ipynb b/XGboost_Basic.ipynb index d3291e6..c3742b5 100644 --- a/XGboost_Basic.ipynb +++ b/XGboost_Basic.ipynb @@ -285,8 +285,6 @@ "source": [ "# Define all the groupby transformations\n", "GROUPBY_AGGREGATIONS = [\n", - " # V1 - GroupBy Features #\n", - " ######################### \n", " # Variance in day, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n", " # Variance in hour, for user_id-prod-product_category_1\n", @@ -304,9 +302,6 @@ " # Mean hour, for user_id-prod-campaign_id\n", " {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n", " \n", - " # V2 - GroupBy Features #\n", - " #########################\n", - " # Average clicks on app by distinct users; is it an app they return to?\n", " {'groupby': ['prod'], \n", " 'select': 'user_id', \n", " 'agg': lambda x: float(len(x)) / len(x.unique()), \n", @@ -315,10 +310,7 @@ " # How popular is the app or channel?\n", " {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n", " {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n", - " \n", - " # V3 - GroupBy Features #\n", - " # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n", - " ###################################################################### \n", + "\n", " {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n", " {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n", " {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",