Final files

Sinchit · Nov 20, 2018 · e9429a7 · e9429a7
1 parent f8bf25c
commit e9429a7
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 79 deletions.
diff --git a/New_LGB.ipynb b/New_LGB.ipynb
@@ -206,7 +206,6 @@
     }
    ],
    "source": [
-    "# Find frequency of is_attributed for each unique value in column\n",
     "freqs = {}\n",
     "for cols in CLICK_ATTR_CATS:\n",
     "    \n",
@@ -345,9 +344,6 @@
    "source": [
     "# Define all the groupby transformations\n",
     "GROUPBY_AGGREGATIONS = [\n",
-    "    # V1 - GroupBy Features #\n",
-    "    #########################    \n",
-    "    # Variance in day, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
     "    # Variance in hour, for user_id-prod-product_category_1\n",
     "    {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n",
@@ -364,21 +360,15 @@
     "    # Mean hour, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
     "    \n",
-    "    # V2 - GroupBy Features #\n",
-    "    #########################\n",
-    "    # Average clicks on app by distinct users; is it an app they return to?\n",
     "    {'groupby': ['prod'], \n",
     "     'select': 'user_id', \n",
     "     'agg': lambda x: float(len(x)) / len(x.unique()), \n",
     "     'agg_name': 'AvgprodPerDistinct'\n",
     "    },\n",
-    "    # How popular is the app or channel?\n",
+    "\n",
     "    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
     "    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
-    "    \n",
-    "    # V3 - GroupBy Features                                              #\n",
-    "    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
-    "    ###################################################################### \n",
+    "\n",
     "    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
@@ -427,7 +417,7 @@
     }
    ],
    "source": [
-    "# Apply all the groupby transformations\n",
+    "\n",
     "for spec in GROUPBY_AGGREGATIONS:\n",
     "    \n",
     "    # Name of the aggregation we're applying\n",
@@ -451,7 +441,6 @@
     "        reset_index(). \\\n",
     "        rename(index=str, columns={spec['select']: new_feature})\n",
     "        \n",
-    "    # Merge back to X_total\n",
     "    if 'cumcount' == spec['agg']:\n",
     "        merge[new_feature] = gp[0].values\n",
     "    else:\n",

diff --git a/New_XGB.ipynb b/New_XGB.ipynb
@@ -175,7 +175,6 @@
     }
    ],
    "source": [
-    "# Find frequency of is_attributed for each unique value in column\n",
     "freqs = {}\n",
     "for cols in CLICK_ATTR_CATS:\n",
     "    \n",
@@ -286,23 +285,15 @@
     }
    ],
    "source": [
-    "# Identify the previous ads and history ads\n",
-    "\n",
     "HISTORY_ADS = {\n",
     "    'identical_': ['user_id', 'prod', 'product_category_1', 'webpage_id', 'campaign_id'],\n",
     "    'user_prods': ['user_id', 'prod']\n",
     "}\n",
     "\n",
-    "# Go through different group-by combinations\n",
     "for fname, fset in HISTORY_ADS.items():\n",
     "    \n",
-    "    # Clicks in the past\n",
     "    merge['prev_'+fname] = merge.groupby(fset).cumcount().rename('prev_'+fname)\n",
-    "        \n",
-    "    # Clicks in the future\n",
     "    merge['future_'+fname] = merge.iloc[::-1].groupby(fset).cumcount().rename('future_'+fname).iloc[::-1]\n",
-    "\n",
-    "# Count cumulative subsequent clicks\n",
     "print(merge.shape)"
    ]
   },
@@ -312,42 +303,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Define all the groupby transformations\n",
+    "\n",
     "GROUPBY_AGGREGATIONS = [\n",
-    "    # V1 - GroupBy Features #\n",
-    "    #########################    \n",
-    "    # Variance in day, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
-    "    # Variance in hour, for user_id-prod-product_category_1\n",
     "    {'groupby': ['user_id','prod','product_category_1'], 'select': 'hour', 'agg': 'var'},\n",
-    "    # Variance in hour, for user_id-day-campaign_id\n",
     "    {'groupby': ['user_id','day','campaign_id'], 'select': 'hour', 'agg': 'var'},\n",
-    "    # Count, for user_id-day-hour'dow','hour'\n",
     "    {'groupby': ['user_id','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n",
-    "    # Count, for user_id-prod\n",
     "    {'groupby': ['user_id', 'prod'], 'select': 'campaign_id', 'agg': 'count'},        \n",
-    "    # Count, for user_id-prod-webpage_id\n",
     "    {'groupby': ['user_id', 'prod', 'webpage_id'], 'select': 'campaign_id', 'agg': 'count'},\n",
-    "    # Count, for user_id-prod-day-hour\n",
     "    {'groupby': ['user_id','prod','day','hour'], 'select': 'campaign_id', 'agg': 'count'},\n",
-    "    # Mean hour, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
     "    \n",
-    "    # V2 - GroupBy Features #\n",
-    "    #########################\n",
-    "    # Average clicks on app by distinct users; is it an app they return to?\n",
+    "\n",
     "    {'groupby': ['prod'], \n",
     "     'select': 'user_id', \n",
     "     'agg': lambda x: float(len(x)) / len(x.unique()), \n",
     "     'agg_name': 'AvgprodPerDistinct'\n",
     "    },\n",
-    "    # How popular is the app or channel?\n",
     "    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
     "    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
     "    \n",
-    "    # V3 - GroupBy Features                                              #\n",
-    "    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
-    "    ###################################################################### \n",
     "    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
@@ -399,34 +374,26 @@
     "# Apply all the groupby transformations\n",
     "for spec in GROUPBY_AGGREGATIONS:\n",
     "    \n",
-    "    # Name of the aggregation we're applying\n",
+    "\n",
     "    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']\n",
-    "    \n",
-    "    # Name of new feature\n",
     "    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])\n",
     "    \n",
-    "    # Info\n",
     "    print(\"Grouping by {}, and aggregating {} with {}\".format(\n",
     "        spec['groupby'], spec['select'], agg_name\n",
     "    ))\n",
     "    \n",
-    "    # Unique list of features to select\n",
     "    all_features = list(set(spec['groupby'] + [spec['select']]))\n",
-    "    \n",
-    "    # Perform the groupby\n",
     "    gp = merge[all_features]. \\\n",
     "        groupby(spec['groupby'])[spec['select']]. \\\n",
     "        agg(spec['agg']). \\\n",
     "        reset_index(). \\\n",
     "        rename(index=str, columns={spec['select']: new_feature})\n",
-    "        \n",
-    "    # Merge back to X_total\n",
+    "\n",
     "    if 'cumcount' == spec['agg']:\n",
     "        merge[new_feature] = gp[0].values\n",
     "    else:\n",
     "        merge = merge.merge(gp, on=spec['groupby'], how='left')\n",
     "        \n",
-    "     # Clear memory\n",
     "    del gp\n",
     "    gc.collect()\n",
     "\n",

diff --git a/New_XGB_LGB2.ipynb b/New_XGB_LGB2.ipynb
@@ -178,7 +178,7 @@
     }
    ],
    "source": [
-    "# Find frequency of is_attributed for each unique value in column\n",
+    "# Find frequency of is_click for each unique value in column\n",
     "freqs = {}\n",
     "for cols in CLICK_ATTR_CATS:\n",
     "    \n",
@@ -195,9 +195,8 @@
     "        cols, new_feature,group_sizes.max(), np.round(group_sizes.mean(), 2), np.round(group_sizes.median(), 2),\n",
     "        group_sizes.min()))\n",
     "    \n",
-    "    # Aggregation function\n",
+    "    # Calculate the click rate. Scale by confidence\n",
     "    def rate_calculation(x):\n",
-    "        \"\"\"Calculate the click rate. Scale by confidence\"\"\"\n",
     "        rate = x.sum() / float(x.count())\n",
     "        conf = np.min([1, np.log(x.count()) / log_group])\n",
     "        return rate * conf\n",
@@ -316,9 +315,7 @@
    "outputs": [],
    "source": [
     "# Define all the groupby transformations\n",
-    "GROUPBY_AGGREGATIONS = [\n",
-    "    # V1 - GroupBy Features #\n",
-    "    #########################    \n",
+    "GROUPBY_AGGREGATIONS = [  \n",
     "    # Variance in day, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
     "    # Variance in hour, for user_id-prod-product_category_1\n",
@@ -336,21 +333,13 @@
     "    # Mean hour, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
     "    \n",
-    "    # V2 - GroupBy Features #\n",
-    "    #########################\n",
-    "    # Average clicks on app by distinct users; is it an app they return to?\n",
-    "    {'groupby': ['prod'], \n",
-    "     'select': 'user_id', \n",
-    "     'agg': lambda x: float(len(x)) / len(x.unique()), \n",
-    "     'agg_name': 'AvgprodPerDistinct'\n",
-    "    },\n",
-    "    # How popular is the app or channel?\n",
+    "   \n",
+    "    {'groupby': ['prod'], 'select': 'user_id', 'agg': lambda x: float(len(x)) / len(x.unique()), \n",
+    "     'agg_name': 'AvgprodPerDistinct'},\n",
+    "\n",
     "    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
     "    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
     "    \n",
-    "    # V3 - GroupBy Features                                              #\n",
-    "    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
-    "    ###################################################################### \n",
     "    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",
@@ -423,7 +412,7 @@
     "        reset_index(). \\\n",
     "        rename(index=str, columns={spec['select']: new_feature})\n",
     "        \n",
-    "    # Merge back to X_total\n",
+    "    # Merge back to data\n",
     "    if 'cumcount' == spec['agg']:\n",
     "        merge[new_feature] = gp[0].values\n",
     "    else:\n",

diff --git a/XGboost_Basic.ipynb b/XGboost_Basic.ipynb
@@ -285,8 +285,6 @@
    "source": [
     "# Define all the groupby transformations\n",
     "GROUPBY_AGGREGATIONS = [\n",
-    "    # V1 - GroupBy Features #\n",
-    "    #########################    \n",
     "    # Variance in day, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'day', 'agg': 'var'},\n",
     "    # Variance in hour, for user_id-prod-product_category_1\n",
@@ -304,9 +302,6 @@
     "    # Mean hour, for user_id-prod-campaign_id\n",
     "    {'groupby': ['user_id','prod','campaign_id'], 'select': 'hour', 'agg': 'mean'}, \n",
     "    \n",
-    "    # V2 - GroupBy Features #\n",
-    "    #########################\n",
-    "    # Average clicks on app by distinct users; is it an app they return to?\n",
     "    {'groupby': ['prod'], \n",
     "     'select': 'user_id', \n",
     "     'agg': lambda x: float(len(x)) / len(x.unique()), \n",
@@ -315,10 +310,7 @@
     "    # How popular is the app or channel?\n",
     "    {'groupby': ['prod'], 'select': 'campaign_id', 'agg': 'count'},\n",
     "    {'groupby': ['campaign_id'], 'select': 'prod', 'agg': 'count'},\n",
-    "    \n",
-    "    # V3 - GroupBy Features                                              #\n",
-    "    # https://www.kaggle.com/bk0000/non-blending-lightgbm-model-lb-0-977 #\n",
-    "    ###################################################################### \n",
+    "\n",
     "    {'groupby': ['user_id'], 'select': 'campaign_id', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id'], 'select': 'prod', 'agg': 'nunique'}, \n",
     "    {'groupby': ['user_id','day'], 'select': 'hour', 'agg': 'nunique'}, \n",