From 70c8bb0a187f95a11ac2bdfe07f4fe06fb132b89 Mon Sep 17 00:00:00 2001
From: vladislavovich-d <48733961+vladislavovich-d@users.noreply.github.com>
Date: Mon, 13 May 2019 12:23:18 +0300
Subject: [PATCH] Add files via upload

---
 .../NB_Classif.ipynb                          | 438 ++++++++++++++++++
 1 file changed, 438 insertions(+)
 create mode 100644 Naive_Bayes_text_classification/NB_Classif.ipynb
diff --git a/Naive_Bayes_text_classification/NB_Classif.ipynb b/Naive_Bayes_text_classification/NB_Classif.ipynb
new file mode 100644
index 0000000..51afb2f
--- /dev/null
+++ b/Naive_Bayes_text_classification/NB_Classif.ipynb
@@ -0,0 +1,438 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Bayes Classific Code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "      <th>Class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>One of a kind Money maker  Try it for free Fro...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>link to my webcam you wanted Wanna see sexuall...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Re  How to manage multiple Internet connection...</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[SPAM]  Give her   hour rodeoEnhance your desi...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Best Price on the netf f m   suddenlysusan Sto...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text  Class\n",
+       "0  One of a kind Money maker  Try it for free Fro...      0\n",
+       "1  link to my webcam you wanted Wanna see sexuall...      0\n",
+       "2  Re  How to manage multiple Internet connection...      1\n",
+       "3  [SPAM]  Give her   hour rodeoEnhance your desi...      0\n",
+       "4  Best Price on the netf f m   suddenlysusan Sto...      0"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#data initializing\n",
+    "data = pd.read_csv('data.csv', sep = ',', header = None)\n",
+    "data.columns = ['Text', 'Class']\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Spliting Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split(data, numtest = 0.2):\n",
+    "    #choice test semples \n",
+    "    ltest = int(data.shape[0] * numtest) # 0 - Class of Spam semples\n",
+    "    half = int(ltest / 2) # half - half of test dataset len\n",
+    "    class1 = data[data.Class == 1].shape[0] #all samples 1st class\n",
+    "    class0 = data[data.Class == 0].shape[0] #all samples 0 class\n",
+    "    #choice test and train sets for 1 and 0 classes\n",
+    "    test_data_1 = data[data.Class == 1][:half]\n",
+    "    train_data_1 = data[data.Class == 1][half:]\n",
+    "    test_data_0 = data[data.Class == 0][:half]\n",
+    "    train_data_0 = data[data.Class == 0][half:]\n",
+    "    #choice test and train sets for all dataset\n",
+    "    test=pd.concat((test_data_1,test_data_0))\n",
+    "    train=pd.concat((train_data_1,train_data_0))\n",
+    "    return train, test"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "train shape (896, 2), test_shape (222, 2)\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_data, test_data = split(data, numtest = 0.2)\n",
+    "print(f'train shape {train_data.shape}, test_shape {test_data.shape}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filtres(string):\n",
+    "    string = string.lower() #lowering data text\n",
+    "    patt = r'[A-Z,a-z]{3,}' #Sort strings \n",
+    "    words = re.findall(patt, string)\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bayes Funcs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fit(x, Y):\n",
+    "   \n",
+    "    w0 = []\n",
+    "    w1 = []\n",
+    "    w01 = []\n",
+    "    x = np.array(x)\n",
+    "    Y = np.array(Y)\n",
+    "    for i in range (x.shape[0]):\n",
+    "        w01 += filtres(x[i].lower())\n",
+    "        if Y[i] == 0:\n",
+    "            w0 += filtres(x[i].lower())\n",
+    "        if Y[i] == 1:\n",
+    "            w1 += filtres(x[i].lower())  \n",
+    "            \n",
+    "    wfreq = Counter(w01)\n",
+    "    freq0 = Counter(w0)\n",
+    "    freq1 = Counter(w1)\n",
+    "    cl0_n = Counter(Y)[0]\n",
+    "    cl1_n = Counter(Y)[1]\n",
+    "    pcl_0 = cl0_n / Y.shape[0]\n",
+    "    pcl_1 = cl1_n / Y.shape[0]\n",
+    "   \n",
+    "    V = len(wfreq.keys())\n",
+    "    N0 = len(w0)\n",
+    "    N1 = len(w1)\n",
+    "    \n",
+    "    print(f'prob_class = [{pcl_0},{pcl_1}], N_classes = [{N0},{N1}], Vocabulary = {V}')\n",
+    "    \n",
+    "    return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(x, prob_classes, n_classes, V, freq):\n",
+    "    x = np.array(x)\n",
+    "    pcl_0 = prob_classes[0]\n",
+    "    pcl_1 = prob_classes[1]\n",
+    "    N0 = n_classes[0]\n",
+    "    N1 = n_classes[1]\n",
+    "    \n",
+    "    freq0 = freq[0]\n",
+    "    freq1 = freq[1]\n",
+    "    predict = []\n",
+    " \n",
+    "    for j in range(x.shape[0]):\n",
+    "        words = filtres(x[j].lower())\n",
+    "        P0 = 1\n",
+    "        P1 = 1\n",
+    "        for i in words: \n",
+    "            pwc0 = (freq0.get(i,0) +1) / (N0 + V)\n",
+    "            pwc1 = (freq1.get(i,0) + 1) / (N1 + V)\n",
+    "            P0 *= pwc0\n",
+    "            P1 *= pwc1\n",
+    "        P0 = P0 * pcl_0\n",
+    "        P1 = P1 * pcl_1    \n",
+    "        if P0>P1:\n",
+    "            predict.append(0)\n",
+    "        else:\n",
+    "            predict.append(1)\n",
+    "            \n",
+    "    return predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def accuracy(Y, predict):\n",
+    "    Y = np.array(Y)\n",
+    "    predict = np.array(predict)\n",
+    "    acc = np.sum(Y == predict) / Y.shape[0]\n",
+    "    return acc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [107762,168259], Vocabulary = 32449\n",
+      "accuracy = 79.60644007155635%\n"
+     ]
+    }
+   ],
+   "source": [
+    "prod_classes, n_classes, V, freq = fit(data.Text, data.Class)\n",
+    "predictions = predict(data.Text, prob_classes, n_classes,  V, freq)\n",
+    "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train=['Captain Morgan', 'Captain Jack', 'Captain John', 'Pirate Jack']\n",
+    "Class=[0,0,0,1]\n",
+    "test=['Captain Jack Captain John Pirate Jack']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [6,2], Vocabulary = 5\n",
+      "accuracy = 66.01073345259391%\n"
+     ]
+    }
+   ],
+   "source": [
+    "prod_classes, n_classes, V, freq = fit(train, data.Class)\n",
+    "predictions = predict(test, prob_classes, n_classes,  V, freq)\n",
+    "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TF/IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fit_tf(Z):\n",
+    "    Z['words'] = Z.Text.apply(filtres)\n",
+    "    w01 = Z.words.sum()\n",
+    "    w0 = Z[Z.Class == 0].words.sum()\n",
+    "    w1 = Z[Z.Class == 1].words.sum()\n",
+    "    wfreq = Counter(w01)\n",
+    "    freq0 = Counter(w0)\n",
+    "    freq1 = Counter(w1)\n",
+    "    #TF-IDF, frequencies calculation            \n",
+    "    idf_dic0 = {}\n",
+    "    idf_dic1 = {}\n",
+    "    ncl_0 = Z[Z.Class == 0].shape[0]\n",
+    "    ncl_1 = Z[Z.Class == 1].shape[0]\n",
+    "    for w in freq0.keys():\n",
+    "        idf_dic0[w] = np.sum([w in Z[Z.Class == 0].words.iloc[i] for i in range(ncl_0)])\n",
+    "    for w in freq1.keys():\n",
+    "        idf_dic1[w] = np.sum([w in Z[Z.Class == 1].words.iloc[i] for i in range(ncl_1)])\n",
+    "    \n",
+    "    cl0_n = Counter(Z.Class)[0]\n",
+    "    cl1_n = Counter(Z.Class)[1]\n",
+    "    \n",
+    "    pcl_0 = cl0_n / Z.shape[0]\n",
+    "    pcl_1 = cl1_n / Z.shape[0]\n",
+    "    V = len(wfreq.keys())\n",
+    "    N0 = len(w0)\n",
+    "    N1 = len(w1)\n",
+    "    \n",
+    "    return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1 ], [idf_dic0,idf_dic1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_tf(x, prob_classes, n_classes, V, freq, idf_dic):  \n",
+    "\n",
+    "    x = np.array(x)\n",
+    "    pcl_0 = prob_classes[0]\n",
+    "    pcl_1 = prob_classes[1]\n",
+    "    N0 = n_classes[0]\n",
+    "    N1 = n_classes[1]\n",
+    "    \n",
+    "    freq0 = freq[0]\n",
+    "    freq1 = freq[1]\n",
+    "    \n",
+    "    idf_dic0 = idf_dic[0]\n",
+    "    idf_dic1 = idf_dic[1]\n",
+    "    \n",
+    "    predict = []\n",
+    "    \n",
+    "    for j in docs:\n",
+    "        words = filtres(j)\n",
+    "        words_freq = Counter(words)\n",
+    "        P0=1\n",
+    "        P1=1\n",
+    "        for i in words:\n",
+    "            pwc_0=(words_freq[i] / len(words)) * np.log(N0/idf_dic0.get(i,(N0-0.00001)))\n",
+    "            pwc_1=(words_freq[i] / len(words)) * np.log(N1/idf_dic1.get(i,(N1-0.00001)))\n",
+    "            P0 *= pwc_0\n",
+    "            P1 *= pwc_1\n",
+    "        P0 *= pcl_0\n",
+    "        P1 *= pcl_1\n",
+    "        \n",
+    "        if P0>P1:\n",
+    "            predict.append(0)\n",
+    "        else:\n",
+    "            predict.append(1)\n",
+    "    \n",
+    "    return predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prob_classes, n_classes, V, freq, idf_dic = fit_tf(train_data)\n",
+    "\n",
+    "x0 = train_data[train_data.Class == 0].shape[0]\n",
+    "x1 = train_data[train_data.Class == 1].shape[0]\n",
+    "\n",
+    "predictions = predict_tf(test_data.Text, prob_classes, [x0,x1], V, freq, idf_dic)\n",
+    "print(f'Accuracy with TF-IDF = {accuracy(test_data.Class,predictions)*100}%')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	Text	Class
0	One of a kind Money maker Try it for free Fro...	0
1	link to my webcam you wanted Wanna see sexuall...	0
2	Re How to manage multiple Internet connection...	1
3	[SPAM] Give her hour rodeoEnhance your desi...	0
4	Best Price on the netf f m suddenlysusan Sto...	0