From 70c8bb0a187f95a11ac2bdfe07f4fe06fb132b89 Mon Sep 17 00:00:00 2001 From: vladislavovich-d <48733961+vladislavovich-d@users.noreply.github.com> Date: Mon, 13 May 2019 12:23:18 +0300 Subject: [PATCH] Add files via upload --- .../NB_Classif.ipynb | 438 ++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 Naive_Bayes_text_classification/NB_Classif.ipynb diff --git a/Naive_Bayes_text_classification/NB_Classif.ipynb b/Naive_Bayes_text_classification/NB_Classif.ipynb new file mode 100644 index 0000000..51afb2f --- /dev/null +++ b/Naive_Bayes_text_classification/NB_Classif.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bayes Classific Code" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "import pandas as pd\n", + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TextClass
0One of a kind Money maker Try it for free Fro...0
1link to my webcam you wanted Wanna see sexuall...0
2Re How to manage multiple Internet connection...1
3[SPAM] Give her hour rodeoEnhance your desi...0
4Best Price on the netf f m suddenlysusan Sto...0
\n", + "
" + ], + "text/plain": [ + " Text Class\n", + "0 One of a kind Money maker Try it for free Fro... 0\n", + "1 link to my webcam you wanted Wanna see sexuall... 0\n", + "2 Re How to manage multiple Internet connection... 1\n", + "3 [SPAM] Give her hour rodeoEnhance your desi... 0\n", + "4 Best Price on the netf f m suddenlysusan Sto... 0" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#data initializing\n", + "data = pd.read_csv('data.csv', sep = ',', header = None)\n", + "data.columns = ['Text', 'Class']\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Spliting Data" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "def split(data, numtest = 0.2):\n", + " #choice test semples \n", + " ltest = int(data.shape[0] * numtest) # 0 - Class of Spam semples\n", + " half = int(ltest / 2) # half - half of test dataset len\n", + " class1 = data[data.Class == 1].shape[0] #all samples 1st class\n", + " class0 = data[data.Class == 0].shape[0] #all samples 0 class\n", + " #choice test and train sets for 1 and 0 classes\n", + " test_data_1 = data[data.Class == 1][:half]\n", + " train_data_1 = data[data.Class == 1][half:]\n", + " test_data_0 = data[data.Class == 0][:half]\n", + " train_data_0 = data[data.Class == 0][half:]\n", + " #choice test and train sets for all dataset\n", + " test=pd.concat((test_data_1,test_data_0))\n", + " train=pd.concat((train_data_1,train_data_0))\n", + " return train, test" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train shape (896, 2), test_shape (222, 2)\n" + ] + } + ], + "source": [ + "train_data, test_data = split(data, numtest = 0.2)\n", + "print(f'train shape {train_data.shape}, test_shape {test_data.shape}')" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "def filtres(string):\n", + " string = string.lower() #lowering data text\n", + " patt = r'[A-Z,a-z]{3,}' #Sort strings \n", + " words = re.findall(patt, string)\n", + " return words" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bayes Funcs" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "def fit(x, Y):\n", + " \n", + " w0 = []\n", + " w1 = []\n", + " w01 = []\n", + " x = np.array(x)\n", + " Y = np.array(Y)\n", + " for i in range (x.shape[0]):\n", + " w01 += filtres(x[i].lower())\n", + " if Y[i] == 0:\n", + " w0 += filtres(x[i].lower())\n", + " if Y[i] == 1:\n", + " w1 += filtres(x[i].lower()) \n", + " \n", + " wfreq = Counter(w01)\n", + " freq0 = Counter(w0)\n", + " freq1 = Counter(w1)\n", + " cl0_n = Counter(Y)[0]\n", + " cl1_n = Counter(Y)[1]\n", + " pcl_0 = cl0_n / Y.shape[0]\n", + " pcl_1 = cl1_n / Y.shape[0]\n", + " \n", + " V = len(wfreq.keys())\n", + " N0 = len(w0)\n", + " N1 = len(w1)\n", + " \n", + " print(f'prob_class = [{pcl_0},{pcl_1}], N_classes = [{N0},{N1}], Vocabulary = {V}')\n", + " \n", + " return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1]" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "def predict(x, prob_classes, n_classes, V, freq):\n", + " x = np.array(x)\n", + " pcl_0 = prob_classes[0]\n", + " pcl_1 = prob_classes[1]\n", + " N0 = n_classes[0]\n", + " N1 = n_classes[1]\n", + " \n", + " freq0 = freq[0]\n", + " freq1 = freq[1]\n", + " predict = []\n", + " \n", + " for j in range(x.shape[0]):\n", + " words = filtres(x[j].lower())\n", + " P0 = 1\n", + " P1 = 1\n", + " for i in words: \n", + " pwc0 = (freq0.get(i,0) +1) / (N0 + V)\n", + " pwc1 = (freq1.get(i,0) + 1) / (N1 + V)\n", + " P0 *= pwc0\n", + " P1 *= pwc1\n", + " P0 = P0 * pcl_0\n", + " P1 = P1 * pcl_1 \n", + " if P0>P1:\n", + " predict.append(0)\n", + " else:\n", + " predict.append(1)\n", + " \n", + " return predict" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy(Y, predict):\n", + " Y = np.array(Y)\n", + " predict = np.array(predict)\n", + " acc = np.sum(Y == predict) / Y.shape[0]\n", + " return acc" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [107762,168259], Vocabulary = 32449\n", + "accuracy = 79.60644007155635%\n" + ] + } + ], + "source": [ + "prod_classes, n_classes, V, freq = fit(data.Text, data.Class)\n", + "predictions = predict(data.Text, prob_classes, n_classes, V, freq)\n", + "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test " + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "train=['Captain Morgan', 'Captain Jack', 'Captain John', 'Pirate Jack']\n", + "Class=[0,0,0,1]\n", + "test=['Captain Jack Captain John Pirate Jack']" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [6,2], Vocabulary = 5\n", + "accuracy = 66.01073345259391%\n" + ] + } + ], + "source": [ + "prod_classes, n_classes, V, freq = fit(train, data.Class)\n", + "predictions = predict(test, prob_classes, n_classes, V, freq)\n", + "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TF/IDF" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "def fit_tf(Z):\n", + " Z['words'] = Z.Text.apply(filtres)\n", + " w01 = Z.words.sum()\n", + " w0 = Z[Z.Class == 0].words.sum()\n", + " w1 = Z[Z.Class == 1].words.sum()\n", + " wfreq = Counter(w01)\n", + " freq0 = Counter(w0)\n", + " freq1 = Counter(w1)\n", + " #TF-IDF, frequencies calculation \n", + " idf_dic0 = {}\n", + " idf_dic1 = {}\n", + " ncl_0 = Z[Z.Class == 0].shape[0]\n", + " ncl_1 = Z[Z.Class == 1].shape[0]\n", + " for w in freq0.keys():\n", + " idf_dic0[w] = np.sum([w in Z[Z.Class == 0].words.iloc[i] for i in range(ncl_0)])\n", + " for w in freq1.keys():\n", + " idf_dic1[w] = np.sum([w in Z[Z.Class == 1].words.iloc[i] for i in range(ncl_1)])\n", + " \n", + " cl0_n = Counter(Z.Class)[0]\n", + " cl1_n = Counter(Z.Class)[1]\n", + " \n", + " pcl_0 = cl0_n / Z.shape[0]\n", + " pcl_1 = cl1_n / Z.shape[0]\n", + " V = len(wfreq.keys())\n", + " N0 = len(w0)\n", + " N1 = len(w1)\n", + " \n", + " return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1 ], [idf_dic0,idf_dic1]" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_tf(x, prob_classes, n_classes, V, freq, idf_dic): \n", + "\n", + " x = np.array(x)\n", + " pcl_0 = prob_classes[0]\n", + " pcl_1 = prob_classes[1]\n", + " N0 = n_classes[0]\n", + " N1 = n_classes[1]\n", + " \n", + " freq0 = freq[0]\n", + " freq1 = freq[1]\n", + " \n", + " idf_dic0 = idf_dic[0]\n", + " idf_dic1 = idf_dic[1]\n", + " \n", + " predict = []\n", + " \n", + " for j in docs:\n", + " words = filtres(j)\n", + " words_freq = Counter(words)\n", + " P0=1\n", + " P1=1\n", + " for i in words:\n", + " pwc_0=(words_freq[i] / len(words)) * np.log(N0/idf_dic0.get(i,(N0-0.00001)))\n", + " pwc_1=(words_freq[i] / len(words)) * np.log(N1/idf_dic1.get(i,(N1-0.00001)))\n", + " P0 *= pwc_0\n", + " P1 *= pwc_1\n", + " P0 *= pcl_0\n", + " P1 *= pcl_1\n", + " \n", + " if P0>P1:\n", + " predict.append(0)\n", + " else:\n", + " predict.append(1)\n", + " \n", + " return predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prob_classes, n_classes, V, freq, idf_dic = fit_tf(train_data)\n", + "\n", + "x0 = train_data[train_data.Class == 0].shape[0]\n", + "x1 = train_data[train_data.Class == 1].shape[0]\n", + "\n", + "predictions = predict_tf(test_data.Text, prob_classes, [x0,x1], V, freq, idf_dic)\n", + "print(f'Accuracy with TF-IDF = {accuracy(test_data.Class,predictions)*100}%')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}