From 70c8bb0a187f95a11ac2bdfe07f4fe06fb132b89 Mon Sep 17 00:00:00 2001
From: vladislavovich-d <48733961+vladislavovich-d@users.noreply.github.com>
Date: Mon, 13 May 2019 12:23:18 +0300
Subject: [PATCH] Add files via upload
---
.../NB_Classif.ipynb | 438 ++++++++++++++++++
1 file changed, 438 insertions(+)
create mode 100644 Naive_Bayes_text_classification/NB_Classif.ipynb
diff --git a/Naive_Bayes_text_classification/NB_Classif.ipynb b/Naive_Bayes_text_classification/NB_Classif.ipynb
new file mode 100644
index 0000000..51afb2f
--- /dev/null
+++ b/Naive_Bayes_text_classification/NB_Classif.ipynb
@@ -0,0 +1,438 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Bayes Classific Code"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from collections import Counter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Text | \n",
+ " Class | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " One of a kind Money maker Try it for free Fro... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " link to my webcam you wanted Wanna see sexuall... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Re How to manage multiple Internet connection... | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [SPAM] Give her hour rodeoEnhance your desi... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Best Price on the netf f m suddenlysusan Sto... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Text Class\n",
+ "0 One of a kind Money maker Try it for free Fro... 0\n",
+ "1 link to my webcam you wanted Wanna see sexuall... 0\n",
+ "2 Re How to manage multiple Internet connection... 1\n",
+ "3 [SPAM] Give her hour rodeoEnhance your desi... 0\n",
+ "4 Best Price on the netf f m suddenlysusan Sto... 0"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#data initializing\n",
+ "data = pd.read_csv('data.csv', sep = ',', header = None)\n",
+ "data.columns = ['Text', 'Class']\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Spliting Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split(data, numtest = 0.2):\n",
+ " #choice test semples \n",
+ " ltest = int(data.shape[0] * numtest) # 0 - Class of Spam semples\n",
+ " half = int(ltest / 2) # half - half of test dataset len\n",
+ " class1 = data[data.Class == 1].shape[0] #all samples 1st class\n",
+ " class0 = data[data.Class == 0].shape[0] #all samples 0 class\n",
+ " #choice test and train sets for 1 and 0 classes\n",
+ " test_data_1 = data[data.Class == 1][:half]\n",
+ " train_data_1 = data[data.Class == 1][half:]\n",
+ " test_data_0 = data[data.Class == 0][:half]\n",
+ " train_data_0 = data[data.Class == 0][half:]\n",
+ " #choice test and train sets for all dataset\n",
+ " test=pd.concat((test_data_1,test_data_0))\n",
+ " train=pd.concat((train_data_1,train_data_0))\n",
+ " return train, test"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train shape (896, 2), test_shape (222, 2)\n"
+ ]
+ }
+ ],
+ "source": [
+ "train_data, test_data = split(data, numtest = 0.2)\n",
+ "print(f'train shape {train_data.shape}, test_shape {test_data.shape}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def filtres(string):\n",
+ " string = string.lower() #lowering data text\n",
+ " patt = r'[A-Z,a-z]{3,}' #Sort strings \n",
+ " words = re.findall(patt, string)\n",
+ " return words"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Bayes Funcs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit(x, Y):\n",
+ " \n",
+ " w0 = []\n",
+ " w1 = []\n",
+ " w01 = []\n",
+ " x = np.array(x)\n",
+ " Y = np.array(Y)\n",
+ " for i in range (x.shape[0]):\n",
+ " w01 += filtres(x[i].lower())\n",
+ " if Y[i] == 0:\n",
+ " w0 += filtres(x[i].lower())\n",
+ " if Y[i] == 1:\n",
+ " w1 += filtres(x[i].lower()) \n",
+ " \n",
+ " wfreq = Counter(w01)\n",
+ " freq0 = Counter(w0)\n",
+ " freq1 = Counter(w1)\n",
+ " cl0_n = Counter(Y)[0]\n",
+ " cl1_n = Counter(Y)[1]\n",
+ " pcl_0 = cl0_n / Y.shape[0]\n",
+ " pcl_1 = cl1_n / Y.shape[0]\n",
+ " \n",
+ " V = len(wfreq.keys())\n",
+ " N0 = len(w0)\n",
+ " N1 = len(w1)\n",
+ " \n",
+ " print(f'prob_class = [{pcl_0},{pcl_1}], N_classes = [{N0},{N1}], Vocabulary = {V}')\n",
+ " \n",
+ " return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predict(x, prob_classes, n_classes, V, freq):\n",
+ " x = np.array(x)\n",
+ " pcl_0 = prob_classes[0]\n",
+ " pcl_1 = prob_classes[1]\n",
+ " N0 = n_classes[0]\n",
+ " N1 = n_classes[1]\n",
+ " \n",
+ " freq0 = freq[0]\n",
+ " freq1 = freq[1]\n",
+ " predict = []\n",
+ " \n",
+ " for j in range(x.shape[0]):\n",
+ " words = filtres(x[j].lower())\n",
+ " P0 = 1\n",
+ " P1 = 1\n",
+ " for i in words: \n",
+ " pwc0 = (freq0.get(i,0) +1) / (N0 + V)\n",
+ " pwc1 = (freq1.get(i,0) + 1) / (N1 + V)\n",
+ " P0 *= pwc0\n",
+ " P1 *= pwc1\n",
+ " P0 = P0 * pcl_0\n",
+ " P1 = P1 * pcl_1 \n",
+ " if P0>P1:\n",
+ " predict.append(0)\n",
+ " else:\n",
+ " predict.append(1)\n",
+ " \n",
+ " return predict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def accuracy(Y, predict):\n",
+ " Y = np.array(Y)\n",
+ " predict = np.array(predict)\n",
+ " acc = np.sum(Y == predict) / Y.shape[0]\n",
+ " return acc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [107762,168259], Vocabulary = 32449\n",
+ "accuracy = 79.60644007155635%\n"
+ ]
+ }
+ ],
+ "source": [
+ "prod_classes, n_classes, V, freq = fit(data.Text, data.Class)\n",
+ "predictions = predict(data.Text, prob_classes, n_classes, V, freq)\n",
+ "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Test "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train=['Captain Morgan', 'Captain Jack', 'Captain John', 'Pirate Jack']\n",
+ "Class=[0,0,0,1]\n",
+ "test=['Captain Jack Captain John Pirate Jack']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "prob_class = [0.33989266547406083,0.6601073345259392], N_classes = [6,2], Vocabulary = 5\n",
+ "accuracy = 66.01073345259391%\n"
+ ]
+ }
+ ],
+ "source": [
+ "prod_classes, n_classes, V, freq = fit(train, data.Class)\n",
+ "predictions = predict(test, prob_classes, n_classes, V, freq)\n",
+ "print(f'accuracy = {accuracy(data.Class, predictions)*100}%')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## TF/IDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def fit_tf(Z):\n",
+ " Z['words'] = Z.Text.apply(filtres)\n",
+ " w01 = Z.words.sum()\n",
+ " w0 = Z[Z.Class == 0].words.sum()\n",
+ " w1 = Z[Z.Class == 1].words.sum()\n",
+ " wfreq = Counter(w01)\n",
+ " freq0 = Counter(w0)\n",
+ " freq1 = Counter(w1)\n",
+ " #TF-IDF, frequencies calculation \n",
+ " idf_dic0 = {}\n",
+ " idf_dic1 = {}\n",
+ " ncl_0 = Z[Z.Class == 0].shape[0]\n",
+ " ncl_1 = Z[Z.Class == 1].shape[0]\n",
+ " for w in freq0.keys():\n",
+ " idf_dic0[w] = np.sum([w in Z[Z.Class == 0].words.iloc[i] for i in range(ncl_0)])\n",
+ " for w in freq1.keys():\n",
+ " idf_dic1[w] = np.sum([w in Z[Z.Class == 1].words.iloc[i] for i in range(ncl_1)])\n",
+ " \n",
+ " cl0_n = Counter(Z.Class)[0]\n",
+ " cl1_n = Counter(Z.Class)[1]\n",
+ " \n",
+ " pcl_0 = cl0_n / Z.shape[0]\n",
+ " pcl_1 = cl1_n / Z.shape[0]\n",
+ " V = len(wfreq.keys())\n",
+ " N0 = len(w0)\n",
+ " N1 = len(w1)\n",
+ " \n",
+ " return [pcl_0, pcl_1], [N0,N1], V, [freq0, freq1 ], [idf_dic0,idf_dic1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def predict_tf(x, prob_classes, n_classes, V, freq, idf_dic): \n",
+ "\n",
+ " x = np.array(x)\n",
+ " pcl_0 = prob_classes[0]\n",
+ " pcl_1 = prob_classes[1]\n",
+ " N0 = n_classes[0]\n",
+ " N1 = n_classes[1]\n",
+ " \n",
+ " freq0 = freq[0]\n",
+ " freq1 = freq[1]\n",
+ " \n",
+ " idf_dic0 = idf_dic[0]\n",
+ " idf_dic1 = idf_dic[1]\n",
+ " \n",
+ " predict = []\n",
+ " \n",
+ " for j in docs:\n",
+ " words = filtres(j)\n",
+ " words_freq = Counter(words)\n",
+ " P0=1\n",
+ " P1=1\n",
+ " for i in words:\n",
+ " pwc_0=(words_freq[i] / len(words)) * np.log(N0/idf_dic0.get(i,(N0-0.00001)))\n",
+ " pwc_1=(words_freq[i] / len(words)) * np.log(N1/idf_dic1.get(i,(N1-0.00001)))\n",
+ " P0 *= pwc_0\n",
+ " P1 *= pwc_1\n",
+ " P0 *= pcl_0\n",
+ " P1 *= pcl_1\n",
+ " \n",
+ " if P0>P1:\n",
+ " predict.append(0)\n",
+ " else:\n",
+ " predict.append(1)\n",
+ " \n",
+ " return predict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prob_classes, n_classes, V, freq, idf_dic = fit_tf(train_data)\n",
+ "\n",
+ "x0 = train_data[train_data.Class == 0].shape[0]\n",
+ "x1 = train_data[train_data.Class == 1].shape[0]\n",
+ "\n",
+ "predictions = predict_tf(test_data.Text, prob_classes, [x0,x1], V, freq, idf_dic)\n",
+ "print(f'Accuracy with TF-IDF = {accuracy(test_data.Class,predictions)*100}%')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}