-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive-byes.html
713 lines (623 loc) · 36 KB
/
naive-byes.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>Naive Bayes</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2>Machine Learning</h2>
<ol>
<li><a href="machine-learning.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>Naive Bayes: Classification methods</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<div class="swiper-slide">
<figure>
<img src="assets/img/machine-ln/classification-naive.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
</div>
</div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Content</h3>
<ol>
<li><a href="#introduction">Introduction</a></li>
<ul>
<li><a href="#principle-bayes">Principle of Naive Bayes</a></li>
<li><a href="#working">Working of Naive Bayes</a></li>
<li><a href="#types">Types of Naive Bayes Classifiers</a></li>
<li><a href="#advantages">Advantages of Naive Bayes</a></li>
<li><a href="#limitations">Limitations of Naive Bayes</a></li>
<li><a href="#applications-bayes">Applications of Naive Bayes</a></li>
</ul>
<li><a href="#example">Example</a></li>
<ul>
<li><a href="#example-weather">Example on weather outlook</a></li>
<li><a href="#example-iris">Irdis dataset</a></li>
<li><a href="#example-userdata">User_Data.csv</a></li>
</ul>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section>
<h2 id="introdction">Introduction</h2>
<p>Naive Bayes algorithms are a family of probabilistic classifiers based on Bayes' theorem with the "naive" assumption of independence between features. Despite their simplicity, they are powerful and widely used for classification tasks in various fields, including text classification, spam filtering, and medical diagnosis. They work by calculating the probability of a given data point belonging to each class and selecting the class with the highest probability. </p>
<p>Naive Bayes algorithms are efficient, particularly for large datasets, and they perform well even with limited training data. Popular variants include Gaussian Naive Bayes, Multinomial Naive Bayes, and Bernoulli Naive Bayes. </p>
<h4 id="principle-bayes">Principle of Naive Bayes</h4>
<p>At the core of Naive Bayes is Bayes' theorem, which calculates the probability of a hypothesis (class label) given the observed evidence (features). The "naive" assumption in Naive Bayes is that the features are conditionally independent given the class label, which simplifies the calculation of probabilities.</p>
<div class="important-box">
Bayes' theorem is a fundamental concept in probability theory and statistics, used to update the probability of a hypothesis (or event) based on new evidence. It is expressed mathematically as:
$$P(A|B) = \frac{P(B|A) \times P(A)}{P(B)}$$
where:
<ul>
<li>\(P(A|B)\) is the probability of event \(A\) occurring given that event \(B\) has occurred. This is called the posterior probability. The posterior probability represents the updated belief about the probability of event A occurring after considering the new evidence (event B). It is what we are interested in calculating using Bayes' theorem.</li>
<li>\(P(B|A)\) is the probability of event \(B\) occurring given that event \(A\) has occurred. This is called the likelihood. The likelihood represents the probability of observing the new evidence (event B) given that the hypothesis (event A) is true. It quantifies how well the evidence supports the hypothesis.</li>
<li>\(P(A)\) is the probability of event \(A\) occurring. This is called the prior probability. The prior probability represents our initial belief about the probability of event \(A\) occurring before considering any new evidence.</li>
<li>\(P(B)\) is the probability of event \(B\) occurring. This is called the marginal probability. The marginal probability represents the total probability of observing event B, irrespective of whether event A is true or not. It serves as a normalization factor to ensure that the posterior probability is properly scaled.</li>
</ul>
<p><strong>Example: </strong>Let's say we have a medical test to detect a disease, and:
<ul>
<li>\(P(A)\) is the prior probability of having the disease.</li>
<li>\(P(B|A)\) is the probability of testing positive given that the person has the disease.</li>
<li>\(P(B)\) is the probability of testing positive (with or without having the disease).</li>
<li>\(P(A|B)\) is the probability of having the disease given that the person tested positive</li>
</ul>
Using Bayes' theorem, we can update our belief about the probability of having the disease \(P(A|B)\) based on the test result \(P(B|A)\), the prior probability of having the disease \(P(A)\), and the probability of testing positive \(P(B)\).
</p>
</div><br><br>
<h4 id="working">Working of Naive Bayes</h4>
<ol>
<li><strong>Training Phase: </strong>During the training phase, Naive Bayes learns the probabilities of each feature given each class label from the training data.</li>
$$P(C_j) = \frac{\text{Number of instances with class} ~ C_j}{\text{Total number of instances}}$$
where \(P(C_j)\) represents the the Prior probability of each class occuring in the dataset and is calculated as the frequency of each class divided by the total number of instances in the dataset.
<li><strong>Probability Calculation: </strong> To classify a new data point:
<ul>
<li>Calculate the posterior probability of each class given the observed features using Bayes' theorem i.e. \(P(X_i|C_j)\). Depending on the type of feature (e.g., continuous, categorical), different probability distributions (e.g., Gaussian, multinomial) can be used. <strong>For example</strong>, for continuous features, Gaussian Naive Bayes assumes a normal (Gaussian) distribution for each feature given each class. Thus, the likelihood \(P(X_i|C_j)\) can be calculated using the mean \(\mu_{ij}\) and standard deviation \(\sigma_{ij}\) of feature \(X_i\) in class \(C_j\).
$$P(X_i|C_j) = \frac{1}{\sqrt{2\pi \sigma^2_{ij}}} ~\text{exp}\left(-\frac{(x-\mu_{ij})^2}{2\sigma_{ij}^2}\right).$$
<p><strong>Probability Calculation for classification:</strong> To classify a new data point, Naive Bayes calculates the posterior probability of each class given the observed features using Bayes' theorem:</p>
$$P(C_j|X_1,X_2,...,X_n) = \frac{P(X_1,X_2,...,X_n |C_j) \times P(C_j)}{P(X_1,X_2,...,X_n)}$$
Given the "naive" assumption of feature independence, this equation simplifies to:
$$P(C_j|X_1,X_2,...,X_n) = P(C_j) \times \Pi_{i=1}^n P(X_i|C_j)$$
where:
<ul>
<li>\(P(C_j|X_1,X_2,...,X_n)\) is the posterior probability of class \(C_j\) given the observed features.</li>
<li>\(P(X_i|C_j)\) is the likelihood of feature \(X_i\) given class \(C_j\), which was calculated during the training phase.</li>
<li>\(P(C_j)\) is the prior probability of class \(C_j\), also calculated during the training phase. </li>
</ul>
Finally, Naive Bayes selects the class with the highest posterior probability as the predicted class for the new data point:
$$\text{Predicted Class} = \text{arg}\text{max}_{C_j} = P(C_j|X_1, X_2,...,X_n).$$
In summary, during the training phase, Naive Bayes learns the probabilities of each feature given each class from the training data. Then, during the prediction phase, it calculates the posterior probability of each class given the observed features using Bayes' theorem and selects the class with the highest posterior probability as the predicted class.
</li>
<li>Select the class with the highest posterior probability as the predicted class for the new data point.</li>
</ul>
</li>
</ol>
<h4 id="types">Types of Naive Bayes Classifiers</h4>
<ol>
<li><strong>Gaussian Naive Bayes: </strong>Assumes that the features follow a Gaussian (normal) distribution.</li>
<li><strong>Multinomial Naive Bayes: </strong>Suitable for features that represent counts or frequencies (e.g., word counts in text classification).</li>
<li><strong>Bernoulli Naive Bayes: </strong>Designed for binary features, where each feature is either present or absent.</li>
</ol>
<h4 id="advantages">Advantages of Naive Bayes:</h4>
<ol>
<li><strong>Simplicity: </strong>Naive Bayes is straightforward to implement and understand, making it suitable for quick prototyping and baseline models.</li>
<li><strong>Efficiency: </strong>Naive Bayes is computationally efficient, especially for large datasets, as it requires only simple probability calculations.</li>
<li><strong>Robustness to Irrelevant Features: </strong>Naive Bayes can perform well even in the presence of irrelevant features, thanks to its independence assumption.</li>
</ol>
<h4 id="limitations">Limitations of Naive Bayes</h4>
<ol>
<li><strong>Strong Independence Assumption: </strong>The assumption of feature independence may not hold true in many real-world datasets, leading to suboptimal performance.</li>
<li><strong>Sensitive to Imbalanced Data: </strong>Naive Bayes can be sensitive to imbalanced class distributions, potentially biasing the model towards the majority class.</li>
<li><strong>Limited Expressiveness: </strong>Naive Bayes may struggle with capturing complex relationships between features, especially in datasets with highly nonlinear decision boundaries.</li>
</ol>
<h4 id="applications-bayes">Applications of Naive Bayes:</h4>
<ol>
<li><strong>Text Classification: </strong>Naive Bayes is widely used for text classification tasks, such as spam detection, sentiment analysis, and document categorization.</li>
<li><strong>Recommendation Systems: </strong>Naive Bayes can be employed in recommendation systems to predict user preferences based on historical interactions.</li>
<li><strong>Medical Diagnosis: </strong>Naive Bayes has applications in medical diagnosis, where it can assist in predicting the likelihood of diseases based on symptoms and patient characteristics.</li>
</ol>
Naive Bayes is a simple yet effective classification algorithm that can deliver competitive performance in various machine learning tasks, especially when dealing with text data and relatively simple classification problems. However, its performance may degrade in more complex scenarios where the independence assumption is violated or when dealing with highly imbalanced datasets.
<!------------------------->
<hr>
<h3 id="example">Example</h3>
<h4 id="example-weather">Example on weather outlook</h4>
Suppose we have a dataset of weather conditions and corresponding target variable "Play". So using this dataset we need to decide that whether we should play or not on a particular day according to the weather conditions. So to solve this problem, we need to follow the below steps
(for more details go to <a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">link</a>):
<br> <br>
<table>
<tr>
<th>Day</th>
<th>Outlook</th>
<th>Play</th>
</tr>
<tr>
<td>0</td>
<td>Rainy</td>
<td>Yes</td>
</tr>
<tr>
<td>1</td>
<td>Sunny</td>
<td>Yes</td>
</tr>
<tr>
<td>2</td>
<td>Overcast</td>
<td>Yes</td>
</tr>
<tr>
<td>3</td>
<td>Overcast</td>
<td>Yes</td>
</tr>
<tr>
<td>4</td>
<td>Sunny</td>
<td>No</td>
</tr>
<tr>
<td>5</td>
<td>Rainy</td>
<td>Yes</td>
</tr>
<tr>
<td>6</td>
<td>Sunny</td>
<td>Yes</td>
</tr>
<tr>
<td>7</td>
<td>Overcast</td>
<td>Yes</td>
</tr>
<tr>
<td>8</td>
<td>Rainy</td>
<td>No</td>
</tr>
<tr>
<td>9</td>
<td>Sunny</td>
<td>No</td>
</tr>
<tr>
<td>10</td>
<td>Sunny</td>
<td>Yes</td>
</tr>
<tr>
<td>11</td>
<td>Rainy</td>
<td>No</td>
</tr>
<tr>
<td>12</td>
<td>Overcast</td>
<td>Yes</td>
</tr>
<tr>
<td>13</td>
<td>Overcast</td>
<td>Yes</td>
</tr>
</table><br>
<p>Total samples = 14</p>
<p><strong>Frequency table and the Likelihood for the Weather Conditions:</strong></p>
<table>
<tr>
<th><strong>Outlook</strong></th>
<th>Yes</th>
<th>No</th>
<th>P(Outlook | Yes)</th>
<th>P(Outlook | No)</th>
<th>P(Outlook)</th>
</tr>
<tr>
<td>Overcast</td>
<td>5</td>
<td>0</td>
<td>5/10 = 0.5</td>
<td>0</td>
<td>5/14 = 0.35</td>
</tr>
<tr>
<td>Rainy</td>
<td>2</td>
<td>2</td>
<td>2/10 =0.2</td>
<td>2/5= 0.4</td>
<td>4/14 = 0.29</td>
</tr>
<tr>
<td>Sunny</td>
<td>3</td>
<td>2</td>
<td>3/10 = 0.3</td>
<td>2/5 = 0.4</td>
<td>5/14=0.35</td>
</tr>
<tr>
<td>Total</td>
<td>10</td>
<td>5</td>
<td></td>
<td></td>
<td></td>
</tr>
</table><br>
<table>
<tr>
<th>Play (yes or no)</th>
<th>Total number</th>
<th>P(Play)</th>
</tr>
<tr>
<td>Yes</td>
<td>10</td>
<td>10/14 = 0.71</td>
</tr>
<tr>
<td>No</td>
<td>4</td>
<td>4/14 = 0.29</td>
</tr>
</table>
So apllying the Bayes' theorem, we have:
$$P(\text{Yes}|\text{Sunny}) = P(\text{Yes}) \times \frac{ P(\text{Sunny}|\text{Yes})}{P(\text{Sunny})} =\frac{10}{14} \times \frac{3/10}{5/10} =0.71$$
and
$$P(\text{No}|\text{Sunny}) = P(\text{No}) \times \frac{P(\text{Sunny}|\text{No}) }{P(\text{Sunny})} =\frac{4}{14} \times \frac{2/5}{5/10}=0.22$$
<p>We can normalize the two probabilities as follows:</p>
$$P(\text{Yes}|\text{Sunny}) = \frac{0.71}{0.71+0.22} = 0.76 \equiv 76\%$$
and
$$P(\text{No}|\text{Sunny}) \frac{0.22}{0.71+0.22} = 0.24 \equiv 24\%$$
<p>So as we can see from the above calculation that \(P(\text{Yes}|\text{Sunny})>P(\text{No}|\text{Sunny})\). Hence on a Sunny day, Player can play the game.</p>
<p><strong>Example: Python implementation</strong>A Python implementation of this example with little bit changed outlooks:</p>
<pre><code class="language-python">
import pandas as pd
import random
# Define the possible values for Outlook and Play
outlook_values = ['Sunny', 'Rainy', 'Overcast']
play_values = ['Yes', 'No']
# Generate 20 random data points
data = {
'Outlook': [random.choice(outlook_values) for _ in range(20)],
'Play': [random.choice(play_values) for _ in range(20)]
}
df = pd.DataFrame(data)
# Calculate the total number of instances
total_instances = len(df)
# Calculate the number of instances where the outlook is sunny
sunny_instances = len(df[df['Outlook'] == 'Sunny'])
# Calculate the number of instances where the outlook is sunny and the player plays
sunny_play_instances = len(df[(df['Outlook'] == 'Sunny') & (df['Play'] == 'Yes')])
# Calculate the prior probability of playing the game
prior_probability_play = df['Play'].value_counts(normalize=True)['Yes']
# Calculate the probability of playing the game given that the outlook is sunny using Bayes' theorem
probability_play_given_sunny = (sunny_play_instances / sunny_instances) * prior_probability_play
print("Probability of playing the game on a Sunny day using Bayes' theorem:", probability_play_given_sunny)
</code></pre>
<pre>Probability of playing the game on a Sunny day using Bayes' theorem: 0.09999999999999999</pre>
<figure>
<img src="assets/img/machine-ln/classification-naive-example1.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;">Number of 'Yes' and 'No' instances for each Weather Outlook is shown on the right hand side.</figcaption>
</figure>
<!----------------------------------->
<h4 id="example-iris">Examples-2: Iris data</h4>
In this example. we consider the 'Iris' dataset. This example demonstrates the process of training and evaluating a Gaussian Naive Bayes classifier on the Iris dataset using scikit-learn. It shows how to load the data, split it into training and testing sets, train the classifier, make predictions, and evaluate the classifier's performance using accuracy, classification report, and <a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix details</a>.
<pre><code class="language-python">
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize the Gaussian Naive Bayes classifier
gnb = GaussianNB()
# Train the classifier
gnb.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = gnb.predict(X_test)
</code></pre>
The accuracy is:
<pre><code class="language-python">
# Calculate the accuracy of the classifier
accuracy = gnb.score(X_test, y_test)
print("Accuracy:", accuracy)
</code></pre>
<pre>Accuracy: 0.9777777777777777</pre>
<pre><code class="language-python">
# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
</code></pre>
<figure>
<img src="assets/img/machine-ln/classification-naive-example2.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
<p><strong>Confusion matrix: </strong></p>
<pre><code class="language-python">
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
</code></pre>
<figure>
<img src="assets/img/machine-ln/classification-naive-confusionmatrix.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
<p>The classification report provides a concise summary of the performance of a classifier. In this report, precision measures the accuracy of positive predictions, recall assesses the proportion of actual positives correctly identified, and the F1-score balances precision and recall. For the given classes (0, 1, 2), the classifier achieves high precision, indicating accurate positive predictions. Similarly, recall scores show effective identification of actual positives, though class 1 has slightly lower recall. The F1-score reflects a good balance between precision and recall for each class, indicating overall strong performance. The support values represent the number of instances for each class in the testing set. The reported accuracy of 98% highlights the overall correctness of the classifier across all classes, with both macro and weighted averages reinforcing balanced performance.</p>
<p></p>
<!------------------------------->
<h4 id="example-userdata">Example-3</h4>
In this example, we use dataset 'User_Data.csv' for training, or testing algorithms like Naive Bayes for classification tasks. In this case, it provides a starting point for implementing a Naive Bayes algorithm using Python. Data is available in my <a href="" target="_blank">Github repo</a>
<ul>
<li><strong>Step-1: </strong>Generate a random dataset:</li>
<pre><code class="language-python">
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
df_user = pd.read_csv('User_Data.csv')
# Display the DataFrame
print(df_user.head())
# Display the DataFrame
print(df_user.head())
</code></pre>
<li><strong>Step-2: </strong>Splitting the dataset into the Training set and Test set
<pre><code class="language-python">
# Importing the dataset
X = df_user.iloc[:, [2, 3]].values
y = df_user.iloc[:, 4].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
</code></pre>
</li>
<li><strong>Step-3: </strong> Feature scaling:
<pre><code class="language-python">
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
</code></pre>
</li>
<li><strong>Step-4: </strong>Fitting Naive Bayes and then predicting from the test set:
<pre><code class="language-python">
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
# Predicting the Test set results
y_pred = classifier.predict(X_test)
</code></pre>
</li>
<li><strong>Step-5: </strong>Some metrics:
<pre><code class="language-python">
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
</code></pre>
</li>
<li><strong>Step-6: </strong>Visualizing the training set:
<pre><code class="language-python">
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Plot for training set
plt.sca(axes[0]) # Select the first subplot
x_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=ListedColormap(('lightblue', 'lightgreen')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(('blue', 'green'))(i), label=j)
plt.title('Naive Bayes (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
# Plot for test set
plt.sca(axes[1]) # Select the second subplot
x_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=ListedColormap(('lightblue', 'lightgreen')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(('blue', 'green'))(i), label=j)
plt.title('Naive Bayes (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
# Adjust layout and display the plots
plt.tight_layout()
plt.show()
</code></pre>
<figure>
<img src="assets/img/machine-ln/classification-naive-example-training.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
</li>
<p>The final output above depicts the classifier's performance on the test set data. The classifier has effectively created a Gaussian curve to separate the "purchased" and "not purchased" variables, demonstrating its ability to classify the data. Despite some erroneous predictions, which are quantified in the confusion matrix, overall, the classifier demonstrates strong performance and can be considered a reliable predictor.</p>
</ul>
</section>
<!----------- Reference ----------->
<section id="reference">
<h2>References</h2>
<ul>
<li>My github Repositories on Remote sensing <a href="https://github.com/arunp77/Machine-Learning/" target="_blank">Machine learning</a></li>
<li><a href="https://www.simplilearn.com/tutorials/machine-learning-tutorial/naive-bayes-classifier" target="_blank">Understanding Naive Bayes Classifier: Simplilearn</a>.</li>
<li><a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">Naïve Bayes Classifier Algorithm, JAVAPoint.com</a></li>
<li>Book on Statistics: <a href="https://hastie.su.domains/Papers/ESLII.pdf" target="_blank">The Elements of Statistical Learning</a></li>
</ul>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="machine-learning.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>