naive-byes.html

<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">

<title>Naive Bayes</title>
<meta content="" name="description">
<meta content="" name="keywords">

<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">

<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">

<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>

<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">

<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->

<script type="text/javascript" async
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>


<!-- =======================================================
  * Template Name: iPortfolio
  * Updated: Sep 18 2023 with Bootstrap v5.3.2
  * Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
  * Author: BootstrapMade.com
  * License: https://bootstrapmade.com/license/
======================================================== -->
</head>

<body>

<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>

<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">

    <div class="profile">
    <img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
    <h1 class="text-light"><a href="index.html">Arun</a></h1>
    <div class="social-links mt-3 text-center">
        <a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
        <a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
        <a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
        <a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
        <a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
    </div>
    </div>

    <nav id="navbar" class="nav-menu navbar">
    <ul>
        <li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
        <li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
        <li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
        <li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
        <li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
        <li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
        <li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
        <li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
        <li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
        <li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
        <!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
    </ul>
    </nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->

<main id="main">

    <!-- ======= Breadcrumbs ======= -->
    <section id="breadcrumbs" class="breadcrumbs">
    <div class="container">

    <div class="d-flex justify-content-between align-items-center">
        <h2>Machine Learning</h2>
        <ol>
        <li><a href="machine-learning.html" class="clickable-box">Content section</a></li>
        <li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
        </ol>
    </div>

    </div>
    </section><!-- End Breadcrumbs -->

    <!------  right dropdown menue ------->
    <div class="right-side-list">
    <div class="dropdown">
        <button class="dropbtn"><strong>Shortcuts:</strong></button>
        <div class="dropdown-content">
            <ul>
                <li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
                <li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
                <li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
                <li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
                <li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
                <li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
                <li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
                <li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
                <li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
                <li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
                <li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
                <li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
                <li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
                <!-- Add more subsections as needed -->
            </ul>
        </div>
    </div>
    </div>

    <!-- ======= Portfolio Details Section ======= -->
    <section id="portfolio-details" class="portfolio-details">
    <div class="container">
    <div class="row gy-4">
        <h1>Naive Bayes: Classification methods</h1>
        <div class="col-lg-8">
        <div class="portfolio-details-slider swiper">
            <div class="swiper-wrapper align-items-center">
            <div class="swiper-slide">
                <figure>
                    <img src="assets/img/machine-ln/classification-naive.png" alt="" style="max-width: 90%; max-height: auto;">
                    <figcaption style="text-align: center;"></figcaption>
                </figure>
            </div>
            </div>
        </div>
    </div>

    <div class="col-lg-4 grey-box">
        
        <div class="section-title">
        <h3>Content</h3>
        <ol>
            <li><a href="#introduction">Introduction</a></li>
            <ul>
                <li><a href="#principle-bayes">Principle of Naive Bayes</a></li>
                <li><a href="#working">Working of Naive Bayes</a></li>
                <li><a href="#types">Types of Naive Bayes Classifiers</a></li>
                <li><a href="#advantages">Advantages of Naive Bayes</a></li>
                <li><a href="#limitations">Limitations of Naive Bayes</a></li>
                <li><a href="#applications-bayes">Applications of Naive Bayes</a></li>
            </ul>
            <li><a href="#example">Example</a></li>
            <ul>
                <li><a href="#example-weather">Example on weather outlook</a></li>
                <li><a href="#example-iris">Irdis dataset</a></li>
                <li><a href="#example-userdata">User_Data.csv</a></li>
            </ul>
            <li><a href="#reference">Reference</a></li>
        </ol>
        </div>
    </div>
    </div>

    <section>
    <h2 id="introdction">Introduction</h2>
    <p>Naive Bayes algorithms are a family of probabilistic classifiers based on Bayes' theorem with the "naive" assumption of independence between features. Despite their simplicity, they are powerful and widely used for classification tasks in various fields, including text classification, spam filtering, and medical diagnosis. They work by calculating the probability of a given data point belonging to each class and selecting the class with the highest probability. </p>
    <p>Naive Bayes algorithms are efficient, particularly for large datasets, and they perform well even with limited training data. Popular variants include Gaussian Naive Bayes, Multinomial Naive Bayes, and Bernoulli Naive Bayes. </p>

    <h4 id="principle-bayes">Principle of Naive Bayes</h4> 
    <p>At the core of Naive Bayes is Bayes' theorem, which calculates the probability of a hypothesis (class label) given the observed evidence (features). The "naive" assumption in Naive Bayes is that the features are conditionally independent given the class label, which simplifies the calculation of probabilities.</p>
    
    <div class="important-box">
        Bayes' theorem is a fundamental concept in probability theory and statistics, used to update the probability of a hypothesis (or event) based on new evidence. It is expressed mathematically as:
        $$P(A|B) = \frac{P(B|A) \times P(A)}{P(B)}$$
        where:
        <ul>
            <li>\(P(A|B)\) is the probability of event \(A\) occurring given that event \(B\) has occurred. This is called the posterior probability. The posterior probability represents the updated belief about the probability of event A occurring after considering the new evidence (event B). It is what we are interested in calculating using Bayes' theorem.</li>
            <li>\(P(B|A)\) is the probability of event \(B\) occurring given that event \(A\) has occurred. This is called the likelihood. The likelihood represents the probability of observing the new evidence (event B) given that the hypothesis (event A) is true. It quantifies how well the evidence supports the hypothesis.</li>
            <li>\(P(A)\) is the probability of event \(A\) occurring. This is called the prior probability. The prior probability represents our initial belief about the probability of event \(A\) occurring before considering any new evidence.</li>
            <li>\(P(B)\) is the probability of event \(B\) occurring. This is called the marginal probability. The marginal probability represents the total probability of observing event B, irrespective of whether event A is true or not. It serves as a normalization factor to ensure that the posterior probability is properly scaled.</li>
        </ul>
        <p><strong>Example: </strong>Let's say we have a medical test to detect a disease, and:
            <ul>
                <li>\(P(A)\) is the prior probability of having the disease.</li>
                <li>\(P(B|A)\) is the probability of testing positive given that the person has the disease.</li>
                <li>\(P(B)\) is the probability of testing positive (with or without having the disease).</li>
                <li>\(P(A|B)\) is the probability of having the disease given that the person tested positive</li>
            </ul>
            Using Bayes' theorem, we can update our belief about the probability of having the disease \(P(A|B)\) based on the test result \(P(B|A)\), the prior probability of having the disease \(P(A)\), and the probability of testing positive \(P(B)\).
        </p>
    </div><br><br>

    <h4 id="working">Working of Naive Bayes</h4>
    <ol>
        <li><strong>Training Phase: </strong>During the training phase, Naive Bayes learns the probabilities of each feature given each class label from the training data.</li>
        $$P(C_j) = \frac{\text{Number of instances with class} ~ C_j}{\text{Total number of instances}}$$
        where \(P(C_j)\) represents the the Prior probability of each class occuring in the dataset and is calculated as the frequency of each class divided by the total number of instances in the dataset.
        <li><strong>Probability Calculation: </strong> To classify a new data point: 
            <ul>
                <li>Calculate the posterior probability of each class given the observed features using Bayes' theorem i.e. \(P(X_i|C_j)\). Depending on the type of feature (e.g., continuous, categorical), different probability distributions (e.g., Gaussian, multinomial) can be used. <strong>For example</strong>, for continuous features, Gaussian Naive Bayes assumes a normal (Gaussian) distribution for each feature given each class. Thus, the likelihood \(P(X_i|C_j)\) can be calculated using the mean \(\mu_{ij}\) and standard deviation \(\sigma_{ij}\) of feature \(X_i\) in class \(C_j\).
                $$P(X_i|C_j) = \frac{1}{\sqrt{2\pi \sigma^2_{ij}}} ~\text{exp}\left(-\frac{(x-\mu_{ij})^2}{2\sigma_{ij}^2}\right).$$
                <p><strong>Probability Calculation for classification:</strong> To classify a new data point, Naive Bayes calculates the posterior probability of each class given the observed features using Bayes' theorem:</p>
                $$P(C_j|X_1,X_2,...,X_n) = \frac{P(X_1,X_2,...,X_n |C_j) \times P(C_j)}{P(X_1,X_2,...,X_n)}$$
                Given the "naive" assumption of feature independence, this equation simplifies to:
                $$P(C_j|X_1,X_2,...,X_n) = P(C_j) \times \Pi_{i=1}^n P(X_i|C_j)$$
                where:
                <ul>
                    <li>\(P(C_j|X_1,X_2,...,X_n)\) is the posterior probability of class \(C_j\) given the observed features.</li>
                    <li>\(P(X_i|C_j)\) is the likelihood of feature \(X_i\) given class \(C_j\), which was calculated during the training phase.</li>
                    <li>\(P(C_j)\) is the prior probability of class \(C_j\), also calculated during the training phase. </li>
                </ul>
                Finally, Naive Bayes selects the class with the highest posterior probability as the predicted class for the new data point: 
                $$\text{Predicted Class} = \text{arg}\text{max}_{C_j} = P(C_j|X_1, X_2,...,X_n).$$
                In summary, during the training phase, Naive Bayes learns the probabilities of each feature given each class from the training data. Then, during the prediction phase, it calculates the posterior probability of each class given the observed features using Bayes' theorem and selects the class with the highest posterior probability as the predicted class.
                </li>
                <li>Select the class with the highest posterior probability as the predicted class for the new data point.</li>
            </ul>
        </li>
    </ol>

    <h4 id="types">Types of Naive Bayes Classifiers</h4>
    <ol>
        <li><strong>Gaussian Naive Bayes: </strong>Assumes that the features follow a Gaussian (normal) distribution.</li>
        <li><strong>Multinomial Naive Bayes: </strong>Suitable for features that represent counts or frequencies (e.g., word counts in text classification).</li>
        <li><strong>Bernoulli Naive Bayes: </strong>Designed for binary features, where each feature is either present or absent.</li>
    </ol>

    <h4 id="advantages">Advantages of Naive Bayes:</h4>
    <ol>
        <li><strong>Simplicity: </strong>Naive Bayes is straightforward to implement and understand, making it suitable for quick prototyping and baseline models.</li>
        <li><strong>Efficiency: </strong>Naive Bayes is computationally efficient, especially for large datasets, as it requires only simple probability calculations.</li>
        <li><strong>Robustness to Irrelevant Features: </strong>Naive Bayes can perform well even in the presence of irrelevant features, thanks to its independence assumption.</li>
    </ol>


    <h4 id="limitations">Limitations of Naive Bayes</h4>
    <ol>
        <li><strong>Strong Independence Assumption: </strong>The assumption of feature independence may not hold true in many real-world datasets, leading to suboptimal performance.</li>
        <li><strong>Sensitive to Imbalanced Data: </strong>Naive Bayes can be sensitive to imbalanced class distributions, potentially biasing the model towards the majority class.</li>
        <li><strong>Limited Expressiveness: </strong>Naive Bayes may struggle with capturing complex relationships between features, especially in datasets with highly nonlinear decision boundaries.</li>
    </ol>


    <h4 id="applications-bayes">Applications of Naive Bayes:</h4>
    <ol>
        <li><strong>Text Classification: </strong>Naive Bayes is widely used for text classification tasks, such as spam detection, sentiment analysis, and document categorization.</li>
        <li><strong>Recommendation Systems: </strong>Naive Bayes can be employed in recommendation systems to predict user preferences based on historical interactions.</li>
        <li><strong>Medical Diagnosis: </strong>Naive Bayes has applications in medical diagnosis, where it can assist in predicting the likelihood of diseases based on symptoms and patient characteristics.</li>
    </ol>

    Naive Bayes is a simple yet effective classification algorithm that can deliver competitive performance in various machine learning tasks, especially when dealing with text data and relatively simple classification problems. However, its performance may degrade in more complex scenarios where the independence assumption is violated or when dealing with highly imbalanced datasets.


    <!------------------------->
    <hr>
    <h3 id="example">Example</h3>
    <h4 id="example-weather">Example on weather outlook</h4>
    Suppose we have a dataset of weather conditions and corresponding target variable "Play". So using this dataset we need to decide that whether we should play or not on a particular day according to the weather conditions. So to solve this problem, we need to follow the below steps
    (for more details go to <a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">link</a>):
    <br> <br>
    <table>
        <tr>
            <th>Day</th>
            <th>Outlook</th>
            <th>Play</th>
        </tr>
        <tr>
            <td>0</td>
            <td>Rainy</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>1</td>
            <td>Sunny</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>2</td>
            <td>Overcast</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>3</td>
            <td>Overcast</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>4</td>
            <td>Sunny</td>
            <td>No</td>
        </tr>
        <tr>
            <td>5</td>
            <td>Rainy</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>6</td>
            <td>Sunny</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>7</td>
            <td>Overcast</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>8</td>
            <td>Rainy</td>
            <td>No</td>
        </tr>
        <tr>
            <td>9</td>
            <td>Sunny</td>
            <td>No</td>
        </tr>
        <tr>
            <td>10</td>
            <td>Sunny</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>11</td>
            <td>Rainy</td>
            <td>No</td>
        </tr>
        <tr>
            <td>12</td>
            <td>Overcast</td>
            <td>Yes</td>
        </tr>
        <tr>
            <td>13</td>
            <td>Overcast</td>
            <td>Yes</td>
        </tr>
    </table><br>
    <p>Total samples = 14</p>

    <p><strong>Frequency table and the Likelihood for the Weather Conditions:</strong></p>
    <table>
        <tr>
            <th><strong>Outlook</strong></th>
            <th>Yes</th>
            <th>No</th>
            <th>P(Outlook | Yes)</th>
            <th>P(Outlook | No)</th> 
            <th>P(Outlook)</th>
        </tr>
        <tr>
            <td>Overcast</td>
            <td>5</td>
            <td>0</td>
            <td>5/10 = 0.5</td>
            <td>0</td>
            <td>5/14 = 0.35</td>
        </tr>
        <tr>
            <td>Rainy</td>
            <td>2</td>
            <td>2</td>
            <td>2/10 =0.2</td>
            <td>2/5= 0.4</td>
            <td>4/14 = 0.29</td>
        </tr>
        <tr>
            <td>Sunny</td>
            <td>3</td>
            <td>2</td>
            <td>3/10 = 0.3</td>
            <td>2/5 = 0.4</td>
            <td>5/14=0.35</td>
        </tr>
        <tr>
            <td>Total</td>
            <td>10</td>
            <td>5</td>
            <td></td>
            <td></td>
            <td></td>
        </tr>
    </table><br>

    <table>
        <tr>
            <th>Play (yes or no)</th>
            <th>Total number</th>
            <th>P(Play)</th>
        </tr>
        <tr>
            <td>Yes</td>
            <td>10</td>
            <td>10/14 = 0.71</td>
        </tr>
        <tr>
            <td>No</td>
            <td>4</td>
            <td>4/14 = 0.29</td>
        </tr>
    </table>

    
    So apllying the Bayes' theorem, we have:
    $$P(\text{Yes}|\text{Sunny}) = P(\text{Yes}) \times \frac{ P(\text{Sunny}|\text{Yes})}{P(\text{Sunny})} =\frac{10}{14} \times \frac{3/10}{5/10} =0.71$$

    and 
    $$P(\text{No}|\text{Sunny}) = P(\text{No}) \times \frac{P(\text{Sunny}|\text{No}) }{P(\text{Sunny})} =\frac{4}{14} \times \frac{2/5}{5/10}=0.22$$

    <p>We can normalize the two probabilities as follows:</p>

    $$P(\text{Yes}|\text{Sunny}) = \frac{0.71}{0.71+0.22} = 0.76 \equiv 76\%$$

    and 

    $$P(\text{No}|\text{Sunny}) \frac{0.22}{0.71+0.22} = 0.24 \equiv 24\%$$


    <p>So as we can see from the above calculation that \(P(\text{Yes}|\text{Sunny})>P(\text{No}|\text{Sunny})\). Hence on a Sunny day, Player can play the game.</p>

    <p><strong>Example: Python implementation</strong>A Python implementation of this example with little bit changed outlooks:</p>
    <pre><code class="language-python">
        import pandas as pd
        import random
        
        # Define the possible values for Outlook and Play
        outlook_values = ['Sunny', 'Rainy', 'Overcast']
        play_values = ['Yes', 'No']
        
        # Generate 20 random data points
        data = {
            'Outlook': [random.choice(outlook_values) for _ in range(20)],
            'Play': [random.choice(play_values) for _ in range(20)]
        }
        df = pd.DataFrame(data)

        # Calculate the total number of instances
        total_instances = len(df)
        
        # Calculate the number of instances where the outlook is sunny
        sunny_instances = len(df[df['Outlook'] == 'Sunny'])
        
        # Calculate the number of instances where the outlook is sunny and the player plays
        sunny_play_instances = len(df[(df['Outlook'] == 'Sunny') & (df['Play'] == 'Yes')])
        
        # Calculate the prior probability of playing the game
        prior_probability_play = df['Play'].value_counts(normalize=True)['Yes']
        
        # Calculate the probability of playing the game given that the outlook is sunny using Bayes' theorem
        probability_play_given_sunny = (sunny_play_instances / sunny_instances) * prior_probability_play
        
        print("Probability of playing the game on a Sunny day using Bayes' theorem:", probability_play_given_sunny)
    </code></pre>
    <pre>Probability of playing the game on a Sunny day using Bayes' theorem: 0.09999999999999999</pre>
    <figure>
        <img src="assets/img/machine-ln/classification-naive-example1.png" alt="" style="max-width: 90%; max-height: auto;">
        <figcaption style="text-align: center;">Number of 'Yes' and 'No' instances for each Weather Outlook is shown on the right hand side.</figcaption>
    </figure>

    <!----------------------------------->
    <h4 id="example-iris">Examples-2: Iris data</h4>
    In this example. we consider the 'Iris' dataset. This example demonstrates the process of training and evaluating a Gaussian Naive Bayes classifier on the Iris dataset using scikit-learn. It shows how to load the data, split it into training and testing sets, train the classifier, make predictions, and evaluate the classifier's performance using accuracy, classification report, and <a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix details</a>.

    <pre><code class="language-python">
        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split
        from sklearn.naive_bayes import GaussianNB
        from sklearn.metrics import classification_report, confusion_matrix
        
        # Load the Iris dataset
        iris = load_iris()
        X = iris.data
        y = iris.target
        
        # Split the dataset into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Initialize the Gaussian Naive Bayes classifier
        gnb = GaussianNB()
        
        # Train the classifier
        gnb.fit(X_train, y_train)
        
        # Make predictions on the testing set
        y_pred = gnb.predict(X_test)
    </code></pre>
    The accuracy is:
    <pre><code class="language-python">
        # Calculate the accuracy of the classifier
        accuracy = gnb.score(X_test, y_test)
        print("Accuracy:", accuracy)
    </code></pre>
    <pre>Accuracy: 0.9777777777777777</pre>
    <pre><code class="language-python">
        # Print classification report and confusion matrix
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
    </code></pre> 
    <figure>
        <img src="assets/img/machine-ln/classification-naive-example2.png" alt="" style="max-width: 90%; max-height: auto;">
        <figcaption style="text-align: center;"></figcaption>
    </figure>
    <p><strong>Confusion matrix: </strong></p>
    <pre><code class="language-python">
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, y_pred))        
    </code></pre>
    <figure>
        <img src="assets/img/machine-ln/classification-naive-confusionmatrix.png" alt="" style="max-width: 90%; max-height: auto;">
        <figcaption style="text-align: center;"></figcaption>
    </figure>
    <p>The classification report provides a concise summary of the performance of a classifier. In this report, precision measures the accuracy of positive predictions, recall assesses the proportion of actual positives correctly identified, and the F1-score balances precision and recall. For the given classes (0, 1, 2), the classifier achieves high precision, indicating accurate positive predictions. Similarly, recall scores show effective identification of actual positives, though class 1 has slightly lower recall. The F1-score reflects a good balance between precision and recall for each class, indicating overall strong performance. The support values represent the number of instances for each class in the testing set. The reported accuracy of 98% highlights the overall correctness of the classifier across all classes, with both macro and weighted averages reinforcing balanced performance.</p>


    <p></p>
    <!------------------------------->
    <h4 id="example-userdata">Example-3</h4>
    In this example, we use dataset 'User_Data.csv' for training, or testing algorithms like Naive Bayes for classification tasks. In this case, it provides a starting point for implementing a Naive Bayes algorithm using Python. Data is available in my <a href="" target="_blank">Github repo</a>
    <ul>
        <li><strong>Step-1: </strong>Generate a random dataset:</li>
        <pre><code class="language-python">
            import pandas as pd
            import numpy as np
            import random 
            import matplotlib.pyplot as plt 
            
            df_user =  pd.read_csv('User_Data.csv')
            
            # Display the DataFrame
            print(df_user.head())
            
            # Display the DataFrame
            print(df_user.head())
        </code></pre>
        <li><strong>Step-2: </strong>Splitting the dataset into the Training set and Test set 
            <pre><code class="language-python">
                # Importing the dataset  
                X = df_user.iloc[:, [2, 3]].values  
                y = df_user.iloc[:, 4].values  
                
                # Splitting the dataset into the Training set and Test set  
                from sklearn.model_selection import train_test_split  
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
            </code></pre>
        </li>
        <li><strong>Step-3: </strong> Feature scaling:
            <pre><code class="language-python">
                # Feature Scaling  
                from sklearn.preprocessing import StandardScaler  
                sc = StandardScaler()  
                X_train = sc.fit_transform(X_train)  
                X_test = sc.transform(X_test) 
            </code></pre>
        </li>
        <li><strong>Step-4: </strong>Fitting Naive Bayes and then predicting from the test set:
            <pre><code class="language-python">
                # Fitting Naive Bayes to the Training set  
                from sklearn.naive_bayes import GaussianNB  
                classifier = GaussianNB()  
                classifier.fit(X_train, y_train)
                # Predicting the Test set results  
                y_pred = classifier.predict(X_test)    
            </code></pre>
        </li>
        <li><strong>Step-5: </strong>Some metrics: 
            <pre><code class="language-python">
                # Making the Confusion Matrix  
                from sklearn.metrics import confusion_matrix  
                cm = confusion_matrix(y_test, y_pred)  
            </code></pre>
        </li>
        <li><strong>Step-6: </strong>Visualizing the training set:
        <pre><code class="language-python">
            import numpy as np
            import matplotlib.pyplot as plt
            from matplotlib.colors import ListedColormap
            
            # Create a figure with two subplots
            fig, axes = plt.subplots(1, 2, figsize=(12, 5))
            
            # Plot for training set
            plt.sca(axes[0])  # Select the first subplot
            x_set, y_set = X_train, y_train
            X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
                                 np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
            plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                         alpha=0.75, cmap=ListedColormap(('lightblue', 'lightgreen')))  
            plt.xlim(X1.min(), X1.max())
            plt.ylim(X2.min(), X2.max())
            for i, j in enumerate(np.unique(y_set)):
                plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
                            c=ListedColormap(('blue', 'green'))(i), label=j)
            plt.title('Naive Bayes (Training set)')
            plt.xlabel('Age')
            plt.ylabel('Estimated Salary')
            plt.legend()
            
            # Plot for test set
            plt.sca(axes[1])  # Select the second subplot
            x_set, y_set = X_test, y_test
            X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
                                 np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
            plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                         alpha=0.75, cmap=ListedColormap(('lightblue', 'lightgreen')))  
            plt.xlim(X1.min(), X1.max())
            plt.ylim(X2.min(), X2.max())
            for i, j in enumerate(np.unique(y_set)):
                plt.scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
                            c=ListedColormap(('blue', 'green'))(i), label=j)
            plt.title('Naive Bayes (Test set)')
            plt.xlabel('Age')
            plt.ylabel('Estimated Salary')
            plt.legend()
            
            # Adjust layout and display the plots
            plt.tight_layout()
            plt.show()      
        </code></pre>
        <figure>
            <img src="assets/img/machine-ln/classification-naive-example-training.png" alt="" style="max-width: 90%; max-height: auto;">
            <figcaption style="text-align: center;"></figcaption>
        </figure>
        </li>
        <p>The final output above depicts the classifier's performance on the test set data. The classifier has effectively created a Gaussian curve to separate the "purchased" and "not purchased" variables, demonstrating its ability to classify the data. Despite some erroneous predictions, which are quantified in the confusion matrix, overall, the classifier demonstrates strong performance and can be considered a reliable predictor.</p>
    </ul>

    
    </section>

    <!----------- Reference ----------->
    <section id="reference">
    <h2>References</h2>
    <ul>
        <li>My github Repositories on Remote sensing <a href="https://github.com/arunp77/Machine-Learning/" target="_blank">Machine learning</a></li>
        <li><a href="https://www.simplilearn.com/tutorials/machine-learning-tutorial/naive-bayes-classifier" target="_blank">Understanding Naive Bayes Classifier: Simplilearn</a>.</li>
        <li><a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">Naïve Bayes Classifier Algorithm, JAVAPoint.com</a></li>
        <li>Book on Statistics: <a href="https://hastie.su.domains/Papers/ESLII.pdf" target="_blank">The Elements of Statistical Learning</a></li>
    </ul>
    </section>

    <hr>
    
    <div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">

    <h3>Some other interesting things to know:</h3>
    <ul style="list-style-type: disc; margin-left: 30px;">
        <li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
        <li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
    </ul>
    </div>
    <p></p>

    <div class="navigation">
        <a href="index.html#portfolio" class="clickable-box">
            <span class="arrow-left">Portfolio section</span>
        </a>
        
        <a href="machine-learning.html" class="clickable-box">
            <span class="arrow-right">Content</span>
        </a>
    </div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --

<!-- ======= Footer ======= -->
<footer id="footer">
  <div class="container">
    <div class="copyright">
      &copy; Copyright <strong><span>Arun</span></strong>
    </div>
  </div>
</footer><!-- End  Footer -->

<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>

<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>

<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>

<script>
  document.addEventListener("DOMContentLoaded", function () {
    hljs.initHighlightingOnLoad();
  });
</script>

</body>

</html>