Feature request: Add Anomaly Detection Project

Fixes #858
UppuluriKalyani · Nov 9, 2024 · 9142b5d · 9142b5d
1 parent 56256c0
commit 9142b5d
Show file tree

Hide file tree

Showing 8 changed files with 382 additions and 0 deletions.
diff --git a/Anomaly Detection Project/Anomaly_Detection_Project.ipynb b/Anomaly Detection Project/Anomaly_Detection_Project.ipynb
@@ -0,0 +1,43 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.ensemble import IsolationForest\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics import confusion_matrix, classification_report\n",
+    "from sklearn.decomposition import PCA\n",
+    "\n",
+    "# Set plot style\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "\n",
+    "# Load Dataset\n",
+    "# Replace 'data.csv' with your dataset file\n",
+    "data = pd.read_csv('data.csv')\n",
+    "\n",
+    "# Display the first few rows\n",
+    "data.head()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Anomaly Detection Project/anomalies_report.py b/Anomaly Detection Project/anomalies_report.py
@@ -0,0 +1,15 @@
+# anomalies_report.py
+
+import pandas as pd
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Identify anomalies based on both clustering and isolation forest
+anomalies_kmeans = data[data['Cluster'] == 1]  # From KMeans
+anomalies_isoforest = data[data['Anomaly'] == 1]  # From Isolation Forest
+
+# Combine and save identified anomalies from both methods
+anomalies_report = pd.concat([anomalies_kmeans, anomalies_isoforest]).drop_duplicates()
+anomalies_report.to_csv('anomalies_report.csv', index=False)
+print("Anomalies report saved as 'anomalies_report.csv'.")
diff --git a/Anomaly Detection Project/data_preprocessing.py b/Anomaly Detection Project/data_preprocessing.py
@@ -0,0 +1,19 @@
+# data_preprocessing.py
+
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
+# Load dataset
+data = pd.read_csv('raw_data.csv')  # Replace with the actual path to your dataset
+
+# Check for missing values and drop rows with missing values
+data = data.dropna()
+
+# Standardize data
+scaler = StandardScaler()
+data_scaled = scaler.fit_transform(data)
+data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
+
+# Save preprocessed data to CSV
+data_scaled.to_csv('preprocessed_data.csv', index=False)
+print("Preprocessed data saved to 'preprocessed_data.csv'.")
diff --git a/Anomaly Detection Project/eda_visualizations.py b/Anomaly Detection Project/eda_visualizations.py
@@ -0,0 +1,21 @@
+# eda_visualizations.py
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Set plot style
+sns.set(style="whitegrid")
+
+# Plot feature distributions
+fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
+axes = axes.flatten()
+for i, col in enumerate(data.columns):
+    sns.histplot(data[col], kde=True, ax=axes[i])
+    axes[i].set_title(f"Distribution of {col}")
+plt.tight_layout()
+plt.savefig('eda_feature_distributions.png')
+plt.show()
diff --git a/Anomaly Detection Project/isolation_forest_anomaly_detection.py b/Anomaly Detection Project/isolation_forest_anomaly_detection.py
@@ -0,0 +1,38 @@
+# isolation_forest_anomaly_detection.py
+
+import pandas as pd
+from sklearn.ensemble import IsolationForest
+import joblib
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Set Isolation Forest parameters
+contamination_rate = 0.05  # Adjust this value based on your dataset and anomaly expectations
+max_samples_value = "auto"  # Can be an integer or "auto" (256 or total sample size, whichever is smaller)
+
+# Isolation Forest with custom parameters
+iso_forest = IsolationForest(contamination=contamination_rate, max_samples=max_samples_value, random_state=42)
+data['Anomaly'] = iso_forest.fit_predict(data)
+
+# Convert labels for anomalies
+data['Anomaly'] = data['Anomaly'].apply(lambda x: 1 if x == -1 else 0)
+
+# Save the Isolation Forest model
+joblib.dump(iso_forest, 'isolation_forest_model.pkl')
+print("Isolation Forest model saved as 'isolation_forest_model.pkl'.")
+
+# Visualize anomalies with PCA
+pca = PCA(n_components=2)
+pca_data = pca.fit_transform(data.drop(columns=['Anomaly']))
+
+plt.figure(figsize=(10, 6))
+plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Anomaly'], cmap='coolwarm', marker='o', alpha=0.6)
+plt.title("Isolation Forest Anomaly Detection with PCA")
+plt.xlabel("PCA Component 1")
+plt.ylabel("PCA Component 2")
+plt.colorbar(label="Anomaly")
+plt.savefig('isolation_forest_anomalies_pca.png')
+plt.show()
diff --git a/Anomaly Detection Project/kmeans_anomaly_detection.py b/Anomaly Detection Project/kmeans_anomaly_detection.py
@@ -0,0 +1,35 @@
+# kmeans_anomaly_detection.py
+
+import pandas as pd
+from sklearn.cluster import KMeans
+import joblib
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# KMeans Clustering
+kmeans = KMeans(n_clusters=2, random_state=42)
+kmeans.fit(data)
+data['Cluster'] = kmeans.labels_
+
+# Save the KMeans model
+joblib.dump(kmeans, 'kmeans_model.pkl')
+print("KMeans model saved as 'kmeans_model.pkl'.")
+
+# Identify anomalies based on cluster assignment
+anomalies_kmeans = data[data['Cluster'] == 1]
+
+# Visualize clusters with PCA
+pca = PCA(n_components=2)
+pca_data = pca.fit_transform(data.drop(columns=['Cluster']))
+
+plt.figure(figsize=(10, 6))
+plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Cluster'], cmap='viridis', marker='o', alpha=0.6)
+plt.title("KMeans Clustering with PCA")
+plt.xlabel("PCA Component 1")
+plt.ylabel("PCA Component 2")
+plt.colorbar(label="Cluster")
+plt.savefig('kmeans_clusters_pca.png')
+plt.show()
diff --git a/Anomaly Detection Project/model_evaluation.py b/Anomaly Detection Project/model_evaluation.py
@@ -0,0 +1,23 @@
+# model_evaluation.py
+
+import pandas as pd
+from sklearn.metrics import confusion_matrix, classification_report
+
+# Load data with true labels (replace 'TrueLabels' with the actual column if available)
+data = pd.read_csv('preprocessed_data.csv')
+# Assuming `TrueLabels` column exists in the original data
+
+# Evaluate Isolation Forest Model
+# Assuming 'TrueLabels' is in the original data and represents ground truth for anomalies
+# Uncomment and use if true labels are available
+
+# true_labels = data['TrueLabels']
+# print("Confusion Matrix (Isolation Forest):\n", confusion_matrix(true_labels, data['Anomaly']))
+# print("Classification Report (Isolation Forest):\n", classification_report(true_labels, data['Anomaly']))
+
+# Save the evaluation report to file
+# with open("evaluation_report.txt", "w") as f:
+#     f.write("Confusion Matrix (Isolation Forest):\n")
+#     f.write(str(confusion_matrix(true_labels, data['Anomaly'])) + "\n\n")
+#     f.write("Classification Report (Isolation Forest):\n")
+#     f.write(classification_report(true_labels, data['Anomaly']))
diff --git a/Anomaly Detection Project/readme.md b/Anomaly Detection Project/readme.md
@@ -0,0 +1,188 @@
+# Anomaly Detection Project
+
+This project demonstrates how to detect anomalies (unusual patterns) in a dataset using machine learning techniques like **Isolation Forest** and **KMeans Clustering**. The goal is to identify anomalous behaviors that could indicate fraudulent transactions, network intrusions, or other forms of outliers.
+
+## Table of Contents
+
+- [Project Overview](#project-overview)
+- [Files and Their Purpose](#files-and-their-purpose)
+- [Installation Instructions](#installation-instructions)
+- [Usage Instructions](#usage-instructions)
+- [Dependencies](#dependencies)
+- [Model Explanation](#model-explanation)
+- [Results and Visualizations](#results-and-visualizations)
+- [Contributing](#contributing)
+
+## Project Overview
+
+This project uses **supervised and unsupervised machine learning techniques** to identify anomalies in a given dataset. The anomaly detection process involves:
+
+1. **Data Collection & Preprocessing**: Gathering and cleaning data by handling missing values and outliers.
+2. **Exploratory Data Analysis (EDA)**: Visualizing the data distributions, identifying potential anomalies, and calculating summary statistics.
+3. **Anomaly Detection Techniques**: Using **KMeans clustering** and **Isolation Forest** to detect anomalies.
+4. **Model Evaluation**: Evaluating the model performance with metrics like precision, recall, and F1 score (if ground truth labels are available).
+5. **Visualization**: Visualizing the anomalies detected using **PCA** and other plots.
+
+## Files and Their Purpose
+
+Here’s a breakdown of each script in the project:
+
+1. **`data_preprocessing.py`**:
+   - Handles data loading, cleaning, and preprocessing tasks.
+   - Deals with missing values, categorical data encoding, and outlier handling.
+
+2. **`eda_visualizations.py`**:
+   - Performs Exploratory Data Analysis (EDA).
+   - Visualizes the data distributions and detects potential anomalies through visual methods like histograms and scatter plots.
+
+3. **`kmeans_anomaly_detection.py`**:
+   - Implements anomaly detection using **KMeans clustering**.
+   - Identifies outliers based on clustering results.
+
+4. **`isolation_forest_anomaly_detection.py`**:
+   - Implements anomaly detection using the **Isolation Forest** algorithm.
+   - Detects anomalies by isolating points that are far from the rest of the data.
+   - Allows you to adjust `contamination` and `max_samples` parameters.
+
+5. **`model_evaluation.py`** (Optional):
+   - Evaluates the performance of anomaly detection techniques (if ground truth labels are available).
+   - Calculates precision, recall, F1 score, and other evaluation metrics.
+
+6. **`anomalies_report.py`**:
+   - Generates a report of the detected anomalies, which can be saved to a CSV or viewed as a summary.
+
+7. **`Anomaly_Detection_Project.ipynb`**:
+   - Jupyter notebook that integrates all the scripts above.
+   - Contains code, visualizations, and detailed explanations for the entire anomaly detection process.
+
+## Installation Instructions
+
+### Clone this repository
+To get started, clone the repository to your local machine using:
+
+```bash
+git clone https://github.com/your-username/Anomaly-Detection-Project.git
+cd Anomaly-Detection-Project
+```
+
+### Install dependencies
+This project requires Python 3.x. You can install the required libraries using **pip** by running the following command in your terminal:
+
+```bash
+pip install -r requirements.txt
+```
+
+If `requirements.txt` is not included, you can manually install the dependencies:
+
+```bash
+pip install pandas scikit-learn matplotlib seaborn joblib
+```
+
+## Usage Instructions
+
+### Step 1: Data Preprocessing
+Run the `data_preprocessing.py` script to preprocess the raw data. This will clean the data, handle missing values, and normalize it for anomaly detection:
+
+```bash
+python data_preprocessing.py
+```
+
+### Step 2: Exploratory Data Analysis (EDA)
+Run the `eda_visualizations.py` script to visualize the data distributions and identify potential anomalies:
+
+```bash
+python eda_visualizations.py
+```
+
+### Step 3: Anomaly Detection with KMeans
+Use **KMeans** to detect anomalies by running the `kmeans_anomaly_detection.py` script:
+
+```bash
+python kmeans_anomaly_detection.py
+```
+
+### Step 4: Anomaly Detection with Isolation Forest
+Use **Isolation Forest** to detect anomalies by running the `isolation_forest_anomaly_detection.py` script. Adjust parameters like `contamination` and `max_samples` as needed.
+
+```bash
+python isolation_forest_anomaly_detection.py
+```
+
+### Step 5: Model Evaluation (Optional)
+If you have ground truth labels, run `model_evaluation.py` to evaluate the performance of the anomaly detection models:
+
+```bash
+python model_evaluation.py
+```
+
+### Step 6: Generate Anomalies Report
+Run the `anomalies_report.py` script to generate and save a report of detected anomalies:
+
+```bash
+python anomalies_report.py
+```
+
+### Step 7: Jupyter Notebook Workflow
+Run the entire workflow within a Jupyter notebook for better visualization and step-by-step execution. Open the notebook:
+
+```bash
+jupyter notebook Anomaly_Detection_Project.ipynb
+```
+
+## Dependencies
+
+This project requires the following Python libraries:
+
+- `pandas`: For data manipulation and analysis.
+- `numpy`: For numerical operations.
+- `scikit-learn`: For machine learning models (KMeans and Isolation Forest).
+- `matplotlib`: For data visualization.
+- `seaborn`: For enhanced data visualizations.
+- `joblib`: For saving and loading models.
+
+You can install all dependencies with the command:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Model Explanation
+
+- **KMeans Clustering**: This unsupervised learning algorithm groups data points into clusters. Points that do not belong to any well-defined cluster are considered anomalies.
+- **Isolation Forest**: This tree-based algorithm isolates anomalies by randomly selecting a feature and splitting the data. Anomalous points are those that are isolated more quickly than normal points.
+
+## Results and Visualizations
+
+The results of the anomaly detection process are visualized in several ways:
+
+1. **PCA Visualizations**: The results of the anomaly detection models (KMeans and Isolation Forest) are visualized using PCA (Principal Component Analysis), which reduces the dimensionality of the data to 2D for easy visualization.
+2. **Anomaly Distribution**: The distribution of detected anomalies is shown using scatter plots and heatmaps.
+
+## Contributing
+
+We welcome contributions to this project! If you have suggestions or want to add new features, feel free to fork this repository and submit a pull request.
+
+Please follow these steps to contribute:
+
+1. Fork the repository.
+2. Clone your fork to your local machine.
+3. Create a new branch for your feature (`git checkout -b feature-name`).
+4. Commit your changes (`git commit -m 'Add new feature'`).
+5. Push to your forked repository (`git push origin feature-name`).
+6. Submit a pull request.
+
+---
+
+Thank you for checking out the **Anomaly Detection Project**!
+
+```
+
+### Explanation of Sections
+
+1. **Project Overview**: Describes the project goals and steps involved.
+2. **Files and Their Purpose**: Details the role of each file in the project.
+3. **Installation Instructions**: How to clone the repo and set up dependencies.
+4. **Usage Instructions**: Walks through how to run each script step by step.
+5. **Model Explanation**: Brief overview of the machine learning models used (KMeans and Isolation Forest).
+6. **Results and Visualizations**: Explains how the anomalies are visualized and evaluated.
+7. **Contributing**: Encourages other developers to contribute by forking the repo and submitting pull requests.