Merge branch 'main' into fixes-872

UppuluriKalyani · Nov 10, 2024 · bf5a00b · bf5a00b
2 parents 0a5cb2f + dad0fa0
commit bf5a00b
Show file tree

Hide file tree

Showing 25 changed files with 10,663 additions and 59 deletions.
diff --git a/.github/workflows/auto_assign.yml b/.github/workflows/auto_assign.yml
diff --git a/.github/workflows/auto_label.yml b/.github/workflows/auto_label.yml
@@ -17,14 +17,13 @@ jobs:
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }} 
         labels: |
-          gssoc-ext
-          hacktoberfest-accepted
+          ml-nexus
 
     - name: Add labels to new pull requests
       if: github.event_name == 'pull_request_target'
       uses: actions-ecosystem/action-add-labels@v1
       with:
         github_token: ${{ secrets.GITHUB_TOKEN }}
         labels: |
-          gssoc-ext
-          hacktoberfest-accepted
+          ml-nexus
+          
diff --git a/.github/workflows/issue-reminder.yml b/.github/workflows/issue-reminder.yml
diff --git a/Anomaly Detection Project/Anomaly_Detection_Project.ipynb b/Anomaly Detection Project/Anomaly_Detection_Project.ipynb
@@ -0,0 +1,43 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Import necessary libraries\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.ensemble import IsolationForest\n",
+    "from sklearn.cluster import KMeans\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.metrics import confusion_matrix, classification_report\n",
+    "from sklearn.decomposition import PCA\n",
+    "\n",
+    "# Set plot style\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "\n",
+    "# Load Dataset\n",
+    "# Replace 'data.csv' with your dataset file\n",
+    "data = pd.read_csv('data.csv')\n",
+    "\n",
+    "# Display the first few rows\n",
+    "data.head()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Anomaly Detection Project/anomalies_report.py b/Anomaly Detection Project/anomalies_report.py
@@ -0,0 +1,15 @@
+# anomalies_report.py
+
+import pandas as pd
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Identify anomalies based on both clustering and isolation forest
+anomalies_kmeans = data[data['Cluster'] == 1]  # From KMeans
+anomalies_isoforest = data[data['Anomaly'] == 1]  # From Isolation Forest
+
+# Combine and save identified anomalies from both methods
+anomalies_report = pd.concat([anomalies_kmeans, anomalies_isoforest]).drop_duplicates()
+anomalies_report.to_csv('anomalies_report.csv', index=False)
+print("Anomalies report saved as 'anomalies_report.csv'.")
diff --git a/Anomaly Detection Project/data_preprocessing.py b/Anomaly Detection Project/data_preprocessing.py
@@ -0,0 +1,19 @@
+# data_preprocessing.py
+
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+
+# Load dataset
+data = pd.read_csv('raw_data.csv')  # Replace with the actual path to your dataset
+
+# Check for missing values and drop rows with missing values
+data = data.dropna()
+
+# Standardize data
+scaler = StandardScaler()
+data_scaled = scaler.fit_transform(data)
+data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
+
+# Save preprocessed data to CSV
+data_scaled.to_csv('preprocessed_data.csv', index=False)
+print("Preprocessed data saved to 'preprocessed_data.csv'.")
diff --git a/Anomaly Detection Project/eda_visualizations.py b/Anomaly Detection Project/eda_visualizations.py
@@ -0,0 +1,21 @@
+# eda_visualizations.py
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Set plot style
+sns.set(style="whitegrid")
+
+# Plot feature distributions
+fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
+axes = axes.flatten()
+for i, col in enumerate(data.columns):
+    sns.histplot(data[col], kde=True, ax=axes[i])
+    axes[i].set_title(f"Distribution of {col}")
+plt.tight_layout()
+plt.savefig('eda_feature_distributions.png')
+plt.show()
diff --git a/Anomaly Detection Project/isolation_forest_anomaly_detection.py b/Anomaly Detection Project/isolation_forest_anomaly_detection.py
@@ -0,0 +1,38 @@
+# isolation_forest_anomaly_detection.py
+
+import pandas as pd
+from sklearn.ensemble import IsolationForest
+import joblib
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# Set Isolation Forest parameters
+contamination_rate = 0.05  # Adjust this value based on your dataset and anomaly expectations
+max_samples_value = "auto"  # Can be an integer or "auto" (256 or total sample size, whichever is smaller)
+
+# Isolation Forest with custom parameters
+iso_forest = IsolationForest(contamination=contamination_rate, max_samples=max_samples_value, random_state=42)
+data['Anomaly'] = iso_forest.fit_predict(data)
+
+# Convert labels for anomalies
+data['Anomaly'] = data['Anomaly'].apply(lambda x: 1 if x == -1 else 0)
+
+# Save the Isolation Forest model
+joblib.dump(iso_forest, 'isolation_forest_model.pkl')
+print("Isolation Forest model saved as 'isolation_forest_model.pkl'.")
+
+# Visualize anomalies with PCA
+pca = PCA(n_components=2)
+pca_data = pca.fit_transform(data.drop(columns=['Anomaly']))
+
+plt.figure(figsize=(10, 6))
+plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Anomaly'], cmap='coolwarm', marker='o', alpha=0.6)
+plt.title("Isolation Forest Anomaly Detection with PCA")
+plt.xlabel("PCA Component 1")
+plt.ylabel("PCA Component 2")
+plt.colorbar(label="Anomaly")
+plt.savefig('isolation_forest_anomalies_pca.png')
+plt.show()
diff --git a/Anomaly Detection Project/kmeans_anomaly_detection.py b/Anomaly Detection Project/kmeans_anomaly_detection.py
@@ -0,0 +1,35 @@
+# kmeans_anomaly_detection.py
+
+import pandas as pd
+from sklearn.cluster import KMeans
+import joblib
+from sklearn.decomposition import PCA
+import matplotlib.pyplot as plt
+
+# Load preprocessed data
+data = pd.read_csv('preprocessed_data.csv')
+
+# KMeans Clustering
+kmeans = KMeans(n_clusters=2, random_state=42)
+kmeans.fit(data)
+data['Cluster'] = kmeans.labels_
+
+# Save the KMeans model
+joblib.dump(kmeans, 'kmeans_model.pkl')
+print("KMeans model saved as 'kmeans_model.pkl'.")
+
+# Identify anomalies based on cluster assignment
+anomalies_kmeans = data[data['Cluster'] == 1]
+
+# Visualize clusters with PCA
+pca = PCA(n_components=2)
+pca_data = pca.fit_transform(data.drop(columns=['Cluster']))
+
+plt.figure(figsize=(10, 6))
+plt.scatter(pca_data[:, 0], pca_data[:, 1], c=data['Cluster'], cmap='viridis', marker='o', alpha=0.6)
+plt.title("KMeans Clustering with PCA")
+plt.xlabel("PCA Component 1")
+plt.ylabel("PCA Component 2")
+plt.colorbar(label="Cluster")
+plt.savefig('kmeans_clusters_pca.png')
+plt.show()
diff --git a/Anomaly Detection Project/model_evaluation.py b/Anomaly Detection Project/model_evaluation.py
@@ -0,0 +1,23 @@
+# model_evaluation.py
+
+import pandas as pd
+from sklearn.metrics import confusion_matrix, classification_report
+
+# Load data with true labels (replace 'TrueLabels' with the actual column if available)
+data = pd.read_csv('preprocessed_data.csv')
+# Assuming `TrueLabels` column exists in the original data
+
+# Evaluate Isolation Forest Model
+# Assuming 'TrueLabels' is in the original data and represents ground truth for anomalies
+# Uncomment and use if true labels are available
+
+# true_labels = data['TrueLabels']
+# print("Confusion Matrix (Isolation Forest):\n", confusion_matrix(true_labels, data['Anomaly']))
+# print("Classification Report (Isolation Forest):\n", classification_report(true_labels, data['Anomaly']))
+
+# Save the evaluation report to file
+# with open("evaluation_report.txt", "w") as f:
+#     f.write("Confusion Matrix (Isolation Forest):\n")
+#     f.write(str(confusion_matrix(true_labels, data['Anomaly'])) + "\n\n")
+#     f.write("Classification Report (Isolation Forest):\n")
+#     f.write(classification_report(true_labels, data['Anomaly']))