Notebooks

MannLabs · Jul 28, 2023 · 7321be6 · 7321be6
1 parent dbc09a7
commit 7321be6
Show file tree

Hide file tree

Showing 17 changed files with 2,064 additions and 0 deletions.
diff --git a/sandbox/Figure_Notebooks/P_06_B Quant.ipynb b/sandbox/Figure_Notebooks/P_06_B Quant.ipynb
diff --git a/sandbox/Figure_Notebooks/P_08B.ipynb b/sandbox/Figure_Notebooks/P_08B.ipynb
diff --git a/sandbox/Figure_Notebooks/SI_01_TOP_N_Test.ipynb b/sandbox/Figure_Notebooks/SI_01_TOP_N_Test.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7f2b7cdf",
+   "metadata": {},
+   "source": [
+    "# Score Benchmarking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a63ee21",
+   "metadata": {},
+   "source": [
+    "For the first search , compare the performance for searching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adef4dc0",
+   "metadata": {},
+   "source": [
+    "We don't want to rerun feature finding and file conversion again and again, so we create a copy and continue runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af15f7ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from alphapept.settings import load_settings\n",
+    "from alphapept.paths import DEFAULT_SETTINGS_PATH\n",
+    "import alphapept.interface\n",
+    "import alphapept.io\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os \n",
+    "    \n",
+    "BASE_PATH = 'F:/AP_Paper_Benchmark/PXD028735/top_n_optimization'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b3de785",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "settings = load_settings(DEFAULT_SETTINGS_PATH)\n",
+    "\n",
+    "settings['experiment']['file_paths'] =  [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.raw')]\n",
+    "settings['experiment']['fasta_paths'] = [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.fasta')]\n",
+    "\n",
+    "settings = alphapept.interface.import_raw_data(settings)\n",
+    "settings = alphapept.interface.feature_finding(settings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34a2e864",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "\n",
+    "_ = settings['experiment']['file_paths'][0]\n",
+    "base, ext = os.path.splitext(_)\n",
+    "ms_file_path = base+'.ms_data.hdf'\n",
+    "ms_file_path_bkup = ms_file_path+'.bkup'\n",
+    "\n",
+    "if os.path.isfile(ms_file_path_bkup):\n",
+    "    os.remove(ms_file_path_bkup)\n",
+    "\n",
+    "os.rename(ms_file_path, ms_file_path_bkup)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a935628",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm as tqdm\n",
+    "import pandas as pd\n",
+    "\n",
+    "def set_settings(top_n, method, ini_score):\n",
+    "    settings = load_settings(DEFAULT_SETTINGS_PATH)\n",
+    "\n",
+    "    settings['workflow']['continue_runs'] = True\n",
+    "\n",
+    "    settings['experiment']['file_paths'] =  [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.raw')]\n",
+    "    settings['experiment']['fasta_paths'] = [os.path.join(BASE_PATH, _) for _ in os.listdir('F:/AP_Paper_Benchmark/PXD028735/top_n_optimization') if _.endswith('.fasta')]\n",
+    "    settings['search']['top_n'] = top_n\n",
+    "    settings['score']['method'] = method\n",
+    "    settings['score']['ml_ini_score'] = ini_score\n",
+    "    \n",
+    "    return settings\n",
+    "\n",
+    "settings_list = []\n",
+    "for top_n in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,18,20,25,30,35,40]:\n",
+    "    for method in ['x_tandem','random_forest','generic_score','morpheus']:\n",
+    "        if method == 'random_forest':\n",
+    "            for ini_score in ['x_tandem','generic_score','hits']:\n",
+    "                settings = set_settings(top_n, method, ini_score)\n",
+    "                settings_list.append(settings)\n",
+    "        else:\n",
+    "            settings = set_settings(top_n, method, 'hits')\n",
+    "            settings_list.append(settings)\n",
+    "            \n",
+    "benchmark = []\n",
+    "\n",
+    "for settings in tqdm(settings_list):\n",
+    "    \n",
+    "    if os.path.isfile(ms_file_path):\n",
+    "        os.remove(ms_file_path)\n",
+    "    \n",
+    "    shutil.copyfile(ms_file_path_bkup, ms_file_path)\n",
+    "        \n",
+    "    settings_ = alphapept.interface.run_complete_workflow(settings)\n",
+    "\n",
+    "    _ = settings['experiment']['file_paths'][0]\n",
+    "    base, ext = os.path.splitext(_)\n",
+    "\n",
+    "    ms_file = alphapept.io.MS_Data_File(base+'.ms_data.hdf')\n",
+    "\n",
+    "    df = pd.read_hdf(settings['experiment']['results_path'], 'protein_fdr')\n",
+    "    \n",
+    "    time = settings['summary']['timing']['total (min)']\n",
+    "    decoy = df['decoy'].sum()\n",
+    "    target = df['target'].sum()\n",
+    "    top_n = settings['search']['top_n']\n",
+    "    method = settings['score']['method']\n",
+    "    ini_score = settings['score']['ml_ini_score']\n",
+    "\n",
+    "    plt.show()\n",
+    "    \n",
+    "    benchmark.append((top_n, method, ini_score, target, decoy, time))\n",
+    "    \n",
+    "    if os.path.isfile(settings_['experiment']['results_path']):\n",
+    "        os.remove(settings_['experiment']['results_path'])\n",
+    "\n",
+    "benchmark_df = pd.DataFrame(benchmark, columns = ['top_n','method','ini_score','target','decoy','time'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ff9185e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "benchmark_df['method_'] = benchmark_df.apply(lambda row: row['method'] + ' with ' + str(row['ini_score']) if row['method'] == 'random_forest' else row['method'], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c6ecbec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dark_blue = '#17212b'\n",
+    "light_blue = '#3dc5ef'\n",
+    "teal= '#42dee1'\n",
+    "green = '#6eecb9'\n",
+    "yellow = '#eef5b3'\n",
+    "hfont = {'fontname':'Arial', 'size':10}\n",
+    "\n",
+    "colors = [dark_blue, light_blue, teal, green, yellow]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11cf303b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(7,7))\n",
+    "\n",
+    "sns.scatterplot(data=benchmark_df[benchmark_df['top_n'] > 1], x='top_n', y='target', hue='method_', alpha=0.5)\n",
+    "\n",
+    "plt.title('Top N vs number of identified precursors after FDR')\n",
+    "plt.xlabel('Top N')\n",
+    "plt.ylabel('Top N')\n",
+    "plt.tight_layout()\n",
+    "plt.xticks(**hfont)\n",
+    "plt.yticks(**hfont)\n",
+    "plt.ylim([0, 35000])\n",
+    "plt.legend(loc='lower right')\n",
+    "plt.savefig('figures/SI_01.pdf')  \n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/sandbox/Figure_Notebooks/SI_02 Score Benchmarking.ipynb b/sandbox/Figure_Notebooks/SI_02 Score Benchmarking.ipynb
diff --git a/sandbox/Figure_Notebooks/SI_Cloud_Timings.ipynb b/sandbox/Figure_Notebooks/SI_Cloud_Timings.ipynb
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker.pdf
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker_time.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker_time.pdf
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker_venn.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Bruker_venn.pdf
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo.pdf
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo_time.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo_time.pdf
diff --git a/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo_venn.pdf b/sandbox/Figure_Notebooks/figures/08B_PXD028735_Thermo_venn.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI03_accuracy_PXD028735_Bruker.pdf b/sandbox/Figure_Notebooks/figures/SI03_accuracy_PXD028735_Bruker.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI03_accuracy_PXD028735_Thermo.pdf b/sandbox/Figure_Notebooks/figures/SI03_accuracy_PXD028735_Thermo.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI04_FDR_Bruker.pdf b/sandbox/Figure_Notebooks/figures/SI04_FDR_Bruker.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI04_FDR_Thermo.pdf b/sandbox/Figure_Notebooks/figures/SI04_FDR_Thermo.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI_02B_Bruker.pdf b/sandbox/Figure_Notebooks/figures/SI_02B_Bruker.pdf
diff --git a/sandbox/Figure_Notebooks/figures/SI_02B_Thermo.pdf b/sandbox/Figure_Notebooks/figures/SI_02B_Thermo.pdf