🎨 format code in scripts

RasmussenLab · May 20, 2023 · 3f2c4c9 · 3f2c4c9
1 parent dc60244
commit 3f2c4c9
Show file tree

Hide file tree

Showing 12 changed files with 820 additions and 591 deletions.
diff --git a/project/01_0_split_data.ipynb b/project/01_0_split_data.ipynb
diff --git a/project/01_0_split_data.py b/project/01_0_split_data.py
diff --git a/project/01_1_train_CF.ipynb b/project/01_1_train_CF.ipynb
@@ -43,7 +43,8 @@
     "import vaep.nb\n",
     "from vaep.logging import setup_logger\n",
     "logger = setup_logger(logger=logging.getLogger('vaep'))\n",
-    "logger.info(\"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n",
+    "logger.info(\n",
+    "    \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n",
     "\n",
     "figures = {}  # collection of ax or figures"
    ]
@@ -80,23 +81,26 @@
    "outputs": [],
    "source": [
     "# files and folders\n",
-    "folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment\n",
-    "folder_data:str = '' # specify data directory if needed\n",
-    "file_format: str = 'csv' # change default to pickled files\n",
-    "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow\n",
+    "# Datasplit folder with data for experiment\n",
+    "folder_experiment: str = 'runs/example'\n",
+    "folder_data: str = ''  # specify data directory if needed\n",
+    "file_format: str = 'csv'  # change default to pickled files\n",
+    "# Machine parsed metadata from rawfile workflow\n",
+    "fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n",
     "# training\n",
-    "epochs_max:int = 20  # Maximum number of epochs\n",
+    "epochs_max: int = 20  # Maximum number of epochs\n",
     "# early_stopping:bool = True # Wheather to use early stopping or not\n",
-    "patience:int = 1 # Patience for early stopping\n",
-    "batch_size:int = 32_768 # Batch size for training (and evaluation)\n",
-    "cuda:bool=True # Use the GPU for training?\n",
+    "patience: int = 1  # Patience for early stopping\n",
+    "batch_size: int = 32_768  # Batch size for training (and evaluation)\n",
+    "cuda: bool = True  # Use the GPU for training?\n",
     "# model\n",
-    "latent_dim:int = 10 # Dimensionality of encoding dimension (latent space of model)\n",
+    "# Dimensionality of encoding dimension (latent space of model)\n",
+    "latent_dim: int = 10\n",
     "# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder\n",
-    "sample_idx_position: int = 0 # position of index which is sample ID\n",
-    "model: str = 'CF' # model name\n",
-    "model_key: str = 'CF' # potentially alternative key for model (grid search)\n",
-    "save_pred_real_na:bool=True # Save all predictions for missing values"
+    "sample_idx_position: int = 0  # position of index which is sample ID\n",
+    "model: str = 'CF'  # model name\n",
+    "model_key: str = 'CF'  # potentially alternative key for model (grid search)\n",
+    "save_pred_real_na: bool = True  # Save all predictions for missing values"
    ]
   },
   {
@@ -170,7 +174,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) "
+    "data = datasplits.DataSplits.from_folder(\n",
+    "    args.data, file_format=args.file_format)"
    ]
   },
   {
@@ -196,7 +201,7 @@
    "id": "6045414b",
    "metadata": {},
    "source": [
-    "Infer index names from long format "
+    "Infer index names from long format"
    ]
   },
   {
@@ -208,7 +213,7 @@
    "source": [
     "index_columns = list(data.train_X.index.names)\n",
     "sample_id = index_columns.pop(args.sample_idx_position)\n",
-    "if len(index_columns) == 1: \n",
+    "if len(index_columns) == 1:\n",
     "    index_column = index_columns.pop()\n",
     "    index_columns = None\n",
     "    logger.info(f\"{sample_id = }, single feature: {index_column = }\")\n",
@@ -218,7 +223,8 @@
     "if not index_columns:\n",
     "    index_columns = [sample_id, index_column]\n",
     "else:\n",
-    "    raise NotImplementedError(\"More than one feature: Needs to be implemented. see above logging output.\")"
+    "    raise NotImplementedError(\n",
+    "        \"More than one feature: Needs to be implemented. see above logging output.\")"
    ]
   },
   {
@@ -264,7 +270,7 @@
    "outputs": [],
    "source": [
     "freq_peptides = sampling.frequency_by_index(data.train_X, 0)\n",
-    "freq_peptides.head() # training data"
+    "freq_peptides.head()  # training data"
    ]
   },
   {
@@ -280,7 +286,7 @@
    "id": "02d5763b-00fe-44ce-9dfa-b6e506045762",
    "metadata": {},
    "source": [
-    "The validation fake NA is used to by all models to evaluate training performance. "
+    "The validation fake NA is used to by all models to evaluate training performance."
    ]
   },
   {
@@ -329,12 +335,12 @@
     "ana_collab = models.collab.CollabAnalysis(\n",
     "    datasplits=data,\n",
     "    sample_column=sample_id,\n",
-    "    item_column=index_column, # not generic\n",
+    "    item_column=index_column,  # not generic\n",
     "    target_column='intensity',\n",
     "    model_kwargs=dict(n_factors=args.latent_dim,\n",
-    "                    y_range=(int(data.train_X.min()),\n",
-    "                                int(data.train_X.max())+1)\n",
-    "                    ),\n",
+    "                      y_range=(int(data.train_X.min()),\n",
+    "                               int(data.train_X.max())+1)\n",
+    "                      ),\n",
     "    batch_size=args.batch_size)"
    ]
   },
@@ -403,10 +409,10 @@
     "recorder_dump.save(args.out_figures)\n",
     "del recorder_dump\n",
     "vaep.savefig(fig, name='collab_training',\n",
-    "                folder=args.out_figures)\n",
+    "             folder=args.out_figures)\n",
     "ana_collab.model_kwargs['batch_size'] = ana_collab.batch_size\n",
     "vaep.io.dump_json(ana_collab.model_kwargs, args.out_models /\n",
-    "                    TEMPLATE_MODEL_PARAMS.format('CF'))"
+    "                  TEMPLATE_MODEL_PARAMS.format('CF'))"
    ]
   },
   {
@@ -437,7 +443,8 @@
    "outputs": [],
    "source": [
     "# this could be done using the validation data laoder now\n",
-    "ana_collab.test_dl = ana_collab.dls.test_dl(data.val_y.reset_index())  # test_dl is here validation data\n",
+    "ana_collab.test_dl = ana_collab.dls.test_dl(\n",
+    "    data.val_y.reset_index())  # test_dl is here validation data\n",
     "val_pred_fake_na['CF'], _ = ana_collab.learn.get_preds(\n",
     "    dl=ana_collab.test_dl)\n",
     "val_pred_fake_na"
@@ -476,7 +483,7 @@
     "    pred_real_na = models.collab.get_missing_values(\n",
     "        df_train_long=data.train_X,\n",
     "        val_idx=data.val_y.index,\n",
-    "        test_idx=data.test_y.index, \n",
+    "        test_idx=data.test_y.index,\n",
     "        analysis_collab=ana_collab)\n",
     "    pred_real_na.to_csv(args.out_preds / f\"pred_real_na_{args.model_key}.csv\")"
    ]
@@ -510,8 +517,8 @@
    "source": [
     "## Comparisons\n",
     "\n",
-    "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)  \n",
-    "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)  \n",
+    "> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n",
+    "> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n",
     "> Could be changed."
    ]
   },
@@ -524,7 +531,7 @@
     "\n",
     "- all measured (identified, observed) peptides in validation data\n",
     "\n",
-    "> Does not make to much sense to compare collab and AEs,  \n",
+    "> Does not make to much sense to compare collab and AEs,\n",
     "> as the setup differs of training and validation data differs"
    ]
   },
@@ -603,7 +610,8 @@
    },
    "outputs": [],
    "source": [
-    "vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')"
+    "vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n",
+    "                  f'metrics_{args.model_key}.json')"
    ]
   },
   {
@@ -615,7 +623,8 @@
    },
    "outputs": [],
    "source": [
-    "metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T\n",
+    "metrics_df = models.get_df_from_nested_dict(\n",
+    "    d_metrics.metrics, column_levels=['model', 'metric_name']).T\n",
     "metrics_df"
    ]
   },
@@ -656,7 +665,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "args.dump(fname=args.out_models/ f\"model_config_{args.model_key}.yaml\")\n",
+    "args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n",
     "args"
    ]
   },

diff --git a/project/01_1_train_CF.py b/project/01_1_train_CF.py
@@ -45,7 +45,8 @@
 import vaep.nb
 from vaep.logging import setup_logger
 logger = setup_logger(logger=logging.getLogger('vaep'))
-logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions")
+logger.info(
+    "Experiment 03 - Analysis of latent spaces and performance comparisions")
 
 figures = {}  # collection of ax or figures
 
@@ -59,23 +60,26 @@
 
 # %% tags=["parameters"]
 # files and folders
-folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment
-folder_data:str = '' # specify data directory if needed
-file_format: str = 'csv' # change default to pickled files
-fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow
+# Datasplit folder with data for experiment
+folder_experiment: str = 'runs/example'
+folder_data: str = ''  # specify data directory if needed
+file_format: str = 'csv'  # change default to pickled files
+# Machine parsed metadata from rawfile workflow
+fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
 # training
-epochs_max:int = 20  # Maximum number of epochs
+epochs_max: int = 20  # Maximum number of epochs
 # early_stopping:bool = True # Wheather to use early stopping or not
-patience:int = 1 # Patience for early stopping
-batch_size:int = 32_768 # Batch size for training (and evaluation)
-cuda:bool=True # Use the GPU for training?
+patience: int = 1  # Patience for early stopping
+batch_size: int = 32_768  # Batch size for training (and evaluation)
+cuda: bool = True  # Use the GPU for training?
 # model
-latent_dim:int = 10 # Dimensionality of encoding dimension (latent space of model)
+# Dimensionality of encoding dimension (latent space of model)
+latent_dim: int = 10
 # hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder
-sample_idx_position: int = 0 # position of index which is sample ID
-model: str = 'CF' # model name
-model_key: str = 'CF' # potentially alternative key for model (grid search)
-save_pred_real_na:bool=True # Save all predictions for missing values
+sample_idx_position: int = 0  # position of index which is sample ID
+model: str = 'CF'  # model name
+model_key: str = 'CF'  # potentially alternative key for model (grid search)
+save_pred_real_na: bool = True  # Save all predictions for missing values
 
 # %% [markdown]
 # Some argument transformations
@@ -105,7 +109,8 @@
 # ## Load data in long format
 
 # %%
-data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) 
+data = datasplits.DataSplits.from_folder(
+    args.data, file_format=args.file_format)
 
 # %% [markdown]
 # data is loaded in long format
@@ -114,12 +119,12 @@
 data.train_X.sample(5)
 
 # %% [markdown]
-# Infer index names from long format 
+# Infer index names from long format
 
 # %%
 index_columns = list(data.train_X.index.names)
 sample_id = index_columns.pop(args.sample_idx_position)
-if len(index_columns) == 1: 
+if len(index_columns) == 1:
     index_column = index_columns.pop()
     index_columns = None
     logger.info(f"{sample_id = }, single feature: {index_column = }")
@@ -129,7 +134,8 @@
 if not index_columns:
     index_columns = [sample_id, index_column]
 else:
-    raise NotImplementedError("More than one feature: Needs to be implemented. see above logging output.")
+    raise NotImplementedError(
+        "More than one feature: Needs to be implemented. see above logging output.")
 
 # %% [markdown]
 # load meta data for splits
@@ -150,13 +156,13 @@
 
 # %%
 freq_peptides = sampling.frequency_by_index(data.train_X, 0)
-freq_peptides.head() # training data
+freq_peptides.head()  # training data
 
 # %% [markdown]
 # ### Produce some addional fake samples
 
 # %% [markdown]
-# The validation fake NA is used to by all models to evaluate training performance. 
+# The validation fake NA is used to by all models to evaluate training performance.
 
 # %%
 val_pred_fake_na = data.val_y.to_frame(name='observed')
@@ -178,12 +184,12 @@
 ana_collab = models.collab.CollabAnalysis(
     datasplits=data,
     sample_column=sample_id,
-    item_column=index_column, # not generic
+    item_column=index_column,  # not generic
     target_column='intensity',
     model_kwargs=dict(n_factors=args.latent_dim,
-                    y_range=(int(data.train_X.min()),
-                                int(data.train_X.max())+1)
-                    ),
+                      y_range=(int(data.train_X.min()),
+                               int(data.train_X.max())+1)
+                      ),
     batch_size=args.batch_size)
 
 # %%
@@ -225,10 +231,10 @@
 recorder_dump.save(args.out_figures)
 del recorder_dump
 vaep.savefig(fig, name='collab_training',
-                folder=args.out_figures)
+             folder=args.out_figures)
 ana_collab.model_kwargs['batch_size'] = ana_collab.batch_size
 vaep.io.dump_json(ana_collab.model_kwargs, args.out_models /
-                    TEMPLATE_MODEL_PARAMS.format('CF'))
+                  TEMPLATE_MODEL_PARAMS.format('CF'))
 
 # %% [markdown]
 # ### Predictions
@@ -238,7 +244,8 @@
 
 # %%
 # this could be done using the validation data laoder now
-ana_collab.test_dl = ana_collab.dls.test_dl(data.val_y.reset_index())  # test_dl is here validation data
+ana_collab.test_dl = ana_collab.dls.test_dl(
+    data.val_y.reset_index())  # test_dl is here validation data
 val_pred_fake_na['CF'], _ = ana_collab.learn.get_preds(
     dl=ana_collab.test_dl)
 val_pred_fake_na
@@ -257,7 +264,7 @@
     pred_real_na = models.collab.get_missing_values(
         df_train_long=data.train_X,
         val_idx=data.val_y.index,
-        test_idx=data.test_y.index, 
+        test_idx=data.test_y.index,
         analysis_collab=ana_collab)
     pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv")
 
@@ -275,16 +282,16 @@
 # %% [markdown]
 # ## Comparisons
 #
-# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)  
-# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)  
+# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)
+# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)
 # > Could be changed.
 
 # %% [markdown]
 # ### Validation data
 #
 # - all measured (identified, observed) peptides in validation data
 #
-# > Does not make to much sense to compare collab and AEs,  
+# > Does not make to much sense to compare collab and AEs,
 # > as the setup differs of training and validation data differs
 
 # %%
@@ -311,11 +318,13 @@
 # Save all metrics as json
 
 # %%
-vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')
+vaep.io.dump_json(d_metrics.metrics, args.out_metrics /
+                  f'metrics_{args.model_key}.json')
 
 
 # %%
-metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T
+metrics_df = models.get_df_from_nested_dict(
+    d_metrics.metrics, column_levels=['model', 'metric_name']).T
 metrics_df
 
 # %% [markdown]
@@ -330,7 +339,7 @@
 # ## Config
 
 # %%
-args.dump(fname=args.out_models/ f"model_config_{args.model_key}.yaml")
+args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml")
 args
 
 # %%