Skip to content

Commit

Permalink
🎨 format code in scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Henry committed May 20, 2023
1 parent dc60244 commit 3f2c4c9
Show file tree
Hide file tree
Showing 12 changed files with 820 additions and 591 deletions.
192 changes: 122 additions & 70 deletions project/01_0_split_data.ipynb

Large diffs are not rendered by default.

184 changes: 114 additions & 70 deletions project/01_0_split_data.py

Large diffs are not rendered by default.

77 changes: 43 additions & 34 deletions project/01_1_train_CF.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
"import vaep.nb\n",
"from vaep.logging import setup_logger\n",
"logger = setup_logger(logger=logging.getLogger('vaep'))\n",
"logger.info(\"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n",
"logger.info(\n",
" \"Experiment 03 - Analysis of latent spaces and performance comparisions\")\n",
"\n",
"figures = {} # collection of ax or figures"
]
Expand Down Expand Up @@ -80,23 +81,26 @@
"outputs": [],
"source": [
"# files and folders\n",
"folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment\n",
"folder_data:str = '' # specify data directory if needed\n",
"file_format: str = 'csv' # change default to pickled files\n",
"fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow\n",
"# Datasplit folder with data for experiment\n",
"folder_experiment: str = 'runs/example'\n",
"folder_data: str = '' # specify data directory if needed\n",
"file_format: str = 'csv' # change default to pickled files\n",
"# Machine parsed metadata from rawfile workflow\n",
"fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'\n",
"# training\n",
"epochs_max:int = 20 # Maximum number of epochs\n",
"epochs_max: int = 20 # Maximum number of epochs\n",
"# early_stopping:bool = True # Wheather to use early stopping or not\n",
"patience:int = 1 # Patience for early stopping\n",
"batch_size:int = 32_768 # Batch size for training (and evaluation)\n",
"cuda:bool=True # Use the GPU for training?\n",
"patience: int = 1 # Patience for early stopping\n",
"batch_size: int = 32_768 # Batch size for training (and evaluation)\n",
"cuda: bool = True # Use the GPU for training?\n",
"# model\n",
"latent_dim:int = 10 # Dimensionality of encoding dimension (latent space of model)\n",
"# Dimensionality of encoding dimension (latent space of model)\n",
"latent_dim: int = 10\n",
"# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder\n",
"sample_idx_position: int = 0 # position of index which is sample ID\n",
"model: str = 'CF' # model name\n",
"model_key: str = 'CF' # potentially alternative key for model (grid search)\n",
"save_pred_real_na:bool=True # Save all predictions for missing values"
"sample_idx_position: int = 0 # position of index which is sample ID\n",
"model: str = 'CF' # model name\n",
"model_key: str = 'CF' # potentially alternative key for model (grid search)\n",
"save_pred_real_na: bool = True # Save all predictions for missing values"
]
},
{
Expand Down Expand Up @@ -170,7 +174,8 @@
"metadata": {},
"outputs": [],
"source": [
"data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format) "
"data = datasplits.DataSplits.from_folder(\n",
" args.data, file_format=args.file_format)"
]
},
{
Expand All @@ -196,7 +201,7 @@
"id": "6045414b",
"metadata": {},
"source": [
"Infer index names from long format "
"Infer index names from long format"
]
},
{
Expand All @@ -208,7 +213,7 @@
"source": [
"index_columns = list(data.train_X.index.names)\n",
"sample_id = index_columns.pop(args.sample_idx_position)\n",
"if len(index_columns) == 1: \n",
"if len(index_columns) == 1:\n",
" index_column = index_columns.pop()\n",
" index_columns = None\n",
" logger.info(f\"{sample_id = }, single feature: {index_column = }\")\n",
Expand All @@ -218,7 +223,8 @@
"if not index_columns:\n",
" index_columns = [sample_id, index_column]\n",
"else:\n",
" raise NotImplementedError(\"More than one feature: Needs to be implemented. see above logging output.\")"
" raise NotImplementedError(\n",
" \"More than one feature: Needs to be implemented. see above logging output.\")"
]
},
{
Expand Down Expand Up @@ -264,7 +270,7 @@
"outputs": [],
"source": [
"freq_peptides = sampling.frequency_by_index(data.train_X, 0)\n",
"freq_peptides.head() # training data"
"freq_peptides.head() # training data"
]
},
{
Expand All @@ -280,7 +286,7 @@
"id": "02d5763b-00fe-44ce-9dfa-b6e506045762",
"metadata": {},
"source": [
"The validation fake NA is used to by all models to evaluate training performance. "
"The validation fake NA is used to by all models to evaluate training performance."
]
},
{
Expand Down Expand Up @@ -329,12 +335,12 @@
"ana_collab = models.collab.CollabAnalysis(\n",
" datasplits=data,\n",
" sample_column=sample_id,\n",
" item_column=index_column, # not generic\n",
" item_column=index_column, # not generic\n",
" target_column='intensity',\n",
" model_kwargs=dict(n_factors=args.latent_dim,\n",
" y_range=(int(data.train_X.min()),\n",
" int(data.train_X.max())+1)\n",
" ),\n",
" y_range=(int(data.train_X.min()),\n",
" int(data.train_X.max())+1)\n",
" ),\n",
" batch_size=args.batch_size)"
]
},
Expand Down Expand Up @@ -403,10 +409,10 @@
"recorder_dump.save(args.out_figures)\n",
"del recorder_dump\n",
"vaep.savefig(fig, name='collab_training',\n",
" folder=args.out_figures)\n",
" folder=args.out_figures)\n",
"ana_collab.model_kwargs['batch_size'] = ana_collab.batch_size\n",
"vaep.io.dump_json(ana_collab.model_kwargs, args.out_models /\n",
" TEMPLATE_MODEL_PARAMS.format('CF'))"
" TEMPLATE_MODEL_PARAMS.format('CF'))"
]
},
{
Expand Down Expand Up @@ -437,7 +443,8 @@
"outputs": [],
"source": [
"# this could be done using the validation data laoder now\n",
"ana_collab.test_dl = ana_collab.dls.test_dl(data.val_y.reset_index()) # test_dl is here validation data\n",
"ana_collab.test_dl = ana_collab.dls.test_dl(\n",
" data.val_y.reset_index()) # test_dl is here validation data\n",
"val_pred_fake_na['CF'], _ = ana_collab.learn.get_preds(\n",
" dl=ana_collab.test_dl)\n",
"val_pred_fake_na"
Expand Down Expand Up @@ -476,7 +483,7 @@
" pred_real_na = models.collab.get_missing_values(\n",
" df_train_long=data.train_X,\n",
" val_idx=data.val_y.index,\n",
" test_idx=data.test_y.index, \n",
" test_idx=data.test_y.index,\n",
" analysis_collab=ana_collab)\n",
" pred_real_na.to_csv(args.out_preds / f\"pred_real_na_{args.model_key}.csv\")"
]
Expand Down Expand Up @@ -510,8 +517,8 @@
"source": [
"## Comparisons\n",
"\n",
"> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE) \n",
"> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data) \n",
"> Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)\n",
"> The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)\n",
"> Could be changed."
]
},
Expand All @@ -524,7 +531,7 @@
"\n",
"- all measured (identified, observed) peptides in validation data\n",
"\n",
"> Does not make to much sense to compare collab and AEs, \n",
"> Does not make to much sense to compare collab and AEs,\n",
"> as the setup differs of training and validation data differs"
]
},
Expand Down Expand Up @@ -603,7 +610,8 @@
},
"outputs": [],
"source": [
"vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')"
"vaep.io.dump_json(d_metrics.metrics, args.out_metrics /\n",
" f'metrics_{args.model_key}.json')"
]
},
{
Expand All @@ -615,7 +623,8 @@
},
"outputs": [],
"source": [
"metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T\n",
"metrics_df = models.get_df_from_nested_dict(\n",
" d_metrics.metrics, column_levels=['model', 'metric_name']).T\n",
"metrics_df"
]
},
Expand Down Expand Up @@ -656,7 +665,7 @@
"metadata": {},
"outputs": [],
"source": [
"args.dump(fname=args.out_models/ f\"model_config_{args.model_key}.yaml\")\n",
"args.dump(fname=args.out_models / f\"model_config_{args.model_key}.yaml\")\n",
"args"
]
},
Expand Down
77 changes: 43 additions & 34 deletions project/01_1_train_CF.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
import vaep.nb
from vaep.logging import setup_logger
logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 03 - Analysis of latent spaces and performance comparisions")
logger.info(
"Experiment 03 - Analysis of latent spaces and performance comparisions")

figures = {} # collection of ax or figures

Expand All @@ -59,23 +60,26 @@

# %% tags=["parameters"]
# files and folders
folder_experiment:str = 'runs/example' # Datasplit folder with data for experiment
folder_data:str = '' # specify data directory if needed
file_format: str = 'csv' # change default to pickled files
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv' # Machine parsed metadata from rawfile workflow
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = '' # specify data directory if needed
file_format: str = 'csv' # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
# training
epochs_max:int = 20 # Maximum number of epochs
epochs_max: int = 20 # Maximum number of epochs
# early_stopping:bool = True # Wheather to use early stopping or not
patience:int = 1 # Patience for early stopping
batch_size:int = 32_768 # Batch size for training (and evaluation)
cuda:bool=True # Use the GPU for training?
patience: int = 1 # Patience for early stopping
batch_size: int = 32_768 # Batch size for training (and evaluation)
cuda: bool = True # Use the GPU for training?
# model
latent_dim:int = 10 # Dimensionality of encoding dimension (latent space of model)
# Dimensionality of encoding dimension (latent space of model)
latent_dim: int = 10
# hidden_layers:str = '128_64' # A space separated string of layers, '50 20' for the encoder, reverse will be use for decoder
sample_idx_position: int = 0 # position of index which is sample ID
model: str = 'CF' # model name
model_key: str = 'CF' # potentially alternative key for model (grid search)
save_pred_real_na:bool=True # Save all predictions for missing values
sample_idx_position: int = 0 # position of index which is sample ID
model: str = 'CF' # model name
model_key: str = 'CF' # potentially alternative key for model (grid search)
save_pred_real_na: bool = True # Save all predictions for missing values

# %% [markdown]
# Some argument transformations
Expand Down Expand Up @@ -105,7 +109,8 @@
# ## Load data in long format

# %%
data = datasplits.DataSplits.from_folder(args.data, file_format=args.file_format)
data = datasplits.DataSplits.from_folder(
args.data, file_format=args.file_format)

# %% [markdown]
# data is loaded in long format
Expand All @@ -114,12 +119,12 @@
data.train_X.sample(5)

# %% [markdown]
# Infer index names from long format
# Infer index names from long format

# %%
index_columns = list(data.train_X.index.names)
sample_id = index_columns.pop(args.sample_idx_position)
if len(index_columns) == 1:
if len(index_columns) == 1:
index_column = index_columns.pop()
index_columns = None
logger.info(f"{sample_id = }, single feature: {index_column = }")
Expand All @@ -129,7 +134,8 @@
if not index_columns:
index_columns = [sample_id, index_column]
else:
raise NotImplementedError("More than one feature: Needs to be implemented. see above logging output.")
raise NotImplementedError(
"More than one feature: Needs to be implemented. see above logging output.")

# %% [markdown]
# load meta data for splits
Expand All @@ -150,13 +156,13 @@

# %%
freq_peptides = sampling.frequency_by_index(data.train_X, 0)
freq_peptides.head() # training data
freq_peptides.head() # training data

# %% [markdown]
# ### Produce some addional fake samples

# %% [markdown]
# The validation fake NA is used to by all models to evaluate training performance.
# The validation fake NA is used to by all models to evaluate training performance.

# %%
val_pred_fake_na = data.val_y.to_frame(name='observed')
Expand All @@ -178,12 +184,12 @@
ana_collab = models.collab.CollabAnalysis(
datasplits=data,
sample_column=sample_id,
item_column=index_column, # not generic
item_column=index_column, # not generic
target_column='intensity',
model_kwargs=dict(n_factors=args.latent_dim,
y_range=(int(data.train_X.min()),
int(data.train_X.max())+1)
),
y_range=(int(data.train_X.min()),
int(data.train_X.max())+1)
),
batch_size=args.batch_size)

# %%
Expand Down Expand Up @@ -225,10 +231,10 @@
recorder_dump.save(args.out_figures)
del recorder_dump
vaep.savefig(fig, name='collab_training',
folder=args.out_figures)
folder=args.out_figures)
ana_collab.model_kwargs['batch_size'] = ana_collab.batch_size
vaep.io.dump_json(ana_collab.model_kwargs, args.out_models /
TEMPLATE_MODEL_PARAMS.format('CF'))
TEMPLATE_MODEL_PARAMS.format('CF'))

# %% [markdown]
# ### Predictions
Expand All @@ -238,7 +244,8 @@

# %%
# this could be done using the validation data laoder now
ana_collab.test_dl = ana_collab.dls.test_dl(data.val_y.reset_index()) # test_dl is here validation data
ana_collab.test_dl = ana_collab.dls.test_dl(
data.val_y.reset_index()) # test_dl is here validation data
val_pred_fake_na['CF'], _ = ana_collab.learn.get_preds(
dl=ana_collab.test_dl)
val_pred_fake_na
Expand All @@ -257,7 +264,7 @@
pred_real_na = models.collab.get_missing_values(
df_train_long=data.train_X,
val_idx=data.val_y.index,
test_idx=data.test_y.index,
test_idx=data.test_y.index,
analysis_collab=ana_collab)
pred_real_na.to_csv(args.out_preds / f"pred_real_na_{args.model_key}.csv")

Expand All @@ -275,16 +282,16 @@
# %% [markdown]
# ## Comparisons
#
# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)
# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)
# > Note: The interpolated values have less predictions for comparisons than the ones based on models (CF, DAE, VAE)
# > The comparison is therefore not 100% fair as the interpolated samples will have more common ones (especailly the sparser the data)
# > Could be changed.

# %% [markdown]
# ### Validation data
#
# - all measured (identified, observed) peptides in validation data
#
# > Does not make to much sense to compare collab and AEs,
# > Does not make to much sense to compare collab and AEs,
# > as the setup differs of training and validation data differs

# %%
Expand All @@ -311,11 +318,13 @@
# Save all metrics as json

# %%
vaep.io.dump_json(d_metrics.metrics, args.out_metrics / f'metrics_{args.model_key}.json')
vaep.io.dump_json(d_metrics.metrics, args.out_metrics /
f'metrics_{args.model_key}.json')


# %%
metrics_df = models.get_df_from_nested_dict(d_metrics.metrics, column_levels=['model', 'metric_name']).T
metrics_df = models.get_df_from_nested_dict(
d_metrics.metrics, column_levels=['model', 'metric_name']).T
metrics_df

# %% [markdown]
Expand All @@ -330,7 +339,7 @@
# ## Config

# %%
args.dump(fname=args.out_models/ f"model_config_{args.model_key}.yaml")
args.dump(fname=args.out_models / f"model_config_{args.model_key}.yaml")
args

# %%
Loading

0 comments on commit 3f2c4c9

Please sign in to comment.