-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #69 from gwaygenomics/data-summary
Adding data summary counts
- Loading branch information
Showing
3 changed files
with
237 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,231 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "posted-ottawa", | ||
"metadata": {}, | ||
"source": [ | ||
"## Count LINCS Cell Painting data\n", | ||
"\n", | ||
"This notebook provides counting statistics for two batches of LINCS Cell Painting set in this repository.\n", | ||
"\n", | ||
"* Count profiles\n", | ||
"* Count perturbation treatments\n", | ||
"* Count single cells (with an output cell count summary file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "civic-festival", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pathlib\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "offensive-german", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n", | ||
"\n", | ||
"consensus_dir = pathlib.Path(\"consensus\")\n", | ||
"batch_effect_dir = pathlib.Path(\"spherized_profiles\")\n", | ||
"cell_count_dir = pathlib.Path(\"profiles/cell_count\")\n", | ||
"platemap_dir = pathlib.Path(\"metadata/platemaps\")\n", | ||
"\n", | ||
"consensus_suffix = \"_consensus_modz_feature_select_dmso.csv.gz\"\n", | ||
"batch_effect_suffix = \"_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4ba958c5", | ||
"metadata": {}, | ||
"source": [ | ||
"## Profile and perturbation count" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "b905b954", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n", | ||
"There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n", | ||
"There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n", | ||
"There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1 (['48H'])\n", | ||
"There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n", | ||
"There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n", | ||
"There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n", | ||
"\n", | ||
"There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n", | ||
"There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n", | ||
"There are 349 unique compounds assayed in 2017_12_05_Batch2\n", | ||
"There are 3 unique time points assayed in 2017_12_05_Batch2 (['24H' '48H' '6H'])\n", | ||
"There are 6 unique doses assayed in 2017_12_05_Batch2\n", | ||
"There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n", | ||
"There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for batch in batches:\n", | ||
" consensus_file = pathlib.Path(consensus_dir, batch, f\"{batch}{consensus_suffix}\")\n", | ||
" batch_effect_file = pathlib.Path(batch_effect_dir, \"profiles\", f\"{batch}{batch_effect_suffix}\")\n", | ||
" \n", | ||
" profiles_df = pd.read_csv(batch_effect_file, low_memory=False)\n", | ||
" print(f\"There are a total of {profiles_df.shape[0]:,} image-based profiles assayed in {batch}\")\n", | ||
"\n", | ||
" consensus_df = pd.read_csv(consensus_file)\n", | ||
" print(f\"There are a total of {consensus_df.shape[0]:,} image-based consensus profiles assayed in {batch}\")\n", | ||
" \n", | ||
" num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n", | ||
" print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n", | ||
" \n", | ||
" time_points = consensus_df.Metadata_time_point.unique()\n", | ||
" print(f\"There are {len(time_points)} unique time points assayed in {batch} ({time_points})\")\n", | ||
" \n", | ||
" doses = len(consensus_df.Metadata_dose_recode.unique())\n", | ||
" print(f\"There are {doses} unique doses assayed in {batch}\")\n", | ||
" \n", | ||
" cell_lines = consensus_df.Metadata_cell_id.unique()\n", | ||
" print(f\"There are {len(cell_lines)} unique cell lines assayed in {batch} ({cell_lines})\")\n", | ||
" \n", | ||
" unique_perturbations = (\n", | ||
" consensus_df\n", | ||
" .groupby(\n", | ||
" [\"Metadata_cell_id\", \"Metadata_broad_sample\", \"Metadata_time_point\", \"Metadata_dose_recode\"]\n", | ||
" )[\"Metadata_pert_well\"]\n", | ||
" .count()\n", | ||
" .reset_index()\n", | ||
" .rename(\n", | ||
" columns={\"Metadata_pert_well\": \"num_replicates\"}\n", | ||
" )\n", | ||
" )\n", | ||
" print(f\"There is a total of {len(unique_perturbations):,} unique perturbations assayed in {batch}\\n\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "30171986", | ||
"metadata": {}, | ||
"source": [ | ||
"### Cell Count" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "verbal-vaccine", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Count cells\n", | ||
"all_cell_count_df = {x: [] for x in batches}\n", | ||
"for batch in batches:\n", | ||
" # Load barcode platemap\n", | ||
" barcode_platemap_file = platemap_dir / batch / \"barcode_platemap.csv\"\n", | ||
" barcode_platemap_df = pd.read_csv(barcode_platemap_file)\n", | ||
"\n", | ||
" # Setup other path variables\n", | ||
" batch_count_dir = pathlib.Path(cell_count_dir, batch)\n", | ||
" plate_dirs = [x for x in batch_count_dir.iterdir() if \".DS_Store\" not in x.name]\n", | ||
" \n", | ||
" for plate_dir in plate_dirs:\n", | ||
" plate_name = plate_dir.name\n", | ||
" platemap_name = barcode_platemap_df.query(\"Assay_Plate_Barcode == @plate_name\").Plate_Map_Name.values[0]\n", | ||
" platemap_file = platemap_dir / batch / \"platemap\" / f\"{platemap_name}.txt\"\n", | ||
" platemap_df = pd.read_csv(platemap_file, sep=\"\\t\")\n", | ||
"\n", | ||
" cell_count_file = plate_dir / f\"{plate_name}_cell_count.csv\"\n", | ||
" cell_count_df = (\n", | ||
" pd.read_csv(cell_count_file)\n", | ||
" .assign(batch=batch)\n", | ||
" .rename({\n", | ||
" \"Image_Metadata_Well\": \"Metadata_Well\",\n", | ||
" \"Image_Metadata_Plate\": \"Metadata_Plate\"\n", | ||
" }, axis=\"columns\")\n", | ||
" )\n", | ||
" \n", | ||
" cell_count_df = (\n", | ||
" cell_count_df\n", | ||
" .merge(platemap_df, left_on=\"Metadata_Well\", right_on=\"well_position\")\n", | ||
" )\n", | ||
" \n", | ||
" all_cell_count_df[batch].append(cell_count_df)\n", | ||
" \n", | ||
" # Combine batch-specific cell count summary\n", | ||
" all_cell_count_df[batch] = pd.concat(all_cell_count_df[batch]).reset_index(drop=True)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "complex-polls", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"In batch 2016_04_01_a549_48hr_batch1, we profiled 110,012,425 cells\n", | ||
"In batch 2017_12_05_Batch2, we profiled 49,705,063 cells\n", | ||
"\n", | ||
"We profiled a total of 159,717,488 cells in the LINCS Cell Painting dataset\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Output metadata summary files\n", | ||
"total_cells = 0\n", | ||
"for batch in batches:\n", | ||
" batch_metadata_df = all_cell_count_df[batch]\n", | ||
" batch_cell_count = batch_metadata_df.cell_count.sum()\n", | ||
" \n", | ||
" print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n", | ||
" \n", | ||
" output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n", | ||
" batch_metadata_df.to_csv(\n", | ||
" output_cell_count_summary_file, index=False, compression={\"method\": \"gzip\", \"mtime\": 0}\n", | ||
" )\n", | ||
" \n", | ||
" total_cells += batch_cell_count\n", | ||
" \n", | ||
"print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
3 changes: 3 additions & 0 deletions
3
profiles/cell_count/2016_04_01_a549_48hr_batch1_metadata_cell_count_summary.tsv.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions
3
profiles/cell_count/2017_12_05_Batch2_metadata_cell_count_summary.tsv.gz
Git LFS file not shown