Skip to content

Commit

Permalink
Merge pull request #69 from gwaygenomics/data-summary
Browse files Browse the repository at this point in the history
Adding data summary counts
  • Loading branch information
gwaybio authored Jun 21, 2021
2 parents f2bbd5a + 2de5827 commit 74f7a0d
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 0 deletions.
231 changes: 231 additions & 0 deletions perturbation-count-summary.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "posted-ottawa",
"metadata": {},
"source": [
"## Count LINCS Cell Painting data\n",
"\n",
"This notebook provides counting statistics for two batches of LINCS Cell Painting set in this repository.\n",
"\n",
"* Count profiles\n",
"* Count perturbation treatments\n",
"* Count single cells (with an output cell count summary file)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "civic-festival",
"metadata": {},
"outputs": [],
"source": [
"import pathlib\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "offensive-german",
"metadata": {},
"outputs": [],
"source": [
"batches = [\"2016_04_01_a549_48hr_batch1\", \"2017_12_05_Batch2\"]\n",
"\n",
"consensus_dir = pathlib.Path(\"consensus\")\n",
"batch_effect_dir = pathlib.Path(\"spherized_profiles\")\n",
"cell_count_dir = pathlib.Path(\"profiles/cell_count\")\n",
"platemap_dir = pathlib.Path(\"metadata/platemaps\")\n",
"\n",
"consensus_suffix = \"_consensus_modz_feature_select_dmso.csv.gz\"\n",
"batch_effect_suffix = \"_dmso_spherized_profiles_with_input_normalized_by_whole_plate.csv.gz\""
]
},
{
"cell_type": "markdown",
"id": "4ba958c5",
"metadata": {},
"source": [
"## Profile and perturbation count"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b905b954",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are a total of 52,223 image-based profiles assayed in 2016_04_01_a549_48hr_batch1\n",
"There are a total of 10,752 image-based consensus profiles assayed in 2016_04_01_a549_48hr_batch1\n",
"There are 1,571 unique compounds assayed in 2016_04_01_a549_48hr_batch1\n",
"There are 1 unique time points assayed in 2016_04_01_a549_48hr_batch1 (['48H'])\n",
"There are 7 unique doses assayed in 2016_04_01_a549_48hr_batch1\n",
"There are 1 unique cell lines assayed in 2016_04_01_a549_48hr_batch1 (['A549'])\n",
"There is a total of 9,395 unique perturbations assayed in 2016_04_01_a549_48hr_batch1\n",
"\n",
"There are a total of 51,447 image-based profiles assayed in 2017_12_05_Batch2\n",
"There are a total of 10,368 image-based consensus profiles assayed in 2017_12_05_Batch2\n",
"There are 349 unique compounds assayed in 2017_12_05_Batch2\n",
"There are 3 unique time points assayed in 2017_12_05_Batch2 (['24H' '48H' '6H'])\n",
"There are 6 unique doses assayed in 2017_12_05_Batch2\n",
"There are 3 unique cell lines assayed in 2017_12_05_Batch2 (['A549' 'MCF7' 'U2OS'])\n",
"There is a total of 9,369 unique perturbations assayed in 2017_12_05_Batch2\n",
"\n"
]
}
],
"source": [
"for batch in batches:\n",
" consensus_file = pathlib.Path(consensus_dir, batch, f\"{batch}{consensus_suffix}\")\n",
" batch_effect_file = pathlib.Path(batch_effect_dir, \"profiles\", f\"{batch}{batch_effect_suffix}\")\n",
" \n",
" profiles_df = pd.read_csv(batch_effect_file, low_memory=False)\n",
" print(f\"There are a total of {profiles_df.shape[0]:,} image-based profiles assayed in {batch}\")\n",
"\n",
" consensus_df = pd.read_csv(consensus_file)\n",
" print(f\"There are a total of {consensus_df.shape[0]:,} image-based consensus profiles assayed in {batch}\")\n",
" \n",
" num_compounds = len(consensus_df.Metadata_broad_sample.unique())\n",
" print(f\"There are {num_compounds:,} unique compounds assayed in {batch}\")\n",
" \n",
" time_points = consensus_df.Metadata_time_point.unique()\n",
" print(f\"There are {len(time_points)} unique time points assayed in {batch} ({time_points})\")\n",
" \n",
" doses = len(consensus_df.Metadata_dose_recode.unique())\n",
" print(f\"There are {doses} unique doses assayed in {batch}\")\n",
" \n",
" cell_lines = consensus_df.Metadata_cell_id.unique()\n",
" print(f\"There are {len(cell_lines)} unique cell lines assayed in {batch} ({cell_lines})\")\n",
" \n",
" unique_perturbations = (\n",
" consensus_df\n",
" .groupby(\n",
" [\"Metadata_cell_id\", \"Metadata_broad_sample\", \"Metadata_time_point\", \"Metadata_dose_recode\"]\n",
" )[\"Metadata_pert_well\"]\n",
" .count()\n",
" .reset_index()\n",
" .rename(\n",
" columns={\"Metadata_pert_well\": \"num_replicates\"}\n",
" )\n",
" )\n",
" print(f\"There is a total of {len(unique_perturbations):,} unique perturbations assayed in {batch}\\n\")"
]
},
{
"cell_type": "markdown",
"id": "30171986",
"metadata": {},
"source": [
"### Cell Count"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "verbal-vaccine",
"metadata": {},
"outputs": [],
"source": [
"# Count cells\n",
"all_cell_count_df = {x: [] for x in batches}\n",
"for batch in batches:\n",
" # Load barcode platemap\n",
" barcode_platemap_file = platemap_dir / batch / \"barcode_platemap.csv\"\n",
" barcode_platemap_df = pd.read_csv(barcode_platemap_file)\n",
"\n",
" # Setup other path variables\n",
" batch_count_dir = pathlib.Path(cell_count_dir, batch)\n",
" plate_dirs = [x for x in batch_count_dir.iterdir() if \".DS_Store\" not in x.name]\n",
" \n",
" for plate_dir in plate_dirs:\n",
" plate_name = plate_dir.name\n",
" platemap_name = barcode_platemap_df.query(\"Assay_Plate_Barcode == @plate_name\").Plate_Map_Name.values[0]\n",
" platemap_file = platemap_dir / batch / \"platemap\" / f\"{platemap_name}.txt\"\n",
" platemap_df = pd.read_csv(platemap_file, sep=\"\\t\")\n",
"\n",
" cell_count_file = plate_dir / f\"{plate_name}_cell_count.csv\"\n",
" cell_count_df = (\n",
" pd.read_csv(cell_count_file)\n",
" .assign(batch=batch)\n",
" .rename({\n",
" \"Image_Metadata_Well\": \"Metadata_Well\",\n",
" \"Image_Metadata_Plate\": \"Metadata_Plate\"\n",
" }, axis=\"columns\")\n",
" )\n",
" \n",
" cell_count_df = (\n",
" cell_count_df\n",
" .merge(platemap_df, left_on=\"Metadata_Well\", right_on=\"well_position\")\n",
" )\n",
" \n",
" all_cell_count_df[batch].append(cell_count_df)\n",
" \n",
" # Combine batch-specific cell count summary\n",
" all_cell_count_df[batch] = pd.concat(all_cell_count_df[batch]).reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "complex-polls",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In batch 2016_04_01_a549_48hr_batch1, we profiled 110,012,425 cells\n",
"In batch 2017_12_05_Batch2, we profiled 49,705,063 cells\n",
"\n",
"We profiled a total of 159,717,488 cells in the LINCS Cell Painting dataset\n"
]
}
],
"source": [
"# Output metadata summary files\n",
"total_cells = 0\n",
"for batch in batches:\n",
" batch_metadata_df = all_cell_count_df[batch]\n",
" batch_cell_count = batch_metadata_df.cell_count.sum()\n",
" \n",
" print(f\"In batch {batch}, we profiled {batch_cell_count:,} cells\")\n",
" \n",
" output_cell_count_summary_file = cell_count_dir / f\"{batch}_metadata_cell_count_summary.tsv.gz\"\n",
" batch_metadata_df.to_csv(\n",
" output_cell_count_summary_file, index=False, compression={\"method\": \"gzip\", \"mtime\": 0}\n",
" )\n",
" \n",
" total_cells += batch_cell_count\n",
" \n",
"print(f\"\\nWe profiled a total of {total_cells:,} cells in the LINCS Cell Painting dataset\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Git LFS file not shown
Git LFS file not shown

0 comments on commit 74f7a0d

Please sign in to comment.