diff --git a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb
index 60daa718..9e876e90 100644
--- a/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb
+++ b/nbs/25_pvalue/15-compute_pvalues_from_samples.ipynb
@@ -31,7 +31,7 @@
"tags": []
},
"source": [
- "TODO"
+ "Reads the gene pair samples across different categories and computes their p-values."
]
},
{
@@ -53,15 +53,9 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "1ffa1a96-7545-40b9-ac8b-8627e13de8d4",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.411609Z",
- "iopub.status.busy": "2023-09-12T22:59:49.411498Z",
- "iopub.status.idle": "2023-09-12T22:59:49.837994Z",
- "shell.execute_reply": "2023-09-12T22:59:49.837700Z"
- },
"papermill": {
"duration": 0.429643,
"end_time": "2023-09-12T22:59:49.838894",
@@ -80,6 +74,7 @@
"import numpy as np\n",
"import pandas as pd\n",
"from concurrent.futures import as_completed, ProcessPoolExecutor\n",
+ "from tqdm import tqdm\n",
"\n",
"from ccc.coef import ccc\n",
"from ccc import conf"
@@ -104,15 +99,9 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "9a154623-c787-4a31-871a-cad173f0eb9f",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.855314Z",
- "iopub.status.busy": "2023-09-12T22:59:49.855247Z",
- "iopub.status.idle": "2023-09-12T22:59:49.857116Z",
- "shell.execute_reply": "2023-09-12T22:59:49.856940Z"
- },
"papermill": {
"duration": 0.004783,
"end_time": "2023-09-12T22:59:49.857681",
@@ -128,7 +117,7 @@
"GTEX_TISSUE = \"whole_blood\"\n",
"GENE_SEL_STRATEGY = \"var_pc_log2\"\n",
"\n",
- "PVALUE_N_PERMS = 1000000\n",
+ "PVALUE_N_PERMS = 10000000\n",
"\n",
"RANDOM_STATE = np.random.RandomState(0)"
]
@@ -152,15 +141,9 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "c6f73068-fa38-44be-bd0c-708f6ff450ea",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.865853Z",
- "iopub.status.busy": "2023-09-12T22:59:49.865715Z",
- "iopub.status.idle": "2023-09-12T22:59:49.868127Z",
- "shell.execute_reply": "2023-09-12T22:59:49.867992Z"
- },
"papermill": {
"duration": 0.00506,
"end_time": "2023-09-12T22:59:49.868624",
@@ -170,17 +153,7 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"INPUT_GENE_EXPR_FILE = (\n",
" DATASET_CONFIG[\"GENE_SELECTION_DIR\"]\n",
@@ -193,15 +166,9 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"id": "30cce6f5-ca1b-438c-859d-31903a42d4c6",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.872266Z",
- "iopub.status.busy": "2023-09-12T22:59:49.872175Z",
- "iopub.status.idle": "2023-09-12T22:59:49.873814Z",
- "shell.execute_reply": "2023-09-12T22:59:49.873677Z"
- },
"papermill": {
"duration": 0.004102,
"end_time": "2023-09-12T22:59:49.874422",
@@ -211,17 +178,7 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "outputs": [],
"source": [
"INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (\n",
" DATASET_CONFIG[\"GENE_PAIR_INTERSECTIONS\"]\n",
@@ -234,15 +191,9 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"id": "0122253c-99c0-41e2-8807-60df86bf0619",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.878047Z",
- "iopub.status.busy": "2023-09-12T22:59:49.877936Z",
- "iopub.status.idle": "2023-09-12T22:59:49.879230Z",
- "shell.execute_reply": "2023-09-12T22:59:49.879101Z"
- },
"papermill": {
"duration": 0.00365,
"end_time": "2023-09-12T22:59:49.879712",
@@ -260,15 +211,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"id": "3003ed2c-5da0-43b9-969d-9cf037d05730",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.883414Z",
- "iopub.status.busy": "2023-09-12T22:59:49.883308Z",
- "iopub.status.idle": "2023-09-12T22:59:49.884842Z",
- "shell.execute_reply": "2023-09-12T22:59:49.884722Z"
- },
"papermill": {
"duration": 0.003938,
"end_time": "2023-09-12T22:59:49.885310",
@@ -278,18 +223,7 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/pvalues')"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"OUTPUT_DIR"
]
@@ -313,15 +247,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "6e8ef201-6f98-4fb6-a306-180ed4b467db",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.892209Z",
- "iopub.status.busy": "2023-09-12T22:59:49.892105Z",
- "iopub.status.idle": "2023-09-12T22:59:49.912164Z",
- "shell.execute_reply": "2023-09-12T22:59:49.911953Z"
- },
"papermill": {
"duration": 0.022686,
"end_time": "2023-09-12T22:59:49.912927",
@@ -338,15 +266,9 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "4d18e93e-b394-46bd-8d16-d9261a85ba06",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.917204Z",
- "iopub.status.busy": "2023-09-12T22:59:49.917116Z",
- "iopub.status.idle": "2023-09-12T22:59:49.918963Z",
- "shell.execute_reply": "2023-09-12T22:59:49.918826Z"
- },
"papermill": {
"duration": 0.004607,
"end_time": "2023-09-12T22:59:49.919511",
@@ -356,33 +278,16 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(5000, 755)"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "ea8947b9-9064-43ec-bf10-6e6ae361c451",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.923551Z",
- "iopub.status.busy": "2023-09-12T22:59:49.923405Z",
- "iopub.status.idle": "2023-09-12T22:59:49.931350Z",
- "shell.execute_reply": "2023-09-12T22:59:49.931198Z"
- },
"papermill": {
"duration": 0.01065,
"end_time": "2023-09-12T22:59:49.931837",
@@ -392,290 +297,7 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " GTEX-111YS-0006-SM-5NQBE | \n",
- " GTEX-1122O-0005-SM-5O99J | \n",
- " GTEX-1128S-0005-SM-5P9HI | \n",
- " GTEX-113IC-0006-SM-5NQ9C | \n",
- " GTEX-113JC-0006-SM-5O997 | \n",
- " GTEX-117XS-0005-SM-5PNU6 | \n",
- " GTEX-117YW-0005-SM-5NQ8Z | \n",
- " GTEX-1192W-0005-SM-5NQBQ | \n",
- " GTEX-1192X-0005-SM-5NQC3 | \n",
- " GTEX-11DXW-0006-SM-5NQ7Y | \n",
- " ... | \n",
- " GTEX-ZVE2-0006-SM-51MRW | \n",
- " GTEX-ZVP2-0005-SM-51MRK | \n",
- " GTEX-ZVT2-0005-SM-57WBW | \n",
- " GTEX-ZVT3-0006-SM-51MT9 | \n",
- " GTEX-ZVT4-0006-SM-57WB8 | \n",
- " GTEX-ZVTK-0006-SM-57WBK | \n",
- " GTEX-ZVZP-0006-SM-51MSW | \n",
- " GTEX-ZVZQ-0006-SM-51MR8 | \n",
- " GTEX-ZXES-0005-SM-57WCB | \n",
- " GTEX-ZXG5-0005-SM-57WCN | \n",
- "
\n",
- " \n",
- " gene_ens_id | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ENSG00000000419.12 | \n",
- " 20.6500 | \n",
- " 25.050 | \n",
- " 7.155 | \n",
- " 49.130 | \n",
- " 6.147 | \n",
- " 4.1430 | \n",
- " 5.390 | \n",
- " 4.389 | \n",
- " 1.1580 | \n",
- " 6.8240 | \n",
- " ... | \n",
- " 4.4070 | \n",
- " 32.340 | \n",
- " 18.6800 | \n",
- " 9.251 | \n",
- " 7.828 | \n",
- " 7.460 | \n",
- " 33.2400 | \n",
- " 5.8480 | \n",
- " 25.760 | \n",
- " 17.080 | \n",
- "
\n",
- " \n",
- " ENSG00000000938.12 | \n",
- " 906.0000 | \n",
- " 1344.000 | \n",
- " 633.500 | \n",
- " 719.200 | \n",
- " 392.600 | \n",
- " 166.5000 | \n",
- " 338.200 | \n",
- " 413.200 | \n",
- " 51.5400 | \n",
- " 423.6000 | \n",
- " ... | \n",
- " 354.8000 | \n",
- " 1102.000 | \n",
- " 774.9000 | \n",
- " 206.000 | \n",
- " 620.400 | \n",
- " 346.300 | \n",
- " 1304.0000 | \n",
- " 232.9000 | \n",
- " 631.600 | \n",
- " 884.500 | \n",
- "
\n",
- " \n",
- " ENSG00000001167.14 | \n",
- " 8.1900 | \n",
- " 20.010 | \n",
- " 20.470 | \n",
- " 21.220 | \n",
- " 16.460 | \n",
- " 8.6190 | \n",
- " 18.220 | \n",
- " 16.580 | \n",
- " 1.6020 | \n",
- " 35.6800 | \n",
- " ... | \n",
- " 11.3400 | \n",
- " 11.250 | \n",
- " 11.1800 | \n",
- " 9.523 | \n",
- " 41.860 | \n",
- " 24.580 | \n",
- " 8.8920 | \n",
- " 13.3900 | \n",
- " 13.470 | \n",
- " 42.640 | \n",
- "
\n",
- " \n",
- " ENSG00000001561.6 | \n",
- " 0.7104 | \n",
- " 1.771 | \n",
- " 2.234 | \n",
- " 6.014 | \n",
- " 3.206 | \n",
- " 0.3962 | \n",
- " 2.445 | \n",
- " 1.418 | \n",
- " 0.5531 | \n",
- " 0.7447 | \n",
- " ... | \n",
- " 0.9269 | \n",
- " 2.555 | \n",
- " 0.5976 | \n",
- " 3.417 | \n",
- " 2.645 | \n",
- " 1.883 | \n",
- " 0.5391 | \n",
- " 0.9816 | \n",
- " 1.036 | \n",
- " 6.729 | \n",
- "
\n",
- " \n",
- " ENSG00000002549.12 | \n",
- " 22.5000 | \n",
- " 21.330 | \n",
- " 19.290 | \n",
- " 157.100 | \n",
- " 29.330 | \n",
- " 9.5770 | \n",
- " 14.170 | \n",
- " 23.330 | \n",
- " 1.4070 | \n",
- " 28.3000 | \n",
- " ... | \n",
- " 4.4930 | \n",
- " 50.470 | \n",
- " 16.2100 | \n",
- " 32.740 | \n",
- " 18.150 | \n",
- " 11.920 | \n",
- " 20.1000 | \n",
- " 15.5500 | \n",
- " 11.980 | \n",
- " 35.370 | \n",
- "
\n",
- " \n",
- "
\n",
- "
5 rows × 755 columns
\n",
- "
"
- ],
- "text/plain": [
- " GTEX-111YS-0006-SM-5NQBE GTEX-1122O-0005-SM-5O99J \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 20.6500 25.050 \n",
- "ENSG00000000938.12 906.0000 1344.000 \n",
- "ENSG00000001167.14 8.1900 20.010 \n",
- "ENSG00000001561.6 0.7104 1.771 \n",
- "ENSG00000002549.12 22.5000 21.330 \n",
- "\n",
- " GTEX-1128S-0005-SM-5P9HI GTEX-113IC-0006-SM-5NQ9C \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 7.155 49.130 \n",
- "ENSG00000000938.12 633.500 719.200 \n",
- "ENSG00000001167.14 20.470 21.220 \n",
- "ENSG00000001561.6 2.234 6.014 \n",
- "ENSG00000002549.12 19.290 157.100 \n",
- "\n",
- " GTEX-113JC-0006-SM-5O997 GTEX-117XS-0005-SM-5PNU6 \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 6.147 4.1430 \n",
- "ENSG00000000938.12 392.600 166.5000 \n",
- "ENSG00000001167.14 16.460 8.6190 \n",
- "ENSG00000001561.6 3.206 0.3962 \n",
- "ENSG00000002549.12 29.330 9.5770 \n",
- "\n",
- " GTEX-117YW-0005-SM-5NQ8Z GTEX-1192W-0005-SM-5NQBQ \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 5.390 4.389 \n",
- "ENSG00000000938.12 338.200 413.200 \n",
- "ENSG00000001167.14 18.220 16.580 \n",
- "ENSG00000001561.6 2.445 1.418 \n",
- "ENSG00000002549.12 14.170 23.330 \n",
- "\n",
- " GTEX-1192X-0005-SM-5NQC3 GTEX-11DXW-0006-SM-5NQ7Y ... \\\n",
- "gene_ens_id ... \n",
- "ENSG00000000419.12 1.1580 6.8240 ... \n",
- "ENSG00000000938.12 51.5400 423.6000 ... \n",
- "ENSG00000001167.14 1.6020 35.6800 ... \n",
- "ENSG00000001561.6 0.5531 0.7447 ... \n",
- "ENSG00000002549.12 1.4070 28.3000 ... \n",
- "\n",
- " GTEX-ZVE2-0006-SM-51MRW GTEX-ZVP2-0005-SM-51MRK \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 4.4070 32.340 \n",
- "ENSG00000000938.12 354.8000 1102.000 \n",
- "ENSG00000001167.14 11.3400 11.250 \n",
- "ENSG00000001561.6 0.9269 2.555 \n",
- "ENSG00000002549.12 4.4930 50.470 \n",
- "\n",
- " GTEX-ZVT2-0005-SM-57WBW GTEX-ZVT3-0006-SM-51MT9 \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 18.6800 9.251 \n",
- "ENSG00000000938.12 774.9000 206.000 \n",
- "ENSG00000001167.14 11.1800 9.523 \n",
- "ENSG00000001561.6 0.5976 3.417 \n",
- "ENSG00000002549.12 16.2100 32.740 \n",
- "\n",
- " GTEX-ZVT4-0006-SM-57WB8 GTEX-ZVTK-0006-SM-57WBK \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 7.828 7.460 \n",
- "ENSG00000000938.12 620.400 346.300 \n",
- "ENSG00000001167.14 41.860 24.580 \n",
- "ENSG00000001561.6 2.645 1.883 \n",
- "ENSG00000002549.12 18.150 11.920 \n",
- "\n",
- " GTEX-ZVZP-0006-SM-51MSW GTEX-ZVZQ-0006-SM-51MR8 \\\n",
- "gene_ens_id \n",
- "ENSG00000000419.12 33.2400 5.8480 \n",
- "ENSG00000000938.12 1304.0000 232.9000 \n",
- "ENSG00000001167.14 8.8920 13.3900 \n",
- "ENSG00000001561.6 0.5391 0.9816 \n",
- "ENSG00000002549.12 20.1000 15.5500 \n",
- "\n",
- " GTEX-ZXES-0005-SM-57WCB GTEX-ZXG5-0005-SM-57WCN \n",
- "gene_ens_id \n",
- "ENSG00000000419.12 25.760 17.080 \n",
- "ENSG00000000938.12 631.600 884.500 \n",
- "ENSG00000001167.14 13.470 42.640 \n",
- "ENSG00000001561.6 1.036 6.729 \n",
- "ENSG00000002549.12 11.980 35.370 \n",
- "\n",
- "[5 rows x 755 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"data.head()"
]
@@ -699,15 +321,9 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "178a09a8-1a2e-425a-8a52-773f41c72633",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.939521Z",
- "iopub.status.busy": "2023-09-12T22:59:49.939455Z",
- "iopub.status.idle": "2023-09-12T22:59:49.940684Z",
- "shell.execute_reply": "2023-09-12T22:59:49.940556Z"
- },
"papermill": {
"duration": 0.003701,
"end_time": "2023-09-12T22:59:49.941066",
@@ -724,15 +340,9 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "c42a9f4c-3672-4ab0-b9ff-c214eb40cd2f",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.944914Z",
- "iopub.status.busy": "2023-09-12T22:59:49.944802Z",
- "iopub.status.idle": "2023-09-12T22:59:49.950774Z",
- "shell.execute_reply": "2023-09-12T22:59:49.950627Z"
- },
"papermill": {
"duration": 0.008321,
"end_time": "2023-09-12T22:59:49.951197",
@@ -749,15 +359,9 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "1724d63c-19eb-49a8-83fc-6c8b07585e98",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.955084Z",
- "iopub.status.busy": "2023-09-12T22:59:49.954971Z",
- "iopub.status.idle": "2023-09-12T22:59:49.956445Z",
- "shell.execute_reply": "2023-09-12T22:59:49.956319Z"
- },
"papermill": {
"duration": 0.003797,
"end_time": "2023-09-12T22:59:49.956831",
@@ -767,33 +371,16 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "9"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"len(gene_pair_samples)"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"id": "99f5098f-aa01-471b-a6a2-5aabc332176b",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.960750Z",
- "iopub.status.busy": "2023-09-12T22:59:49.960641Z",
- "iopub.status.idle": "2023-09-12T22:59:49.962169Z",
- "shell.execute_reply": "2023-09-12T22:59:49.962046Z"
- },
"papermill": {
"duration": 0.003951,
"end_time": "2023-09-12T22:59:49.962566",
@@ -803,41 +390,16 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['all_high',\n",
- " 'all_low',\n",
- " 'ccc_high_and_pearson_low',\n",
- " 'ccc_high_and_spearman_low',\n",
- " 'ccc_high_and_spearman_pearson_low',\n",
- " 'ccc_spearman_high_and_pearson_low',\n",
- " 'pearson_high_and_ccc_low',\n",
- " 'pearson_high_and_ccc_spearman_low',\n",
- " 'selected_in_manuscript']"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"sorted(gene_pair_samples.keys())"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"id": "c60378f6-3f87-49d4-8b86-cf3ec30fc545",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.966462Z",
- "iopub.status.busy": "2023-09-12T22:59:49.966348Z",
- "iopub.status.idle": "2023-09-12T22:59:49.970153Z",
- "shell.execute_reply": "2023-09-12T22:59:49.970022Z"
- },
"papermill": {
"duration": 0.006176,
"end_time": "2023-09-12T22:59:49.970553",
@@ -847,167 +409,17 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " | \n",
- " Pearson (high) | \n",
- " Pearson (low) | \n",
- " Spearman (high) | \n",
- " Spearman (low) | \n",
- " Clustermatch (high) | \n",
- " Clustermatch (low) | \n",
- " ccc | \n",
- " pearson | \n",
- " spearman | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ENSG00000052749.13 | \n",
- " ENSG00000165025.14 | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " 0.362340 | \n",
- " 0.709449 | \n",
- " 0.795566 | \n",
- "
\n",
- " \n",
- " ENSG00000102897.9 | \n",
- " ENSG00000086544.2 | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " 0.429092 | \n",
- " 0.698537 | \n",
- " 0.822212 | \n",
- "
\n",
- " \n",
- " ENSG00000110628.13 | \n",
- " ENSG00000267078.1 | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " 0.230143 | \n",
- " 0.509499 | \n",
- " 0.632816 | \n",
- "
\n",
- " \n",
- " ENSG00000169554.18 | \n",
- " ENSG00000132424.14 | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " 0.509012 | \n",
- " 0.773762 | \n",
- " 0.878352 | \n",
- "
\n",
- " \n",
- " ENSG00000143933.16 | \n",
- " ENSG00000135378.3 | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " True | \n",
- " False | \n",
- " 0.471842 | \n",
- " 0.531121 | \n",
- " 0.819382 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Pearson (high) Pearson (low) \\\n",
- "ENSG00000052749.13 ENSG00000165025.14 True False \n",
- "ENSG00000102897.9 ENSG00000086544.2 True False \n",
- "ENSG00000110628.13 ENSG00000267078.1 True False \n",
- "ENSG00000169554.18 ENSG00000132424.14 True False \n",
- "ENSG00000143933.16 ENSG00000135378.3 True False \n",
- "\n",
- " Spearman (high) Spearman (low) \\\n",
- "ENSG00000052749.13 ENSG00000165025.14 True False \n",
- "ENSG00000102897.9 ENSG00000086544.2 True False \n",
- "ENSG00000110628.13 ENSG00000267078.1 True False \n",
- "ENSG00000169554.18 ENSG00000132424.14 True False \n",
- "ENSG00000143933.16 ENSG00000135378.3 True False \n",
- "\n",
- " Clustermatch (high) \\\n",
- "ENSG00000052749.13 ENSG00000165025.14 True \n",
- "ENSG00000102897.9 ENSG00000086544.2 True \n",
- "ENSG00000110628.13 ENSG00000267078.1 True \n",
- "ENSG00000169554.18 ENSG00000132424.14 True \n",
- "ENSG00000143933.16 ENSG00000135378.3 True \n",
- "\n",
- " Clustermatch (low) ccc pearson \\\n",
- "ENSG00000052749.13 ENSG00000165025.14 False 0.362340 0.709449 \n",
- "ENSG00000102897.9 ENSG00000086544.2 False 0.429092 0.698537 \n",
- "ENSG00000110628.13 ENSG00000267078.1 False 0.230143 0.509499 \n",
- "ENSG00000169554.18 ENSG00000132424.14 False 0.509012 0.773762 \n",
- "ENSG00000143933.16 ENSG00000135378.3 False 0.471842 0.531121 \n",
- "\n",
- " spearman \n",
- "ENSG00000052749.13 ENSG00000165025.14 0.795566 \n",
- "ENSG00000102897.9 ENSG00000086544.2 0.822212 \n",
- "ENSG00000110628.13 ENSG00000267078.1 0.632816 \n",
- "ENSG00000169554.18 ENSG00000132424.14 0.878352 \n",
- "ENSG00000143933.16 ENSG00000135378.3 0.819382 "
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "gene_pair_samples[\"all_high\"].head()"
+ "_k = list(gene_pair_samples.keys())[0]\n",
+ "gene_pair_samples[_k].head()"
]
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": null,
"id": "6ccae66e-e276-43c3-809c-512aa0fe795b",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.974698Z",
- "iopub.status.busy": "2023-09-12T22:59:49.974579Z",
- "iopub.status.idle": "2023-09-12T22:59:49.976390Z",
- "shell.execute_reply": "2023-09-12T22:59:49.976266Z"
- },
"papermill": {
"duration": 0.00426,
"end_time": "2023-09-12T22:59:49.976769",
@@ -1017,29 +429,9 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[('ENSG00000052749.13', 'ENSG00000165025.14'),\n",
- " ('ENSG00000102897.9', 'ENSG00000086544.2'),\n",
- " ('ENSG00000110628.13', 'ENSG00000267078.1'),\n",
- " ('ENSG00000169554.18', 'ENSG00000132424.14'),\n",
- " ('ENSG00000143933.16', 'ENSG00000135378.3'),\n",
- " ('ENSG00000170776.21', 'ENSG00000155903.11'),\n",
- " ('ENSG00000136111.12', 'ENSG00000065911.11'),\n",
- " ('ENSG00000131042.14', 'ENSG00000141367.11'),\n",
- " ('ENSG00000160703.15', 'ENSG00000231964.1'),\n",
- " ('ENSG00000008394.12', 'ENSG00000101347.8')]"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "[i for i in gene_pair_samples[\"all_high\"].head(10).index]"
+ "[i for i in gene_pair_samples[_k].head(10).index]"
]
},
{
@@ -1061,15 +453,9 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": null,
"id": "62d8632e-13e0-4a78-ad30-26770172d21e",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.984650Z",
- "iopub.status.busy": "2023-09-12T22:59:49.984567Z",
- "iopub.status.idle": "2023-09-12T22:59:49.985738Z",
- "shell.execute_reply": "2023-09-12T22:59:49.985617Z"
- },
"papermill": {
"duration": 0.003625,
"end_time": "2023-09-12T22:59:49.986138",
@@ -1086,15 +472,9 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": null,
"id": "c8a85ce0-4c5a-4ed9-8ad6-24b21fb10b1e",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.990241Z",
- "iopub.status.busy": "2023-09-12T22:59:49.990158Z",
- "iopub.status.idle": "2023-09-12T22:59:49.991649Z",
- "shell.execute_reply": "2023-09-12T22:59:49.991513Z"
- },
"papermill": {
"duration": 0.00395,
"end_time": "2023-09-12T22:59:49.992041",
@@ -1107,7 +487,7 @@
"outputs": [],
"source": [
"def corr_single(x, y):\n",
- " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)\n",
+ " ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL[\"N_JOBS\"])\n",
" p_val, p_pval = stats.pearsonr(x, y)\n",
" s_val, s_pval = stats.spearmanr(x, y)\n",
"\n",
@@ -1116,15 +496,9 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": null,
"id": "d9838801-1f01-4316-8e29-ffedbdc2a67a",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-12T22:59:49.996242Z",
- "iopub.status.busy": "2023-09-12T22:59:49.996132Z",
- "iopub.status.idle": "2023-09-13T03:38:36.170632Z",
- "shell.execute_reply": "2023-09-13T03:38:36.170157Z"
- },
"papermill": {
"duration": 16726.17747,
"end_time": "2023-09-13T03:38:36.171453",
@@ -1138,7 +512,10 @@
"source": [
"results = []\n",
"\n",
- "with ProcessPoolExecutor(max_workers=conf.GENERAL[\"N_JOBS\"]) as executor:\n",
+ "# I leave the ProcessPoolExecutor here in case I want to easily swith between\n",
+ "# parallelize across gene pairs (max_workers=conf.GENERAL[\"N_JOBS\"] and n_jobs=1 inside function corr_single)\n",
+ "# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL[\"N_JOBS\"])\n",
+ "with ProcessPoolExecutor(max_workers=1) as executor:\n",
" tasks = {\n",
" executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (\n",
" gene0,\n",
@@ -1149,7 +526,7 @@
" for gene0, gene1 in gene_pair_samples[k].index\n",
" }\n",
"\n",
- " for t_idx, t in enumerate(as_completed(tasks)):\n",
+ " for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100):\n",
" gene0, gene1, k = tasks[t]\n",
" ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()\n",
"\n",
@@ -1167,23 +544,17 @@
" }\n",
" )\n",
"\n",
- " if t_idx % 10:\n",
- " _df = pd.DataFrame(results)\n",
- " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n",
- " _df.to_pickle(output_file)"
+ " # save\n",
+ " _df = pd.DataFrame(results)\n",
+ " _df[\"group\"] = _df[\"group\"].astype(\"category\")\n",
+ " _df.to_pickle(output_file)"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"id": "6f32ad1a-3b2f-4e08-8a53-35cfb68e3970",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-13T03:38:36.176634Z",
- "iopub.status.busy": "2023-09-13T03:38:36.176542Z",
- "iopub.status.idle": "2023-09-13T03:38:36.178678Z",
- "shell.execute_reply": "2023-09-13T03:38:36.178505Z"
- },
"papermill": {
"duration": 0.005138,
"end_time": "2023-09-13T03:38:36.179138",
@@ -1193,33 +564,16 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "644"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"len(results)"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": null,
"id": "e68a65a5-8bba-4a79-a740-26d722dc670e",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-13T03:38:36.183638Z",
- "iopub.status.busy": "2023-09-13T03:38:36.183514Z",
- "iopub.status.idle": "2023-09-13T03:38:36.186504Z",
- "shell.execute_reply": "2023-09-13T03:38:36.186287Z"
- },
"papermill": {
"duration": 0.005853,
"end_time": "2023-09-13T03:38:36.187014",
@@ -1237,15 +591,9 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": null,
"id": "9514ebb1-f1c1-46d9-96b6-a2264e3a6b4b",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-13T03:38:36.192047Z",
- "iopub.status.busy": "2023-09-13T03:38:36.191910Z",
- "iopub.status.idle": "2023-09-13T03:38:36.193682Z",
- "shell.execute_reply": "2023-09-13T03:38:36.193507Z"
- },
"papermill": {
"duration": 0.004782,
"end_time": "2023-09-13T03:38:36.194305",
@@ -1255,33 +603,16 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(644, 9)"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"results_df.shape"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": null,
"id": "6110dd19-95e0-4400-847a-424a498fa63d",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-13T03:38:36.198506Z",
- "iopub.status.busy": "2023-09-13T03:38:36.198414Z",
- "iopub.status.idle": "2023-09-13T03:38:36.203136Z",
- "shell.execute_reply": "2023-09-13T03:38:36.202842Z"
- },
"papermill": {
"duration": 0.007371,
"end_time": "2023-09-13T03:38:36.203668",
@@ -1291,125 +622,7 @@
},
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " gene0 | \n",
- " gene1 | \n",
- " group | \n",
- " ccc | \n",
- " ccc_pvalue | \n",
- " pearson | \n",
- " pearson_pvalue | \n",
- " spearman | \n",
- " spearman_pvalue | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " ENSG00000169554.18 | \n",
- " ENSG00000132424.14 | \n",
- " all_high | \n",
- " 0.509012 | \n",
- " 9.999990e-07 | \n",
- " 0.773762 | \n",
- " 1.893487e-151 | \n",
- " 0.878352 | \n",
- " 1.374455e-243 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " ENSG00000136111.12 | \n",
- " ENSG00000065911.11 | \n",
- " all_high | \n",
- " 0.230143 | \n",
- " 9.999990e-07 | \n",
- " 0.558282 | \n",
- " 4.403216e-63 | \n",
- " 0.656130 | \n",
- " 3.863872e-94 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " ENSG00000170776.21 | \n",
- " ENSG00000155903.11 | \n",
- " all_high | \n",
- " 0.324987 | \n",
- " 9.999990e-07 | \n",
- " 0.751337 | \n",
- " 4.609357e-138 | \n",
- " 0.769746 | \n",
- " 6.110239e-149 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " ENSG00000143933.16 | \n",
- " ENSG00000135378.3 | \n",
- " all_high | \n",
- " 0.471842 | \n",
- " 9.999990e-07 | \n",
- " 0.531121 | \n",
- " 3.525528e-56 | \n",
- " 0.819382 | \n",
- " 3.815707e-184 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " ENSG00000160703.15 | \n",
- " ENSG00000231964.1 | \n",
- " all_high | \n",
- " 0.318958 | \n",
- " 9.999990e-07 | \n",
- " 0.589205 | \n",
- " 9.250622e-72 | \n",
- " 0.702882 | \n",
- " 1.639640e-113 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " gene0 gene1 group ccc ccc_pvalue \\\n",
- "0 ENSG00000169554.18 ENSG00000132424.14 all_high 0.509012 9.999990e-07 \n",
- "1 ENSG00000136111.12 ENSG00000065911.11 all_high 0.230143 9.999990e-07 \n",
- "2 ENSG00000170776.21 ENSG00000155903.11 all_high 0.324987 9.999990e-07 \n",
- "3 ENSG00000143933.16 ENSG00000135378.3 all_high 0.471842 9.999990e-07 \n",
- "4 ENSG00000160703.15 ENSG00000231964.1 all_high 0.318958 9.999990e-07 \n",
- "\n",
- " pearson pearson_pvalue spearman spearman_pvalue \n",
- "0 0.773762 1.893487e-151 0.878352 1.374455e-243 \n",
- "1 0.558282 4.403216e-63 0.656130 3.863872e-94 \n",
- "2 0.751337 4.609357e-138 0.769746 6.110239e-149 \n",
- "3 0.531121 3.525528e-56 0.819382 3.815707e-184 \n",
- "4 0.589205 9.250622e-72 0.702882 1.639640e-113 "
- ]
- },
- "execution_count": 22,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"results_df.head()"
]
@@ -1433,15 +646,9 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": null,
"id": "bb8e28d4-3adf-4d6a-a94e-81b6763ebd61",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-09-13T03:38:36.212308Z",
- "iopub.status.busy": "2023-09-13T03:38:36.212205Z",
- "iopub.status.idle": "2023-09-13T03:38:36.214108Z",
- "shell.execute_reply": "2023-09-13T03:38:36.213850Z"
- },
"papermill": {
"duration": 0.004663,
"end_time": "2023-09-13T03:38:36.214516",
@@ -1477,7 +684,12 @@
"metadata": {
"jupytext": {
"cell_metadata_filter": "all,-execution,-papermill,-trusted",
- "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version"
+ "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version",
+ "text_representation": {
+ "extension": ".py",
+ "format_name": "percent",
+ "format_version": "1.3"
+ }
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
diff --git a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py
index 6de900b3..5a27d599 100644
--- a/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py
+++ b/nbs/25_pvalue/py/15-compute_pvalues_from_samples.py
@@ -17,7 +17,7 @@
# # Description
# %% [markdown] tags=[]
-# TODO
+# Reads the gene pair samples across different categories and computes their p-values.
# %% [markdown] tags=[]
# # Modules loading
@@ -30,6 +30,7 @@
import numpy as np
import pandas as pd
from concurrent.futures import as_completed, ProcessPoolExecutor
+from tqdm import tqdm
from ccc.coef import ccc
from ccc import conf
@@ -42,7 +43,7 @@
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"
-PVALUE_N_PERMS = 1000000
+PVALUE_N_PERMS = 10000000
RANDOM_STATE = np.random.RandomState(0)
@@ -102,10 +103,11 @@
sorted(gene_pair_samples.keys())
# %% tags=[]
-gene_pair_samples["all_high"].head()
+_k = list(gene_pair_samples.keys())[0]
+gene_pair_samples[_k].head()
# %% tags=[]
-[i for i in gene_pair_samples["all_high"].head(10).index]
+[i for i in gene_pair_samples[_k].head(10).index]
# %% [markdown] tags=[]
# # Compute pvalues on sampled gene pairs
@@ -116,7 +118,7 @@
# %% tags=[]
def corr_single(x, y):
- ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=1)
+ ccc_val, ccc_pval = ccc(x, y, pvalue_n_perms=PVALUE_N_PERMS, n_jobs=conf.GENERAL["N_JOBS"])
p_val, p_pval = stats.pearsonr(x, y)
s_val, s_pval = stats.spearmanr(x, y)
@@ -126,7 +128,10 @@ def corr_single(x, y):
# %% tags=[]
results = []
-with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
+# I leave the ProcessPoolExecutor here in case I want to easily swith between
+# parallelize across gene pairs (max_workers=conf.GENERAL["N_JOBS"] and n_jobs=1 inside function corr_single)
+# or across permutations for one gene pair (max_workers=1 and n_jobs=conf.GENERAL["N_JOBS"])
+with ProcessPoolExecutor(max_workers=1) as executor:
tasks = {
executor.submit(corr_single, data.loc[gene0], data.loc[gene1]): (
gene0,
@@ -137,7 +142,7 @@ def corr_single(x, y):
for gene0, gene1 in gene_pair_samples[k].index
}
- for t_idx, t in enumerate(as_completed(tasks)):
+ for t_idx, t in tqdm(enumerate(as_completed(tasks)), total=len(tasks), ncols=100):
gene0, gene1, k = tasks[t]
ccc_val, ccc_pval, p_val, p_pval, s_val, s_pval = t.result()
@@ -155,10 +160,10 @@ def corr_single(x, y):
}
)
- if t_idx % 10:
- _df = pd.DataFrame(results)
- _df["group"] = _df["group"].astype("category")
- _df.to_pickle(output_file)
+ # save
+ _df = pd.DataFrame(results)
+ _df["group"] = _df["group"].astype("category")
+ _df.to_pickle(output_file)
# %% tags=[]
len(results)