diff --git a/api_examples.ipynb b/api_examples.ipynb deleted file mode 100644 index a8913557f..000000000 --- a/api_examples.ipynb +++ /dev/null @@ -1,1247 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/vol/tmp/users/jnourisa/ipykernel_1124583/2193138187.py:1: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html\n", - " from pkg_resources import resource_filename\n", - "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/tqdm/autonotebook.py:19: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " warn(WARN_NOIPYW, TqdmWarning, stacklevel=2)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/data/examples\n" - ] - } - ], - "source": [ - "!gimme motifs my_peaks.bed my_motifs -g /data/genomes/hg38/hg38.fa" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pkg_resources import resource_filename\n", - "data_dir = resource_filename(\"gimmemotifs\", \"../data/examples\")\n", - "\n", - "%cd {data_dir}" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "example.pfm\t\t\t MA0099.3.jaspar\n", - "Gm12878.CTCF.top500.w200.bed\t NRF1.bed\n", - "Gm12878.CTCF.top500.w200.fa\t TAp73alpha.bed\n", - "hg19.blood.most_variable.10k.txt TAp73alpha.fa\n", - "hg19.blood.most_variable.1k.txt test.small.fa\n" - ] - } - ], - "source": [ - "!ls " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "AP1_ATGAsTCAy\n", - "CTCF_syGCCmyCTrGTGG\n" - ] - } - ], - "source": [ - "from gimmemotifs.motif import Motif,read_motifs\n", - "\n", - "# Read from file\n", - "motifs = read_motifs(\"example.pfm\")\n", - "\n", - "for motif in motifs:\n", - " print(motif)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'AP1': AP1_ATGAsTCAy, 'CTCF': CTCF_syGCCmyCTrGTGG}\n" - ] - } - ], - "source": [ - "# Read from file to a dictionary\n", - "motifs = read_motifs(\"example.pfm\", as_dict=True)\n", - "print(motifs)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "579\n", - ">MA0004.1_Arnt\n", - "0.2000\t0.7999\t0.0000\t0.0000\n", - "0.9499\t0.0000\t0.0500\t0.0000\n", - "0.0000\t0.9999\t0.0000\t0.0000\n", - "0.0000\t0.0000\t0.9999\t0.0000\n", - "0.0000\t0.0000\t0.0000\t0.9999\n", - "0.0000\t0.0000\t0.9999\t0.0000\n" - ] - } - ], - "source": [ - "# Read any motif database included with gimmemotifs by name\n", - "motifs = read_motifs(\"JASPAR2018_vertebrates\")\n", - "print(len(motifs))\n", - "print(motifs[0].to_ppm())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CpG_CG\n" - ] - } - ], - "source": [ - "# Create from scratch\n", - "m = Motif([[0,1,0,0],[0,0,1,0]])\n", - "m.id = \"CpG\"\n", - "print(m)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ">TGASTCA\n", - "0.0001\t0.0001\t0.0001\t0.9998\n", - "0.0001\t0.0001\t0.9998\t0.0001\n", - "0.9998\t0.0001\t0.0001\t0.0001\n", - "0.0001\t0.4999\t0.4999\t0.0001\n", - "0.0001\t0.0001\t0.0001\t0.9998\n", - "0.0001\t0.9998\t0.0001\t0.0001\n", - "0.9998\t0.0001\t0.0001\t0.0001\n" - ] - } - ], - "source": [ - "# Or from a consensus sequence\n", - "from gimmemotifs.motif import motif_from_consensus\n", - "ap1 = motif_from_consensus(\"TGASTCA\")\n", - "print(ap1.to_ppm())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read motifs from files in other formats.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MA0099.3\tFOS::JUN_ATGAGTCAyn\n" - ] - } - ], - "source": [ - "with open(\"MA0099.3.jaspar\") as f:\n", - " motifs = read_motifs(f, fmt=\"jaspar\")\n", - "print(motifs[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can convert a motif to several formats.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ">AP1\n", - "0.5558\t0.1469\t0.2734\t0.0240\n", - "0.0020\t0.0015\t0.0017\t0.9948\n", - "0.0039\t0.0019\t0.9502\t0.0439\n", - "0.9697\t0.0220\t0.0018\t0.0065\n", - "0.0377\t0.3311\t0.6030\t0.0283\n", - "0.0033\t0.0031\t0.0043\t0.9893\n", - "0.0188\t0.9775\t0.0023\t0.0014\n", - "0.9951\t0.0021\t0.0012\t0.0015\n", - "0.0121\t0.3096\t0.1221\t0.5561\n" - ] - } - ], - "source": [ - "with open(\"example.pfm\") as f:\n", - " motifs = read_motifs(f)\n", - "\n", - "# pwm\n", - "print(motifs[0].to_ppm())" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ">AP1\n", - "555.8\t146.9\t273.4\t24.0\n", - "2.0\t1.5\t1.7\t994.8000000000001\n", - "3.9\t1.9\t950.2\t43.9\n", - "969.7\t22.0\t1.8\t6.5\n", - "37.699999999999996\t331.1\t603.0\t28.299999999999997\n", - "3.3\t3.1\t4.3\t989.3\n", - "18.8\t977.5\t2.3\t1.4\n", - "995.1\t2.1\t1.2\t1.5\n", - "12.1\t309.59999999999997\t122.1\t556.1\n" - ] - } - ], - "source": [ - "# pfm\n", - "print(motifs[0].to_pfm())" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ATGAsTCAy\n" - ] - } - ], - "source": [ - "# consensus sequence\n", - "print(motifs[0].to_consensus())" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DE\tAP1\tunknown\n", - "0\t555\t146\t273\t24\tA\n", - "1\t2\t1\t1\t994\tT\n", - "2\t3\t1\t950\t43\tG\n", - "3\t969\t22\t1\t6\tA\n", - "4\t37\t331\t603\t28\ts\n", - "5\t3\t3\t4\t989\tT\n", - "6\t18\t977\t2\t1\tC\n", - "7\t995\t2\t1\t1\tA\n", - "8\t12\t309\t122\t556\ty\n", - "XX\n" - ] - } - ], - "source": [ - "# TRANSFAC\n", - "print(motifs[0].to_transfac())" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MOTIF AP1\n", - "BL MOTIF AP1 width=0 seqs=0\n", - "letter-probability matrix: alength= 4 w= 9 nsites= 1000.1 E= 0\n", - "0.5558\t0.1469\t0.2734\t0.024\n", - "0.002\t0.0015\t0.0017\t0.9948\n", - "0.0039\t0.0019\t0.9502\t0.0439\n", - "0.9697\t0.022\t0.0018\t0.0065\n", - "0.0377\t0.3311\t0.603\t0.0283\n", - "0.0033\t0.0031\t0.0043\t0.9893\n", - "0.0188\t0.9775\t0.0023\t0.0014\n", - "0.9951\t0.0021\t0.0012\t0.0015\n", - "0.0121\t0.3096\t0.1221\t0.5561\n" - ] - } - ], - "source": [ - "# MEME\n", - "print(motifs[0].to_meme())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Some other useful tidbits." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "9\n" - ] - } - ], - "source": [ - "m = motif_from_consensus(\"NTGASTCAN\")\n", - "print(len(m))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TGAsTCA 7\n" - ] - } - ], - "source": [ - "# Trim by information content\n", - "m.trim(0.5)\n", - "print(m.to_consensus(), len(m))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TGA\n" - ] - } - ], - "source": [ - "# Slices\n", - "print(m[:3].to_consensus())" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "random_sGTAnATGn\n" - ] - } - ], - "source": [ - "# Shuffle\n", - "random_motif = motif_from_consensus(\"NTGASTGAN\").randomize()\n", - "print(random_motif)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To convert a motif to an image, use `plot_logo()`. Supported formats are png, ps and pdf." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "m = motif_from_consensus(\"NTGASTCAN\")\n", - "m.plot_logo(\"ap1.png\", fmt=\"png\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAADjCAIAAABVSHubAAAJMmlDQ1BkZWZhdWx0X3JnYi5pY2MAAEiJlZVnUJNZF8fv8zzphUASQodQQ5EqJYCUEFoo0quoQOidUEVsiLgCK4qINEWQRQEXXJUia0UUC4uCAhZ0gywCyrpxFVFBWXDfGZ33HT+8/5l7z2/+c+bec8/5cAEgiINlwct7YlK6wNvJjhkYFMwE3yiMn5bC8fR0A9/VuxEArcR7ut/P+a4IEZFp/OW4uLxy+SmCdACg7GXWzEpPWeGjy0wPj//CZ1dYsFzgMt9Y4eh/eexLzr8s+pLj681dfhUKABwp+hsO/4b/c++KVDiC9NioyGymT3JUelaYIJKZttIJHpfL9BQkR8UmRH5T8P+V/B2lR2anr0RucsomQWx0TDrzfw41MjA0BF9n8cbrS48hRv9/z2dFX73kegDYcwAg+7564ZUAdO4CQPrRV09tua+UfAA67vAzBJn/eqiVDQ0IgALoQAYoAlWgCXSBETADlsAWOAAX4AF8QRDYAPggBiQCAcgCuWAHKABFYB84CKpALWgATaAVnAad4Dy4Aq6D2+AuGAaPgRBMgpdABN6BBQiCsBAZokEykBKkDulARhAbsoYcIDfIGwqCQqFoKAnKgHKhnVARVApVQXVQE/QLdA66At2EBqGH0Dg0A/0NfYQRmATTYQVYA9aH2TAHdoV94fVwNJwK58D58F64Aq6HT8Id8BX4NjwMC+GX8BwCECLCQJQRXYSNcBEPJBiJQgTIVqQQKUfqkVakG+lD7iFCZBb5gMKgaCgmShdliXJG+aH4qFTUVlQxqgp1AtWB6kXdQ42jRKjPaDJaHq2DtkDz0IHoaHQWugBdjm5Et6OvoYfRk+h3GAyGgWFhzDDOmCBMHGYzphhzGNOGuYwZxExg5rBYrAxWB2uF9cCGYdOxBdhK7EnsJewQdhL7HkfEKeGMcI64YFwSLg9XjmvGXcQN4aZwC3hxvDreAu+Bj8BvwpfgG/Dd+Dv4SfwCQYLAIlgRfAlxhB2ECkIr4RphjPCGSCSqEM2JXsRY4nZiBfEU8QZxnPiBRCVpk7ikEFIGaS/pOOky6SHpDZlM1iDbkoPJ6eS95CbyVfJT8nsxmpieGE8sQmybWLVYh9iQ2CsKnqJO4VA2UHIo5ZQzlDuUWXG8uIY4VzxMfKt4tfg58VHxOQmahKGEh0SiRLFEs8RNiWkqlqpBdaBGUPOpx6hXqRM0hKZK49L4tJ20Bto12iQdQ2fRefQ4ehH9Z/oAXSRJlTSW9JfMlqyWvCApZCAMDQaPkcAoYZxmjDA+SilIcaQipfZItUoNSc1Ly0nbSkdKF0q3SQ9Lf5RhyjjIxMvsl+mUeSKLktWW9ZLNkj0ie012Vo4uZynHlyuUOy33SB6W15b3lt8sf0y+X35OQVHBSSFFoVLhqsKsIkPRVjFOsUzxouKMEk3JWilWqUzpktILpiSTw0xgVjB7mSJleWVn5QzlOuUB5QUVloqfSp5Km8oTVYIqWzVKtUy1R1WkpqTmrpar1qL2SB2vzlaPUT+k3qc+r8HSCNDYrdGpMc2SZvFYOawW1pgmWdNGM1WzXvO+FkaLrRWvdVjrrjasbaIdo12tfUcH1jHVidU5rDO4Cr3KfFXSqvpVo7okXY5upm6L7rgeQ89NL0+vU++Vvpp+sP5+/T79zwYmBgkGDQaPDamGLoZ5ht2GfxtpG/GNqo3uryavdly9bXXX6tfGOsaRxkeMH5jQTNxNdpv0mHwyNTMVmLaazpipmYWa1ZiNsulsT3Yx+4Y52tzOfJv5efMPFqYW6RanLf6y1LWMt2y2nF7DWhO5pmHNhJWKVZhVnZXQmmkdan3UWmijbBNmU2/zzFbVNsK20XaKo8WJ45zkvLIzsBPYtdvNcy24W7iX7RF7J/tC+wEHqoOfQ5XDU0cVx2jHFkeRk4nTZqfLzmhnV+f9zqM8BR6f18QTuZi5bHHpdSW5+rhWuT5z03YTuHW7w+4u7gfcx9aqr01a2+kBPHgeBzyeeLI8Uz1/9cJ4eXpVez33NvTO9e7zofls9Gn2eedr51vi+9hP0y/Dr8ef4h/i3+Q/H2AfUBogDNQP3BJ4O0g2KDaoKxgb7B/cGDy3zmHdwXWTISYhBSEj61nrs9ff3CC7IWHDhY2UjWEbz4SiQwNCm0MXwzzC6sPmwnnhNeEiPpd/iP8ywjaiLGIm0iqyNHIqyiqqNGo62ir6QPRMjE1MecxsLDe2KvZ1nHNcbdx8vEf88filhICEtkRcYmjiuSRqUnxSb7JicnbyYIpOSkGKMNUi9WCqSOAqaEyD0tandaXTlz/F/gzNjF0Z45nWmdWZ77P8s85kS2QnZfdv0t60Z9NUjmPOT5tRm/mbe3KVc3fkjm/hbKnbCm0N39qzTXVb/rbJ7U7bT+wg7Ijf8VueQV5p3tudATu78xXyt+dP7HLa1VIgViAoGN1tubv2B9QPsT8M7Fm9p3LP58KIwltFBkXlRYvF/OJbPxr+WPHj0t6ovQMlpiVH9mH2Je0b2W+z/0SpRGlO6cQB9wMdZcyywrK3BzcevFluXF57iHAo45Cwwq2iq1Ktcl/lYlVM1XC1XXVbjXzNnpr5wxGHh47YHmmtVagtqv14NPbogzqnuo56jfryY5hjmceeN/g39P3E/qmpUbaxqPHT8aTjwhPeJ3qbzJqamuWbS1rgloyWmZMhJ+/+bP9zV6tua10bo63oFDiVcerFL6G/jJx2Pd1zhn2m9az62Zp2WnthB9SxqUPUGdMp7ArqGjzncq6n27K7/Ve9X4+fVz5ffUHyQslFwsX8i0uXci7NXU65PHsl+spEz8aex1cDr97v9eoduOZ67cZ1x+tX+zh9l25Y3Th/0+LmuVvsW523TW939Jv0t/9m8lv7gOlAxx2zO113ze92D64ZvDhkM3Tlnv296/d5928Prx0eHPEbeTAaMip8EPFg+mHCw9ePMh8tPN4+hh4rfCL+pPyp/NP637V+bxOaCi+M24/3P/N59niCP/Hyj7Q/Fifzn5Ofl08pTTVNG02fn3Gcufti3YvJlykvF2YL/pT4s+aV5quzf9n+1S8KFE2+Frxe+rv4jcyb42+N3/bMec49fZf4bmG+8L3M+xMf2B/6PgZ8nFrIWsQuVnzS+tT92fXz2FLi0tI/QiyQvpTNDAsAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAAddEVYdFNvZnR3YXJlAEdQTCBHaG9zdHNjcmlwdCA5LjIyX/2qrgAAIABJREFUeJztnd9v21aWx483iTOwkUQybMxkWqSQtBlMH1osRAOdh1kMdkm9DfZJ8n8g6T+Q/CdY+g8k/weSH/dNFHaA7cMWEPWSBVpsRhQatM0UMUQmgQ3ESeB9uC5Li5fk5Y97xR/ng6KoKZK6PeKX955zzz136/r6GhAEyR//tOkGIAiyGVD8CJJTUPwIklNQ/AiSU1D8CJJTUPwIklPubuRbdV0fj8er1apSqTQajWKxuJFmIEie2RI/zz8cDtvttvVnoVBQVVWSJMHNQJCcI3rYbxgGUX6z2ZxMJtVq1TTNRqMhuBkIgoju+VVVrdVqpVJJ13UA0HW9UqkAwGw2w84fQUQi2ucvFouyLFs6L5fLghuAIAhhAz6/nV6vd3x8bA0EEAQRxmai/QSifAAYDocAsLW15XEyLkBCkHjZjPgNw2i1WmdnZxjqjxHtQjM+GmsHpR2peBdnUqOSSdtuYNivaZqiKKZpyrI8Ho99J/nJiAB7fl/Kz8rLq+XawcnTifJQ2Uh7skQmbbuBnr/RaJimWa/Xx+Ox+G+380O3+3O/H+89n04mDxW2B+JChxcDAICdCnzWiv7Vzqdzg2zWtrr+bjB4BQCVyv1W6yD6VyfKtnEhWvzj8Xi5XALA2dmZ3cmfTCYKo2aywfdDePZrphO8GMBXKmyneAyZHIbDV+32C+vPweBcVZ8Wi5sMbyUT0Uk+s9lM8DcmkTXlA8DrOXwtwdW6Vxkd/V2+plHWlA8A8/mlojw3jA+xf1fabSta/L1e75pGjrp9U4PvupTjl0uYh890dHsQF+8Woe+ZOjTtYk35hPn8stX6PvRts2pbXNUnnO+68N6kf3Q+hZ9CxkHS3gvFQrf7o9tHZ2emqr4Jd9us2hbFL5ZXKpxPvU6gDgoQBlT1zXT61uOEKJ1/JkHxi2Vx4nPC5RK+HwppStY4OfmH9wnL5dV4HH9UJb2g+AVyoft0+4SXoxD3dqagELRLLcTdUoemXXh3+wQy/xeUrNoWxS8QMqvvy/kULgI7mbOLXE+jjEZMXfp0+lbX3wW9eVZti+IXyEvmYB77mQgAAAyH54xn4sjfAsUvClODS+YssVAj/9wyHhum+ZHxZMYxQh5A8YsikJ5fz0OM/KlM3zJEGVLObHbBfvJ8fhli5E8l7bZF8YviXOV6ftqDT1EIOpIPOuGfVdui+IVwocPrebBLzid8mpI1dP3dcnkV6JLJJGS2T8ZA8QshaLcf7pJcEiJvT1X9JwXzAIpfCK+DzxW9N8GMZ7Spvsnye2Q2uwx6iWl+1LQAYQIPUm1bFL8QXoeScZCr0h58Ck24jP1AnX9WbYvi58+VEdjhJ4QYL+QMw/gQ1OEnLBbxBPxTDYqfP+G6/SgX5obQ3rumBXYWsgeKnz+h4/bhxgsOsjpTBQFn+O3M5/GIP9W2RfHz5zJCus4rpniSduH1CK4+rMI3INlE6cAZgwUZti2Knz9RRu9sLw63ZWeZJ0quHuOLI8O2RfHzhz2ln3JtugtFcSV0tI+wWsVf1S9doPg5wzZudyWOmF9W+66IQbtYYn6pti2KnzNRHH7my73DTlktQSdG/Bm2LYqfMxHH7WwuQ6rDTqGJOG5nXAWcYdui+DkTfdweU5Jv9og+bg9dzzcboPg58z6yTxj5Dqn2Sz0wDNYCHhzvkGbbovg5452oU+nAVxN4XPe8g3/P7/0Izi/jSRZKGtETdVhyhDJsWxQ/T7yr8XwxgM97cKCANPbS/3t/nzPVYadw+M7wFwp3ZPmB9zksPX+GbZvr3QsP2u2HtZrbp5ea9uPxsfP404lXuu6OJNlu4f7c7Mu3dub9Yw9enrm0I5UPH2/beou/Wt0hm3Nq2oWiPHeL7cVVzyul5Fr898vl++Vy0KtYd+AGT91Wbj/6u2V40oQXp5Qzo0cNNgFv23rotlC4Mx6Xyba8krQ7HpdrtefUM6P7/KkGh/08cZvn2ynBgeMpf9KmncrU8/sOTVNdc4KKx5rcVmu/XL5v/akoD93G/yxRgwzbFsXPE7dO+zFtN96CBDslynGGqf7lVYQM4nTi0Wm32we+R2z38UkWyLBtUfw8ceu0Hx/Rj++7DHqvUjny54rbsL9eL9i7fUKjUSwU7lDPz/PCfhQ/T6g9/70CFCTKcQDYd4mQYVUPB249f632kHq80SjybE4qQfHzhDrJ79a9A8Ajl5eCJ8YH/3HB5E3WCoG7ueuKQhe/20vBu+fPtm1R/MLZcY+B75Zd3H6vmJNz5cl/PPqPMA1LP6XStnPMT1AUeszPe4FAtm2L4ueG22Jet7E9gdr5B1wd9Jut3wQ6P3W4Fd526/YBoFi8WyptR//qLNkWxS8c77G9x7gA+QU3h79SoXf7BEnacR7EgB/CAWq0b6cE256Rp0eHPNqSsRxVN/FT5W3h5hFEJL22RfFzg1p137djp57gGe13Bpz+8Js/rB1J9eIzJ24LcjyG/eA3LqCSbdui+MXiG893mwUMwl8e/CX6TVKHr0tP7fmDpvdnybYofm7QJ/n3/C+8V4i9LXnAd1RPdQqilABNOyh+blDn51hm8kPN9nuTXr+UCjVK5+3wAwBZ6hM76bUtil8s4YL5Aef5nWQ4QT0QQWf7sm1bFD83qMP+XQbxO3v+gJX/lYdK4U7ufAe3HD47VNfAd22PnSzZFsXPDWduLzV7jw/STvy+Q3KYTkPuz0kl6FR/ZmyL4hcI45h/p8Ljy1nS1FON9zwfwTcuEI6U2hbFL5B7bAvLAsYFGDeKTfV+spsi27ZF8fOBWrqTT+qu+dG0/1naFudcJIdqlUuXnm3bovj5EHvVTeZ6HuX7ZevfmYS600axSK/Vscbh4W6Ur86YbVH8AmHM23eW94PA9TyKd9ZdjPRmocYI4zvC5yZZsS2KXyCMPn8Q2DNMZhe0tQZZIUokzy3an3nbovj5IKrYfnrTy5KDWz2PzNs213X7OUItv8Get/vX69DfnJlZaDeirMBnmQ70IGO2xZ5fIN4r+WPlcJdLXYAkQO2oI0byApEZ22LPn27chqbOoFRK56IZYY/kOTfwcFvnn3nbovhFwWeh7uJdsPJ+iKquV+NwI/O2xWG/KDgs1KWyd5ehZAASiozZFsXPh81ts5GxoBQjnJL2178lW7ZF8WcT5eF6plBK/VIn1Gg/p0IdVDJjWxR/umF/7NbS1BFfMm9bFL8oRPn8CMIIij9rZMwvTRQZsy2KP904l5QU797MQjurTWU1X5XTet7M2xbFzwdRuf3zS9pGwABA66bS+ICyEMtaPSeZty2Knw8BS24iiHhQ/EjKyPPWmvGC4heF987cMSE/kD0+TePQ1Ilp0nfp5E32bMtR/Lquq6pqGKkscpIK1Deqx6fOalOZT1aPkTzYNmbxD4fDYrGoqqqqqpVKpVar7e3tDYfDeL8FYcG5+AyJi2zYNk7xq6rabrdN0wSAwWBgHe92u9j/I0jSiFP8o9EIADqdjqIoZ2dnALBYLDqdjmmampbK5OfUkZnCsoEQs6one7aNU/y6rgNAr9cbj8cAUK1Wy+Ws2StROJPPvYejKV1/shHyYNs4xV8sFgFA07TJZAIAikIrQY3Ex+rDyuPTzFSb2gh5sG2cCyFrtdrZ2dnh4Y1djo6OhsMhifZJUqaSolNBNoJSySQbto2z52+1WvV6nfy3LMuSJC0WC9M0O50OGRTkBepeXQiSMGIugTAej0lsj3T1tVrt8PCw0WhQT9Z1Xdf1DHoHmyva7z0cnb6d8mxOpsiDbeMU/3A4HI1GqvprdoSiKJqmKYrS6/XWRv6GYTQajfl8fn0dvkZ9mvDYrufbrv/lT9qwux49pSw7y8RwNAnkwbbxiH84HC4WC03TptNpt3vrUdZ1fTqd2uf5VVXVNG0wGCyXeVr9UnCPeiz6/pfv15zi98ZZbQqJi2zYNh7xj0aj6fRm2NPvUx5lu89fq4nIckcQxJt4xE+G9LquL5dLWV5f/9But+1jfjIRqGna8fFxLN+eW5x+qe/QVLvQpF2cefEnD7aNR/y9Xg9oPj8Vtwjf1tZWLI3JD8urdb/J9+FL6WbS4smDbWOe6vNVPiISZ7UpJC4yYNsYen4yY0cyeUmGrxOW+Ty3sD+OCEIj7UhpnIJKBRmwbQziHwwG/X6/0+mAS7QP3IWNhCZc9YhwV6nqm8nkjfVnpXJfknYkSdzGuIIRadsNght1phXno1bdqfpeFajmhKq+GY2M09Nz6qel0na7fdBq7YvcLUcMAmybBGLw+Xu93vX1da/XI/9BJfq3IL7EmIViGB9are9rteduygeA5fLq+PjHcvl/x+OUBbpCkL0MH+BXxkvXdSzgsXHCLUHXtAtF8ZK9HdP8eHSkd7s/hPiiVJOB5f0xi1/X9VartbW1ValU9vb2yuXyWsKfhaIoOCiIAsvEkrO/8r3KMD60Wi/m82AVcvv9n7Okf062TRpxemu6rkuSRMp4EZbLZb/fNwwDy/jFzuxiFuIq36CUojwPqnxCv/9zpXK/1ToIcW3S4GTbpBFnzz8YDEzTrFaro9FotVqRGl4AcHp6imW8BBB9J7lu94dwyv/l8h8N40PENiSTjO3SR4iz5ycKPz4+Jmt4i8UiyfwjnX+MX4TwQNff9fs/u31aKm1L0k65fF/TLjXtklo83zQ/drs/Doef8WwmEhtx9vzUcj1E9vkq5kHF3MDYx7kE3cMvPTn5B/V4qbQ9mTzV9S/G40qv96mq/sEw/mUweFIoUHbIOz091/V3UdqcFgLZNpnEluEHAIeHh4VCodVqGYZBEv4mk8np6Wm1WsUyXvA+5ieDpWKkMyjltvmkYXyghver1R1VfeqcyW+1DiRpR1GeO4cAg8GrXu9T37YlmXhtm1hiy/CzH2m32/Y/5/OUGSUqjzbzpqvcr4S+djikKL9QuENVPkGSdnu9T9rtF2vHVfVt6GYklii2TSy4Vx8Htjfj40SZeR6NKAOTXu8T7+y9VuugVNpeOzifX2Yv7JeBWX0nsWX4eRP9WxB+GMYHZ5C/ULjDMm/XblPOwY10UwH2/GmFZUkZtdqUcwtK6kC91dpnaYaiPHAeTLv4Y7RtksnakgwkBLPZhfNgrfaQ5VpJ2u10fus4KGL/LIu0v2s2BYo/O4RORKFOzikKk/gBIO2xfRYymeSDw/7sULwbMtBoGOvTddWq0K47+YS2bZJB8acS7YI1ZchZbco5iT2drvv8xSIlgScnxGvbJIPiF0WsST7syWTO8ar3FpQ3V4l12gPhnFyMF962TQ4oflG8DrNQjJ0M1JNkpFy+L/gbs2pbDPhlBPOjuaVhpVMuZNW2KH4+7Mtwzlza9a+OJKhXKnzjta9RlKXjqVt27ku8U335sS0O+1NJlFqRqVt85gt1fXFo8mNbFD+C5BQUvyjiXtLLFefMP5I90OcXxWUQb9DvTRFleOn0SwuFO2sj5wwX5HAWGq3VHtrTGeO1bZJB8ScSv3nBKA8ZZQtKaWctzyfJ4ne2FgA07YJxByFqqTK7+OO1bZJB8fNhpwyQmo3cyuX7a3JaLq90/R3jjLpvXyoA9FNCgOLnwz1HKniCff5KhSJyVX3Dsp5f0y6cfSnjikDxaFr49YvZAwN+ongdZy2ziFNKxodbl1OTee07c3pArQWQ2OxglgFCvLZNMij+RPLaZ31IxFqRa+tPqEP0szOTxfN31v8qlbbFb93J+KpiIV7bJhkUPwIAUK9T0tfdinlbqOobZ/2vRoPv6leqk4KEAMXPhx1asddXya3xRPV7T0/Ph8NXbpeQnXydxw8PmaLuoYmysCfJsxjiQfHzYSdasVfPdQE8vMpW64C6CUe7/YKqf11/pyjPl8urteOl0jbvnp8KY3r/YkERvz1CkSKPPToofoHEFPCP7lVO3kycB90qdrbbLxTl/8ZjQ9MudP2dqr7pdn+QpG+pu/ptRPkRsUcoONk2meBUn0Bez+D3jU03wpXj49+Nx4azMweA6fStM6/GSam0fXz8Ow5N8wfn+UOAPT8fnPP87GwoNFAs3o1YirPX+1RAnJ86N8G4uTDW+bWD4udDgVbsNVB6/yZoNIrOOtyMNJv7aRzzUyMdOQHFLxBGn9/vHRHdq/TwbHu9T5tNpu067DSb+xvfmVtV/af6nd7BWj4SV9smDRS/QBh7/svwxSRiYTj8LFD/3+n8VrDyQ3fXjN5BTkDxc+NRdf3IJduSrwSsAuj1Pp1MnsoyZSsuO6XS9mTyVPymHdT0YfTng4LRfm5QY36mRg8H2ElGaEBRHirKQ027GI0MTbu0R/ur1R1FeSB+6Z43q5XP1sBUv0B8LeDkgOIXC0uv7pfYH71iBLtfKkm7jOvkN45vz0+dDlzbnkSkbTcODvu5QU3yO/eLJ10Z8N70PiV6lUjzo89XrHOhwysVvh/Ct114pW48T5k67PdN3aXuR7rGBmy7ObDn5wZ12O/b8/t1+0K5MmBxAi/Ht6IVi/7Nfzyuw34NPmttpGlOqOlJdjARaA3s+cXiq+3kiP/vPfivMiz6rnHKl2fwrA3/LYGZlDZ7z/ZR/YLcVvIAFD9HHh1SDvqW9GCY54vFq/Rybq8M0Brw3bGvAwIA8HoOXx/C98PoTWLHTbHebj/LPB932yYJFD833DJ8vR1mhp4/Fq/S9QG9MuAbBV6eBbvds7Zg/VOhLtojsKQAAW/bJgwUv3A8Yn5XRrzVvsLwrBWyDc/awsb/bvNzHgp3GxQkarZSMBjw48aBQj/u0befh4yiN/ebxTteefV/e/u3tSN3t2g//d97gft8O/MG/FmDbe4Z/m7i9yg6zBLqpxKbbZNHOlqZKTwKdTBs400dUh4Vj5SHLu8aAABwbjL76oOjRAeJ7VO5V4DHDdipwCMJzidwqdPfEZdLWJzA5z2PlvDGregwtdDoWpowR9smEhz288SZ4Utwc49fjn1vSX1ApR2/rEEHswvHi2ZxQo/wPWnCv+nw5RD+uQsHCnzeA2kMX01gp0Q5+YUgz98t9ZhayXM8Nqibea7lC3C0bSJB8fPELeZHdftNjTX530HxbuSR9pVB1+2TJnw5pIzkDxT4s0bR/3tzs5E/atHhKLV9Y7BtUkHx88Stkt/LM7hwdDIvBvwaIj+Qfc54OaZ0+/syfOmu5O0ifEH7lMF5iY7H1gCDwa1Rt66/Oz09D3oTdvxtm1RQ/DzxqOfzXffWnxc6vDhluSWvaaSXI8pBqrbtHCjwpLl+MGzYMi6Gw3P7zjzUEsNU0jJFFxcofp5QC3gTXp7dGh7PWWv7Ld6tZwGVtmm+tx+3slmuDEoY8kkTdhlqEO/X1o+EdV4C4VEg3DQ/tlovVPWNrr9rNBYe1QfXbsLFtgkGo/088S7g/awNr2dwrwjnapTp/fL9aGXCwaWvdqqayr4C+45xL8vK5WisrcZbYz6/rNWeR7wJxGLbBIPi54lv9X62oX50vGeqKV76vQJroeHtIvxpA+P8WNz1WG7iY9sEg8N+nrAMm4Xg04M58472vWa2k0AsZYJjuUl6Rwcofs64TfWHJS5/cvrW5uQ7FxpH3HFICL5VxoJezsW2CQbFz5koBfzZCJGFso4z4kBdkpgwfD326JfHYNsEg+LnzKN0Pj3831nRiVh+L8/V+wgY8OOMx2yfN5XOrzVzIrN3d8/1M+pSPA/xs9TweiQJWN4TcTvguHYT9rJtskHxcya08+zy1gjnT1KHr9qFJu1K9MpiHhN13zBMAX41cV3UGB/l8na8l8dv22SDw37OhNYA81vjcDekfx69WOVmiVhWmOXybNsWxc+fEAF/Z9qMO+mdZ45O6IB/tco0w59t26L4+RNi5C8yTOi2uUgaCB20iyW9J+2g+PkTQvz36DEk9U3IXDpqIsrNOhaqe5+ALcNYqFRCit95IRfbJhsUP38Yk+TtBOn5WYam1AfUuY4ldShKyGE/Y8+fbdtitJ8/IcbwBwr7rjgxRJV3SutL8c7dw/WVzvqR94awRQprhI75MdbtTH7EPgoofv5sF+FRNcC6PRIg5D9V9is75XXxe2wW6izR9/0QYDPiBwBZfuCxaNftEk6NSRc47BdCoHUy7iMFXgvFnd8YqCCHkNI9boQI3dF3+E7JIvwYQfELIVCqvHtS4OrDKnQTqjvrM46/Pu7O5r034Sf/aqI3MNQd5cfRUeDZOGpuHy/bJhgUvxAC9fxBTnY+dm54xa6o3+i7oTDhJ1rxP4FI0u5aBW5fGg2m90U8tk0wKH4hELefkSA1cOJ57LaLlLSiF6dMQUe3Uv8CYRQzoV4vMJ6ZUkmzg+IXBWN//rju8WG8SaO37vb4iHLGsxZceX7jt93N7y8GcHgYwO13W8/D0bZJBcUvCqq6nHhGB+JNHZlf2nT7uAH3HF3i5RK+UShVxgl/78W47jAKjUaRfeTvNkzgaNukguIXRUFiGvkHrJ/FXkPKpy7FdhGetCjHX8/hawm+7d5K+P1pDP+jwHfH7O3kSrF4l3HkL8sP2DOCY7NtUkHxC8S3898p3XL4GZb3xOmXVo4pnT8AvDdh0YevD+E/t27+mR957Ti4CRhj/oGmBtDn54JhGL1er9vtDoeb39RdHNSu1c5jn4K5fDPG3XbgYcG5dYdYFOVhqeSzvL9U2qbu4UlIRTZ+vGxA/Jqmlcvl4+Pjfr/fbrclSTKMFERHYmC76COSJ23vGyyvYt4PY301y+8bYWRc6bBGNHhyfPw77xO8XQPutk0eGxB/o9EwTbPZbI5Go2q1Op/Pu92u/2XZoOLuJ+/L66W+/6TCX69v/omc7ctaberLYaBqAvCoCp/3klCqsNU68MjbLZW2e71POX11Sit5iRa/qqrL5bJQKAyHw0ajcXx8DADj8SZTxISyW6YsjCH8Mcy29rWHrEsGAwSl/qSy9v+PqvCVCgACKvax0Ot94hb2Hw4/C3o3LrZNEqLFP5lMAECSboylKAoAmKaZl5E/AHzeo3StlY5vbo/xQaCJvhxCdUTZgdvO4zp8pf4q+7h3KAiBJO32ep84jw8GT7yX8Qm1bWLYcLS/WLx5dDRN23Jhsy3kQnV8K5mn0qEslXPAI13c656/b8C/6/DFAB7Xb70FdkrwpAlfTUAa3+rw/1WL0UkJTat1MBqVreBfoXBnMHjiEecjiLZtMti6vr4W+X3dbrff78uyrKo34RAi78lkUqt5jbIEt1MEFzpc6rBTTs6uXllCVd8A87r9fJKg9fyJlbemaWBzVQjk5UXclpDs3pK9qqrlcrlcjvNFYBgGmVuJ97aaphmGIUmSNXBLwm11XR8MBoeHh41GAzjLXtd1Xdch4gPwC+RnAoDYfykfrsVycnICALIskz9Xq5t1lKvVSnBLWJjNZqXSzYi3VCqNRiProyjWWywWzWZTluV6vb5arWazWaFwk10jy/JsNgtxz9VqNXFArN1sNsmfIW47m81kWe50OuTP0WhkGYTcOfQPR25FWjWZTOy3lWU5xG1JOAkAOp1OvI8TMYL16wwGA6uppVIp3O9F6HQ69v9xQr1etz9p/BAt/tlsZlc7sWOpVBLcDBZWq5WlSQvrlw4t/tVqZf+9q9Xq2rdUq9UQt7Ue/Xhf9OS25GW9WFDq0tXr9RC3XSwW5P96MpnY7VytVi39h2uqpclwLzsna4+B9dtZB0M/veSF4ob1wuWHaPFfX18T85VKpU6nQywY8f+T03NPXkzVanWxWFz/MmYpFArktRX9tp1Op9O5mfZrNpvX19fWECBEZyJA/KS1siwTg0wmE9Ja8mcgrCHJ9S8GsXr7xWJBnpCgRrCaallVluXoXShpHnnHWV0XaZv1Hg/xohmNRuQxILcajUaFQqFara5Wq8FgENqwgdiA+O1jaQjbddhZLBZWjxHjc0+eIfvvavdZIt7Weiibzab9Ps4vZYT0pYVCwf4mJXqI8m51it/ettCttV9I1GVXabjb2ptq9yMKhQJ5I5ycnARtp7MxsizbRyURLeD2f+38lAcbmOqTJEnXdeKFzmaz6Bk+5XJZVVXyY5+cnDhdX5Ze0cne3h4AkDc0odvt1uv16XRKQkpRsPqQo6Mjq6eCX4KLISiXy7qut1qtfr9PDBKxhU4qlQoA2DMyQmdnkFsNBgPDMBqNRqFQsOxsGEb0R0JRFBL/K5VKpmlOp9N+v08yyoJCHgPyCJHIHAlMkk9D/17kttZjAAAkgkjuTP4dbzyVAtdXi0hItxyXp3dtc3GJh0IOrnmAUW67FtubzWZk/GJ5FuGwolMkbgRx9PwEq3nkI8tlDddau3tveYKyLBPzhgh82Ht+O7PZ7OTkpF6vh4gjXN/+vUqlEmkeeSSi/F7Wbev1unUrAFgsFuRJDm1YdrIjfvLbxyj+619GpGuPlN1tCXdb69e1t9aS2WAwiNru62vLb4wofuKCdjodWZatB5R8FLG1q9XKLeJlhRUC4Sb+6NjD+4PBgHhq9iPRb2u/FRkJxvIYeCM6ySd1kJFesVhcm+cfj8ez2azXC5OQDwC6rmuaJkmSNa+r6/p4PG40GnHN9BqGMRwOV6tVrVaLZTqaQJoNAN1uN/qdiR2s0W+lUpEkac3U7LcaDAaVSqXV8ls6HZy1jIlerzebzcrlckQLkB99tVrt7e1ZPz0Z/wuY8EfxI0hOwUo+CJJTNpnea2U1WthH17GMKhEkDxCXBwAqlUqj0bBPE3jpiHdQwQNnwMOK1pDgjYCYB4KkHSt+TCiVSla41FtHm/T5rRV+1hFJkkgITVGUcrmcrwp/CBIcXddJ0kSn0zk8PDw5OZnP581mk2jHW0ebHPaTMb8zHYWsb0PlI4gvRESyLJNeczabzedzMl/gq6MN+/zVanU4HJKEh3a7TaY3isUiKh9BWJAkaTKZrOUCkj99dbTJYb+zSs9oNIqeOYsgOURV1dFodHp6Wq1WVVVlSQ3e2FQfGa6QpO7JZEI8/xyV8UWQWJlMJqdDSj55AAAA+0lEQVSnp2ArNOJLUpJ8rLjFbDYLl+CFIEir1To9PS0UCixrrpKS5GMlM+aojC+CREbXdVVVrXwZ4uSbpsmyrHNj4u92u1tbW1bugdVW7PYRhJ3xeFyr1azlDNZbgGVpwMaG/aqqknK9zWaThCXJNj4Y50cQdix/WZZlSZKIjuzVsb0QlIVEYy3Dj1Sz3GB7ECSN2Jdvg600mC8bDvhtrGgxgmQIS0fOteceJCXajyCIYJIS7UcQRDAofgTJKSh+BMkpKH4EySkofgTJKSh+BMkpKH4EySkofgTJKSh+BMkpKH4EySkofgTJKSh+BMkpKH4EySkofgTJKf8PQU8Qh6BQ2rMAAAAASUVORK5CYII=", - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from IPython.display import Image\n", - "Image(\"ap1.png\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Motif scanning\n", - "\n", - "For very simple scanning, you can just use a Motif instance. Let’s say we have a FASTA file called `test.small.fa` that looks like this:\n", - "\n", - "```\n", - ">seq1\n", - "AAAAAAAAAAAAAAAAAAAAAA\n", - ">seq2\n", - "CGCGCGTGAGTCACGCGCGCGCG\n", - ">seq3\n", - "TGASTCAAAAAAAAAATGASTCA\n", - "```\n", - "\n", - "Now we can use this file for scanning." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'seq1': [], 'seq2': [6, 6], 'seq3': []}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from gimmemotifs.motif import motif_from_consensus\n", - "from gimmemotifs.fasta import Fasta\n", - "\n", - "f = Fasta(\"test.small.fa\")\n", - "m = motif_from_consensus(\"TGAsTCA\")\n", - "\n", - "m.scan(f)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3 sequences" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "f" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This return a dictionary with the sequence names as keys. The value is a list with positions where the motif matches. Here, as the AP1 motif is a palindrome, you see matches on both forward and reverse strand. This is more clear when we use `scan_all()` that returns position, score and strand for every match." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'seq1': [],\n", - " 'seq2': [(6, 9.02922042678255, 1), (6, 9.02922042678255, -1)],\n", - " 'seq3': [(0, 8.331251500673487, 1),\n", - " (16, 8.331251500673487, 1),\n", - " (0, 8.331251500673487, -1),\n", - " (16, 8.331251500673487, -1)]}" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m.scan_all(f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of matches to return is set to 50 by default, you can control this by setting the `nreport` argument. Use `scan_rc=False` to only scan the forward orientation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'seq1': [],\n", - " 'seq2': [(6, 9.02922042678255, 1)],\n", - " 'seq3': [(0, 8.331251500673487, 1)]}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m.scan_all(f, nreport=1, scan_rc=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While this functionality works, it is not very efficient. To scan many motifs in potentially many sequences, use the functionality in the `scanner` module. If you only want the best match per sequence, there is a utility function called `scan_to_best_match`, otherwise, use the `Scanner` class." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "motif\tpos\tscore\n", - "AP1\t0\t-20.052563923836903\n", - "AP1\t6\t9.029486018303187\n", - "AP1\t0\t8.331550321011443\n", - "CG\t0\t-18.26379789133924\n", - "CG\t0\t5.554366880674296\n", - "CG\t0\t-7.743307225501047\n" - ] - } - ], - "source": [ - "from gimmemotifs.motif import motif_from_consensus\n", - "from gimmemotifs.scanner import scan_to_best_match\n", - "\n", - "m1 = motif_from_consensus(\"TGAsTCA\")\n", - "m1.id = \"AP1\"\n", - "m2 = motif_from_consensus(\"CGCG\")\n", - "m2.id = \"CG\"\n", - "motifs = [m1, m2]\n", - "\n", - "print(\"motif\\tpos\\tscore\")\n", - "result = scan_to_best_match(\"test.small.fa\", motifs)\n", - "for motif, matches in result.items():\n", - " for match in matches:\n", - " print(\"{}\\t{}\\t{}\".format(motif, match[1], match[0]))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The matches are in the same order as the sequences in the original file.\n", - "\n", - "While this function can be very useful, a Scanner instance is much more flexible. You can scan different input formats (BED, FASTA, regions), and control the thresholds and output.\n", - "\n", - "As an example we will use the file `Gm12878.CTCF.top500.w200.fa` that contains 500 top CTCF peaks. We will get the CTCF motif and scan this file in a number of different ways.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "cannot import name 'default_motifs' from 'gimmemotifs.motif' (/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/gimmemotifs/motif/__init__.py)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[11], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgimmemotifs\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmotif\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m default_motifs\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgimmemotifs\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mscanner\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Scanner\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mgimmemotifs\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfasta\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Fasta\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'default_motifs' from 'gimmemotifs.motif' (/home/jnourisa/miniconda3/envs/py10/lib/python3.10/site-packages/gimmemotifs/motif/__init__.py)" - ] - } - ], - "source": [ - "from gimmemotifs.motif import default_motifs\n", - "from gimmemotifs.scanner import Scanner\n", - "from gimmemotifs.fasta import Fasta\n", - "import numpy as np\n", - "\n", - "# Input file\n", - "fname = \"Gm12878.CTCF.top500.w200.fa\"\n", - "\n", - "# Select the CTCF motif from the default motif database\n", - "motifs = [m for m in default_motifs() if \"CTCF\" in m.factors['direct']][:1]\n", - "\n", - "# Initialize the scanner\n", - "s = Scanner()\n", - "s.set_motifs(motifs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let’s get the best score for the CTCF motif for each sequence." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "500\t11.00\t1.45\t15.07\n" - ] - } - ], - "source": [ - "scores = [r[0] for r in s.best_score(\"Gm12878.CTCF.top500.w200.fa\")]\n", - "print(\"{}\\t{:.2f}\\t{:.2f}\\t{:.2f}\".format(\n", - " len(scores),\n", - " np.mean(scores),\n", - " np.min(scores),\n", - " np.max(scores)\n", - " ))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In many cases you’ll want to set a threshold. In this example we’ll use a 1% FPR threshold, based on scanning randomly selected sequences from the hg38 genome. The first time you run this, it will take a while. However, the tresholds will be cached. This means that for the same combination of motifs and genome, the previously generated threshold will be used.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2018-12-05 02:18:08,850 - INFO - Using default background: genome hg38 with length 200\n", - "2018-12-05 02:18:08,853 - INFO - Using background: genome hg38 with length 200\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n" - ] - } - ], - "source": [ - "# Set a 1% FPR threshold based on random hg38 sequence\n", - "s.set_genome(\"hg38\")\n", - "s.set_threshold(fpr=0.01)\n", - "\n", - "# get the number of sequences with at least one match\n", - "counts = [n[0] for n in s.count(\"Gm12878.CTCF.top500.w200.fa\", nreport=1)]\n", - "print(counts[:10])" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[414]\n" - ] - } - ], - "source": [ - "# or the grand total of number of sequences with 1 match\n", - "print(s.total_count(\"Gm12878.CTCF.top500.w200.fa\", nreport=1))" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "chr11:190037-190237 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 11.987579910503003 142 1\n", - "chr11:190037-190237 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.366964008790466 21 1\n", - "chr11:190037-190237 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.049239042402315 81 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 15.066074431157894 119 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.695942248831264 82 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.362503901152305 26 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.885395021504959 158 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.035150927051049 100 1\n", - "chr14:106873577-106873777 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 8.943385403326214 6 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 15.066074431157894 144 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.76300111320958 184 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.629818549380293 164 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 13.7098269913903 26 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.732636083478935 125 1\n", - "chr14:106765204-106765404 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.180200548235172 66 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 15.066074431157894 27 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 15.066074431157894 184 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.237178252975294 66 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.362503901152305 146 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 11.986694533589388 125 1\n", - "chr15:22461178-22461378 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 11.010111555985224 6 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.732636083478935 36 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.732636083478935 94 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.732636083478935 152 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 10.821419305539889 74 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 10.75850548012932 16 1\n", - "chr14:107119996-107120196 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 10.75850548012932 132 1\n", - "chr14:107238917-107239117 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 15.066074431157894 91 1\n", - "chr14:107238917-107239117 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.180200548235172 33 1\n", - "chr14:107238917-107239117 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 10.065681099588275 14 1\n", - "chr14:107238917-107239117 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.248968117435332 53 1\n", - "chr14:107238917-107239117 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.248968117435332 111 1\n", - "chr6:53036754-53036954 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 9.03332830388423 62 -1\n", - "chr14:107147705-107147905 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.607310435301924 32 1\n", - "chr14:107147705-107147905 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 14.171054553524323 148 1\n", - "chr14:107147705-107147905 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.015417890970234 91 1\n", - "chr14:107147705-107147905 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 12.015417890970234 129 1\n", - "chr14:50328834-50329034 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 11.417325296846556 133 -1\n", - "chr1:114889205-114889405 GM.5.0.C2H2_ZF.0024_syGCCmyCTrGTGG 10.747636638875207 124 -1\n" - ] - } - ], - "source": [ - "# Scanner.scan() just gives all information\n", - "seqs = Fasta(\"Gm12878.CTCF.top500.w200.fa\")[:10]\n", - "for i,result in enumerate(s.scan(seqs)):\n", - " seqname = seqs.ids[i]\n", - " for m,matches in enumerate(result):\n", - " motif = motifs[m]\n", - " for score, pos, strand in matches:\n", - " print(seqname, motif, score, pos, strand)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Finding de novo motifs\n", - "\n", - "Let’s take the `Gm12878.CTCF.top500.w200.fa` file as example again. For a basic example we’ll just use two motif finders, as they’re quick to run.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2018-12-05 02:18:10,850 - INFO - starting full motif analysis\n", - "2018-12-05 02:18:10,853 - INFO - preparing input (FASTA)\n", - "not enough random sequences found for 0.7000000000000001 <= GC < 0.75 (83 instead of 100)\n", - "2018-12-05 02:19:19,484 - INFO - starting motif prediction (medium)\n", - "2018-12-05 02:19:19,487 - INFO - tools: BioProspector, Homer\n", - "2018-12-05 02:19:20,146 - INFO - all jobs submitted\n", - "2018-12-05 02:19:23,902 - INFO - Homer_width_5 finished, found 5 motifs\n", - "2018-12-05 02:19:24,247 - INFO - Homer_width_6 finished, found 5 motifs\n", - "2018-12-05 02:19:25,217 - INFO - Homer_width_7 finished, found 5 motifs\n", - "2018-12-05 02:19:26,671 - INFO - Homer_width_8 finished, found 5 motifs\n", - "2018-12-05 02:19:28,930 - INFO - Homer_width_9 finished, found 5 motifs\n", - "2018-12-05 02:19:29,602 - INFO - Homer_width_10 finished, found 5 motifs\n", - "2018-12-05 02:19:30,393 - INFO - BioProspector_width_5 finished, found 5 motifs\n", - "2018-12-05 02:19:30,642 - INFO - BioProspector_width_6 finished, found 5 motifs\n", - "2018-12-05 02:19:31,293 - INFO - BioProspector_width_7 finished, found 5 motifs\n", - "2018-12-05 02:19:31,892 - INFO - BioProspector_width_8 finished, found 5 motifs\n", - "2018-12-05 02:19:32,083 - INFO - BioProspector_width_9 finished, found 5 motifs\n", - "2018-12-05 02:19:32,479 - INFO - BioProspector_width_10 finished, found 5 motifs\n", - "2018-12-05 02:19:41,497 - INFO - predicted 60 motifs\n", - "2018-12-05 02:19:41,613 - INFO - 50 motifs are significant\n", - "2018-12-05 02:19:41,704 - INFO - clustering 50 motifs.\n", - "2018-12-05 02:21:00,349 - INFO - creating reports\n", - "2018-12-05 02:22:17,818 - INFO - finished\n", - "2018-12-05 02:22:17,822 - INFO - output dir: CTCF.gimme\n", - "2018-12-05 02:22:17,825 - INFO - report: CTCF.gimme/motif_report.html\n" - ] - } - ], - "source": [ - "from gimmemotifs.denovo import gimme_motifs\n", - "\n", - "peaks = \"Gm12878.CTCF.top500.w200.fa\"\n", - "outdir = \"CTCF.gimme\"\n", - "params = {\n", - " \"tools\": \"Homer,BioProspector\",\n", - " \"genome\": \"hg38\",\n", - " }\n", - "\n", - "motifs = gimme_motifs(peaks, outdir, params=params)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This will basically run the same pipeline as the `gimme motifs` command. All output files will be stored in outdir and `gimme_motifs` returns a `list` of `Motif` instances. If you only need the motifs but not the graphical report, you can decide to skip it by setting `create_report` to `False`. Additionally, you can choose to skip clustering (`cluster=False`) or to skip calculation of significance (`filter_significant=False`). For instance, the following command will only predict motifs and cluster them.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2018-12-05 02:22:17,889 - INFO - starting full motif analysis\n", - "2018-12-05 02:22:17,893 - INFO - preparing input (FASTA)\n", - "not enough random sequences found for 0.7000000000000001 <= GC < 0.75 (271 instead of 290)\n", - "2018-12-05 02:22:47,000 - INFO - starting motif prediction (medium)\n", - "2018-12-05 02:22:47,002 - INFO - tools: BioProspector, Homer\n", - "2018-12-05 02:22:47,928 - INFO - all jobs submitted\n", - "2018-12-05 02:22:51,852 - INFO - Homer_width_5 finished, found 5 motifs\n", - "2018-12-05 02:22:52,728 - INFO - Homer_width_7 finished, found 5 motifs\n", - "2018-12-05 02:22:53,196 - INFO - Homer_width_6 finished, found 5 motifs\n", - "2018-12-05 02:22:54,898 - INFO - Homer_width_8 finished, found 5 motifs\n", - "2018-12-05 02:22:56,287 - INFO - Homer_width_9 finished, found 5 motifs\n", - "2018-12-05 02:22:57,110 - INFO - Homer_width_10 finished, found 5 motifs\n", - "2018-12-05 02:22:58,226 - INFO - BioProspector_width_5 finished, found 5 motifs\n", - "2018-12-05 02:22:59,184 - INFO - BioProspector_width_6 finished, found 5 motifs\n", - "2018-12-05 02:23:00,207 - INFO - BioProspector_width_7 finished, found 5 motifs\n", - "2018-12-05 02:23:00,756 - INFO - BioProspector_width_8 finished, found 5 motifs\n", - "2018-12-05 02:23:00,905 - INFO - BioProspector_width_9 finished, found 5 motifs\n", - "2018-12-05 02:23:01,666 - INFO - BioProspector_width_10 finished, found 5 motifs\n", - "2018-12-05 02:23:09,414 - INFO - predicted 60 motifs\n", - "2018-12-05 02:23:09,503 - INFO - not filtering for significance\n", - "2018-12-05 02:23:09,614 - INFO - clustering 60 motifs.\n", - "2018-12-05 02:25:05,541 - INFO - finished\n", - "2018-12-05 02:25:05,546 - INFO - output dir: CTCF.gimme\n", - "2018-12-05 02:25:05,548 - INFO - report: CTCF.gimme/motif_report.html\n" - ] - } - ], - "source": [ - "motifs = gimme_motifs(peaks, outdir,\n", - " params=params, filter_significant=False, create_report=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All parameters for motif finding are set by the `params` argument" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Although the `gimme_motifs` function is probably the easiest way to run the `de novo` finding tools, you can also run any of the tools directly. In this case you would also have to supply the background file if the specific tool requires it." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nnnCnTGynnnGrCwTGyyn\n" - ] - } - ], - "source": [ - "from gimmemotifs.tools import get_tool\n", - "from gimmemotifs.background import MatchedGcFasta\n", - "\n", - "m = get_tool(\"homer\") # tool name is case-insensitive\n", - "\n", - "# Create a background fasta file with a similar GC%\n", - "fa = MatchedGcFasta(\"TAp73alpha.fa\", number=1000)\n", - "fa.writefasta(\"bg.fa\")\n", - "\n", - "# Run motif prediction\n", - "params = {\n", - " \"background\": \"bg.fa\",\n", - " \"width\": \"20\",\n", - " \"number\": 5,\n", - "}\n", - "\n", - "motifs, stdout, stderr = m.run(\"TAp73alpha.fa\", params=params)\n", - "print(motifs[0].to_consensus())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Motif statistics\n", - "\n", - "With some motifs, a sample file and a background file you can calculate motif statistics. Let’s say I wanted to know which of the p53-family motifs is most enriched in the file TAp73alpha.fa.\n", - "\n", - "First, we’ll generate a GC%-matched genomic background. Then we only select p53 motifs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stats for GM.5.0.p53.0001_rCATGyCCnGrCATGy\n", - "recall_at_fdr 0.833\n", - "fraction_fpr 0.416\n", - "score_at_fpr 9.05025905735\n", - "enr_at_fpr 41.6\n", - "max_enrichment 55.5\n", - "phyper_at_fpr 3.33220067463e-132\n", - "mncp 1.85474606318\n", - "roc_auc 0.9211925\n", - "roc_auc_xlim 0.0680115\n", - "pr_auc 0.927368602993\n", - "max_fmeasure 0.867519181586\n", - "ks_pvalue 0.0\n", - "ks_significance inf\n", - "\n", - "Best motif (recall at 10% FDR): GM.5.0.p53.0001_rCATGyCCnGrCATGy\n" - ] - } - ], - "source": [ - "from gimmemotifs.background import MatchedGcFasta\n", - "from gimmemotifs.fasta import Fasta\n", - "from gimmemotifs.stats import calc_stats\n", - "from gimmemotifs.motif import default_motifs\n", - "\n", - "sample = \"TAp73alpha.fa\"\n", - "bg = MatchedGcFasta(sample, genome=\"hg19\", number=1000)\n", - "\n", - "motifs = [m for m in default_motifs() if any(f in m.factors['direct'] for f in [\"TP53\", \"TP63\", \"TP73\"])]\n", - "\n", - "stats = calc_stats(motifs, sample, bg)\n", - "\n", - "print(\"Stats for\", motifs[0])\n", - "for k, v in stats[str(motifs[0])].items():\n", - " print(k,v)\n", - "\n", - "print()\n", - "\n", - "best_motif = sorted(motifs, key=lambda x: stats[str(x)][\"recall_at_fdr\"])[-1]\n", - "print(\"Best motif (recall at 10% FDR):\", best_motif)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A lot of statistics are generated and you will not always need all of them. You can choose one or more specific metrics with the additional `stats` argument.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GM.5.0.p53.0001\troc_auc\t0.92\n", - "GM.5.0.p53.0003\troc_auc\t0.89\n", - "GM.5.0.p53.0004\troc_auc\t0.91\n", - "GM.5.0.p53.0005\troc_auc\t0.86\n", - "GM.5.0.p53.0006\troc_auc\t0.80\n", - "GM.5.0.p53.0007\troc_auc\t0.87\n", - "GM.5.0.p53.0008\troc_auc\t0.82\n", - "GM.5.0.p53.0010\troc_auc\t0.80\n", - "GM.5.0.p53.0011\troc_auc\t0.85\n", - "GM.5.0.p53.0001\trecall_at_fdr\t0.83\n", - "GM.5.0.p53.0003\trecall_at_fdr\t0.64\n", - "GM.5.0.p53.0004\trecall_at_fdr\t0.74\n", - "GM.5.0.p53.0005\trecall_at_fdr\t0.58\n", - "GM.5.0.p53.0006\trecall_at_fdr\t0.19\n", - "GM.5.0.p53.0007\trecall_at_fdr\t0.66\n", - "GM.5.0.p53.0008\trecall_at_fdr\t0.18\n", - "GM.5.0.p53.0010\trecall_at_fdr\t0.20\n", - "GM.5.0.p53.0011\trecall_at_fdr\t0.53\n" - ] - } - ], - "source": [ - "metrics = [\"roc_auc\", \"recall_at_fdr\"]\n", - "stats = calc_stats(motifs, sample, bg, stats=metrics)\n", - "\n", - "for metric in metrics:\n", - " for motif in motifs:\n", - " print(\"{}\\t{}\\t{:.2f}\".format(\n", - " motif.id, metric, stats[str(motif)][metric]\n", - " ))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Motif comparison" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "from gimmemotifs.comparison import MotifComparer\n", - "from gimmemotifs.motif import motif_from_consensus\n", - "from gimmemotifs.motif import read_motifs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compare two motifs" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rrrCATGyyy\n", - " ACAyGA\n" - ] - } - ], - "source": [ - "m1 = motif_from_consensus(\"RRRCATGYYY\")\n", - "m2 = motif_from_consensus(\"TCRTGT\")\n", - "\n", - "mc = MotifComparer()\n", - "score, pos, orient = mc.compare_motifs(m1, m2)\n", - "\n", - "if orient == -1:\n", - " m2 = m2.rc()\n", - "pad1, pad2 = \"\", \"\"\n", - "if pos < 0:\n", - " pad1 = \" \" * -pos \n", - "elif pos > 0:\n", - " pad2 =\" \" * pos\n", - "print(pad1 + m1.to_consensus())\n", - "print(pad2 + m2.to_consensus())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Find closest match in a motif database" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GATA: AGATAASR_GATA3(Zf)/iTreg-Gata3-ChIP-Seq(GSE20898)/Homer - 0.823\n", - " GATA\n", - "AGATAAnr\n", - "\n", - "NTATAWA: NGYCATAAAWCH_CDX4(Homeobox)/ZebrafishEmbryos-Cdx4.Myc-ChIP-Seq(GSE48254)/Homer - 0.747\n", - " nTATAwA\n", - "nnynrTAAAnnn\n", - "\n", - "ACGCG: NCCACGTG_c-Myc(bHLH)/LNCAP-cMyc-ChIP-Seq(Unpublished)/Homer - 0.744\n", - " ACGCG\n", - "CACGTGGn\n", - "\n" - ] - } - ], - "source": [ - "motifs = [\n", - " motif_from_consensus(\"GATA\"),\n", - " motif_from_consensus(\"NTATAWA\"),\n", - " motif_from_consensus(\"ACGCG\"),\n", - "]\n", - "\n", - "mc = MotifComparer()\n", - "results = mc.get_closest_match(motifs, dbmotifs=read_motifs(\"HOMER\"), metric=\"seqcor\")\n", - "\n", - "# Load motifs\n", - "db = read_motifs(\"HOMER\", as_dict=True)\n", - "\n", - "for motif in motifs:\n", - " match, scores = results[motif.id]\n", - " print(\"{}: {} - {:.3f}\".format(motif.id, match, scores[0]))\n", - " dbmotif = db[match]\n", - " orient = scores[2]\n", - " if orient == -1:\n", - " dbmotif = dbmotif.rc()\n", - " padm, padd = 0, 0\n", - " if scores[1] < 0:\n", - " padm = -scores[1]\n", - " elif scores[1] > 0:\n", - " padd = scores[1]\n", - " print(\" \" * padm + motif.to_consensus())\n", - " print(\" \" * padd + dbmotif.to_consensus())\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py10", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/runs.ipynb b/runs.ipynb index d7594730f..0472805bc 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -14,19 +14,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "upload: resources/grn_models/d0_hvg/scenicplus.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvg/scenicplus.csv\n", - "upload: resources/prior/peaks.bed to s3://openproblems-data/resources/grn/prior/peaks.bed\n", - "upload: resources/prior/peaks.txt to s3://openproblems-data/resources/grn/prior/peaks.txt\n", - "upload: resources/prior/cell_topic_d0_hvg.csv to s3://openproblems-data/resources/grn/prior/cell_topic_d0_hvg.csv\n", - "\n", - "The user-provided path resources/supplementary/ does not exist.\n" + "/bin/bash: aws: command not found\n", + "/bin/bash: aws: command not found\n", + "/bin/bash: aws: command not found\n" ] } ], @@ -40,13 +37,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/bin/bash: aws: command not found\n" + ] + } + ], "source": [ "!aws s3 sync s3://openproblems-data/resources/grn/grn-benchmark resources/grn-benchmark \n", - "!aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/\n", - "!aws s3 sync s3://openproblems-data/resources/grn/prior resources/prior/ \n" + "# !aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/\n", + "# !aws s3 sync s3://openproblems-data/resources/grn/prior resources/prior/ \n" ] }, { @@ -58,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -209,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -247,12 +252,13 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ - "if False:\n", - " cols = ['chrom', 'chromStart', 'chromEnd', 'name']\n", + "cols = ['chrom', 'chromStart', 'chromEnd', 'name']\n", + "if True:\n", + " \n", " # merge two db\n", " df_jaspar = pd.read_csv('output/db/JASPAR2022-hg38.bed.gz', sep='\\t', names=cols, comment='#')\n", "\n", @@ -263,182 +269,538 @@ "\n", " print(df_jaspar['tf'].nunique(), df_encode['tf'].nunique(), np.union1d(df_jaspar['tf'].unique(), df_encode['tf'].unique()).shape) #634 957 (1282,)\n", " print(df_jaspar['peaks'].nunique(), df_encode['peaks'].nunique(), np.union1d(df_jaspar['peaks'].unique(), df_encode['peaks'].unique()).shape) #62310613 19645880 (81956493,)\n", - " df_concat = pd.concat([df_jaspar, df_jaspar], axis=0)\n", - " df_concat = df_concat.drop_duplicates() #(143124468, 5)\n", + " df_concat = pd.concat([df_jaspar, df_encode], axis=0)\n", + " df_concat = df_concat[cols].drop_duplicates() #(143124468, 5)\n", " df_concat.to_csv('output/db/jaspar_encode.bed.gz', sep='\\t', header=False, index=False, compression='gzip')" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcetarget
0TARDBPLINC01409
1SKILLINC01409
2ZSCAN29LINC01409
3ARNTLINC01409
5ZNF592LINC01409
.........
4965345SIN3ATBL1Y
4965346POLR2AphosphoS5TBL1Y
4965347HNRNPLLTBL1Y
4965348MAXTBL1Y
4965349ZNF687TBL1Y
\n", + "

9795968 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " source target\n", + "0 TARDBP LINC01409\n", + "1 SKIL LINC01409\n", + "2 ZSCAN29 LINC01409\n", + "3 ARNT LINC01409\n", + "5 ZNF592 LINC01409\n", + "... ... ...\n", + "4965345 SIN3A TBL1Y\n", + "4965346 POLR2AphosphoS5 TBL1Y\n", + "4965347 HNRNPLL TBL1Y\n", + "4965348 MAX TBL1Y\n", + "4965349 ZNF687 TBL1Y\n", + "\n", + "[9795968 rows x 2 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skeleton = pd.read_csv('output/skeleton_jaspar/tf2gene.csv', index_col=0)\n", + "skeleton.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sourcetarget
0RREB1LINC01409
1KLF15LINC01409
2ALX1LINC01409
3ALX4LINC01409
4POU6F1LINC01409
.........
1677140SP5IL9R
1677141ZNF816IL9R
1677142PBX2IL9R
1677143THAP1IL9R
1677144OTPIL9R
\n", + "

5019111 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " source target\n", + "0 RREB1 LINC01409\n", + "1 KLF15 LINC01409\n", + "2 ALX1 LINC01409\n", + "3 ALX4 LINC01409\n", + "4 POU6F1 LINC01409\n", + "... ... ...\n", + "1677140 SP5 IL9R\n", + "1677141 ZNF816 IL9R\n", + "1677142 PBX2 IL9R\n", + "1677143 THAP1 IL9R\n", + "1677144 OTP IL9R\n", + "\n", + "[5019111 rows x 2 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skeleton = pd.read_csv('output/skeleton/tf2gene.csv', index_col=0)\n", + "skeleton.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "------ process_atac --------\n", - "------ peak2tf --------\n" + "db.regions_vs_motifs.rankings.feather hg38-blacklist.v2.bed\n", + "db.regions_vs_motifs.scores.feather JASPAR2022-hg38.bed.gz\n", + "ENCODE-TF-ChIP-hg38.bed.gz\t jaspar_encode.bed.gz\n", + "gencode.v45.annotation.gtf.gz\t motifs-v10-nr.hgnc-m0.00001-o0.0.tbl\n" ] } ], + "source": [ + "!ls output/db/" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jnourisa/miniconda3/envs/scglue/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/jnourisa/miniconda3/envs/scglue/lib/python3.10/site-packages/ignite/handlers/checkpoint.py:16: DeprecationWarning: `TorchScript` support for functional optimizers is deprecated and will be removed in a future PyTorch release. Consider using the `torch.compile` optimizer instead.\n", + " from torch.distributed.optim import ZeroRedundancyOptimizer\n" + ] + }, + { + "data": { + "text/plain": [ + "chrom 732\n", + "chromStart 58972646\n", + "chromEnd 59142199\n", + "name 1282\n", + "score 1\n", + "strand 1\n", + "thickStart 1\n", + "thickEnd 1\n", + "itemRgb 1\n", + "blockCount 1\n", + "blockSizes 1\n", + "blockStarts 1\n", + "dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import scglue\n", - "par = {\n", - " 'atac_file': f\"resources/grn-benchmark/multiomics_atac.h5ad\",\n", - " 'rna_file': f\"resources/grn-benchmark/multiomics_rna.h5ad\",\n", - " 'annotation_file': f\"output/db/gencode.v45.annotation.gtf.gz\",\n", - " # 'motif_file': 'output/db/ENCODE-TF-ChIP-hg38.bed.gz',\n", - " 'motif_file': 'output/db/jaspar_encode.bed.gz',\n", - " 'temp_file': 'output/skeleton'\n", - "}\n", - "def process_atac(par):\n", - " atac = ad.read_h5ad(par['atac_file'])\n", - " split = atac.var_names.str.split(r\"[:-]\")\n", - " atac.var[\"chrom\"] = split.map(lambda x: x[0])\n", - " atac.var[\"chromStart\"] = split.map(lambda x: x[1]).astype(int)\n", - " atac.var[\"chromEnd\"] = split.map(lambda x: x[2]).astype(int)\n", - " atac.write(par['atac'])\n", - " \n", - "os.makedirs(par['temp_file'], exist_ok=True)\n", - "par['atac'] = f\"{par['temp_file']}/atac.h5ad\"\n", - "def peak2tf(par):\n", - " print('read atac')\n", - " atac = ad.read_h5ad(par['atac'])\n", - " peak_bed = scglue.genomics.Bed(atac.var)\n", - " print('read motif_bed')\n", - " motif_bed = scglue.genomics.read_bed(par['motif_file'])\n", - " print('run window_graph')\n", - " peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True)\n", - " # peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs)\n", - "def tss2tf(par):\n", - " rna = ad.read_h5ad(par['rna_file'])\n", - " scglue.data.get_gene_annotation(\n", - " rna, gtf=par['annotation_file'],\n", - " gtf_by=\"gene_name\"\n", - " )\n", - " rna = rna[:, ~rna.var.chrom.isna()]\n", - " \n", - " flank_bed = scglue.genomics.Bed(rna.var).strand_specific_start_site().expand(10000, 10000)\n", - " flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True)\n", - " \n", - "print('------ process_atac --------')\n", - "# process_atac(par)\n", - "print('------ peak2tf --------')\n", - "# peak2tf(par)" + "!ls output/db/\n", + "motif_bed = scglue.genomics.read_bed('output/db/jaspar_encode.bed.gz')\n", + "motif_bed.nunique()" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 20, "metadata": {}, "outputs": [ { - "ename": "RuntimeError", - "evalue": "This function relies on bedtools (>=2.29.2). Detected version is 2.17.0. Please install a newer version. You may install bedtools following the guide from https://bedtools.readthedocs.io/en/latest/content/installation.html, or use `conda install -c bioconda bedtools` if a conda environment is being used.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[38], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mscglue\u001b[39;00m\n\u001b[1;32m 3\u001b[0m peak_bed \u001b[38;5;241m=\u001b[39m scglue\u001b[38;5;241m.\u001b[39mgenomics\u001b[38;5;241m.\u001b[39mBed(atac\u001b[38;5;241m.\u001b[39mvar)\n\u001b[0;32m----> 4\u001b[0m peak2tf \u001b[38;5;241m=\u001b[39m \u001b[43mscglue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenomics\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwindow_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpeak_bed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmotif_bed\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mright_sorted\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs)\u001b[39;00m\n", - "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/scglue/genomics.py:408\u001b[0m, in \u001b[0;36mwindow_graph\u001b[0;34m(left, right, window_size, left_sorted, right_sorted, attr_fn)\u001b[0m\n\u001b[1;32m 372\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwindow_graph\u001b[39m(\n\u001b[1;32m 373\u001b[0m left: Union[Bed, \u001b[38;5;28mstr\u001b[39m], right: Union[Bed, \u001b[38;5;28mstr\u001b[39m], window_size: \u001b[38;5;28mint\u001b[39m,\n\u001b[1;32m 374\u001b[0m left_sorted: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, right_sorted: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 375\u001b[0m attr_fn: Optional[Callable[[Interval, Interval, \u001b[38;5;28mfloat\u001b[39m], Mapping[\u001b[38;5;28mstr\u001b[39m, Any]]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 376\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m nx\u001b[38;5;241m.\u001b[39mMultiDiGraph:\n\u001b[1;32m 377\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 378\u001b[0m \u001b[38;5;124;03m Construct a window graph between two sets of genomic features, where\u001b[39;00m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;124;03m features pairs within a window size are connected.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[38;5;124;03m Window graph\u001b[39;00m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 408\u001b[0m \u001b[43mcheck_deps\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mbedtools\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(left, Bed):\n\u001b[1;32m 410\u001b[0m pbar_total \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(left)\n", - "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/scglue/check.py:169\u001b[0m, in \u001b[0;36mcheck_deps\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;124;03mCheck whether certain dependencies are installed\u001b[39;00m\n\u001b[1;32m 162\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;124;03m A list of dependencies to check\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m args:\n\u001b[0;32m--> 169\u001b[0m \u001b[43mCHECKERS\u001b[49m\u001b[43m[\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/scglue/check.py:127\u001b[0m, in \u001b[0;36mCmdChecker.check\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 125\u001b[0m v \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvmin \u001b[38;5;129;01mand\u001b[39;00m v \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvmin:\n\u001b[0;32m--> 127\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvreq_hint, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDetected version is \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mv\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 129\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease install a newer version.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minstall_hint\n\u001b[1;32m 130\u001b[0m ]))\n", - "\u001b[0;31mRuntimeError\u001b[0m: This function relies on bedtools (>=2.29.2). Detected version is 2.17.0. Please install a newer version. You may install bedtools following the guide from https://bedtools.readthedocs.io/en/latest/content/installation.html, or use `conda install -c bioconda bedtools` if a conda environment is being used." + "name": "stdout", + "output_type": "stream", + "text": [ + "db.regions_vs_motifs.rankings.feather hg38-blacklist.v2.bed\n", + "db.regions_vs_motifs.scores.feather JASPAR2022-hg38.bed.gz\n", + "ENCODE-TF-ChIP-hg38.bed.gz\t jaspar_encode.bed.gz\n", + "gencode.v45.annotation.gtf.gz\t motifs-v10-nr.hgnc-m0.00001-o0.0.tbl\n" ] + }, + { + "data": { + "text/plain": [ + "chrom 639\n", + "chromStart 47138515\n", + "chromEnd 47146899\n", + "name 634\n", + "score 1\n", + "strand 1\n", + "thickStart 1\n", + "thickEnd 1\n", + "itemRgb 1\n", + "blockCount 1\n", + "blockSizes 1\n", + "blockStarts 1\n", + "dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "\n", - "if False: # peak2tf\n", - " \n", - "if False:\n", - " " + "motif_bed = scglue.genomics.read_bed('output/db/JASPAR2022-hg38.bed.gz')\n", + "motif_bed.nunique()" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "atac-emb.h5ad\t\tgene2peak.csv\t guidance.graphml.gz rna.h5ad\n", + "atac.h5ad\t\tgene2peak.links peak2tf.csv\t tf2gene.csv\n", + "consistency_scores.csv\tglue\t\t peaks.bed\n", + "flank2tf.csv\t\tglue.dill\t rna-emb.h5ad\n" + ] + }, + { + "data": { + "text/plain": [ + "source 634\n", + "target 17075\n", + "dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# df = pd.read_csv('output/scenicplus/qc/tss.bed', sep='\\t')\n", - "# df.head(5)" + "!ls output/skeleton/\n", + "skeleton = pd.read_csv('output/skeleton_jaspar/tf2gene.csv', index_col=0)\n", + "skeleton.nunique()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "source 135406\n", + "target 634\n", + "dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# pd.read_csv('output/scenicplus/cell_topic.csv')" + "pd.read_csv('output/skeleton_jaspar/peak2tf.csv', index_col=0).nunique()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "source 135406\n", + "target 634\n", + "dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv('output/skeleton/peak2tf.csv', index_col=0).nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-rw-r--r-- 1 jnourisa clusers 102386538 Oct 8 13:42 output/skeleton/tf2gene.csv\n" + ] + } + ], + "source": [ + "!ls -l output/skeleton/tf2gene.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "# adata_atac = ad.read_h5ad('resources/grn-benchmark/multiomics_atac_d0.h5ad')" + "skeleton_encode = pd.read_csv('output/skeleton_encode/tf2gene.csv', index_col=0)\n", + "skeleton_jaspar = pd.read_csv('output/skeleton_jaspar/tf2gene.csv', index_col=0)\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "\n", - "if False:\n", - " import re\n", - "\n", - " # Function to convert location string to BED format\n", - " def convert_to_bed(locations, output_file='output.bed'):\n", - " with open(output_file, 'w') as f:\n", - " for loc in locations:\n", - " # Use regex to extract chromosome, start, and end\n", - " match = re.match(r\"([^:]+):(\\d+)-(\\d+)\", loc)\n", - " if match:\n", - " chromosome, start, end = match.groups()\n", - " # BED format requires chrom, chromStart, chromEnd (0-based start)\n", - " bed_line = f\"{chromosome}\\t{int(start)-1}\\t{end}\\n\"\n", - " f.write(bed_line)\n", - "\n", - " # Call the function with your location list\n", - " convert_to_bed(adata.var_names[:1000], 'resources/prior/peaks.bed')" + "skeleton = pd.concat([skeleton_encode, skeleton_jaspar], axis=0)" ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(
,\n", - " )" + "source 1282\n", + "target 17075\n", + "dtype: int64" ] }, - "execution_count": 42, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "skeleton = skeleton.drop_duplicates().reset_index(drop=True)\n", + "skeleton.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "all_links = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "np.savetxt('resources/prior/skeleton.csv', all_links.values, fmt='%s')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAFzCAYAAADSc9khAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAABGVElEQVR4nO3deVhUZf8G8JthExRwQxHFBfclAVlNBURBC3tNy8IVFbXVTCvDl/wlpq9Y2Zu9ZuaSuKBmqVlZKpJbqYhg7rviDiKKgyzDMPP8/hiZHFmcgYGZYe7PdXFdnHOeM/N9mOHm4TlnzrEQQggQEZFZkBi6ACIiqjkMfSIiM8LQJyIyIwx9IiIzwtAnIjIjDH0iIjPC0CciMiMMfSIiM2Jl6AJqmlKpxK1bt+Dg4AALCwtDl0NEVGVCCOTm5sLV1RUSScVjebML/Vu3bsHNzc3QZRAR6d3169fRokWLCtuYXeg7ODgAUP1wHB0dtd5PLpdj586dCAsLg7W1dXWVV2PYH+PG/hg3Y+uPVCqFm5ubOt8qYnahXzKl4+joqHPo29vbw9HR0She5Kpif4wb+2PcjLU/2kxZ80AuEZEZYegTEZkRhj4RkRlh6BMRmRGGPhGRGWHoExGZEYOHflFREaKjo2FlZYX09PSntv/zzz8REBCAoKAgBAQEYP/+/dVfJBFRLWHQ8/TT09MxfPhwdOjQAQqF4qntr169ivDwcPz666/o06cP9u7di0GDBuH48eNo1apVDVRMRGTaDDrSf/jwIdasWYNx48Zp1X7hwoXo0qUL+vTpAwAICgpCx44d8dVXX1VnmUREtYZBQ79bt25o166d1u2TkpLg4+Ojsc7X1xe7du3Sd2lERDXuzwt3kXg6s1qfw6Quw3D58mUMGzZMY52LiwuuXLlS7j4ymQwymUy9LJVKAag+Ri2Xy7V+7pK2uuxjzNgf48b+GLfq6E9K+n1MWpOGQrkCKyO98WzbRjrXow2TCv38/HzY2tpqrLO1tUV+fn65+8ybNw+xsbGl1u/cuRP29vY615CYmKjzPsaM/TFu7I9x01d/Lj4Avj1riSKlBTo6KXH3TDJ+O6f9/hVl4JNMKvTt7e01Ru2AaiRfUXjPmDED06ZNUy+XXI0uLCxM5wuuJSYmIjQ01KgusFRZ7I9xY3+Mmz77k3zlHqLXpKFIqUSvto2wZKQn6lhb6vQYJTMY2jCp0Hd3d0dmpuZ8V0ZGBtzd3cvdx9bWttR/BwBgbW1dqRersvsZK/bHuLE/xq2q/Tlw6S4mrjmKArkSgR2csXS0t86BX1KHtgx+nr4u+vXrh9TUVI11R44cQf/+/Q1UERFR5Ry4eBfj41NQIFcgqAqBryujDv0RI0Zg9OjR6uUpU6bg1KlT+OuvvwAA+/fvx9mzZzF58mRDlUhEpLO/Lt7F+FUpKJQrEdzRGd/WUOADBp7eKSoqQlhYGHJycgAAERERcHNzww8//AAAKCws1LjfY6tWrfDrr7/ivffeg42NDWQyGX799Vd+MIuITMafF+4ialUKZMVKhHRqgm9G9YCtVc0EPmDg0LexscGePXvK3b558+ZS6/r06YNDhw5VY1VERNVj3/ksTFx9xGCBDxj59A4RUW2x93wWJjwK/P6dDRP4gImdvUNEZIr2nLuDSWtSUVSsRP/OTbF4ZA/YWBlmzM2RPhFRNdp97g4mrVYFfmgXwwY+wNAnIqo2u8/ewWurU1GkUGJA16b4eoRhAx/g9A4RUbVIOpOJN9amoUihxMCuLvjfCC9YWxp+nM3QJyLSs12nM/FGQirkCoHnurngq+HGEfgAp3eIiPQq8bHAD3+mmVEFPsCRPhGR3uw8lYG31qWpAr97Myx81RNWRhT4AEf6RER6sf1kBt5MUAX+ICMNfIAjfSKiKtt+8jbeXncUxUqBf3m44otXPIwy8AGO9ImIquT3E/8E/ouexh34AEf6RESV9vvJDEz94QQUSoEhXs3x+TAPWEosDF1WhRj6RESVcPSuBdYkqwJ/qFdzfGYCgQ8w9ImIdPbr8dtYfUECJQRe6tECn77c3SQCH+CcPhGRTrb+fRPv/XgCSlhgqJerSQU+wNAnItLa1r9vYur3f0MpAH9nJf7zYleTCnyA0ztERFrZcvQG3tt4DEoBDPNujmetr5pc4AMc6RMRPdXmtBuY9ijwI3zdMOdfXWCCeQ+AoU9EVKEfU2/gvR+OQQhguF9L/GfIM5CYauKDoU9EVK4fjlzHBz+qAn+kf0vMfbGbSQc+wNAnIirTxpTrmL7pOIQARgW0xCeDTT/wAR7IJSIq5fuUa/hw0wkAwOiAVpg9uCssLEw/8AGO9ImINGw4/E/gR/asXYEPcKRPRKS2Lvka/r1FFfhjn22Nj1/oUqsCH2DoExEBABKSryJmy0kAwLherfF/g2pf4AMMfSIirDl0FTN/UgV+VO82+Ci8c60MfIChT0Rmbs3BdMzcegoAMKF3G8TU4sAHGPpEZMZWHUjHxz+rAn9SoDtmPNepVgc+wNAnIjMV/9cVzPrlNADgtSB3RA+s/YEPMPSJyAx99+cVzP5VFfivB7XFhwM7mkXgAwx9IjIzy/dfxpxtZwAAbwS3xfQB5hP4AEOfiMzI44H/Vt+2eD/MvAIfYOgTkZlYuu8S/vPbWQDA5JB2mBbawewCH2DoE5EZ+HbvJcz7XRX474S0w1QzDXyAoU9EtdySvZcQ9yjwp/Rrj6mhHQxckWEx9Imo1lq85yI+3X4OADC1fwdM6d/ewBUZHkOfiGqlr3dfxGc7VIE/LbQD3unHwAcY+kRUCy364wI+33keAPB+WAe8HcLAL8HQJ6Ja5aukC/giURX4HwzoiLf6tjNwRcaFoU9EtcaXu87jy10XAADTB3bEm8EM/Ccx9InI5Akh8OWuC1iYpAr86Oc64fWgtgauyjgx9InIpAkh8N/E8/jqj4sAgH8/3wmTAhn45WHoE5HJEkJgwc7zWLRbFfgxz3fGxEB3A1dl3Bj6RGSShBD4fOc5fL37EgDgo/DOmNCHgf80EkMXAABbtmyBr68v+vTpg6CgIJw6darctjKZDFOnToWHhweCgoLg7++PLVu21GC1RGRoQgh8uuOfwP+/QV0Y+Foy+Ej/8OHDiIyMRGpqKtq3b4/Vq1djwIABOHPmDBwcHEq1nzNnDn766Sf8/fffcHJywtGjRxEQEIDDhw/Dw8PDAD0gopokhEDc9rP4du9lAMDHL3TBuF5tDFyV6TD4SD8uLg7h4eFo31714YlRo0ahuLgY8fHxZbb/+++/4evrCycnJwCAl5cXnJyc8Mcff9RUyURkIEIIxP3+T+DPYuDrzOChn5SUBB8fH/WyRCKBt7c3du3aVWb7l156Cfv378e1a9cAADt27EBWVhaaNm1aI/USkWEIIfCf387g232qwJ89uCvGMvB1ZtDpnezsbEil0lKB7eLigpSUlDL3GTt2LPLz89G9e3c0a9YM58+fx8svv4xXXnmlzPYymQwymUy9LJVKAQByuRxyuVzrWkva6rKPMWN/jBv7o0kIgXnbz2PlgasAgFmDOmG4T3OD/XyM7fXRpQ6Dhn5+fj4AwNbWVmO9ra2tetuTli9fjri4OKSmpqJt27Y4duwYdu3aBYmk7H9a5s2bh9jY2FLrd+7cCXt7e51rTkxM1HkfY8b+GDf2BxAC2HJVgr23Vb/jw9oo0CD7JH777aS+y9OZsbw+5eVlWQwa+iWh+/hIvGS5rEAWQmD69Ol477330Lat6sMXHh4emDZtGgoKCvDRRx+V2mfGjBmYNm2aelkqlcLNzQ1hYWFwdHTUula5XI7ExESEhobC2tpa6/2MFftj3NgfFSEE5v5+Dntvq6ZzP/lXF0T4tqiuMrVmbK9PyQyGNgwa+o0aNYKTkxMyMzM11mdkZMDdvfTpV1lZWbh//z5at26tsb5NmzbYtGlTmaFva2tb6j8JALC2tq7Ui1XZ/YwV+2PczLk/QgjE/nIaqw6qAn/e0Gcw3K9ldZanM2N5fXSpweAHckNCQpCamqpeFkIgLS0N/fv3L9W2cePGsLW1xe3btzXW3759u1JTNURknIQQmPXzKcQfSIeFBRBnhIFvqgwe+tHR0di2bRsuXlR9jDohIQGWlpaIjIwEAPTu3RsxMTEAVGf2REZGYvny5bh//z4AIC0tDYmJieUeyCUi0yKEwP9tPYVVB6/CwgKYP7Q7Ihj4emPwD2f5+fkhPj4eERERsLOzg0QiwY4dO9QfzMrPz9eY8//vf/+LWbNmoV+/frC3t0dubi7i4uLwzjvvGKoLRKQnSqXAzK0nkZB8TRX4L3XHKz5uhi6rVjF46APAkCFDMGTIkDK3paWlaSzb29vj008/rYmyiKgGKZUCH209iXWPAv/Tl7pjGANf74wi9InIvCmVAjE/ncD6w9dhYQF8/rIHXvI2/Fk6tRFDn4gMSqkU+PeWE9iQch0SC+DzYR4Y2oOBX10Y+kRkMEqlQPTm49h45AYkFsAXr3jiRa/mhi6rVmPoE5FBKJQC0ZuO44dUVeD/91VPDPZk4Fc3hj4R1TiFUmD6j8exKY2BX9MY+kRUoxRKgQ9+PIbNaTdhKbHAl6964gUPV0OXZTYY+kRUYxRKgQ83H8Pmo6rAXxjhiUHdGfg1iaFPRDVCKYDpm07i5+O3YSmxwFcRXgjv3szQZZkdnS/DkJCQUB11EFEtVqxQYu1FCX4+fhtWEgssGs7ANxSdR/rTp0+HXC7HK6+8woucEdFTFSuUeH/TSaTelagCf0QPDOzmYuiyzJbOI/1mzZohJycHoaGhGD9+PPbv318ddRFRLVCsUOLd7//GthMZkFgILHy1OwPfwHQO/TVr1uDdd9/FX3/9hcmTJ+OHH35AQEAA5s2bh5s3b1ZHjURkguQKJaZs+Bu/Hr8Na0sLjO+gRFgX3sva0HQO/c6dO6u/9/LywogRI9CxY0fExMSga9eueP7557Fp0yYIIfRaKBGZDlXgH8W2E6rA/9+rHnimITPBGOgc+uHh4bhz5w4+//xzdO3aFb1798aNGzewatUq3L59G8uWLcOJEycwYsSI6qiXiIycXKHEO+uP4rcTGbC2tMA3I73Rr3MTQ5dFj+h8IHfPnj1wc3ODq6srIiMjMXbsWI3bFzZv3hyzZs2Cl5eXPuskIhMgVygxed1RbD+VARtLCZaM7oGQTk0hl8sNXRo9onPoN2nSBCtWrEBISEi5bebNm8cze4jMTFGxEpPXp2HHqUzYWErw7Whv9O3EEb6x0Xl6JyYmplTgFxUV4ZtvvkF2djYAYMaMGfjrr7/0UyERGb2iYiXeWvco8K0k+HYMA99Y6Rz669atK7XOwsICubm5GDZsmF6KIiLTUVSsxJsJaUg8rQr8paO90bcjA99Y6eXG6NbW1pg+fTry8vL08XBEZCJkxQq8mZCKXWcyYWslwfIxPghm4Bs1reb0Fy5ciIULFwIAMjIy4O7uXqrNgwcP4OPjo9/qiMhoyYoVeGNtGv44e0cV+JE+6NPe2dBl0VNoFfrBwcGoX78+hBCYP38+oqOjNbZLJBI4OztXeHCXiGqPQrkCb6xNxe5zWbC1kmBFpC96t29s6LJIC1qFvoeHBzw8PAAAtra2GD58eLUWRUTGq1CuwOtrU7HnXBbqWEvwXaQvnm3HwDcVOs/pVxT4kyZNqlIxRGTcCuUKTFrzWOCPZeCbGq1G+j/99BMaNmyIwMBAjB8/vtx227dv11thRGRcCuUKTFx9BPsv3IWdtSW+G+uLnm0bGbos0pFWI/1PPvkES5YsAQD8/vvvEEKU+UVEtdOTgb9yHAPfVGk10k9NTVV/HxYWhpUrV5bZLjIyUj9VEZHRKChSBf6fF+/C3sYSK8f6wt+dgW+qdL4Mw6pVqyq1jYhMT0GRAlGrUnDgUjbsbSwRP84Pfm0aGrosqgKdD+SmpKRg9uzZuHz5MgBg8eLF8PDwwLBhw5CZman3AonIMPKLijE+XhX4dW0ssWo8A7820Dn0Y2NjoVQq0aBBAxw9ehSTJ0/GgAED0KJFC7z99tvVUSMR1bCSwD94WRX4q6P84NuagV8b6Dy9k5eXh1mzZgEAPv74Y/Tt2xeffvopAKB37956LY6Ial6eTBX4yVfuoZ6tFVaN94V3KwZ+baHzSL/kutgymQw//PADoqKi1NtsbGz0VxkR1bg8WTHGPQp8B1srrI7yY+DXMjqP9Js1a4axY8ciIyMDADB06FAIIbBjxw7IZDK9F0hENeOhrBjjVh5GSvp9deB7tWxg6LJIz3Qe6X/77beoW7cu6tSpgy1btsDW1hZbt27F/Pnz8dZbb1VHjURUzR7KijH2u0eBX8cKayb4M/BrKZ1H+g0bNsTXX3+tse7FF1/Eiy++qL6JChGZjtxCOcauTEHqVVXgr43yh4dbfUOXRdVEL9fTL8GbqBCZltxCOSK/O4zUq/fhWMcKCRMY+LWdzqH/999/Izg4GA0aNIClpaXG1969e6ujRiKqBtJCOcZ8dxhp13IeBX4Aureob+iyqJrpPL0TGRmJ/v3747333oODgwMsLCwAAEIITJ06Ve8FEpH+SQvlGLPiMP6+ngMnO2skTPBHt+ZOhi6LaoDOoe/g4IAFCxaUua3k7lpEZLweFKhG+Meu56C+vTXWRjHwzYnO0zvdu3fH3bt3y9yWlpZW5YKIqPo8KJBj9IpkdeBzhG9+KjXS9/f3R0hICFxdXWFpaaneFh8fj3fffVef9RGRnjzIl2P0d8k4fuMBGthbI2FCALq4Ohq6LKphOof+0qVL4enpiYsXL+LixYsa23JycvRVFxHpUU5+EUavOIwTNx+gYV0bJEzwR+dmDHxzpHPo9+7dG7/88kuZ23jvXCLjk5NfhJHLk3HqlhQN69pg3UR/dHJh4JsrnUO/vMAHgPXr11epGCLSr/t5RRi1QhX4jeraYN3EAHR0cTB0WWRAlfpwVnJyMiIjI/Hqq68CAJYsWcJz9ImMzL28Iox4NMJvXM8G6ycx8KkSof/TTz+hf//+uH//Ps6cOQMA6NSpE2bMmIENGzZUqogtW7bA19cXffr0QVBQEE6dOlVh+8uXL+Oll15C37590bVrVwQEBODIkSOVem6i2uheXhFGLDuEM7cfBf7EAHRoysCnSoT+ggULcOzYMfz8889o1Eh1n8zg4GAkJiZi8eLFOhdw+PBhREZGYt26ddi/fz+ioqIwYMAA5Obmltk+KysL/fr1w5QpU7B7924cO3YM9vb2pQ4qE5mr7IcyjFh2CGczctG4ni02TApAewY+PaJz6FtaWsLd3R0A1J/GBYC6detCqVTqXEBcXBzCw8PRvn17AMCoUaNQXFyM+Pj4MtvPnz8fPXv2RGBgIADAysoKS5cuVS8TmbO7D2UYsSwZZzNy4eygCvx2TRj49A+dQz83Nxe3b98utf7EiRPljs4rkpSUBB8fn38Kkkjg7e2NXbt2ldl+8+bNpQK+Xbt2cHV11fm5iWqTu49G+Ocyc9FEHfj1DF0WGRmdz96ZMmUKPDw8EBERgevXryM2Nhbnzp3Dzz//jKVLl+r0WNnZ2ZBKpWjatKnGehcXF6SkpJRqn5eXhytXrkChUGDkyJFIT09HvXr18O677+K5554r8zlkMpnGzV2kUikA1R3ASu4Cpo2StrrsY8zYH+Oma3/uPpRhzMojuHAnD00dbLFmvA9a1rc1mp+Hub8+1U2XOiyEEELXJ9ixYwfmzZuHkydPAgC6deuGmJgYhIaG6vQ4169fR8uWLbFx40aNyzK/+eab2LlzZ6l5+ps3b6JFixZo0KABdu/eDQ8PDyQlJWHAgAH4/fffy3z+WbNmITY2ttT6devWwd7eXqd6iYyRtAhYdNoSmQUWcLIWeLurAk3sDF0V1aT8/HyMGDECDx48gKNjxZ/BqFTo60t2djYaN26MNWvWYNSoUer1UVFRSElJwfHjxzXaZ2RkoFmzZhgzZgxWrVqlXh8WFgYbGxv8+uuvpZ6jrJG+m5sb7t69+9QfzuPkcjkSExMRGhoKa2trXbpplNgf46Ztf7JyZRj13RFcvpuHpo62WDveB60b1a3BSrVjrq9PTZFKpWjcuLFWoa/z9E5qaipSUlKQk5ODhg0bws/PD56enpUqtFGjRnByckJmZqbG+oyMDPXB4sc5OzvD1tYWzZs311jfqlUrHDhwoMznsLW1ha2tban11tbWlXqxKrufsWJ/jFtF/bkjLcTolarAb+ZUB+snBqB1Y+ML/MeZ0+tT03VoS+vQv3TpEkaNGoXDhw/j8X8OLCws0LNnTyQkJKBVq1a6VQogJCQEqamp6mUhBNLS0hATE1OqraWlJXr16lXqQHJmZiZatmyp83MTmapMaSGGLzuEy1l5cHWqg/WTAtDKCEf4ZHy0OnsnOzsbffv2RcOGDbF9+3ZkZ2dDLpfj7t272LZtGxwcHBAUFIT79+/rXEB0dDS2bdumnr9PSEiApaUlIiMjAaiu9fP4H4APP/wQW7duxbVr1wAAp0+fxs6dO3lTdjIbGQ8KMXzpP4G/YVJPBj5pTauR/oIFCxAeHo5vvvlGY33Dhg0xcOBADBw4EK+//jo+//xzzJ07V6cC/Pz8EB8fj4iICNjZ2UEikWDHjh1wcFCdW5yfn68xJx8WFoavvvoKgwcPRr169VBcXIxVq1Zh0KBBOj0vkSnKeKAa4V+5m4fm9e2wYVIA3BryhATSnlahn5iYiN27d1fY5tNPP0W/fv10Dn0AGDJkCIYMGVLmtrJuzDJq1CiNA79E5uD2gwIMX3oI6dn5DHyqNK1C387ODvXqVfwhD0dHR9jZ8TwxoupwK6cAw5cdwtXsfLRoYIf1Exn4VDlahb6VlXbHe7VtR0Tau5VTgIilh3DtXj7cGqoCv0UDBj5VjlYpfebMGYwfP/6p7c6ePVvlgojoH7dyCjBq5RFcv1eAlg3tsX5SAJrX53/UVHlahX5hYSGuXLmiVTsi0o97MmDkihTcyClEy4b22DApAK4MfKoirULf09PzqQdyAaBv375VLoiIgBv3C/C/U5a4JytE60aqEX4zJwY+VZ1Wob9z506tHkzbdkRUvuv38jHquxTck1mgVUN7bJjUEy5OdQxdFtUSWn04S9uP+BrDx5GJTNm17HxELD2EmzmFcK4jsDbKh4FPesXTbYiMhCrwD+LWg0K0aWSPca2lcHFk4JN+VerG6ESkX1ez8/Dqo8B3d66LtVG+cLIxdFVUG3GkT2Rg6XfzELH0EDKkhWjrXBfrJwaggZ2locuiWqrSI325XK6+6Fll7o1LRMCVxwK/XZN6WD8pAE04pUPVSOfQl8lkeP3111G3bl31KZrjx49HVFQUCgoK9F4gUW11OeshIpYeRIa0EO2b1MP6iQFo4sDAp+qlc+hHR0fj5s2b2LBhA5o0aQIAWL58OTp37oxp06bpvUCi2uhS1kNELD2ETKkMHZqqRvjODqVv9kOkbzqH/pEjR7B161YMHTpUfYE1KysrvP/++7wMA5EWSgL/Tq4MHZs6YN3EADSux8CnmqHzgVyFQgGJRPW34snb6967d08/VRHVUhfv5GL4smRk5crQycUBCRP80YiBTzVI55G+k5MTli1bBkB1q0QAyMvLw0cffVTq3rVE9I8LmbmIWPpP4K+bGMDApxqn80h/4cKFGDhwID744AMoFAq0adMGt2/fRosWLbBjx47qqJHI5F3IzMXwZYdw92EROjdzRMIEfzSsyxPxqebpHPodOnTA2bNnkZCQgFOnTgEAunXrhhEjRsDGhm9ioiedz8zFiEeB3+VR4Ddg4JOB6Bz63377LV577TWMGzeuOuohqlXOZkgxclkysvOK0NVVFfj17Rn4ZDg6h35MTAxycnIwatQozuETVeBshhQjliXjXl4RujV3xNooBj4Zns4Hctu3bw83NzdMmjQJAwcOxOrVq5Gfn18dtRGZrDO3/wn8Z5o7ISEqgIFPRkHnkf7GjRvh5uaGESNGICMjAwkJCQgNDUXbtm0RGRmJfv36VUedRCbj1K0HGLU8Gffz5ejewglrxvvDyZ6XHSfjoPNIv0WLFurvXVxc0LNnTzzzzDP48ccf8eKLL+qzNiKTc+rWA4x8FPgeLZywJoqBT8ZF59Dv168fLl26hFmzZqFdu3YICgrCpUuX8M033+D27dvVUSORSTh5UxX4OflyeLjVx+oofzjZMfDJuOg8vfPXX3+hQ4cO6Ny5MyZOnMgDukQATtx4gFErkvGgQA5Pt/pYHeUHxzoMfDI+Ood+q1at8P3338PLy6s66iEyOSduPMDI5YcgLSyGV8v6WDWegU/GS+fpndWrV5cb+Js3b65yQUSm5PiNHHXge7dqgNUMfDJyWo30i4qKYG1tDQsLCxQVFWHfvn1ltpszZw6GDh2q1wKJjNXf13MwekUycguL4dOqAeLH+6GeLW9GR8ZNq3dou3bt0KlTJ+zcuRPBwcHltiu5ABtRbXf02n2MWXEYubJi+LZugJXjGPhkGrR6l27evBkODg4AgKCgIOzevbvMdiV30iKqzdKu3Ufko8D3a90QK8f5oi4Dn0yEVu9UHx8f9feLFy8us01eXl6524hqi9Sr9xH53WE8lBXDr01DrBzLwCfTovOB3K+++qrUury8PPj7+yMhIUEvRREZo9Sr99SB79+mIeI5wicTpPM79ty5c6XW1a1bFydPnkSfPn30UhSRsTmSrgr8vCIFero3woqxPrC3YeCT6dHqXbt3717s3bsXAJCeno7Zs2eXanP//n1kZ2frtzoiI5CSfg9jHwX+s20bYUWkL+xsLA1dFlGlaBX66enp6oO39+/fL3UgVyKRwNnZWX0bRaLa4vCVexi78jDyixTo1a4Rlo9h4JNp0yr0IyMjERkZqf5+1apV1VoUkTFIvpyNcfEpyC9SoHe7xlg2xoeBTyZP5wO5FQV+UlJSlYohMhYHL2Vj7EpV4Pdp3xjLIxn4VDtU6kiUUqnEpUuXkJGRASGEev0HH3yAtLQ0vRVHZAgHLt1FVPwRFMgVCOzgjKWjvVHHmoFPtYPOoX/mzBkMGTIE58+fh4WFhUbo8xO5ZOoOXLyL8atSUChXIqiDM75l4FMto/P0zrvvvouZM2eioKAAgYGBUCqVKCwsREJCAj7++OPqqJGoRvz1WOAHd2TgU+2kc+jLZDKMHDkStra26nU2NjYYPnw4jh49qtfiiGrKnxfuYny8KvD7dnTGklEMfKqddA59uVyu/l6hUKjPzS8oKMDJkyf1VxlRDdl/IQtRq1IgK1aiX6cmWMIRPtViOod+8+bNERERgZycHPTt2xf+/v6YOHEifH190bFjx+qokaja7D2fhahVRyArVqJ/5yZYPKoHbK0Y+FR76Xwg97PPPsPJkydhbW2NGTNm4O7du9i/fz+6deuGL774ojpqJKoWe87dwaQ1qSgqVqJ/56ZYPLIHbKx0HgcRmRSd3+GtWrVCeHg46tatizp16uDrr7/G8ePHsWHDBri6uupcwJYtW+Dr64s+ffogKCgIp06d0mq/RYsWwcLCAnv27NH5OYl2Pxb4YV0Y+GQ+9PouHzJkiE7tDx8+jMjISKxbtw779+9HVFQUBgwYgNzc3Ar3u3XrFj777LOqlEpm7I+zmXhttSrwB3RtikUjGPhkPrSa3gkJCdHqwf7++2+dnjwuLg7h4eFo3749AGDUqFGYPn064uPjMXny5HL3mzx5Mv7973/j9ddf1+n5iJLOZOKNtWkoUigxsKsL/jfCC9aWDHwyH1qF/pUrVzB27NintktPT9fpyZOSkvB///d/6mWJRAJvb2/s2rWr3ND/5ZdfYG1tjQEDBuj0XES7TmfijYRUyBUCzz/jgoURDHwyP1qF/vDhw7X64JVMJtP6ibOzsyGVStG0aVON9S4uLkhJSSlzn7y8PMTExGDHjh1aP5dMJtNoK5VKAahOPX389NOnKWmryz7GzNz6k3TmDiZ/fwxyhcBzXZvi85e6AUoF5EpFTZapNXN7fUyNsfVHlzq0Cv3//Oc/Wj3YoEGDtH7i/Px8AND4kFfJcsm2J82cOROvv/46mjVrpvV/FfPmzUNsbGyp9Tt37oS9vb3W9ZZITEzUeR9jZg79OX7PAivPS6AUFvBqpERovZtI3HHTANXpzhxeH1NmLP0pLzPLovMpm9euXSt32/vvv48DBw5o9TglgfvkiF0mk5UZxmlpaUhOTsbnn3+uQ7XAjBkzMG3aNPWyVCqFm5sbwsLC4OjoqPXjyOVyJCYmIjQ0FNbW1jrVYIzMpT87T2diVfJxKIVA+DMu+PylbrAygSkdc3l9TJWx9adkBkMbOod+69at9XJhtUaNGsHJyQmZmZka6zMyMuDu7l6q/bZt21BQUKA+qFxYWAhAdS2g+vXrY/ny5WjXrl2p/WxtbUv9NwEA1tbWlXqxKrufsarN/dl+8jamfH8cxUqBf3m44otXPEwi8B9Xm1+f2sBY+qNLDTqHvr+/PzZs2KBeVigUuHHjBr7//nsEBgbq9FghISFITU1VLwshkJaWhpiYmFJtZ86ciZkzZ6qX09PT0aZNG3z55ZcIDg7WtRtUy/124jYmrz8KhVJgsKcrFgwzvcAnqg6VuolKq1at1F/u7u4IDAzE119/rfHHQBvR0dHYtm0bLl68CABISEiApaWl+i5dvXv3LvMPAFFFth3/J/CHeDXHF694MvCJHtF5pN+hQ4cy18vlcly4cEGnx/Lz80N8fDwiIiJgZ2cHiUSCHTt2wMHBAYDq4ERZZ+m8++67OHTokPr7Tp066fwHh2qn305kYNqPJ6BQCgz1ao7PhnnAUsL7PBCV0Dn0x48fX2pdbm4u0tLS4Ofnp3MBQ4YMKfeTvOXdhevLL7/U+Xmo9ku7a4G1yY8Cv0dzfPYyA5/oSTr/z/v7779DCKH+AgBXV1d89NFHiI+P13d9RFr55fhtrL4ggUIp8LJ3CwY+UTl0HulHRETgv//9b3XUQlQpPx29ifd/PAEBC7zcozk+fak7JAx8ojLpPNKvKPBXr15dpWKIdLXl6A1M2/g3lAIIaKLE3MFdGPhEFdB5pA8AV69exbFjx/DgwQONG6PHxcVhzJgxeiuOqCKb027gvR+OQQjgVZ/mCLC6ysAnegqdQ3/+/PmIiYlBw4YNUbduXY1tT37Qiqi6/Jh6Ax/8qAr84X4tMSu8I7Zvv2rosoiMns6hv2LFCpw6darMWyPyypdUE344ch3TNx2HEMAI/5aYM7gbFIpiQ5dFZBJ0ntPv2rVruffC/f7776tcEFFFNj4W+KMCWmLui904pUOkA51D/5133sGSJUtw69Ytjfl8ABg6dKjeCiN60vcp1/Dho8Af07MVPhncTS/XgSIyJzqHvoODAxYvXgw3NzdYWVnB0tJS/bV3797qqJEI6w9fw4ebTkAIILJnK8T+qysDn6gSdJ7THzduHAYPHoz58+drXAJZCIGpU6fqtTgiAFiXfA3/3nICADD22db4+IUuDHyiStI59Bs0aIA5c+aUue2LL76ockFEj0tIvoqYLScBAON6tcb/DWLgE1WFztM7zz77LK5cuVLmth07dlS5IKISaw79E/jje7Vh4BPpgc4j/du3b8PPzw9eXl5o1qwZLC0t1du2b9+OuLg4vRZI5mnNwXTM3HoKADChdxvEhHdm4BPpgc6hv3PnTo174T55Bg9RVa06kI6Pf1YF/qRAd8x4rhMDn0hPdA79QYMGYdmyZWVu44FcqqqVf11B7C+nAQCvBbojmoFPpFc6h355gQ8Ac+fOrVIxZN6++/MKZv+qCvzXg9riw4EdGfhEeqbXe8g9Pu1DpIsVjwX+m8EMfKLqovNI393dvdxtGRkZVSqGzNPy/ZcxZ9sZAMBbfdvi/TAGPlF10Tn0bW1tER0drV5WKBS4efMmfvnlF7zxxht6LY5qv2X7LmPub6rAnxzSDtNCOzDwiaqRzqEfGxuLV155pdT6qVOn4vXXX9dLUWQevt17CfN+PwsAeKdfe0zt356BT1TNdA79sgIfAOrVq4eLFy9WuSAyD9/suYT521WBP6Vfe0wN7WDgiojMg86hX9YtEXNzc3HgwAFIJHo9Lky11OI9F/Hp9nMAgKn9O2BK//YGrojIfOgc+q+99hpcXFzUyxYWFnBwcICnpycSEhL0WhzVPl/vvojPdqgC/73QDpjcj4FPVJN0Dv2AgADs3r27OmqhWu5/SRewIPE8AOD9sA54O4SBT1TTdA59Bj5VxldJF/DFo8D/YEBHvNW3nYErIjJPWk3CZ2VlYfbs2Zg9ezZOnz5davv06dORlZWl9+Kodvhy13l14H84sBMDn8iAtAr977//HnPnzsWDBw9Qv379UtvPnDmDnj174ubNm/quj0yYEAJfJJ7Hl7suAACin+uEN4LbGrgqIvOmVehv3boVGzduxIIFC+Dq6lpq+y+//IIpU6YgNjZW7wWSaRJC4L+J5/FVkirwZzzXCa8HMfCJDE2r0M/Pz8fgwYMrbDN58mScOnVKL0WRaRNCYMHO8/jqD9XnNmKe74zXGPhERkGrA7l16tTR6sFsbW2rVAyZPiEEPttxDov3XAIAfBTeGRP6lH+9JiKqWVqN9OVyOZRKZYVtFAoFioqK9FIUmSYhBD59LPBnDurCwCcyMlqFfmhoKD788MMK28TExGDAgAF6KYpMjxACcdvP4ptHgf/xC10Q1buNgasioidpNb3z/vvvo2/fvvD29sbw4cPRqVMn1KtXD3l5eTh9+jQ2btwIe3t7JCYmVne9ZISEEIj7/Sy+3XcZADDrhS4Y24uBT2SMtAp9Ozs77N69GzNnzsScOXMglUphYWEBIQScnJzwxhtvYNasWbCxsanuesnICCHwn9/OYNn+KwCA2YO7YkzP1oYtiojKpfUncu3s7PD5559j/vz5OHv2rPqc/U6dOvFCa2ZKCIG5285g+Z+qwP/kxW4YHdDKwFURUUV0vgyDpaUlunbtWh21kAkRQuCTX8/gu79UgT/nxW4YxcAnMno6hz6REAKxv5xG/IF0AMDcId0w0p+BT2QKGPqkkycDf97QZzDcr6VhiyIirTH0SWtCCHz88ymsPngVABA39BlEMPCJTApDn7SiVKoCf82hq7CwAOYP7Y5XfN0MXRYR6YihT0+lVAr8388nsfbQNVXgv9Qdr/gw8IlMEUOfKqRUCny09STWJasC/7OXPfCydwtDl0VElcTQp3IplQIxP53A+sPXYWEBfP6yB15i4BOZNIY+lUmpFPj3lhPYkHIdEgtgwSseGOLFwCcydUbxUdotW7bA19cXffr0QVBQUIXX5d+4cSPCwsLQr18/+Pr6YtiwYUhPT6+5Ys2AUikQvfm4OvC/eMWTgU9USxg89A8fPozIyEisW7cO+/fvR1RUFAYMGIDc3Nwy248aNQrvvfcekpKSkJycDDs7OwwcOBAymayGK6+dFEqBDzcdx8YjNyCxAP77qide9Gpu6LKISE8MHvpxcXEIDw9H+/btAahCvbi4GPHx8WW2Hzx4sPoSzhKJBO+88w7OnTuHtLS0miq51ioJ/B9SVYH/ZYQXBnsy8IlqE4OHflJSEnx8fNTLEokE3t7e2LVrV5ntf/jhB43lkrt6caRfNQqlwAc/HsOPqTdgKbHAwggv/Muj9P2Qici0GfRAbnZ2NqRSKZo2baqx3sXFBSkpKVo9xsGDB+Hq6opevXqVuV0mk2n8QZBKpQBUdwOTy+Va11rSVpd9jNnj/VEoBT7cfBJbj92GpcQCX7z8DAZ2cTapvtbm16c2YH+qly51GDT08/PzAZS+t66tra16W0VkMhk+++wzLFq0CNbW1mW2mTdvHmJjY0ut37lzJ+zt7XWuubbdKGb7zkQkXJQg9a4EEgiMaacArqfht+uGrqxyatvrw/4YN2PpjzZ5WcKgoV8Suk9OzchkMq0C+bXXXsOrr76KIUOGlNtmxowZmDZtmnpZKpXCzc0NYWFhcHR01LpWuVyOxMREhIaGlvsHxpTI5XJs35mIpIfNkXo3E1YSC/z3FQ8M7Nr06Tsbodr4+rA/xsvY+lMyg6ENg4Z+o0aN4OTkhMzMTI31GRkZcHev+Iba0dHRsLe3xyeffFJhO1tb21L/SQCAtbV1pV6syu5nbIoVSqy9IEFatirwF43wwsBuzQxdVpXVltenBPtj3IylP7rUYPADuSEhIUhNTVUvCyGQlpaG/v37l7tPXFwcrl+/jkWLFgEAUlNTNR6DKlasUOL9H08iLVsCK4kFvh7Zo1YEPhE9ncFDPzo6Gtu2bcPFixcBAAkJCbC0tERkZCQAoHfv3oiJiVG3X7JkCdauXYvJkycjLS0NR44cwS+//IITJ04YpH5TI1coMeX7v7HtZAYsLQT+F+GBAV1dDF0WEdUQg1+Gwc/PD/Hx8YiIiICdnR0kEgl27NgBBwcHAKoDFCVz/rm5uXjrrbegVCrRs2dPjcdZuXJljdduauQKJaZsOIrfTmTA2tICke0U6N+5iaHLIqIaZPDQB4AhQ4aUezD28Q9dOTg4QKFQ1FRZtYpcocQ764/i95OqwF803BOFl7Q7LZaIag+DT+9Q9SsqVmLyOlXg21hK8O1ob4R0dDZ0WURkAAz9Wq6oWIm316Vh+6nHAr+TaZ6WSURVZxTTO1Q9ioqVeGtdGhJPZ8LGShX4fTtyDp/InDH0aylZsQJvJRzFrjOqwF82xgdBHTilQ2TuGPq1kKxYgTfXpiHp7B3YPgr8QAY+EYGhX+vIihV4Y20a/ngU+MsjfdCnPQOfiFQY+rVIoVyBN9amYve5LNhaSbAi0he92zc2dFlEZEQY+rVEoVyB19akYu/5LNSxVgV+r3YMfCLSxNCvBQrlCkxak4p9jwL/u7G+eLYtA5+ISmPom7hCuQITVx/B/gt3YWdtie/G+qJn20aGLouIjBRD34Q9Gfgrx/kiwJ2BT0TlY+ibqIIiVeD/efEu7G0ssXKsL/wZ+ET0FAx9E1RQpEDUqhQcuJQNextLxI/zg1+bhoYui4hMAEPfxOQXFSMq/ggOXs5GXRtLxI/3g29rBj4RaYehb0Lyi4oxPj4Fhy7fQz1bK8SP84UPA5+IdMDQNxF5MlXgJ19RBf6q8X7wbtXA0GURkYlh6JuAPFkxxsWn4DADn4iqiKFv5PJkxRi3MgWH0+/BwdYKq6P84NWSgU9ElcPQN2IPZcUY+91hHLl6Hw51rLAmyh+ebvUNXRYRmTCGvpHKLZRj7MoUpD4K/LVR/vBg4BNRFTH0jVBuoRyR3x1G2rUcONaxwtoJ/ujeor6hyyKiWoChb2SkjwL/6LUcONlZY22UP55p4WTosoiolmDoGxFpoRxjVhzG39dVgZ8wwR/dmjPwiUh/GPpG4kGBHGO+O4xj13NQ3141wmfgE5G+MfSNwIMCOcasSMaxGw/QwN4aayf4o6srA5+I9I+hb2AP8uUYtSIZJ26qAj9hQgC6uDoauiwiqqUY+gaUk1+E0SsO48TNB2hY1wYJE/zRuRkDn4iqD0PfQHLyizBqRTJO3pSiUV0bJEz0RycXBj4RVS+GvgHczyvCyOXJOH1bFfjrJgago4uDocsiIjPA0K9h9/OKMGJ5Ms7clqJxPVXgd2jKwCeimsHQr0H3Ho3wVYFvi/UT/dGegU9ENYihX0OyH8owcnkyzmbkonE9W2yY5I92TRj4RFSzGPo1IPuhDCOWJeNcZi6cHWyxfmIA2jWpZ+iyiMgMSQxdQG1397HAb+Jgiw2TGPhEZDgc6VejrFwZRiw7hAt3HqoD392ZgU9EhsPQryZZuTIMX3YIF+88RFNH1ZQOA5+IDI2hXw3u5BZixLJkXLzzEC6OdbB+UgDaNK5r6LKIiBj6+nZHWojhyw7hUlYemjnVwfqJAWjNwCciI8HQ16NMaSGGLz2Ey3fz4OqkGuG3asTAJyLjwdDXk8cDv3l9O6yfGICWjewNXRYRkQaGvh5kPFBN6Vx5FPgbJgXArSEDn4iMD0O/im4/KMDwpYeQnp3PwCcio8fQr4JbOQUYvuwQrmbno0UD1ZQOA5+IjBlDv5Ju5RQgYukhXLuXD7eGqsBv0YCBT0TGzSguw7Blyxb4+vqiT58+CAoKwqlTp/TaXt9u5hTg1aUHce1ePlo2tMeGST0Z+ERkEgw+0j98+DAiIyORmpqK9u3bY/Xq1RgwYADOnDkDB4fSV6HUtb2+3cwpwOiVR3D9XgFaNbLH+okBcK1vV+3PS0SkDwYf6cfFxSE8PBzt27cHAIwaNQrFxcWIj4/XS3t9yi4ERq5IUQf+hkkMfCIyLQYP/aSkJPj4+KiXJRIJvL29sWvXLr2015fr9/Ox6LQlbuYUovWjwG/mxMAnItNi0Omd7OxsSKVSNG3aVGO9i4sLUlJSqtweAGQyGWQymXpZKpUCAORyOeRyuVZ15hYWY/iyFNyTWaBVQzusGe+DxvZWWu9vjEpqN+U+PI79MW7sT/XSpQ6Dhn5+fj4AwNbWVmO9ra2teltV2gPAvHnzEBsbW2r9zp07YW+v/cFX/wYWOFwsQVSbXKT9+YfW+xm7xMREQ5egV+yPcWN/qkd5+VcWg4Z+Seg+PhIvWS4rkHVtDwAzZszAtGnT1MtSqRRubm4ICwuDo6Oj1rX2LyrCbzt2IXxgKKytrbXez1jJ5XIkJiYiNJT9MUbsj3Eztv6UzGBow6Ch36hRIzg5OSEzM1NjfUZGBtzd3avcHlD9F/DkfwYAYG1trfOLZWNZuf2MGftj3Ngf42Ys/dGlBoMfyA0JCUFqaqp6WQiBtLQ09O/fXy/tiYjoHwYP/ejoaGzbtg0XL14EACQkJMDS0hKRkZEAgN69eyMmJkbr9kREVD6DfzjLz88P8fHxiIiIgJ2dHSQSCXbs2KH+oFV+fr7GHP7T2hMRUfkMHvoAMGTIEAwZMqTMbWlpaTq1JyKi8hl8eoeIiGoOQ5+IyIww9ImIzAhDn4jIjDD0iYjMCEOfiMiMGMUpmzVJCAFAt2tVAKprbeTn50MqlRrFx66riv0xbuyPcTO2/pTkWUm+VcTsQj83NxcA4ObmZuBKiIj0Kzc3F05OThW2sRDa/GmoRZRKJW7dugUHBwdYWFhovV/J1TmvX7+u09U5jRX7Y9zYH+NmbP0RQiA3Nxeurq6QSCqetTe7kb5EIkGLFi0qvb+jo6NRvMj6wv4YN/bHuBlTf542wi/BA7lERGaEoU9EZEYY+lqytbXFxx9/XOYNWUwR+2Pc2B/jZsr9MbsDuURE5owjfSIiM8LQJyIyIwx9IiIzwtDX0pYtW+Dr64s+ffogKCgIp06dMnRJmDVrFjw9PREcHKz+Gjp0qHq7EAKzZ89Gjx494Ofnh1GjRuHBgwcaj/HgwQOMHj0afn5+6NGjB2JjY0t9lPv06dMIDg5GYGAgfHx8sHnzZr31oaioCNHR0bCyskJ6enqp7d9++y28vb3Rq1cvhIeH4+bNm6X2nzJlCnx8fODt7Y133nkHRUVFGm1u3ryJQYMGoVevXujRoweWLFlS6nn+/PNPBAQEICgoCAEBAdi/f7/e+zN27FgEBARovF5vvvmm0fZn48aNCAsLQ79+/eDr64thw4Zp9Kkm31/6+P17Wn8ef11KvmbPnm20/ak0QU+VnJwsHBwcxPnz54UQQqxatUo0b95cSKVSg9b18ccfi927d5e7fcGCBaJ79+4iPz9fCCHEuHHjxAsvvKDR5oUXXhATJkwQQgiRl5cnunbtKhYsWKDeLpVKRfPmzcXatWuFEEKcO3dO1KtXTyQnJ1e5/itXroiAgAAxZswYAUBcuXJFY/umTZtEs2bNRFZWlhBCiNjYWOHp6SkUCoW6zeTJk8WAAQNEcXGxKC4uFv379xeTJ09Wb1coFMLT01PMmTNHCCHEnTt3RNOmTcWmTZvUbdLT04Wjo6PYt2+fEEKIPXv2CEdHR5Genq7X/kRGRpZa9yRj6o+1tbXYvn27+nlHjx4tOnbsKAoLC4UQNff+0tfv39P6ExQU9NTHMKb+VBZDXwtDhgwRERER6mWFQiGaNm0qvvrqKwNWVXHoFxcXC2dnZ7FkyRL1ulOnTgkA4vjx40IIIY4dOyYAiLNnz6rbfP3118LZ2VkUFxcLIYRYuHChaNasmVAqleo2w4YNE0OHDq1y/SdOnBAXLlwQu3fvLjMkvby8RHR0tHo5JydHWFlZiZ9//lkIIcTdu3c1fpGFEGLbtm3C2tpaZGdnCyGE2Lp1q7C2tha5ubnqNh988IHo0aOHennq1KkiICBA47l9fX3FtGnT9Nqfp4W+sfXn5Zdf1lhOSUkRAMSBAwdq9P2lr9+/ivojxNND39j6U1mc3tFCUlISfHx81MsSiQTe3t7YtWuXAauq2PHjx5GVlaVRd+fOnVG3bl113UlJSahXrx46duyobuPr64usrCwcP35c3cbb21vjOkW+vr5ISkqqco3dunVDu3btytx27949HD16VKN+JycndOjQQV3/vn37IJfLNdr4+vpCLpdj79696vo7duyIevXqabRJS0vD/fv31W0ef4ySNrq+vhX1RxvG1p8ffvhBY7lOnToAAJlMVqPvL339/lXUH20YW38qi6H/FNnZ2ZBKpWjatKnGehcXF1y5csVAVf3ju+++Q3BwMHr16oXIyEhcunQJAHD58mUA0KjbwsICTZs2Vdd9+fLlMvsF4KltHjx4gHv37lVPpx57/op+7pcvX4aVlRUaNWqk3u7s7AxLS0u99LE6Xt958+YhODgYvXv3xltvvYXMzEz1NmPvz8GDB+Hq6opevXrV2PurOn//Hu9PiSlTpiAoKAiBgYGIjo5WX5XXFPqjLYb+U+Tn5wNAqU/e2draqrcZSsuWLeHl5YVdu3Zh//79aNOmDby9vXHz5k2t6s7Pzy9ze8k2bdtUB23rt7GxKbWvjY2NXvqo7/516NABgYGB+OOPP7B7927IZDIEBATg4cOHRt8fmUyGzz77DIsWLYK1tXWNvb+q6/fvyf4AgKenJ8LDw7F371789ttvOHHiBEJDQ6FQKIy+P7owu6ts6sre3h5A6X8BZTKZepuhjB8/XmN55syZWLJkCRYvXowePXoAqLhue3v7MreXbNO2TXWo6Odet25ddZsnz2wBVGfAPF5/QUFBqcd4/DnK66O++/fvf/9b/b1EIsEXX3yBBg0aYP369Zg4caJR9+e1117Dq6++iiFDhmg8V3W/v8SjM2Oquz8A8OWXX6q/r1evHj799FN069YNf/zxB0JDQ426P7rgSP8pGjVqBCcnJ41/wwEgIyMD7u7uBqqqbJaWlmjdujUuXbqkru3JujMzM9Xb3N3dy+xXybaK2jg5OaFhw4bV0o/Hn7+in7u7uzuKi4uRnZ2t3p6VlQWFQqFVH9u0aVNhm+p+fR0dHeHs7KyekjPW/kRHR8Pe3h6ffPKJel1Nvb+q4/evrP6UpW3btgCg8foYY390xdDXQkhICFJTU9XLQgikpaWhf//+BqxKNf/4pFu3bqFly5bo3r07nJ2dNeo+c+YM8vLy1HX369cPDx8+xPnz59Vtjhw5giZNmqB79+7qNmlpaRrnIh85cqTa+96gQQN4eXlp1C+VSnH+/Hn1cwcGBsLa2lqjzZEjR2BtbY3AwEB1/efOnVNPoZS08fb2RoMGDdRtHn+Mkjb67uOTr5dMJkN2djZatmxptP2Ji4vD9evXsWjRIgBAamoqUlNTa/T9pc/fv/L6c+fOHcydO1ejbclnQkpeH2PsT6XUyDlCJi45OVk4OjqKCxcuCCGEWLNmjVGcp9+6dWuxdetW9fKyZctEnTp1xJkzZ4QQqvOoPTw81OdRR0VFlXke9aRJk4QQQuTn54tnnnmmzPOO161bJ4QQ4vz588LBwUEv5+mXKO8Ux02bNglXV1dx9+5dIYQQn3zySZnn6T/33HNCoVAIhUIhwsLCNM5rLy4uFp6enuI///mPEEKIrKws4eLiUuZ57X/++acQQoh9+/ZV6rz2p/XHxsZGpKSkqJc/+ugj4ezsLO7cuWOU/fnmm29E165dxcGDB0VKSopISUkRH3/8sVi5cqUQoubeX/r6/auoP1euXBENGzZUv2bFxcUiMjJSdOrUSRQUFBhlfyqLoa+lzZs3C29vb9G7d28RGBgoTp48aeiSREJCgujbt68ICgoSPXv2FMHBwepfdCGEUCqVIjY2Vnh5eQlfX18xYsQIcf/+fY3HuH//vhg5cqTw8/MTnp6eYtasWRrnGAshxMmTJ0VgYKDo3bu38Pb21giYqpDJZCIoKEh4eHgIAMLf37/UudTffPON8PLyEj179hTPP/+8uH79usb2wsJCMXnyZNGjRw/Ro0cP8fbbb6s/bFPi+vXrIjw8XDz77LPCy8tLLF68uFQt+/btE/7+/qJPnz7Cz89P/cEmffbnq6++Er179xbBwcHCz89PhIeHl3ofGUt/pFKpkEgkAkCpr5LQr8n3V1V//57Wn4KCAjF37lwREBAggoKChI+Pjxg+fLi4evWqUfanKnhpZSIiM8I5fSIiM8LQJyIyIwx9IiIzwtAnIjIjDH0iIjPC0CciMiMMfSIiM8LQJ7M2aNAg2NraomXLlpg8ebJ6/cGDB2FhYYELFy6o13300Udo0aIFfH19cfr06XIf8+bNm2jatGmpWztWZNGiRejUqRNat25dYbuffvoJP/30k9aPS/Qkhj6ZtV9//RWBgYHw8vLC//73P/X6kpte/PHHH+p1c+bMgaenJ/bs2YMuXbqU+5h16tRBx44dYWdnp3Udb7/9NqKjo5/ajqFPVcXQJ7MXEhKCffv2qa+bDqhuLP7ss89q3PFILpdDLperL+1cnkaNGmHfvn3VehVSospi6JPZCwkJQU5ODtLS0gAAhYWFKC4uxr/+9S/s3r1bfcXE5ORk+Pv7AwA+++wzeHp6IigoCEFBQdi/fz8A1W0eg4ODUadOHcTHx6ufIyMjA88//zw6dOiA0NBQJCQkwMLCAp6envjxxx816omPj8dzzz2Hdu3aIS4uTr1++vTp2L59O7Zv347g4GAMHjy4On8sVEvxJipk9nx8fODo6IikpCT4+vriwIED6NmzJ0JCQhAdHY3jx4/Dw8MDf/zxB0JCQrB06VKsWLEChw4dQv369XHgwAH069cPZ8+eRatWrbBnz55Sc/Njx45FnTp1cPbsWUgkEvVllr/88ksEBwer22VmZsLCwgK///47Tp48ie7du2PYsGFo27YtPv30U9y5cwcANP6gEOmCI30ye5aWlurbGAKqefx+/fqhR48ecHJyUk/xHDp0CD179sTcuXMxYcIE1K9fHwDw7LPPol27dli+fHmZj3/u3Dns2LEDU6ZMgUSi+pV75513ymwrhMDIkSMBqG60Xr9+ffVNt4n0gaFPBNUUz19//YWioiJ1uFtaWiIoKAhJSUkoLCyERCJBUVERrl27hpUrVyI4OFj9JZfLNW6i/bizZ88CgMadkUpuzPEkZ2dnWFn98w+4o6MjpFKpHntK5o7TO0RQhX5+fj4SExNhbW2tvnF1SEgIZs6cib179+LZZ59Vt3///fcxbty4Sj+fhYVFmestLS1LrePVz0mfONInAtC9e3c0btwYsbGx6lsTAqrQz83Nxfz58xESEgIHBwe0bNkS586d09j/+++/x6ZNm8p87E6dOgEALl++rF537dq1StVZMj0EAPn5+RpnHBFpg6FPBNXIOzg4GCkpKQgJCVGv79atG5o0aYIjR47Ax8cHABATE4NVq1apgzsrKwuxsbHo1q1bmY/dsWNHDBgwAAsXLoRSqQQALF26tFJ1Ojs74/79+wCAl19+WT11RKQthj7RIyEhIXB0dFSHO/DPH4PAwED1XPukSZPwwQcfYODAgejTpw+GDRuGL7/8Eh07dlSfspmRkYG4uDj1Dbjj4+Mhk8nQqVMnDBw4ED179gQAWFtbq7fHxcUhIyMDYWFhAIDnnntO/Thr1qwBAIwbNw6XL19Gnz590LhxY3Tt2rXGfj5UO/B2iUQ1ICsrC87OzurlW7duoXnz5rhx4waaN29uwMrI3HCkT1QD3njjDezdu1e9/PXXXyM4OJiBTzWOZ+8Q1YDBgwfj/fffR7169SCTydCqVSusX7/e0GWRGeL0DhGRGeH0DhGRGWHoExGZEYY+EZEZYegTEZkRhj4RkRlh6BMRmRGGPhGRGWHoExGZEYY+EZEZ+X+qiv8Ir2sKyAAAAABJRU5ErkJggg==", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stdout", + "output_type": "stream", + "text": [ + "scenicplus 17596 (13785,)\n" + ] } ], "source": [ - "plot_cumulative_density(df.iloc[10, :-1].values)" + "par['models_dir'] = 'resources/grn_models/d0_hvg'\n", + "for method in ['scenicplus']:\n", + " prediction = pd.read_csv(f\"{par['models_dir']}/{method}.csv\", index_col=0)\n", + " prediction['link'] = prediction['source'].astype(str) + '_' + prediction['target'].astype(str)\n", + " print(method, len(prediction), np.intersect1d(all_links, prediction['link']).shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aaa scenicplus 17596 (10807,), " ] }, { @@ -457,26 +819,36 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Submitted batch job 7755406\n" + "Submitted batch job 7757148\n" ] } ], "source": [ "if True:\n", + " # par = {\n", + " # 'methods': ['scglue'],\n", + " # 'models_dir': 'resources/grn_models/',\n", + " # 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n", + " # 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', \n", + " # 'num_workers': 20,\n", + " # 'mem': \"250GB\",\n", + " # 'time': \"48:00:00\"\n", + " # }\n", + " \n", " par = {\n", " 'methods': ['scenicplus'],\n", - " 'models_dir': 'resources/grn_models/',\n", - " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', \n", - " 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', \n", + " 'models_dir': 'resources/grn_models/d0_hvg',\n", + " 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', \n", + " 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac_d0.h5ad', \n", " 'num_workers': 20,\n", - " 'mem': \"250G\",\n", + " 'mem': \"250GB\",\n", " 'time': \"48:00:00\"\n", " }\n", "\n", @@ -512,13 +884,6 @@ " # !bash scripts/sbatch/grn_inference.sh \"{command}\" " ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -528,7 +893,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -546,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -558,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -592,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -612,7 +977,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -631,284 +996,27 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 S1S2static-theta-0.0static-theta-0.5rank
collectri-0.100238-0.2111820.4893160.51489611
negative_control-0.044574-0.0451580.4457270.5013899
positive_control0.1971290.5788220.5308480.5846943
pearson_corr0.2734430.5163430.6398710.5491882
portia0.2633100.3570060.5271320.5385935
ppcor0.0179540.1597540.3171360.5061478
grnboost20.4219360.4893220.8250250.6195271
scenic0.1680060.2189160.5212340.5832026
granie0.0832980.1060120.3233240.45850610
scglue0.0981870.2704890.4193250.5376547
celloracle0.2091510.2914780.6807870.5846874
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" + "ename": "IntCastingNaNError", + "evalue": "Cannot convert non-finite values (NA or inf) to integer", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIntCastingNaNError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[16], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m df_scores \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresources/scores/scgen_pearson-ridge.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m 3\u001b[0m df_all_n \u001b[38;5;241m=\u001b[39m (df_scores\u001b[38;5;241m-\u001b[39mdf_scores\u001b[38;5;241m.\u001b[39mmin(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m))\u001b[38;5;241m/\u001b[39m(df_scores\u001b[38;5;241m.\u001b[39mmax(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m-\u001b[39mdf_scores\u001b[38;5;241m.\u001b[39mmin(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m))\n\u001b[0;32m----> 4\u001b[0m df_scores[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrank\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf_all_n\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmean\u001b[49m\u001b[43m(\u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrank\u001b[49m\u001b[43m(\u001b[49m\u001b[43mascending\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mint\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m df_scores\u001b[38;5;241m.\u001b[39mstyle\u001b[38;5;241m.\u001b[39mbackground_gradient()\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/generic.py:6643\u001b[0m, in \u001b[0;36mNDFrame.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 6637\u001b[0m results \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 6638\u001b[0m ser\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy, errors\u001b[38;5;241m=\u001b[39merrors) \u001b[38;5;28;01mfor\u001b[39;00m _, ser \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 6639\u001b[0m ]\n\u001b[1;32m 6641\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6642\u001b[0m \u001b[38;5;66;03m# else, only a single dtype is given\u001b[39;00m\n\u001b[0;32m-> 6643\u001b[0m new_data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mgr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6644\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_constructor_from_mgr(new_data, axes\u001b[38;5;241m=\u001b[39mnew_data\u001b[38;5;241m.\u001b[39maxes)\n\u001b[1;32m 6645\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\u001b[38;5;241m.\u001b[39m__finalize__(\u001b[38;5;28mself\u001b[39m, method\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mastype\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/internals/managers.py:430\u001b[0m, in \u001b[0;36mBaseBlockManager.astype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m using_copy_on_write():\n\u001b[1;32m 428\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 430\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 431\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mastype\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 432\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 433\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 434\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 435\u001b[0m \u001b[43m \u001b[49m\u001b[43musing_cow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43musing_copy_on_write\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 436\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/internals/managers.py:363\u001b[0m, in \u001b[0;36mBaseBlockManager.apply\u001b[0;34m(self, f, align_keys, **kwargs)\u001b[0m\n\u001b[1;32m 361\u001b[0m applied \u001b[38;5;241m=\u001b[39m b\u001b[38;5;241m.\u001b[39mapply(f, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 363\u001b[0m applied \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m result_blocks \u001b[38;5;241m=\u001b[39m extend_blocks(applied, result_blocks)\n\u001b[1;32m 366\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mfrom_blocks(result_blocks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes)\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/internals/blocks.py:758\u001b[0m, in \u001b[0;36mBlock.astype\u001b[0;34m(self, dtype, copy, errors, using_cow, squeeze)\u001b[0m\n\u001b[1;32m 755\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCan not squeeze with more than one column.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 756\u001b[0m values \u001b[38;5;241m=\u001b[39m values[\u001b[38;5;241m0\u001b[39m, :] \u001b[38;5;66;03m# type: ignore[call-overload]\u001b[39;00m\n\u001b[0;32m--> 758\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array_safe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 760\u001b[0m new_values \u001b[38;5;241m=\u001b[39m maybe_coerce_values(new_values)\n\u001b[1;32m 762\u001b[0m refs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:237\u001b[0m, in \u001b[0;36mastype_array_safe\u001b[0;34m(values, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 234\u001b[0m dtype \u001b[38;5;241m=\u001b[39m dtype\u001b[38;5;241m.\u001b[39mnumpy_dtype\n\u001b[1;32m 236\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 237\u001b[0m new_values \u001b[38;5;241m=\u001b[39m \u001b[43mastype_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mValueError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m):\n\u001b[1;32m 239\u001b[0m \u001b[38;5;66;03m# e.g. _astype_nansafe can fail on object-dtype of strings\u001b[39;00m\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# trying to convert to float\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m errors \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:182\u001b[0m, in \u001b[0;36mastype_array\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m 179\u001b[0m values \u001b[38;5;241m=\u001b[39m values\u001b[38;5;241m.\u001b[39mastype(dtype, copy\u001b[38;5;241m=\u001b[39mcopy)\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 182\u001b[0m values \u001b[38;5;241m=\u001b[39m \u001b[43m_astype_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# in pandas we don't store numpy str dtypes, so convert to object\u001b[39;00m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(dtype, np\u001b[38;5;241m.\u001b[39mdtype) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28missubclass\u001b[39m(values\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mtype, \u001b[38;5;28mstr\u001b[39m):\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:101\u001b[0m, in \u001b[0;36m_astype_nansafe\u001b[0;34m(arr, dtype, copy, skipna)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mensure_string_array(\n\u001b[1;32m 97\u001b[0m arr, skipna\u001b[38;5;241m=\u001b[39mskipna, convert_na_value\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 98\u001b[0m )\u001b[38;5;241m.\u001b[39mreshape(shape)\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m np\u001b[38;5;241m.\u001b[39missubdtype(arr\u001b[38;5;241m.\u001b[39mdtype, np\u001b[38;5;241m.\u001b[39mfloating) \u001b[38;5;129;01mand\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_astype_float_to_int_nansafe\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 103\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m 104\u001b[0m \u001b[38;5;66;03m# if we have a datetime/timedelta array of objects\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;66;03m# then coerce to datetime64[ns] and use DatetimeArray.astype\u001b[39;00m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(dtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", + "File \u001b[0;32m~/miniconda3/envs/scglue/lib/python3.10/site-packages/pandas/core/dtypes/astype.py:145\u001b[0m, in \u001b[0;36m_astype_float_to_int_nansafe\u001b[0;34m(values, dtype, copy)\u001b[0m\n\u001b[1;32m 141\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;124;03mastype with a check preventing converting NaN to an meaningless integer value.\u001b[39;00m\n\u001b[1;32m 143\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m np\u001b[38;5;241m.\u001b[39misfinite(values)\u001b[38;5;241m.\u001b[39mall():\n\u001b[0;32m--> 145\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m IntCastingNaNError(\n\u001b[1;32m 146\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot convert non-finite values (NA or inf) to integer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 147\u001b[0m )\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# GH#45151\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (values \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m.\u001b[39mall():\n", + "\u001b[0;31mIntCastingNaNError\u001b[0m: Cannot convert non-finite values (NA or inf) to integer" + ] } ], "source": [ diff --git a/scripts/sbatch/figr.sh b/scripts/sbatch/figr.sh new file mode 100644 index 000000000..5f59662ad --- /dev/null +++ b/scripts/sbatch/figr.sh @@ -0,0 +1,12 @@ +#!/bin/bash +#SBATCH --job-name=figr +#SBATCH --time=48:00:00 +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --mail-type=END +#SBATCH --mail-user=jalil.nourisa@gmail.com +#SBATCH --cpus-per-task=20 +#SBATCH --mem=250G + + +singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R diff --git a/scripts/sbatch/run_skeleton.sh b/scripts/sbatch/run_skeleton.sh new file mode 100644 index 000000000..3b05264d8 --- /dev/null +++ b/scripts/sbatch/run_skeleton.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name=skeleton +#SBATCH --time=48:00:00 +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --mail-type=END +#SBATCH --mail-user=jalil.nourisa@gmail.com +#SBATCH --mem=128G +#SBATCH --cpus-per-task=20 +#SBATCH --partition=gpu +#SBATCH --gres=gpu:1 + +singularity run ../../images/scglue python src/metrics/skeleton/script.py diff --git a/src/methods/multi_omics/figr/script.R b/src/methods/multi_omics/figr/script.R index 066848090..98422b67d 100644 --- a/src/methods/multi_omics/figr/script.R +++ b/src/methods/multi_omics/figr/script.R @@ -8,14 +8,14 @@ library(BSgenome.Hsapiens.UCSC.hg38) ## VIASH START par <- list( - multiomics_rna_r = "resources_test/grn-benchmark/multiomics_rna.rds", - multiomics_atac_r = "resources_test/grn-benchmark/multiomics_atac.rds", + multiomics_rna_r = "resources/grn-benchmark/multiomics_rna.rds", + multiomics_atac_r = "resources/grn-benchmark/multiomics_atac.rds", temp_dir = "output/figr/", - cell_topic = "resources/prior/cell_topic_d0_hvg.csv", - num_workers = 2, + cell_topic = "resources/prior/cell_topic.csv", + num_workers = 20, n_topics = 48, peak_gene = "output/figr/peak_gene.csv", - prediction= "output/figr/prediction.csv" + prediction= "resources/grn_models/figr.csv" ) print(par) # meta <- list( @@ -25,10 +25,9 @@ print(par) dir.create(par$temp_dir, recursive = TRUE, showWarnings = TRUE) atac = readRDS(par$multiomics_atac_r) -rna = readRDS(par$multiomics_rna_r) - - colnames(atac) <- gsub("-", "", colnames(atac)) + +rna = readRDS(par$multiomics_rna_r) colnames(rna) <- gsub("-", "", colnames(rna)) @@ -41,8 +40,7 @@ cellknn_func <- function(par) { rownames(cellkNN) <- rownames(cell_topic) saveRDS(cellkNN, paste0(par$temp_dir, "cellkNN.rds")) } -cellknn_func(par) -print('1: cellknn_func finished') + ## Step1: Peak-gene association testing peak_gene_func <- function(par){ @@ -59,9 +57,6 @@ peak_gene_func <- function(par){ write.csv(cisCorr, paste0(par$temp_dir, "cisCorr.csv"), row.names = TRUE) } -peak_gene_func(par) - -print('2: peak_gene_func finished') ## Step 2: create DORCs and smooth them dorc_genes_func <- function(par){ cisCorr = read.csv(paste0(par$temp_dir, "cisCorr.csv")) @@ -83,19 +78,23 @@ dorc_genes_func <- function(par){ cat('cellKNN dim:', dim(cellkNN), '\n') cat('dorcMat dim:', dim(dorcMat), '\n') cat('rna dim:', dim(rna), '\n') - dorcMat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = dorcMat, nCores = par$num_workers) + dorcMat.s <- smoothScoresNN(NNmat = cellkNN, mat = dorcMat, nCores = par$num_workers) cat('dorcMat.s completed') # Smooth RNA using cell KNNs # This takes longer since it's all genes - RNAmat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = rna, nCores = par$num_workers) + matching_indices <- match(colnames(rna), rownames(cellkNN)) + cellkNN_ordered <- cellkNN[matching_indices, ] + + + RNAmat.s <- smoothScoresNN(NNmat = cellkNN_ordered, mat = rna, nCores = par$num_workers) + # RNAmat.s <- rna cat('RNAmat.s completed') # get peak gene connection write.csv(cisCorr.filt, paste0(par$temp_dir, "cisCorr.filt.csv")) saveRDS(RNAmat.s, paste0(par$temp_dir, "RNAmat.s.RDS")) saveRDS(dorcMat.s, paste0(par$temp_dir, "dorcMat.s.RDS")) } -dorc_genes_func(par) -print('3: dorc_genes_func finished') + ## TF-gene associations tf_gene_association_func <- function(par){ cisCorr.filt = read.csv(paste0(par$temp_dir, "cisCorr.filt.csv")) @@ -111,8 +110,6 @@ tf_gene_association_func <- function(par){ write.csv(figR.d, paste0(par$temp_dir, "figR.d.csv")) } -tf_gene_association_func(par) -print('3: tf_gene_association_func finished') extract_peak_gene_func <- function(par) { # Read the CSV file @@ -137,22 +134,20 @@ extract_peak_gene_func <- function(par) { # Write the result to a CSV file write.csv(peak_gene_figr, file = par$peak_gene, row.names = FALSE) } -extract_peak_gene_func(par) -print('4: extract_peak_gene_func finished') - filter_figr_grn <- function(par) { # Read the CSV file figr_grn <- read.csv(file.path(par$temp_dir, "figR.d.csv")) + + # Filter those that have a Score of 0 + figr_grn <- subset(figr_grn, Score != 0) # Filter based on enrichment figr_grn <- subset(figr_grn, Enrichment.P < 0.05) # Filter based on correlation - figr_grn <- subset(figr_grn, Corr.P < 0.05) + # figr_grn <- subset(figr_grn, Corr.P < 0.05) - # Filter those that have a Score of 0 - figr_grn <- subset(figr_grn, Score != 0) # Subset columns figr_grn <- figr_grn[, c("Motif", "DORC", "Score")] @@ -168,4 +163,16 @@ filter_figr_grn <- function(par) { } + + +cellknn_func(par) +print('1: cellknn_func finished') +peak_gene_func(par) +print('2: peak_gene_func finished') +dorc_genes_func(par) +print('3: dorc_genes_func finished') +tf_gene_association_func(par) +print('3: tf_gene_association_func finished') +extract_peak_gene_func(par) +print('4: extract_peak_gene_func finished') filter_figr_grn(par) \ No newline at end of file diff --git a/src/methods/multi_omics/scenicplus/main.py b/src/methods/multi_omics/scenicplus/main.py index d391c35ff..cf3e4f6eb 100644 --- a/src/methods/multi_omics/scenicplus/main.py +++ b/src/methods/multi_omics/scenicplus/main.py @@ -1,5 +1,7 @@ import os +import gc + import sys import yaml import pickle @@ -16,6 +18,9 @@ import tarfile from urllib.request import urlretrieve import json +import os + + import flatbuffers import numpy as np @@ -193,10 +198,7 @@ def process_peak(par): chain=False ) - # Create cistopic objects - # Download Mallet - if not os.path.exists(par['MALLET_PATH']): url = 'https://github.com/mimno/Mallet/releases/download/v202108/Mallet-202108-bin.tar.gz' response = requests.get(url) @@ -204,12 +206,17 @@ def process_peak(par): f.write(response.content) with tarfile.open(os.path.join(par['temp_dir'], 'Mallet-202108-bin.tar.gz'), 'r:gz') as f: f.extractall(path=par['temp_dir']) + del consensus_peaks + del narrow_peak_dict + del adata_atac + gc.collect() def run_cistopic(par): adata_atac = anndata.read_h5ad(par['multiomics_atac']) unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories] print(unique_donor_ids) unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories] - + donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id] + index = [barcode.replace('-', '') + '-' + donor_id for donor_id, barcode in zip(donor_ids, adata_atac.obs.index)] cell_data = pd.DataFrame({ 'cell_type': [s.replace(' ', '_') for s in adata_atac.obs.cell_type.to_numpy()], 'donor_id': [s.replace(' ', '_') for s in adata_atac.obs.donor_id.to_numpy()] @@ -362,7 +369,12 @@ def run_cistopic(par): # Save cistopic objects with open(par['cistopic_object'], 'wb') as f: pickle.dump(cistopic_obj, f) - + + del cistopic_obj + del model + del cistopic_obj_list + del adata_atac + gc.collect() def process_topics(par): # Load cistopic objects @@ -559,7 +571,6 @@ def process_topics(par): split_pattern='-' ) -# Download databases def download_databases(par): def download(url: str, filepath: str) -> None: if os.path.exists(filepath): @@ -574,7 +585,6 @@ def download(url: str, filepath: str) -> None: # with open(par['blacklist_path'], 'w') as f: # f.write(response.text) download(url, par['blacklist_path']) - url = 'https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl' if not os.path.exists(par['motif_annotation']): download(url, par['motif_annotation']) @@ -594,7 +604,6 @@ def download(url: str, filepath: str) -> None: if not os.path.exists(os.path.join(par['temp_dir'], 'cistarget-db', 'create_cisTarget_databases')): with contextlib.chdir(os.path.join(par['temp_dir'], 'cistarget-db')): subprocess.run(['git', 'clone', 'https://github.com/aertslab/create_cisTarget_databases']) - # Download cluster-buster if not os.path.exists(os.path.join(par['temp_dir'], 'cistarget-db', 'cbust')): urlretrieve('https://resources.aertslab.org/cistarget/programs/cbust', os.path.join(par['temp_dir'], 'cistarget-db', 'cbust')) @@ -639,7 +648,6 @@ def download(url: str, filepath: str) -> None: '1000', 'yes' ]) - # Create cistarget databases with open(os.path.join(par['temp_dir'], 'cistarget-db', 'motifs.txt'), 'w') as f: for filename in os.listdir(os.path.join(par['temp_dir'], 'cistarget-db', 'v10nr_clust_public', 'singletons')): @@ -659,14 +667,12 @@ def download(url: str, filepath: str) -> None: '--bgpadding', '1000', '-t', str(par['num_workers']) ], capture_output=True, text=True) - # Print the result for debugging print(result.stdout) print(result.stderr) def preprocess_rna(par): os.makedirs(os.path.join(par['temp_dir'], 'scRNA'), exist_ok=True) - print("Preprocess RNA-seq", flush=True) # Load scRNA-seq data print("Load scRNA-seq data") @@ -688,11 +694,9 @@ def preprocess_rna(par): adata_rna.raw = adata_rna sc.pp.normalize_total(adata_rna, target_sum=1e4) sc.pp.log1p(adata_rna) - # Change barcodes to match the barcodes in the scATAC-seq data bar_codes = [f'{obs_name.replace("-", "")}-{donor_id}' for obs_name, donor_id in zip(adata_rna.obs_names, adata_rna.obs.donor_id)] adata_rna.obs_names = bar_codes - # Save scRNA-seq data adata_rna.write_h5ad(os.path.join(par['temp_dir'], 'rna.h5ad')) diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py index 830baaea2..e56f0c399 100644 --- a/src/methods/multi_omics/scenicplus/script.py +++ b/src/methods/multi_omics/scenicplus/script.py @@ -1,5 +1,6 @@ import sys +import os ## VIASH START par = { 'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad', @@ -44,6 +45,15 @@ sys.path.append(meta["resources_dir"]) from main import * + +def print_memory_usage(): + import psutil + + process = psutil.Process(os.getpid()) + mem_info = process.memory_info().rss / (1024 * 1024) # Convert to MB + print(f"Memory usage: {mem_info:.2f} MB") + + def main(par): par['cistopic_object'] = f'{par["temp_dir"]}/cistopic_object.pkl' @@ -58,19 +68,24 @@ def main(par): par['MALLET_PATH'] = os.path.join(par['temp_dir'], 'Mallet-202108', 'bin', 'mallet') os.makedirs(par['atac_dir'], exist_ok=True) - print('------- download_databases -------') + # print('------- download_databases -------') # download_databases(par) - print('------- process_peak -------') + # print_memory_usage() + # print('------- process_peak -------') # process_peak(par) - print('------- run_cistopic -------') - run_cistopic(par) - print('------- process_topics -------') - process_topics(par) - - print('------- preprocess_rna -------') - preprocess_rna(par) + # print_memory_usage() + # print('------- run_cistopic -------') + # run_cistopic(par) + # print_memory_usage() + # print('------- process_topics -------') + # process_topics(par) + # print_memory_usage() + # print('------- preprocess_rna -------') + # preprocess_rna(par) + # print_memory_usage() print('------- snakemake_pipeline -------') snakemake_pipeline(par) + print_memory_usage() print('------- post_process -------') post_process(par) if __name__ == '__main__': diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py index 19a37ab3d..ab48fa89e 100644 --- a/src/methods/multi_omics/scglue/main.py +++ b/src/methods/multi_omics/scglue/main.py @@ -11,7 +11,35 @@ from ast import literal_eval import requests import torch - +def download_annotation(par): + + if not os.path.exists(par['annotation_file']): + print("Downloading prior started") + + response = requests.get("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz") + + if response.status_code == 200: + with open(par['annotation_file'], 'wb') as file: + file.write(response.content) + print(f"File downloaded and saved to {par['annotation_file']}") + else: + print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}") + print("Downloading prior ended") +def download_motifs(par): + # get gene annotation + + if not os.path.exists(par['motif_file']): + tag = par['motif_file'].split('/')[-1] + print("Downloading motif started") + response = requests.get(f"http://download.gao-lab.org/GLUE/cisreg/{tag}") + + if response.status_code == 200: + with open(par['motif_file'], 'wb') as file: + file.write(response.content) + print(f"File downloaded and saved to {par['motif_file']}") + else: + print(f"Failed to download the motif file. Status code: {response.status_code}") + print("Downloading motif ended") def preprocess(par): print('Reading input files', flush=True) rna = ad.read_h5ad(par['multiomics_rna']) @@ -119,7 +147,7 @@ def training(par): atac.write(f"{par['temp_dir']}/atac-emb.h5ad", compression="gzip") nx.write_graphml(guidance, f"{par['temp_dir']}/guidance.graphml.gz") -def run_grn(par): +def create_prior(par): ''' Infers gene2peak connections ''' rna = ad.read_h5ad(f"{par['temp_dir']}/rna-emb.h5ad") @@ -163,25 +191,6 @@ def run_grn(par): rna[:, np.union1d(genes, tfs)].write_loom(f"{par['temp_dir']}/rna.loom") np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s") - # pyscenic grn - if True: - command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", - f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", - '--seed', '0', '--num_workers', f"{par['num_workers']}", - '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name'] - print('Run grn') - result = subprocess.run(command, check=True) - - print("Output:") - print(result.stdout) - print("Error:") - print(result.stderr) - - if result.returncode == 0: - print("Command executed successfully") - else: - print("Command failed with return code", result.returncode) - print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True) peak_bed = scglue.genomics.Bed(atac.var.loc[peaks]) peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True) @@ -192,7 +201,7 @@ def run_grn(par): region_lens=atac.var.loc[peaks, "chromEnd"] - atac.var.loc[peaks, "chromStart"], random_state=0) - flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(10000, 10000) + flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(500, 500) flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True) gene2flank = nx.Graph([(g, g) for g in genes]) @@ -224,6 +233,23 @@ def run_grn(par): orthologous_identity=1.0, description="placeholder" ).to_csv(f"{par['temp_dir']}/ctx_annotation.tsv", sep="\t", index=False) +def pyscenic_grn(par): + command = ['pyscenic', 'grn', f"{par['temp_dir']}/rna.loom", + f"{par['temp_dir']}/tfs.txt", '-o', f"{par['temp_dir']}/draft_grn.csv", + '--seed', '0', '--num_workers', f"{par['num_workers']}", + '--cell_id_attribute', 'obs_id', '--gene_attribute', 'name'] + print('Run grn') + result = subprocess.run(command, check=True) + + print("Output:") + print(result.stdout) + print("Error:") + print(result.stderr) + + if result.returncode == 0: + print("Command executed successfully") + else: + print("Command failed with return code", result.returncode) def prune_grn(par): # Construct the command print(par) @@ -236,10 +262,10 @@ def prune_grn(par): "--annotations_fname", f"{par['temp_dir']}/ctx_annotation.tsv", "--expression_mtx_fname", f"{par['temp_dir']}/rna.loom", "--output", f"{par['temp_dir']}/pruned_grn.csv", - "--top_n_targets", str(par['top_n_targets']), + # "--top_n_targets", str(par['top_n_targets']), # "--rank_threshold", str(par['rank_threshold']), - "--auc_threshold", "0.1", - "--nes_threshold", str(par['nes_threshold']), + # "--auc_threshold", "0.1", + # "--nes_threshold", str(par['nes_threshold']), "--min_genes", "1", "--num_workers", f"{par['num_workers']}", "--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var @@ -247,7 +273,6 @@ def prune_grn(par): ] result = subprocess.run(command, check=True) - print("Output:") print(result.stdout) print("Error:") @@ -257,57 +282,28 @@ def prune_grn(par): print("pyscenic ctx executed successfully") else: print("pyscenic ctx failed with return code", result.returncode) -def download_annotation(par): - # get gene annotation - par['annotation_file'] = f"{par['temp_dir']}/gencode.v45.annotation.gtf.gz" - if not os.path.exists(par['annotation_file']): - print("Downloading prior started") - - response = requests.get("https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_45/gencode.v45.annotation.gtf.gz") - - if response.status_code == 200: - with open(par['annotation_file'], 'wb') as file: - file.write(response.content) - print(f"File downloaded and saved to {par['annotation_file']}") - else: - print(f"Failed to download the gencode.v45.annotation.gtf.gz. Status code: {response.status_code}") - print("Downloading prior ended") -def download_motifs(par): - # get gene annotation - tag = "JASPAR2022-hg38.bed.gz" - par['motif_file'] = f"{par['temp_dir']}/{tag}" - if not os.path.exists(par['motif_file']): - print("Downloading motif started") - response = requests.get(f"http://download.gao-lab.org/GLUE/cisreg/{tag}") - - if response.status_code == 200: - with open(par['motif_file'], 'wb') as file: - file.write(response.content) - print(f"File downloaded and saved to {par['motif_file']}") - else: - print(f"Failed to download the motif file. Status code: {response.status_code}") - print("Downloading motif ended") + def main(par): print("Is CUDA available:", torch.cuda.is_available()) print("Number of GPUs:", torch.cuda.device_count()) - - from util import process_links # Load scRNA-seq data - os.makedirs(par['temp_dir'], exist_ok=True) - print('----- download_annotation ---- ', flush=True) - download_annotation(par) - print('----- download_motifs ---- ', flush=True) - download_motifs(par) - print('----- preprocess ---- ', flush=True) - preprocess(par) - print('----- training ---- ', flush=True) - training(par) - print('----- run_grn ---- ', flush=True) - run_grn(par) + # os.makedirs(par['temp_dir'], exist_ok=True) + # print('----- download_annotation ---- ', flush=True) + # download_annotation(par) + # print('----- download_motifs ---- ', flush=True) + # download_motifs(par) + # print('----- preprocess ---- ', flush=True) + # preprocess(par) + # print('----- training ---- ', flush=True) + # training(par) + print('----- create_prior ---- ', flush=True) + create_prior(par) + print('----- pyscenic_grn ---- ', flush=True) + pyscenic_grn(par) print('----- prune_grn ---- ', flush=True) prune_grn(par) print('Curate predictions', flush=True) diff --git a/src/methods/multi_omics/scglue/script.py b/src/methods/multi_omics/scglue/script.py index 077380bfd..6d25d8a58 100644 --- a/src/methods/multi_omics/scglue/script.py +++ b/src/methods/multi_omics/scglue/script.py @@ -50,6 +50,12 @@ if args.resources_dir: meta['resources_dir'] = args.resources_dir +# get gene annotation +par['annotation_file'] = f"{par['temp_dir']}/gencode.v45.annotation.gtf.gz" +# par['motif_file'] = f"{par['temp_dir']}/JASPAR2022-hg38.bed.gz" +# par['motif_file'] = f"{par['temp_dir']}/ENCODE-TF-ChIP-hg38.bed.gz" +par['motif_file'] = f"output/db/jaspar_encode.bed.gz" + sys.path.append(meta["util_dir"]) sys.path.append(meta["resources_dir"]) from main import main diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml index 96b28eeb8..5713e58f3 100644 --- a/src/metrics/regression_1/config.vsh.yaml +++ b/src/metrics/regression_1/config.vsh.yaml @@ -19,6 +19,10 @@ functionality: direction: input description: whether to binarize the weight default: true + - name: --skeleton + type: string + direction: input + example: resources/prior/skeleton.csv' resources: - type: python_script path: script.py diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index 0ee0e63fa..f2a455849 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -201,10 +201,20 @@ def main(par): verbose_print(par['verbose'], 'Reading input files', 3) + perturbation_data = ad.read_h5ad(par['perturbation_data']) tf_all = np.loadtxt(par['tf_all'], dtype=str) gene_names = perturbation_data.var.index.to_numpy() net = pd.read_csv(par['prediction']) + + if True: #apply skeleton + print('Before filtering with skeleton:', net.shape) + skeleton = np.savetxt(par['skeleton'], all_links.values, fmt='%s') + net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str) + net = net[net['link'].isin(skeleton)] + print('After filtering with skeleton:', net.shape) + + # net['weight'] = net.weight.abs() # subset to keep only those links with source as tf diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py index 54e95ef92..7607ffbd0 100644 --- a/src/metrics/regression_1/script.py +++ b/src/metrics/regression_1/script.py @@ -17,6 +17,7 @@ 'layer': 'scgen_pearson', 'subsample': -2, 'num_workers': 4, + 'skeleton': 'resources/prior/skeleton.csv' } ## VIASH END # meta = { diff --git a/src/metrics/skeleton/script.py b/src/metrics/skeleton/script.py new file mode 100644 index 000000000..b2a8c0570 --- /dev/null +++ b/src/metrics/skeleton/script.py @@ -0,0 +1,252 @@ +import sys +import anndata as ad +import networkx as nx +import scanpy as sc +import scglue +from matplotlib import rcParams +import os +import subprocess +import pandas as pd +import numpy as np +from ast import literal_eval +import requests +import torch +def preprocess(par): + print('Reading input files', flush=True) + rna = ad.read_h5ad(par['multiomics_rna']) + atac = ad.read_h5ad(par['multiomics_atac']) + + rna.layers["counts"] = rna.X.copy() + sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3") + sc.pp.normalize_total(rna) + sc.pp.log1p(rna) + sc.pp.scale(rna) + sc.tl.pca(rna, n_comps=100, svd_solver="auto") + sc.pp.neighbors(rna, metric="cosine") + sc.tl.umap(rna) + print('step 1 completed') + + scglue.data.lsi(atac, n_components=100, n_iter=15) + sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine") + sc.tl.umap(atac) + print('step 2 completed') + + scglue.data.get_gene_annotation( + rna, gtf=par['annotation_file'], + gtf_by="gene_name" + ) + + rna = rna[:, ~rna.var.chrom.isna()] + + split = atac.var_names.str.split(r"[:-]") + atac.var["chrom"] = split.map(lambda x: x[0]) + atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int) + atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int) + + guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac, extend_range=par['extend_range']) + + scglue.graph.check_graph(guidance, [rna, atac]) + + column_names = [ + "chrom", + "gene_type", + "gene_id", + "hgnc_id", + "havana_gene", + "tag", + "score", + "strand", + "thickStart", + "thickEnd", + "itemRgb", + "blockCount", + "blockSizes", + "blockStarts", + "artif_dupl", + "highly_variable_rank" + ] + rna.var[column_names] = rna.var[column_names].astype(str) + + rna.write(f"{par['temp_dir']}/rna.h5ad") + atac.write(f"{par['temp_dir']}/atac.h5ad") + nx.write_graphml(guidance, f"{par['temp_dir']}/guidance.graphml.gz") + +def training(par): + os.makedirs(f"{par['temp_dir']}/glue", exist_ok=True) + rna = ad.read_h5ad(f"{par['temp_dir']}/rna.h5ad") + atac = ad.read_h5ad(f"{par['temp_dir']}/atac.h5ad") + guidance = nx.read_graphml(f"{par['temp_dir']}/guidance.graphml.gz") + scglue.models.configure_dataset( + rna, "NB", use_highly_variable=True, + use_layer="counts", use_rep="X_pca", use_batch='donor_id', use_cell_type='cell_type' + ) + scglue.models.configure_dataset( + atac, "NB", use_highly_variable=True, + use_rep="X_lsi", use_batch='donor_id', use_cell_type='cell_type' + ) + if False: + guidance_hvf = guidance.subgraph(chain( + rna.var.query("highly_variable").index, + atac.var.query("highly_variable").index + )).copy() + + glue = scglue.models.fit_SCGLUE( + {"rna": rna, "atac": atac}, guidance, + fit_kws={"directory": f"{par['temp_dir']}/glue"} + ) + + glue.save(f"{par['temp_dir']}/glue.dill") + + if True: # consistency score + dx = scglue.models.integration_consistency( + glue, {"rna": rna, "atac": atac}, guidance + ) + dx.to_csv(f"{par['temp_dir']}/consistency_scores.csv") + + rna.obsm["X_glue"] = glue.encode_data("rna", rna) + atac.obsm["X_glue"] = glue.encode_data("atac", atac) + feature_embeddings = glue.encode_graph(guidance) + feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices) + rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy() + atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy() + + rna.write(f"{par['rna-emb']}", compression="gzip") + atac.write(f"{par['atac-emb']}", compression="gzip") + nx.write_graphml(guidance, f"{par['guidance.graphml']}") +def peak_tf_gene_connections(par): + ''' Infers gene2peak connections + ''' + print('reload the data') + rna = ad.read_h5ad(f"{par['temp_dir']}/rna-emb.h5ad") + atac = ad.read_h5ad(f"{par['temp_dir']}/atac-emb.h5ad") + guidance = nx.read_graphml(f"{par['temp_dir']}/guidance.graphml.gz") + + rna.var["name"] = rna.var_names + atac.var["name"] = atac.var_names + + genes = rna.var.index + peaks = atac.var.index + features = pd.Index(np.concatenate([rna.var_names, atac.var_names])) + feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]]) + print('Get the skeleton') + + skeleton = guidance.edge_subgraph( + e for e, attr in dict(guidance.edges).items() + if attr["type"] == "fwd" + ).copy() + print('reginf') + reginf = scglue.genomics.regulatory_inference( + features, feature_embeddings, + skeleton=skeleton, random_state=0 + ) + print('gene2peak') + gene2peak = reginf.edge_subgraph( + e for e, attr in dict(reginf.edges).items() + if attr["qval"] < 0.1 + ) + + scglue.genomics.Bed(atac.var).write_bed(f"{par['temp_dir']}/peaks.bed", ncols=3) + scglue.genomics.write_links( + gene2peak, + scglue.genomics.Bed(rna.var).strand_specific_start_site(), + scglue.genomics.Bed(atac.var), + f"{par['temp_dir']}/gene2peak.links", keep_attrs=["score"] + ) + print('this is the motif file: ', par['motif_file']) + motif_bed = scglue.genomics.read_bed(par['motif_file']) + # motif_bed = motif_bed.iloc[:100000, :] #TODO: remove this + # tfs = pd.Index(motif_bed["name"]).intersection(rna.var_names) + + print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True) + peak_bed = scglue.genomics.Bed(atac.var.loc[peaks]) + peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True) + # peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs) + + flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(500, 500) + flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True) + + sources = [] + targets = [] + for e, attr in dict(gene2peak.edges).items(): + sources.append(e[0]) + targets.append(e[1]) + df = pd.DataFrame({'source': sources, 'target':targets}) + df.to_csv(par['gene2peak']) + + sources = [] + targets = [] + for e, attr in dict(peak2tf.edges).items(): + sources.append(e[0]) + targets.append(e[1]) + df = pd.DataFrame({'source': sources, 'target':targets}) + df.to_csv(par['peak2tf']) + + sources = [] + targets = [] + for e, attr in dict(flank2tf.edges).items(): + sources.append(e[0]) + targets.append(e[1]) + df = pd.DataFrame({'source': sources, 'target':targets}) + df.to_csv(par['flank2tf']) + +def merge_connections(par): + + gene2peak = pd.read_csv(par['gene2peak'], index_col=0) + gene2peak.columns = ['target', 'peak'] + + peak2tf= pd.read_csv(par['peak2tf'], index_col=0) + peak2tf.columns = ['peak', 'source'] + + flank2tf= pd.read_csv(par['flank2tf'], index_col=0) + flank2tf.columns = ['target', 'source'] + # merge gene2peak and peak2tf + tf2gene = gene2peak.merge(peak2tf, on='peak', how='inner')[['source','target']].drop_duplicates() + # merge flank2tf and tf2gene + tf2gene = pd.concat([tf2gene, flank2tf], axis=0).drop_duplicates() + + tf2gene.to_csv(f"{par['tf2gene']}") + +if __name__ == '__main__': + par = { + 'multiomics_atac': f"resources/grn-benchmark/multiomics_atac.h5ad", + 'multiomics_rna': f"resources/grn-benchmark/multiomics_rna.h5ad", + 'annotation_file': f"output/db/gencode.v45.annotation.gtf.gz", + # 'motif_file': 'output/db/ENCODE-TF-ChIP-hg38.bed.gz', + 'motif_file': 'output/db/jaspar_encode.bed.gz', + 'temp_dir': 'output/skeleton', + 'extend_range': 150000, + 'tf2gene': 'output/skeleton/tf2gene.csv' + } + print(par) + os.makedirs(par['temp_dir'], exist_ok=True) + par['rna-emb'] = f"{par['temp_dir']}/rna-emb.h5ad" + par['atac-emb'] = f"{par['temp_dir']}/atac-emb.h5ad" + par['guidance.graphml'] = f"{par['temp_dir']}/guidance.graphml.gz" + + par['gene2peak'] = f"{par['temp_dir']}/gene2peak.csv" + par['peak2tf'] = f"{par['temp_dir']}/peak2tf.csv" + par['flank2tf'] = f"{par['temp_dir']}/flank2tf.csv" + + # ---- simplify + if False: + multiomics_atac = ad.read_h5ad(par['multiomics_atac']) + multiomics_atac = multiomics_atac[:, :10000] + + par['multiomics_atac'] = f"{par['temp_dir']}/multiomics_atac.h5ad" + multiomics_atac.write(par['multiomics_atac']) + + # ----- actual runs + # print('------- preprocess ---------') + # preprocess(par) + # print('------- training ---------') + # training(par) + print('------- peak_tf_gene_connections ---------') + peak_tf_gene_connections(par) + print('------- merge_connections ---------') + merge_connections(par) + + + + + + diff --git a/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml b/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml index 525787bbc..2fd38032f 100644 --- a/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml +++ b/src/process_data/perturbation/batch_correction_scgen/config.vsh.yaml @@ -62,6 +62,14 @@ functionality: required: false direction: output example: resources_test/grn-benchmark/perturbation_data.h5ad + - name: --batch_key + type: string + default: plate_id + direction: input + - name: --label_key + type: string + default: cell_type + direction: input resources: - type: python_script diff --git a/src/process_data/perturbation/batch_correction_scgen/script.py b/src/process_data/perturbation/batch_correction_scgen/script.py index ba9aa39e5..5507df65d 100644 --- a/src/process_data/perturbation/batch_correction_scgen/script.py +++ b/src/process_data/perturbation/batch_correction_scgen/script.py @@ -7,11 +7,31 @@ ## VIASH START par = { 'perturbation_data_n': 'resources/grn-benchmark/perturbation_data.h5ad', - "perturbation_data_bc": 'resources/grn-benchmark/perturbation_data.h5ad' + "perturbation_data_bc": 'resources/grn-benchmark/perturbation_data.h5ad', + 'batch_key': 'plate_name', + 'label_key': 'cell_type' } ## VIASH END -batch_key = 'plate_name' -label_key = 'cell_type' +import argparse +parser = argparse.ArgumentParser(description="Batch correction") +parser.add_argument('--perturbation_data_n', type=str, help='Path to the anndata file') +parser.add_argument('--perturbation_data_bc', type=str, help='Path to the anndata file') +parser.add_argument('--batch_key', type=str, help='Batch name') +parser.add_argument('--label_key', type=str, help='label name') + +args = parser.parse_args() + +if args.perturbation_data_n: + par['perturbation_data_n'] = args.perturbation_data_n +if args.perturbation_data_bc: + par['perturbation_data_bc'] = args.perturbation_data_bc +if args.label_key: + par['label_key'] = args.label_key +if args.batch_key: + par['batch_key'] = args.batch_key + +print(par) + bulk_adata = ad.read_h5ad(par['perturbation_data_n']) print(bulk_adata) @@ -21,7 +41,7 @@ sc.pp.neighbors(train) sc.tl.umap(train) - scgen.SCGEN.setup_anndata(train, batch_key=batch_key, labels_key=label_key) + scgen.SCGEN.setup_anndata(train, batch_key=par['batch_key'], labels_key=par['label_key']) model = scgen.SCGEN(train) model.train( max_epochs=100, diff --git a/src/utils/util.py b/src/utils/util.py index 0e935ce14..f5340ae5b 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -144,3 +144,38 @@ def create_corr_net(par): else: grn = corr_net(X, gene_names, par, tf_all) return grn +def read_gmt(file_path:str) -> dict[str, list[str]]: + '''Reas gmt file and returns a dict of gene''' + gene_sets = {} + with open(file_path, 'r') as file: + for line in file: + parts = line.strip().split('\t') + gene_set_name = parts[0] + gene_set_description = parts[1] + genes = parts[2:] + gene_sets[gene_set_name] = { + 'description': gene_set_description, + 'genes': genes + } + return gene_sets +def quantile_transformation(values, one_sided=False, log1p_scale=True): + from sklearn.preprocessing import QuantileTransformer + if log1p_scale: + log_data = np.log1p(values) # log(x + 1) to avoid log(0) + if one_sided: + output_distribution = 'uniform' + else: + output_distribution = 'normal' + quantile_transformer = QuantileTransformer(output_distribution=output_distribution) + transformed_data = quantile_transformer.fit_transform(log_data.reshape(-1, 1)).reshape(len(log_data)) + return transformed_data +def zscore_transformation(values, one_sided=False, log1p_scale=True): + if log1p_scale: + log_data = np.log1p(values) # log(x + 1) to avoid log(0) + if one_sided: + mean = 0 + else: + mean = np.mean(values) + std = np.std(values) + transformed_data = (log_data-mean)/std + return transformed_data