Skip to content

Commit

Permalink
skeleton added
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Oct 8, 2024
1 parent e7b87a4 commit 652b00e
Show file tree
Hide file tree
Showing 16 changed files with 1,026 additions and 1,782 deletions.
1,247 changes: 0 additions & 1,247 deletions api_examples.ipynb

This file was deleted.

938 changes: 523 additions & 415 deletions runs.ipynb

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions scripts/sbatch/figr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
#SBATCH --job-name=figr
#SBATCH --time=48:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH --cpus-per-task=20
#SBATCH --mem=250G


singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R
13 changes: 13 additions & 0 deletions scripts/sbatch/run_skeleton.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#SBATCH --job-name=skeleton
#SBATCH --time=48:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH --mem=128G
#SBATCH --cpus-per-task=20
#SBATCH --partition=gpu
#SBATCH --gres=gpu:1

singularity run ../../images/scglue python src/metrics/skeleton/script.py
57 changes: 32 additions & 25 deletions src/methods/multi_omics/figr/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ library(BSgenome.Hsapiens.UCSC.hg38)

## VIASH START
par <- list(
multiomics_rna_r = "resources_test/grn-benchmark/multiomics_rna.rds",
multiomics_atac_r = "resources_test/grn-benchmark/multiomics_atac.rds",
multiomics_rna_r = "resources/grn-benchmark/multiomics_rna.rds",
multiomics_atac_r = "resources/grn-benchmark/multiomics_atac.rds",
temp_dir = "output/figr/",
cell_topic = "resources/prior/cell_topic_d0_hvg.csv",
num_workers = 2,
cell_topic = "resources/prior/cell_topic.csv",
num_workers = 20,
n_topics = 48,
peak_gene = "output/figr/peak_gene.csv",
prediction= "output/figr/prediction.csv"
prediction= "resources/grn_models/figr.csv"
)
print(par)
# meta <- list(
Expand All @@ -25,10 +25,9 @@ print(par)
dir.create(par$temp_dir, recursive = TRUE, showWarnings = TRUE)

atac = readRDS(par$multiomics_atac_r)
rna = readRDS(par$multiomics_rna_r)


colnames(atac) <- gsub("-", "", colnames(atac))

rna = readRDS(par$multiomics_rna_r)
colnames(rna) <- gsub("-", "", colnames(rna))


Expand All @@ -41,8 +40,7 @@ cellknn_func <- function(par) {
rownames(cellkNN) <- rownames(cell_topic)
saveRDS(cellkNN, paste0(par$temp_dir, "cellkNN.rds"))
}
cellknn_func(par)
print('1: cellknn_func finished')

## Step1: Peak-gene association testing
peak_gene_func <- function(par){

Expand All @@ -59,9 +57,6 @@ peak_gene_func <- function(par){
write.csv(cisCorr, paste0(par$temp_dir, "cisCorr.csv"), row.names = TRUE)
}

peak_gene_func(par)

print('2: peak_gene_func finished')
## Step 2: create DORCs and smooth them
dorc_genes_func <- function(par){
cisCorr = read.csv(paste0(par$temp_dir, "cisCorr.csv"))
Expand All @@ -83,19 +78,23 @@ dorc_genes_func <- function(par){
cat('cellKNN dim:', dim(cellkNN), '\n')
cat('dorcMat dim:', dim(dorcMat), '\n')
cat('rna dim:', dim(rna), '\n')
dorcMat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = dorcMat, nCores = par$num_workers)
dorcMat.s <- smoothScoresNN(NNmat = cellkNN, mat = dorcMat, nCores = par$num_workers)
cat('dorcMat.s completed')
# Smooth RNA using cell KNNs
# This takes longer since it's all genes
RNAmat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = rna, nCores = par$num_workers)
matching_indices <- match(colnames(rna), rownames(cellkNN))
cellkNN_ordered <- cellkNN[matching_indices, ]


RNAmat.s <- smoothScoresNN(NNmat = cellkNN_ordered, mat = rna, nCores = par$num_workers)
# RNAmat.s <- rna
cat('RNAmat.s completed')
# get peak gene connection
write.csv(cisCorr.filt, paste0(par$temp_dir, "cisCorr.filt.csv"))
saveRDS(RNAmat.s, paste0(par$temp_dir, "RNAmat.s.RDS"))
saveRDS(dorcMat.s, paste0(par$temp_dir, "dorcMat.s.RDS"))
}
dorc_genes_func(par)
print('3: dorc_genes_func finished')

## TF-gene associations
tf_gene_association_func <- function(par){
cisCorr.filt = read.csv(paste0(par$temp_dir, "cisCorr.filt.csv"))
Expand All @@ -111,8 +110,6 @@ tf_gene_association_func <- function(par){

write.csv(figR.d, paste0(par$temp_dir, "figR.d.csv"))
}
tf_gene_association_func(par)
print('3: tf_gene_association_func finished')

extract_peak_gene_func <- function(par) {
# Read the CSV file
Expand All @@ -137,22 +134,20 @@ extract_peak_gene_func <- function(par) {
# Write the result to a CSV file
write.csv(peak_gene_figr, file = par$peak_gene, row.names = FALSE)
}
extract_peak_gene_func(par)
print('4: extract_peak_gene_func finished')


filter_figr_grn <- function(par) {
# Read the CSV file
figr_grn <- read.csv(file.path(par$temp_dir, "figR.d.csv"))

# Filter those that have a Score of 0
figr_grn <- subset(figr_grn, Score != 0)

# Filter based on enrichment
figr_grn <- subset(figr_grn, Enrichment.P < 0.05)

# Filter based on correlation
figr_grn <- subset(figr_grn, Corr.P < 0.05)
# figr_grn <- subset(figr_grn, Corr.P < 0.05)

# Filter those that have a Score of 0
figr_grn <- subset(figr_grn, Score != 0)

# Subset columns
figr_grn <- figr_grn[, c("Motif", "DORC", "Score")]
Expand All @@ -168,4 +163,16 @@ filter_figr_grn <- function(par) {
}




cellknn_func(par)
print('1: cellknn_func finished')
peak_gene_func(par)
print('2: peak_gene_func finished')
dorc_genes_func(par)
print('3: dorc_genes_func finished')
tf_gene_association_func(par)
print('3: tf_gene_association_func finished')
extract_peak_gene_func(par)
print('4: extract_peak_gene_func finished')
filter_figr_grn(par)
30 changes: 17 additions & 13 deletions src/methods/multi_omics/scenicplus/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@

import os
import gc

import sys
import yaml
import pickle
Expand All @@ -16,6 +18,9 @@
import tarfile
from urllib.request import urlretrieve
import json
import os



import flatbuffers
import numpy as np
Expand Down Expand Up @@ -193,23 +198,25 @@ def process_peak(par):
chain=False
)

# Create cistopic objects

# Download Mallet

if not os.path.exists(par['MALLET_PATH']):
url = 'https://github.com/mimno/Mallet/releases/download/v202108/Mallet-202108-bin.tar.gz'
response = requests.get(url)
with open(os.path.join(par['temp_dir'], 'Mallet-202108-bin.tar.gz'), 'wb') as f:
f.write(response.content)
with tarfile.open(os.path.join(par['temp_dir'], 'Mallet-202108-bin.tar.gz'), 'r:gz') as f:
f.extractall(path=par['temp_dir'])
del consensus_peaks
del narrow_peak_dict
del adata_atac
gc.collect()
def run_cistopic(par):
adata_atac = anndata.read_h5ad(par['multiomics_atac'])
unique_donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id.cat.categories]
print(unique_donor_ids)
unique_cell_types = [s.replace(' ', '_') for s in adata_atac.obs.cell_type.cat.categories]

donor_ids = [s.replace(' ', '_') for s in adata_atac.obs.donor_id]
index = [barcode.replace('-', '') + '-' + donor_id for donor_id, barcode in zip(donor_ids, adata_atac.obs.index)]
cell_data = pd.DataFrame({
'cell_type': [s.replace(' ', '_') for s in adata_atac.obs.cell_type.to_numpy()],
'donor_id': [s.replace(' ', '_') for s in adata_atac.obs.donor_id.to_numpy()]
Expand Down Expand Up @@ -362,7 +369,12 @@ def run_cistopic(par):
# Save cistopic objects
with open(par['cistopic_object'], 'wb') as f:
pickle.dump(cistopic_obj, f)


del cistopic_obj
del model
del cistopic_obj_list
del adata_atac
gc.collect()

def process_topics(par):
# Load cistopic objects
Expand Down Expand Up @@ -559,7 +571,6 @@ def process_topics(par):
split_pattern='-'
)

# Download databases
def download_databases(par):
def download(url: str, filepath: str) -> None:
if os.path.exists(filepath):
Expand All @@ -574,7 +585,6 @@ def download(url: str, filepath: str) -> None:
# with open(par['blacklist_path'], 'w') as f:
# f.write(response.text)
download(url, par['blacklist_path'])

url = 'https://resources.aertslab.org/cistarget/motif_collections/v10nr_clust_public/snapshots/motifs-v10-nr.hgnc-m0.00001-o0.0.tbl'
if not os.path.exists(par['motif_annotation']):
download(url, par['motif_annotation'])
Expand All @@ -594,7 +604,6 @@ def download(url: str, filepath: str) -> None:
if not os.path.exists(os.path.join(par['temp_dir'], 'cistarget-db', 'create_cisTarget_databases')):
with contextlib.chdir(os.path.join(par['temp_dir'], 'cistarget-db')):
subprocess.run(['git', 'clone', 'https://github.com/aertslab/create_cisTarget_databases'])

# Download cluster-buster
if not os.path.exists(os.path.join(par['temp_dir'], 'cistarget-db', 'cbust')):
urlretrieve('https://resources.aertslab.org/cistarget/programs/cbust', os.path.join(par['temp_dir'], 'cistarget-db', 'cbust'))
Expand Down Expand Up @@ -639,7 +648,6 @@ def download(url: str, filepath: str) -> None:
'1000',
'yes'
])

# Create cistarget databases
with open(os.path.join(par['temp_dir'], 'cistarget-db', 'motifs.txt'), 'w') as f:
for filename in os.listdir(os.path.join(par['temp_dir'], 'cistarget-db', 'v10nr_clust_public', 'singletons')):
Expand All @@ -659,14 +667,12 @@ def download(url: str, filepath: str) -> None:
'--bgpadding', '1000',
'-t', str(par['num_workers'])
], capture_output=True, text=True)

# Print the result for debugging
print(result.stdout)
print(result.stderr)

def preprocess_rna(par):
os.makedirs(os.path.join(par['temp_dir'], 'scRNA'), exist_ok=True)

print("Preprocess RNA-seq", flush=True)
# Load scRNA-seq data
print("Load scRNA-seq data")
Expand All @@ -688,11 +694,9 @@ def preprocess_rna(par):
adata_rna.raw = adata_rna
sc.pp.normalize_total(adata_rna, target_sum=1e4)
sc.pp.log1p(adata_rna)

# Change barcodes to match the barcodes in the scATAC-seq data
bar_codes = [f'{obs_name.replace("-", "")}-{donor_id}' for obs_name, donor_id in zip(adata_rna.obs_names, adata_rna.obs.donor_id)]
adata_rna.obs_names = bar_codes

# Save scRNA-seq data
adata_rna.write_h5ad(os.path.join(par['temp_dir'], 'rna.h5ad'))

Expand Down
33 changes: 24 additions & 9 deletions src/methods/multi_omics/scenicplus/script.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

import sys
import os
## VIASH START
par = {
'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad',
Expand Down Expand Up @@ -44,6 +45,15 @@
sys.path.append(meta["resources_dir"])
from main import *


def print_memory_usage():
import psutil

process = psutil.Process(os.getpid())
mem_info = process.memory_info().rss / (1024 * 1024) # Convert to MB
print(f"Memory usage: {mem_info:.2f} MB")


def main(par):

par['cistopic_object'] = f'{par["temp_dir"]}/cistopic_object.pkl'
Expand All @@ -58,19 +68,24 @@ def main(par):
par['MALLET_PATH'] = os.path.join(par['temp_dir'], 'Mallet-202108', 'bin', 'mallet')
os.makedirs(par['atac_dir'], exist_ok=True)

print('------- download_databases -------')
# print('------- download_databases -------')
# download_databases(par)
print('------- process_peak -------')
# print_memory_usage()
# print('------- process_peak -------')
# process_peak(par)
print('------- run_cistopic -------')
run_cistopic(par)
print('------- process_topics -------')
process_topics(par)

print('------- preprocess_rna -------')
preprocess_rna(par)
# print_memory_usage()
# print('------- run_cistopic -------')
# run_cistopic(par)
# print_memory_usage()
# print('------- process_topics -------')
# process_topics(par)
# print_memory_usage()
# print('------- preprocess_rna -------')
# preprocess_rna(par)
# print_memory_usage()
print('------- snakemake_pipeline -------')
snakemake_pipeline(par)
print_memory_usage()
print('------- post_process -------')
post_process(par)
if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 652b00e

Please sign in to comment.