Skip to content

Commit

Permalink
moved split_protein to utils
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Dec 5, 2024
1 parent 042bc4b commit 649f071
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 58 deletions.
61 changes: 5 additions & 56 deletions src/python/ensembl/tools/anno/protein_annotation/genblast.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,10 @@
import multiprocessing
import os
from pathlib import Path
import random
import re
import shutil
import signal
import subprocess
from typing import List
import argparse

from ensembl.tools.anno.utils._utils import (
Expand All @@ -65,7 +63,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
convert2blastmask_bin: Path = Path("convert2blastmask"),
makeblastdb_bin: Path = Path("makeblastdb"),
num_threads: int = 1,
protein_set: str = ["uniprot", "orthodb"],
protein_set: str = ["uniprot"],
) -> None:
"""
Expand Down Expand Up @@ -102,6 +100,9 @@ def run_genblast(#pylint:disable=dangerous-default-value
check_exe(genblast_bin)
check_exe(convert2blastmask_bin)
check_exe(makeblastdb_bin)
if protein_set not in {"uniprot", "orthodb"}:

raise ValueError("protein_set must be either 'uniprot' or 'orthodb'")
if protein_set == "uniprot":
genblast_dir = create_dir(output_dir, "uniprot_output")
elif protein_set == "orthodb":
Expand All @@ -128,7 +129,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
else:
_run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file)
_run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file)
batched_protein_files = _split_protein_file(
batched_protein_files = split_protein_file(
protein_dataset, genblast_dir, num_threads
)
pool = multiprocessing.Pool(num_threads) # pylint:disable=consider-using-with
Expand Down Expand Up @@ -285,58 +286,6 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None:
):
path.unlink()


def _split_protein_file(
protein_dataset: Path, output_dir: Path, batch_size: int = 20
) -> List:
"""
The protein dataset file is splitted by a number of sequence equals to the batch_size
in batch files stored in 10 output directories.
protein_dataset : Path for the protein dataset.
output_dir : Output directory path.
batch_size : Size of the batch, it needs to be equals to the number of threads
to parallelise the sequence processing for each file.
"""
batched_protein_files = []

for i in range(0, 10):
create_dir(output_dir, (f"bin_{i}"))
with open(protein_dataset,"r", encoding="utf8") as file_in:
seq_count = 0
batch_count = 0
current_record = ""
initial_seq = True
for line in file_in:
match = re.search(r">(.+)$", line)
# match header and is not first sequence, if the number of stored sequences in each file equals
# the number of batch_size, a new file will be created and the current_record reset
if match and not initial_seq and seq_count % batch_size == 0:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batch_count += 1
seq_count += 1
current_record = line
batched_protein_files.append(batch_file)
# match header and is the first sequence
elif match:
current_record += line
initial_seq = False
seq_count += 1
# other lines
else:
current_record += line

if current_record:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batched_protein_files.append(batch_file)
return batched_protein_files


def _run_convert2blastmask(
convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path
) -> None:
Expand Down
54 changes: 52 additions & 2 deletions src/python/ensembl/tools/anno/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
from os import PathLike
from pathlib import Path
import random
import re
import subprocess
import shutil
Expand Down Expand Up @@ -81,8 +82,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
Check number of transcript lines in the GTF
Arg:
gtf_file: GTF file path
content_obj: Object to detect and count in the gtf i.e transcript, repeat
gtf_file: GTF file path
content_obj: Object to detect and count in the gtf i.e transcript, repeat
Return: Number of occurences
"""
Expand Down Expand Up @@ -403,6 +404,55 @@ def check_file(file_path: Path) -> None:
if not file_path.is_file():
raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)

def split_protein_file(
protein_dataset: Path, output_dir: Path, batch_size: int = 20
) -> List:
"""
The protein dataset file is splitted by a number of sequence equals to the batch_size
in batch files stored in 10 output directories.
protein_dataset : Path for the protein dataset.
output_dir : Output directory path.
batch_size : Size of the batch, it needs to be equals to the number of threads
to parallelise the sequence processing for each file.
"""
batched_protein_files = []

for i in range(0, 10):
create_dir(output_dir, (f"bin_{i}"))
with open(protein_dataset,"r", encoding="utf8") as file_in:
seq_count = 0
batch_count = 0
current_record = ""
initial_seq = True
for line in file_in:
match = re.search(r">(.+)$", line)
# match header and is not first sequence, if the number of stored sequences in each file equals
# the number of batch_size, a new file will be created and the current_record reset
if match and not initial_seq and seq_count % batch_size == 0:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batch_count += 1
seq_count += 1
current_record = line
batched_protein_files.append(batch_file)
# match header and is the first sequence
elif match:
current_record += line
initial_seq = False
seq_count += 1
# other lines
else:
current_record += line

if current_record:
bin_num = random.randint(0, 9)
batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
with batch_file.open("w+") as file_out:
file_out.write(current_record)
batched_protein_files.append(batch_file)
return batched_protein_files

def load_results_to_ensembl_db(
main_script_dir,
Expand Down

0 comments on commit 649f071

Please sign in to comment.