From 649f0714e4b0f7489695530c5a7cd3fcfb1855f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 5 Dec 2024 12:03:06 +0000
Subject: [PATCH] moved split_protein to utils

---
 .../tools/anno/protein_annotation/genblast.py | 61 ++-----------------
 src/python/ensembl/tools/anno/utils/_utils.py | 54 +++++++++++++++-
 2 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/src/python/ensembl/tools/anno/protein_annotation/genblast.py b/src/python/ensembl/tools/anno/protein_annotation/genblast.py
index 8652807..a7d8e7f 100644
--- a/src/python/ensembl/tools/anno/protein_annotation/genblast.py
+++ b/src/python/ensembl/tools/anno/protein_annotation/genblast.py
@@ -38,12 +38,10 @@
 import multiprocessing
 import os
 from pathlib import Path
-import random
 import re
 import shutil
 import signal
 import subprocess
-from typing import List
 import argparse
 
 from ensembl.tools.anno.utils._utils import (
@@ -65,7 +63,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
     convert2blastmask_bin: Path = Path("convert2blastmask"),
     makeblastdb_bin: Path = Path("makeblastdb"),
     num_threads: int = 1,
-    protein_set: str = ["uniprot", "orthodb"],
+    protein_set: str = ["uniprot"],
 ) -> None:
     """
     
@@ -102,6 +100,9 @@ def run_genblast(#pylint:disable=dangerous-default-value
     check_exe(genblast_bin)
     check_exe(convert2blastmask_bin)
     check_exe(makeblastdb_bin)
+    if protein_set not in {"uniprot", "orthodb"}:
+
+        raise ValueError("protein_set must be either 'uniprot' or 'orthodb'")
     if protein_set == "uniprot":
         genblast_dir = create_dir(output_dir, "uniprot_output")
     elif protein_set == "orthodb":
@@ -128,7 +129,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
     else:
         _run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file)
     _run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file)
-    batched_protein_files = _split_protein_file(
+    batched_protein_files = split_protein_file(
         protein_dataset, genblast_dir, num_threads
     )
     pool = multiprocessing.Pool(num_threads)  # pylint:disable=consider-using-with
@@ -285,58 +286,6 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None:
             ):
                 path.unlink()
 
-
-def _split_protein_file(
-    protein_dataset: Path, output_dir: Path, batch_size: int = 20
-) -> List:
-    """
-    The protein dataset file is splitted by a number of sequence equals to the batch_size
-    in batch files stored in 10 output directories.
-    protein_dataset : Path for the protein dataset.
-    output_dir : Output directory path.
-    batch_size : Size of the batch, it needs to be equals to the number of threads
-    to parallelise the sequence processing for each file.
-    """
-    batched_protein_files = []
-
-    for i in range(0, 10):
-        create_dir(output_dir, (f"bin_{i}"))
-    with open(protein_dataset,"r", encoding="utf8") as file_in:
-        seq_count = 0
-        batch_count = 0
-        current_record = ""
-        initial_seq = True
-        for line in file_in:
-            match = re.search(r">(.+)$", line)
-            # match header and is not first sequence, if the number of stored sequences in each file equals
-            # the number of batch_size, a new file will be created and the current_record reset
-            if match and not initial_seq and seq_count % batch_size == 0:
-                bin_num = random.randint(0, 9)
-                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-                with batch_file.open("w+") as file_out:
-                    file_out.write(current_record)
-                batch_count += 1
-                seq_count += 1
-                current_record = line
-                batched_protein_files.append(batch_file)
-            # match header and is the first sequence
-            elif match:
-                current_record += line
-                initial_seq = False
-                seq_count += 1
-            # other lines
-            else:
-                current_record += line
-
-        if current_record:
-            bin_num = random.randint(0, 9)
-            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-            with batch_file.open("w+") as file_out:
-                file_out.write(current_record)
-            batched_protein_files.append(batch_file)
-    return batched_protein_files
-
-
 def _run_convert2blastmask(
     convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path
 ) -> None:
diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py
index c0c9786..08c3db3 100644
--- a/src/python/ensembl/tools/anno/utils/_utils.py
+++ b/src/python/ensembl/tools/anno/utils/_utils.py
@@ -21,6 +21,7 @@
 import os
 from os import PathLike
 from pathlib import Path
+import random
 import re
 import subprocess
 import shutil
@@ -81,8 +82,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
     Check number of transcript lines in the GTF
 
     Arg:
-      gtf_file: GTF file path
-      content_obj: Object to detect and count in the gtf i.e transcript, repeat
+        gtf_file: GTF file path
+        content_obj: Object to detect and count in the gtf i.e transcript, repeat
 
     Return: Number of occurences
     """
@@ -403,6 +404,55 @@ def check_file(file_path: Path) -> None:
     if not file_path.is_file():
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
 
+def split_protein_file(
+    protein_dataset: Path, output_dir: Path, batch_size: int = 20
+) -> List:
+    """
+    The protein dataset file is splitted by a number of sequence equals to the batch_size
+    in batch files stored in 10 output directories.
+    protein_dataset : Path for the protein dataset.
+    output_dir : Output directory path.
+    batch_size : Size of the batch, it needs to be equals to the number of threads
+    to parallelise the sequence processing for each file.
+    """
+    batched_protein_files = []
+
+    for i in range(0, 10):
+        create_dir(output_dir, (f"bin_{i}"))
+    with open(protein_dataset,"r", encoding="utf8") as file_in:
+        seq_count = 0
+        batch_count = 0
+        current_record = ""
+        initial_seq = True
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            # match header and is not first sequence, if the number of stored sequences in each file equals
+            # the number of batch_size, a new file will be created and the current_record reset
+            if match and not initial_seq and seq_count % batch_size == 0:
+                bin_num = random.randint(0, 9)
+                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+                with batch_file.open("w+") as file_out:
+                    file_out.write(current_record)
+                batch_count += 1
+                seq_count += 1
+                current_record = line
+                batched_protein_files.append(batch_file)
+            # match header and is the first sequence
+            elif match:
+                current_record += line
+                initial_seq = False
+                seq_count += 1
+            # other lines
+            else:
+                current_record += line
+
+        if current_record:
+            bin_num = random.randint(0, 9)
+            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+            with batch_file.open("w+") as file_out:
+                file_out.write(current_record)
+            batched_protein_files.append(batch_file)
+    return batched_protein_files
 
 def load_results_to_ensembl_db(
     main_script_dir,