Merge pull request #13 from bergmanlab/improve_accuracy

Remove prediction redundancy
bergmanlab · Aug 5, 2021 · 9041fa0 · 9041fa0
2 parents 83fbd4c + 1d86bf6
commit 9041fa0
Showing 1 changed file with 79 additions and 4 deletions.
diff --git a/src/telr/TELR_liftover.py b/src/telr/TELR_liftover.py
@@ -1067,16 +1067,91 @@ def liftover(
             report = json.load(f)
             data.append(report)
 
-    json_report = out + "/" + "liftover_report.json"
-    with open(json_report, "w") as output:
+    json_report_tmp = out + "/" + "liftover_report.tmp.json"
+    with open(json_report_tmp, "w") as output:
         json.dump(data, output, indent=4, sort_keys=False)
 
     # add a step to merge close annotations if inserions are : 1) supported by only one flank 2) same family 3) same strand
 
+    # merge entries with overlapping or identical coordinates, choose one with longest TE length
+    ## step one: write json into bed format
+    bed_report_tmp = out + "/" + "liftover_report.tmp.bed"
+    with open(bed_report_tmp, "w") as output:
+        for entry in data:
+            num_hits = entry["num_hits"]
+            te_id = entry["ID"]
+            te_length = entry["te_length"]
+            if num_hits == 1:
+                report = entry["report"]
+                te_type = report["type"]
+                if te_type == "non-reference":
+                    chrom = report["chrom"]
+                    start = report["start"]
+                    end = report["end"]
+                    family = report["family"]
+                    strand = report["strand"]
+                    score = "."
+                    out_line = "\t".join(
+                        [
+                            chrom,
+                            str(start),
+                            str(end),
+                            family,
+                            score,
+                            strand,
+                            str(te_length),
+                            te_id,
+                        ]
+                    )
+                    output.write(out_line + "\n")
+
+    ## step two: sort bed file
+    bed_report_sort = out + "/" + "liftover_report.sort.bed"
+    command = "bedtools sort -i " + bed_report_tmp
+    with open(bed_report_sort, "w") as output:
+        subprocess.call(command, shell=True, stdout=output)
+
+    ## step three: merge entries with same coordinates
+    bed_report_merge = out + "/" + "liftover_report.merge.bed"
+    with open(bed_report_merge, "w") as output:
+        command = (
+            'bedtools merge -d 0 -o collapse -c 2,3,4,5,6,7,8 -delim "," -i '
+            + bed_report_sort
+        )
+        subprocess.call(command, shell=True, stdout=output)
+
+    ## step four: find overlapped entries and pick ones to filter out
+    remove_ids = set()
+    with open(bed_report_merge, "r") as input:
+        for line in input:
+            entry = line.replace("\n", "").split("\t")
+            chrom = entry[0]
+            if "," in entry[3]:
+                len_list = entry[7].split(",")
+                idx = len_list.index(max(len_list))
+                remove_ids.add(entry[8].split(",")[idx])
+
+    ## step five: write final report
+    data_new = []
+    for entry in data:
+        te_id = entry["ID"]
+        if te_id not in remove_ids:
+            data_new.append(entry)
+
+    ## step six: clean up
+    os.remove(bed_report_tmp)
+    os.remove(bed_report_sort)
+    os.remove(bed_report_merge)
+    os.remove(json_report_tmp)
+
+    json_report = out + "/" + "liftover_report.json"
+    with open(json_report, "w") as output:
+        json.dump(data_new, output, indent=4, sort_keys=False)
+
     # write non-reference lifted annotations in BED format
     bed_report = out + "/" + "liftover_nonref.bed"
     with open(bed_report, "w") as output:
-        for item in data:
+        for item in data_new:
             if item["num_hits"] == 1:
                 info = item["report"]
                 chrom = info["chrom"]
@@ -1097,7 +1172,7 @@ def liftover(
     nonref_comments = dict()
     ref_comments = dict()
     unlift_comments = dict()
-    for item in data:
+    for item in data_new:
         info = item["report"]
         if info["type"] == "non-reference":
             nonref_total = nonref_total + 1