From 180374e2f52d430c1fa50a6fa5886c1c8be7696f Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 13 Feb 2024 17:22:44 +0100 Subject: [PATCH] fix 424 - bug in agat_sp_complement_annotations.pl / check_feature_overlap_from_l3_to_l1 (#425) * fix check_feature_overlap_from_l3_to_l1. fix #424 * add a new test for agat_sp_complement_annotations --- lib/AGAT/OmniscientTool.pm | 227 +++++++++--------- t/scripts_output.t | 5 + .../agat_sp_complement_annotations_add.gff | 6 + .../agat_sp_complement_annotations_ref.gff | 6 + .../out/agat_sp_complement_annotations_2.gff | 7 + 5 files changed, 137 insertions(+), 114 deletions(-) create mode 100644 t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_add.gff create mode 100644 t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_ref.gff create mode 100644 t/scripts_output/out/agat_sp_complement_annotations_2.gff diff --git a/lib/AGAT/OmniscientTool.pm b/lib/AGAT/OmniscientTool.pm index b84d20e1..c7cdb46a 100644 --- a/lib/AGAT/OmniscientTool.pm +++ b/lib/AGAT/OmniscientTool.pm @@ -128,7 +128,6 @@ sub complement_omniscients { my ($omniscient1, $omniscient2, $size_min, $verbose)=@_; my %add_omniscient; - $size_min=0 if (! $size_min); if(! $verbose){$verbose=0;} @@ -152,13 +151,12 @@ sub complement_omniscients { #If location_to_check start if over the end of the reference location, we stop if($location2->[1] > $location->[2]) {last;} - print "location_to_check start if over the end of the reference location.\n" if ($verbose >= 3); #If location_to_check end if inferior to the start of the reference location, we continue next if($location2->[2] < $location->[1]) {next;} - print "location_to_check start if inferior to the start of the reference location.\n" if ($verbose >= 3); # Let's check at Gene LEVEL + print "location overlap at gene level check now level3.\n" if ($verbose >= 3); if( location_overlap($location, $location2) ){ #location overlap at gene level check now level3 #let's check at CDS level (/!\ id1_l1 is corresponding to id from $omniscient2) if(check_feature_overlap_from_l3_to_l1($omniscient2, $omniscient1, $id1_l1, $id2_l1)){ #If contains CDS it has to overlap at CDS level, otherwise any type of feature level3 overlaping is sufficient to decide that they overlap @@ -2060,133 +2058,134 @@ sub check_gene_overlap_at_level3{ # if no l3 check at l2 # then if no l2 check at l1 sub check_feature_overlap_from_l3_to_l1{ - my ($hash_omniscient, $hash_omniscient2, $gene_id, $gene_id2)=@_; + my ($hash_omniscient, $hash_omniscient2, $gene_id, $gene_id2)=@_; - my $level3_global=0; - my $overlap_ft = undef; - my $level2_overlap=undef; - my $exist_l2A=undef; - my $exist_l2B=undef; + my $level3_global=0; + my $overlap_ft = undef; + my $level2_overlap=undef; + my $exist_l2A=undef; + my $exist_l2B=undef; - # check at L3 + # check at L3 foreach my $l2_type (keys %{$hash_omniscient->{'level2'}} ){ - if(exists_keys($hash_omniscient,('level2', $l2_type, lc($gene_id)))){ - $exist_l2A=1; + if(exists_keys($hash_omniscient,('level2', $l2_type, lc($gene_id)))){ + $exist_l2A=1; foreach my $mrna_feature (@{$hash_omniscient->{'level2'}{$l2_type}{lc($gene_id)}}){ my $mrna_id1 = $mrna_feature->_tag_value('ID'); if(exists_keys($hash_omniscient2,('level2', $l2_type, lc($gene_id2)))){ + # from here both feature level2 are the same type + foreach my $mrna_feature2 (@{$hash_omniscient2->{'level2'}{$l2_type}{lc($gene_id2)}}){ + my $mrna_id2 = $mrna_feature2->_tag_value('ID'); - # from here both feature level2 are the same type - foreach my $mrna_feature2 (@{$hash_omniscient2->{'level2'}{$l2_type}{lc($gene_id2)}}){ - my $mrna_id2 = $mrna_feature2->_tag_value('ID'); + # check overlap in case for later if Level3 test not possible + if(($mrna_feature2->start <= $mrna_feature->end) and ($mrna_feature2->end >= $mrna_feature->start )){ # they overlap + $level2_overlap=$l2_type; + } - # check overlap in case for later if LEvel3 test not possible - if(($mrna_feature2->start <= $mrna_feature->end) and ($mrna_feature2->end >= $mrna_feature->start )){ # they overlap - $level2_overlap=$l2_type; - } + #check all cds pieces - CDS against CDS + my $cds_local=0; + $cds_local++ if ( exists_keys($hash_omniscient,('level3', 'cds', lc($mrna_id1)))); + $cds_local++ if ( exists_keys($hash_omniscient2,('level3', 'cds', lc($mrna_id2)))); + $level3_global += $cds_local; + if( $cds_local == 2) { + foreach my $cds_feature1 (@{$hash_omniscient->{'level3'}{'cds'}{lc($mrna_id1)}}){ + foreach my $cds_feature2 (@{$hash_omniscient2->{'level3'}{'cds'}{lc($mrna_id2)}}){ + if(($cds_feature2->start <= $cds_feature1->end) and ($cds_feature2->end >= $cds_feature1->start )){ # they overlap + $overlap_ft = "cds"; + last; + } + } + last if ($overlap_ft); + } + } - #check all cds pieces - CDS against CDS - my $cds_local=0; - $cds_local++ if ( exists_keys($hash_omniscient,('level3', 'cds', lc($mrna_id1)))); - $cds_local++ if ( exists_keys($hash_omniscient,('level3', 'cds', lc($mrna_id2)))); - $level3_global += $cds_local; - if( $cds_local == 2) { - foreach my $cds_feature1 (@{$hash_omniscient->{'level3'}{'cds'}{lc($mrna_id1)}}){ - foreach my $cds_feature2 (@{$hash_omniscient2->{'level3'}{'cds'}{lc($mrna_id2)}}){ - if(($cds_feature2->start <= $cds_feature1->end) and ($cds_feature2->end >= $cds_feature1->start )){ # they overlap - $overlap_ft = "cds"; - last; - } - } - last if ($overlap_ft); - } - } - # No CDS, check at exon - elsif(!$cds_local){ - # check all exon pieces - exon against exon - my $exon_local=0; - $exon_local++ if ( exists_keys($hash_omniscient,('level3', 'exon', lc($mrna_id1)))); - $exon_local++ if ( exists_keys($hash_omniscient,('level3', 'exon', lc($mrna_id2)))); - $level3_global += $exon_local; - if( $exon_local == 2) { - foreach my $cds_feature1 (@{$hash_omniscient->{'level3'}{'exon'}{lc($mrna_id1)}}){ - foreach my $cds_feature2 (@{$hash_omniscient2->{'level3'}{'exon'}{lc($mrna_id2)}}){ - if(($cds_feature2->start <= $cds_feature1->end) and ($cds_feature2->end >= $cds_feature1->start )){ # they overlap - $overlap_ft = "exon"; - last; - } - } - last if ($overlap_ft); - } - } - # No CDS, No exon, check at other feature types - elsif(!$exon_local){ - foreach my $tag_l3 (keys %{$hash_omniscient->{'level3'}}){ - if ($tag_l3 ne "cds" or $tag_l3 ne "exon"){ - my $l3_local=0; - $l3_local++ if ( exists_keys($hash_omniscient,('level3', $tag_l3, lc($mrna_id1)))); - $l3_local++ if ( exists_keys($hash_omniscient,('level3', $tag_l3, lc($mrna_id2)))); - $level3_global += $l3_local; - if( $l3_local == 2) { - foreach my $feature1 (@{$hash_omniscient->{'level3'}{$tag_l3}{lc($mrna_id1)}}){ - foreach my $feature2 (@{$hash_omniscient2->{'level3'}{$tag_l3}{lc($mrna_id2)}}){ - if(($feature2->start <= $feature1->end) and ($feature2->end >= $feature1->start )){ # they overlap - $overlap_ft = $tag_l3; - last; - } - last if ($overlap_ft); - } - } - } - last if ($overlap_ft); - } - } - } - } - last if ($overlap_ft); - } + # No CDS, check at exon + elsif(!$cds_local){ + # check all exon pieces - exon against exon + my $exon_local=0; + $exon_local++ if ( exists_keys($hash_omniscient,('level3', 'exon', lc($mrna_id1)))); + $exon_local++ if ( exists_keys($hash_omniscient,('level3', 'exon', lc($mrna_id2)))); + $level3_global += $exon_local; + if( $exon_local == 2) { + foreach my $cds_feature1 (@{$hash_omniscient->{'level3'}{'exon'}{lc($mrna_id1)}}){ + foreach my $cds_feature2 (@{$hash_omniscient2->{'level3'}{'exon'}{lc($mrna_id2)}}){ + if(($cds_feature2->start <= $cds_feature1->end) and ($cds_feature2->end >= $cds_feature1->start )){ # they overlap + $overlap_ft = "exon"; + last; + } + } + last if ($overlap_ft); + } + } + # No CDS, No exon, check at other feature types + elsif(!$exon_local){ + foreach my $tag_l3 (keys %{$hash_omniscient->{'level3'}}){ + if ($tag_l3 ne "cds" or $tag_l3 ne "exon"){ + my $l3_local=0; + $l3_local++ if ( exists_keys($hash_omniscient,('level3', $tag_l3, lc($mrna_id1)))); + $l3_local++ if ( exists_keys($hash_omniscient,('level3', $tag_l3, lc($mrna_id2)))); + $level3_global += $l3_local; + if( $l3_local == 2) { + foreach my $feature1 (@{$hash_omniscient->{'level3'}{$tag_l3}{lc($mrna_id1)}}){ + foreach my $feature2 (@{$hash_omniscient2->{'level3'}{$tag_l3}{lc($mrna_id2)}}){ + if(($feature2->start <= $feature1->end) and ($feature2->end >= $feature1->start )){ # they overlap + $overlap_ft = $tag_l3; + last; + } + last if ($overlap_ft); + } + } + } + last if ($overlap_ft); + } + } + } + } + last if ($overlap_ft); + } } - last if ($overlap_ft); + last if ($overlap_ft); } } - last if ($overlap_ft); + last if ($overlap_ft); } - # Level3 test not possible, check overlap at level2 - if (! $level3_global){ #nothing tested at level3 - if ( $level2_overlap){ - $overlap_ft = $level2_overlap; - } - else{ - if (! $exist_l2A){ - # check other locus was wihtout l2 also - foreach my $l2_type (keys %{$hash_omniscient->{'level2'}} ){ - if(exists_keys($hash_omniscient,('level2', $l2_type, lc($gene_id2)))){ - $exist_l2B=1; - } - } - if (! $exist_l2B){ # locus2 was also without l2 let's check at level1 now - foreach my $tag_l1 (keys %{$hash_omniscient->{'level1'}} ){ - if(exists_keys($hash_omniscient,('level1', $tag_l1, lc($gene_id)))){ - if(exists_keys($hash_omniscient,('level1', $tag_l1, lc($gene_id2)))){ - my $level1_feature = $hash_omniscient->{'level1'}{$tag_l1}{$gene_id}; - my $level1_feature2 = $hash_omniscient->{'level1'}{$tag_l1}{$gene_id2}; - if(($level1_feature2->start <= $level1_feature->end) and ($level1_feature2->end >= $level1_feature->start )){ # they overlap - $overlap_ft=$tag_l1; - last - } - } - else{ - last; - } - } - } - } - } - } - } + + # Level3 test not possible, check overlap at level2 + if (! $level3_global){ #nothing tested at level3 + if ( $level2_overlap){ + $overlap_ft = $level2_overlap; + } + else{ + if (! $exist_l2A){ + # check other locus was wihtout l2 also + foreach my $l2_type (keys %{$hash_omniscient->{'level2'}} ){ + if(exists_keys($hash_omniscient,('level2', $l2_type, lc($gene_id2)))){ + $exist_l2B=1; + } + } + if (! $exist_l2B){ # locus2 was also without l2 let's check at level1 now + foreach my $tag_l1 (keys %{$hash_omniscient->{'level1'}} ){ + if(exists_keys($hash_omniscient,('level1', $tag_l1, lc($gene_id)))){ + if(exists_keys($hash_omniscient,('level1', $tag_l1, lc($gene_id2)))){ + my $level1_feature = $hash_omniscient->{'level1'}{$tag_l1}{$gene_id}; + my $level1_feature2 = $hash_omniscient->{'level1'}{$tag_l1}{$gene_id2}; + if(($level1_feature2->start <= $level1_feature->end) and ($level1_feature2->end >= $level1_feature->start )){ # they overlap + $overlap_ft=$tag_l1; + last + } + } + else{ + last; + } + } + } + } + } + } + } return $overlap_ft; } diff --git a/t/scripts_output.t b/t/scripts_output.t index 11f3914e..857b78d4 100644 --- a/t/scripts_output.t +++ b/t/scripts_output.t @@ -260,6 +260,11 @@ system(" $script --ref t/gff_syntax/in/25_test.gff --add t/gff_syntax/in/9_test ok( system("diff $result $outtmp") == 0, "output $script"); unlink $outtmp; +$result = "$output_folder/agat_sp_complement_annotations_2.gff"; +system(" $script --ref $input_folder/agat_sp_complement_annotations/agat_sp_complement_annotations_ref.gff --add $input_folder/agat_sp_complement_annotations/agat_sp_complement_annotations_add.gff -o $outtmp 2>&1 1>/dev/null"); +#run test +ok( system("diff $result $outtmp") == 0, "output $script"); +unlink $outtmp; # --------check agat_sp_ensembl_output_style.pl------------- diff --git a/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_add.gff b/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_add.gff new file mode 100644 index 00000000..fb03df17 --- /dev/null +++ b/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_add.gff @@ -0,0 +1,6 @@ +Chr1 Liftoff gene 41075 54676 . + . ID=gene-gene1 +Chr1 Liftoff mRNA 41075 54676 . + . ID=rna-transcript1;Parent=gene-gene1 +Chr1 Liftoff exon 41075 54676 . + . ID=exon-transcript1-1;Parent=rna-transcript1 +Chr1 Liftoff CDS 41311 54435 . + 0 ID=cds-codingsequence-1.1;Parent=rna-transcript1 +Chr1 Liftoff five_prime_UTR 41075 41310 . + . ID=nbis-five_prime_utr-19342;Parent=rna-transcript1 +Chr1 Liftoff three_prime_UTR 54436 54676 . + . ID=nbis-three_prime_utr-18368;Parent=rna-transcript1 diff --git a/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_ref.gff b/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_ref.gff new file mode 100644 index 00000000..a3791da2 --- /dev/null +++ b/t/scripts_output/in/agat_sp_complement_annotations/agat_sp_complement_annotations_ref.gff @@ -0,0 +1,6 @@ +Chr1 Helixer gene 41064 54753 . + . ID=Chr1_000001 +Chr1 Helixer mRNA 41064 54753 . + . ID=Chr1_000001.1;Parent=Chr1_000001 +Chr1 Helixer exon 41064 54753 . + . ID=Chr1_000001.1.exon.1;Parent=Chr1_000001.1 +Chr1 Helixer five_prime_UTR 41064 41310 . + . ID=Chr1_000001.1.five_prime_UTR.1;Parent=Chr1_000001.1 +Chr1 Helixer CDS 41311 54435 . + 0 ID=Chr1_000001.1.CDS.1;Parent=Chr1_000001.1 +Chr1 Helixer three_prime_UTR 54436 54753 . + . ID=Chr1_000001.1.three_prime_UTR.1;Parent=Chr1_000001.1 diff --git a/t/scripts_output/out/agat_sp_complement_annotations_2.gff b/t/scripts_output/out/agat_sp_complement_annotations_2.gff new file mode 100644 index 00000000..784a39c9 --- /dev/null +++ b/t/scripts_output/out/agat_sp_complement_annotations_2.gff @@ -0,0 +1,7 @@ +##gff-version 3 +Chr1 Helixer gene 41064 54753 . + . ID=Chr1_000001 +Chr1 Helixer mRNA 41064 54753 . + . ID=Chr1_000001.1;Parent=Chr1_000001 +Chr1 Helixer exon 41064 54753 . + . ID=Chr1_000001.1.exon.1;Parent=Chr1_000001.1 +Chr1 Helixer CDS 41311 54435 . + 0 ID=Chr1_000001.1.CDS.1;Parent=Chr1_000001.1 +Chr1 Helixer five_prime_UTR 41064 41310 . + . ID=Chr1_000001.1.five_prime_UTR.1;Parent=Chr1_000001.1 +Chr1 Helixer three_prime_UTR 54436 54753 . + . ID=Chr1_000001.1.three_prime_UTR.1;Parent=Chr1_000001.1