Skip to content

Commit

Permalink
agat_sp_manage_IDs.pl not sorting properly fix #411 (#415)
Browse files Browse the repository at this point in the history
* add pipe between start and end when sorting (avoid super long feature starting further being set earlier in the file due to super high end value) + Change level2 sorting for print_omniscient_as_match

* fix #411 - level1 feature was not using same ordering than output (add sortnaturaly sorting to level1)

Co-authored-by: Jacques Dainat <[email protected]>
  • Loading branch information
Juke34 and Juke34 authored Jan 12, 2024
1 parent 834fdad commit 703106c
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 32 deletions.
17 changes: 6 additions & 11 deletions bin/agat_sp_manage_IDs.pl
Original file line number Diff line number Diff line change
Expand Up @@ -104,21 +104,16 @@
my $spreadfeatures = $hash_omniscient->{'other'}{'level'}{'spread'};

# sort by seq id
my %hash_sortBySeq;
foreach my $tag_level1 ( keys %{$hash_omniscient->{'level1'}}){
foreach my $level1_id ( keys %{$hash_omniscient->{'level1'}{$tag_level1}}){
my $position=$hash_omniscient->{'level1'}{$tag_level1}{$level1_id}->seq_id;
push (@{$hash_sortBySeq{$position}{$tag_level1}}, $hash_omniscient->{'level1'}{$tag_level1}{$level1_id});
}
}
my $hash_sortBySeq = gather_and_sort_l1_by_seq_id($hash_omniscient);

my $opt_tair_suffix=0;
#Read by seqId to sort properly for ID naming
foreach my $seqid (sort { (($a =~ /(\d+)$/)[0] || 0) <=> (($b =~ /(\d+)$/)[0] || 0) } keys %hash_sortBySeq){ # loop over all the feature level1

foreach my $tag_l1 (sort {$a cmp $b} keys %{$hash_sortBySeq{$seqid}}){
foreach my $seqid ( sort { ncmp ($a, $b) } keys %{$hash_sortBySeq}){ # loop over all the feature level1

foreach my $tag_l1 (sort {$a cmp $b} keys %{$hash_sortBySeq->{$seqid}}){

foreach my $feature_l1 ( sort {$a->start <=> $b->start} @{$hash_sortBySeq{$seqid}{$tag_l1}}){
foreach my $feature_l1 ( @{$hash_sortBySeq->{$seqid}{$tag_l1}}){ # feature are alredy sorted by function that made that hash
my $id_l1 = lc($feature_l1->_tag_value('ID'));
my $l1_ID_modified=undef;

Expand All @@ -135,7 +130,7 @@
foreach my $tag_l2 (sort {$a cmp $b} keys %{$hash_omniscient->{'level2'}}){ # primary_tag_key_level2 = mrna or mirna or ncrna or trna etc...

if ( exists ($hash_omniscient->{'level2'}{$tag_l2}{$id_l1} ) ){
foreach my $feature_l2 ( sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$hash_omniscient->{'level2'}{$tag_l2}{$id_l1}}) {
foreach my $feature_l2 ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$hash_omniscient->{'level2'}{$tag_l2}{$id_l1}}) {
$opt_tair_suffix++;
my $l2_ID_modified=undef;
my $level2_ID = lc($feature_l2->_tag_value('ID'));
Expand Down
32 changes: 16 additions & 16 deletions lib/AGAT/OmniscientO.pm
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ sub print_omniscient_as_gff{
}
else{

### OLD FASHION GOING TRHOUGH LEVEL1
#foreach my $primary_tag_l1 ( sort {$a <=> $b or $a cmp $b} keys %{$omniscient->{'level1'}}){ # primary_tag_l1 = gene or repeat etc...
# foreach my $id_tag_key_level1 ( sort { $omniscient->{'level1'}{$primary_tag_l1}{$a}->start <=> $omniscient->{'level1'}{$primary_tag_l1}{$b}->start } keys %{$omniscient->{'level1'}{$primary_tag_l1}} ) { #sort by position
### Just for information - OLD FASHION GOING TRHOUGH LEVEL1
#foreach my $primary_tag_l1 ( sort {$a <=> $b or $a cmp $b} keys %{$omniscient->{'level1'}}){ # primary_tag_l1 = gene or repeat etc...
# foreach my $id_tag_key_level1 ( sort { $omniscient->{'level1'}{$primary_tag_l1}{$a}->start <=> $omniscient->{'level1'}{$primary_tag_l1}{$b}->start } keys %{$omniscient->{'level1'}{$primary_tag_l1}} ) { #sort by position

### NEW FASHION GOING TRHOUGH LEVEL1 - Have to first create a hash of seq_id -> level1_feature , then we can go through in alphanumerical order.

Expand Down Expand Up @@ -264,23 +264,23 @@ sub print_omniscient_as_gff{
foreach my $primary_tag_l2 (sort {$a cmp $b} keys %{$omniscient->{'level2'}}){ # primary_tag_l2 = mrna or mirna or ncrna or trna etc...

if ( exists_keys( $omniscient, ('level2', $primary_tag_l2, $id_tag_key_level1) ) ){
foreach my $feature_level2 ( sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {
foreach my $feature_level2 ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {
$gffout->write_feature($feature_level2);

#################
# == LEVEL 3 == #
#################
my $level2_ID ;
if($feature_level2->has_tag('ID')){
$level2_ID = lc($feature_level2->_tag_value('ID'));
}
elsif($feature_level2->has_tag('transcript_id')){
$level2_ID = lc( $feature_level2->_tag_value('transcript_id'));
}
else{
warn "Cannot retrieve the parent feature of the following feature: ".gff_string($feature_level2);
}
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $gffout} );
if($feature_level2->has_tag('ID')){
$level2_ID = lc($feature_level2->_tag_value('ID'));
}
elsif($feature_level2->has_tag('transcript_id')){
$level2_ID = lc( $feature_level2->_tag_value('transcript_id'));
}
else{
warn "Cannot retrieve the parent feature of the following feature: ".gff_string($feature_level2);
}
print_level3_old_school( {omniscient => $omniscient, level2_ID =>$level2_ID, output => $gffout} );
}
}
}
Expand Down Expand Up @@ -336,7 +336,7 @@ sub print_omniscient_as_match{
foreach my $primary_tag_l2 (sort {$a cmp $b} keys %{$omniscient->{'level2'}}){ # primary_tag_l2 = mrna or mirna or ncrna or trna etc...

if ( exists_keys( $omniscient, ('level2', $primary_tag_l2, $id_tag_key_level1) ) ){
foreach my $feature_level2 ( sort {$a->start <=> $b->start} @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {
foreach my $feature_level2 ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$omniscient->{'level2'}{$primary_tag_l2}{$id_tag_key_level1}}) {

if($primary_tag_l2 =~ "match"){
$gffout->write_feature($feature_level2);
Expand Down Expand Up @@ -429,7 +429,7 @@ sub print_omniscient_from_level1_id_list {
foreach my $primary_tag_key_level2 (keys %{$omniscient->{'level2'}}){ # primary_tag_key_level2 = mrna or mirna or ncrna or trna etc...

if ( exists ($omniscient->{'level2'}{$primary_tag_key_level2}{$id_tag_key_level1} ) ){
foreach my $feature_level2 ( @{$omniscient->{'level2'}{$primary_tag_key_level2}{$id_tag_key_level1}}) {
foreach my $feature_level2 ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$omniscient->{'level2'}{$primary_tag_key_level2}{$id_tag_key_level1}}) {

#_uri_encode_one_feature($feature_level2);

Expand Down
10 changes: 5 additions & 5 deletions lib/AGAT/OmniscientTool.pm
Original file line number Diff line number Diff line change
Expand Up @@ -2620,8 +2620,8 @@ sub gather_and_sort_l1_by_seq_id{
push (@{$hash_sortBySeq{$position}{$tag_level1}}, $omniscient->{'level1'}{$tag_level1}{$level1_id});
}
foreach my $position_l1 (keys %hash_sortBySeq){
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
}
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
}
}
return \%hash_sortBySeq;
}
Expand All @@ -2643,7 +2643,7 @@ sub gather_and_sort_l1_by_seq_id_for_l2type{
}
}
foreach my $position_l1 (keys %hash_sortBySeq){
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
}
}
return \%hash_sortBySeq;
Expand All @@ -2663,7 +2663,7 @@ sub gather_and_sort_l1_by_seq_id_for_l1type{
push (@{$hash_sortBySeq{$position}{$tag_level1}}, $omniscient->{'level1'}{$tag_level1}{$level1_id});
}
foreach my $position_l1 (keys %hash_sortBySeq){
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
}
}

Expand All @@ -2684,7 +2684,7 @@ sub gather_and_sort_l1_by_seq_id_and_strand{
push (@{$hash_sortBySeq{$position_l1}{$tag_level1}}, $level1_feature);
}
foreach my $position_l1 (keys %hash_sortBySeq){
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start.$a->end.$a->_tag_value('ID'), $b->start.$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
@{$hash_sortBySeq{$position_l1}{$tag_level1}} = sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$hash_sortBySeq{$position_l1}{$tag_level1}};
}
}
return \%hash_sortBySeq;
Expand Down

0 comments on commit 703106c

Please sign in to comment.