Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pe,sv,os from uniprot db header, evalue from blast output #519

Merged
merged 1 commit into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 60 additions & 16 deletions bin/agat_sp_manage_functional_annotation.pl
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@
my %nameBlast;
my %geneNameBlast;
my %mRNANameBlast;
my %mRNAUniprotIDFromBlast;
my %mRNAproduct;
my %geneNameGiven;
my %mRNAUniprotIDFromBlast; # Uniprot ID of the best candidate (from balst file)
my %blast_evalue; # evalue of the best candidate (from balst file)
my %blast_organism; # organism (OS) of the best candidate (from balst file)
my %blast_protEvidence; # Protein Evidence (PR) of the best candidate (from balst file)
my %blast_seqVersion; # Sequence Version (SV) of the best candidate (from balst file)
my %geneNameGiven; # Gene Name (GN) of the best candidate (from balst file)
my %duplicateNameGiven;
my %fasta_id_gn_hash = (); # JN: key: 'sp|a6w1c3|hem1_marms' , value: 'hema' etc or undef
my %l2_gn_present_hash = (); # JN: Key: 'maker-bi03_p1mp_001088f-est_gff_stringtie-gene-0.2-mrna-1', value: 'yes' or 'no'
Expand Down Expand Up @@ -347,6 +351,21 @@
create_or_append_tag($feature_level2, 'Name', $mRNANameBlast{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'Name', $mRNANameBlast{$level2_ID});
}
# add OS attribute
if (exists ($blast_organism{$level2_ID})) {
create_or_append_tag($feature_level2, 'os', $blast_organism{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'os', $blast_organism{$level2_ID});
}
# add PE attribute
if (exists ($blast_protEvidence{$level2_ID})) {
create_or_append_tag($feature_level2, 'pe', $blast_protEvidence{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'pe', $blast_protEvidence{$level2_ID});
}
# add SV attribute
if (exists ($blast_seqVersion{$level2_ID})) {
create_or_append_tag($feature_level2, 'sv', $blast_seqVersion{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'sv', $blast_seqVersion{$level2_ID});
}

#add UniprotID attribute
if (exists ($mRNAUniprotIDFromBlast{$level2_ID})) {
Expand All @@ -355,6 +374,12 @@
add_attribute_to_cds($hash_omniscient, $level2_ID, 'uniprot_id', $mRNAUniprotID);
}

#add evalue of the blast
if (exists ($blast_evalue{$level2_ID})) {
create_or_replace_tag($feature_level2, 'evalue', $blast_evalue{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'evalue', $blast_evalue{$level2_ID});
}

# JN: Add info on existence of GN= tag in fasta header in blast db file: gn_present=yes|no|NA
if (exists($l2_gn_present_hash{$level2_ID})) {
my $gn_status = $l2_gn_present_hash{$level2_ID};
Expand Down Expand Up @@ -802,9 +827,9 @@ sub parse_blast {
my $prot_name = $values[1]; # JN: sp|Q4FZT2|PPME1_RAT
my @prot_name_sliced = split(/\|/, $values[1]);
my $uniprot_id = $prot_name_sliced[1];
print "uniprot_id: ".$uniprot_id."\n" if ($opt_verbose);
#print "uniprot_id: ".$uniprot_id."\n" if ($opt_verbose);
my $evalue = $values[10];
print "Evalue: ".$evalue."\n" if ($opt_verbose);
#print "Evalue: ".$evalue."\n" if ($opt_verbose);

#if does not exist fill it if over the minimum evalue
if (! exists_keys(\%candidates, ($l2_name)) or @{$candidates{$l2_name}} > 3 ) { # the second one means we saved an error message as candidates we still have to try to find a proper one
Expand Down Expand Up @@ -893,8 +918,8 @@ sub parse_blast {
my $nb_desc = keys %candidates;
$ostreamLog->print( "We have $nb_desc description candidates.\n") if ($opt_verbose or $opt_output);

##################################################
####### Step 2 : go through all candidates ####### report gene name for each mRNA
##################################################
####### Step 2 : go through all candidates ####### report gene name for each mRNA
my %name_checker;
foreach my $l2 (sort keys %candidates) {
# JN: Here we need to not(?) return error above to be able to differentiate the cases without GN?
Expand All @@ -904,9 +929,15 @@ sub parse_blast {
}

#Save uniprot id of the best match
print "save for $l2 ".$candidates{$l2}[2]."\n" if ($opt_verbose);

$mRNAUniprotIDFromBlast{$l2} = $candidates{$l2}[2];
print "save for $l2 ".$candidates{$l2}[2]."\n" if ($opt_verbose);
print "save protein ID for $l2 : ".$candidates{$l2}[2]."\n" if ($opt_verbose);

#Save evalu
$blast_evalue{$l2} = $candidates{$l2}[1];
print "save blast evalue for $l2 : ".$candidates{$l2}[1]."\n" if ($opt_verbose);

# Parse header
my $header = $candidates{$l2}[0];
print "header: ".$header."\n" if ($opt_verbose);

Expand All @@ -917,6 +948,7 @@ sub parse_blast {
$theRest =~ s/\n//g;
$theRest =~ s/\r//g;
my $nameGene = undef;
print "description: ".$description."\n" if ($opt_verbose);
push ( @{ $mRNAproduct{$l2} }, $description );

#deal with the rest
Expand All @@ -925,28 +957,24 @@ sub parse_blast {
while ($theRest) {
($theRest, $tuple) = stringCatcher($theRest);
my ($type, $value) = split /=/, $tuple;
#print "$protID: type:$type --- value:$value\n";
print "$protID: type:$type --- value:$value\n" if ($opt_verbose);
$hash_rest{lc($type)} = $value;
}

if (exists($hash_rest{"gn"})) {
$nameGene = $hash_rest{"gn"};


if (exists_keys ($hash_mRNAGeneLink, ($l2)) ) {
# Save mRNA name into mRNA features
$mRNANameBlast{$l2} = $nameGene;

my $geneID = $hash_mRNAGeneLink->{$l2};

# Save mRNA names into gene features (a list because we can have several gene names if mRNA isoorms were refering to different gene names)
# Save mRNA names into gene features (a list because we can have several gene names if mRNA isoforms were refering to different gene names)
if (! exists_keys(\%name_checker,(lc($geneID),lc($nameGene) ))){
push ( @{ $geneNameBlast{lc($geneID)} }, $nameGene );
$name_checker{lc($geneID)}{lc($nameGene)}++
}
else{
$name_checker{lc($geneID)}{lc($nameGene)}++
}
$name_checker{lc($geneID)}{lc($nameGene)}++;

}
else {
Expand All @@ -956,6 +984,21 @@ sub parse_blast {
else {
$ostreamLog->print( "No gene name (GN) tag found in the header: $header\n") if ($opt_verbose or $opt_output);
}

# catch organism
if (exists($hash_rest{"os"})) {
$blast_organism{$l2} = $hash_rest{"os"};
}

# catch Protein evidence
if (exists($hash_rest{"pe"})) {
$blast_protEvidence{$l2} = $hash_rest{"pe"};
}

# catch Sequence Version
if (exists($hash_rest{"sv"})) {
$blast_seqVersion{$l2} = $hash_rest{"sv"};
}
}
else {
$ostreamLog->print( "Header from the db fasta file doesn't match the regular expression: $header\n") if ($opt_verbose or $opt_output);
Expand Down Expand Up @@ -992,6 +1035,7 @@ sub stringCatcher {
return ($newString, $1);
}
else {
$String =~ s/^\s+|\s+$//g; # Removes leading and trailing spaces
return (undef, $String);
}
}
Expand Down Expand Up @@ -1212,7 +1256,7 @@ =head1 OPTIONS
=item B<--pe>

Integer - The PE (protein existence) in the uniprot header indicates the type of evidence that supports the existence of the protein.
You can decide until which protein existence level you want to consider to lift the finctional information. Default 5.
You can decide until which protein existence level you want to consider to lift the functional information. Default 5.

1. Experimental evidence at protein level
2. Experimental evidence at transcript level
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,23 @@
@002413F|arrow|arrow maker five_prime_UTR 21263 21283 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.0-mRNA-1:five_prime_utr;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.0-mRNA-1
@002413F|arrow|arrow maker three_prime_UTR 23400 23877 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.0-mRNA-1:three_prime_utr;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.0-mRNA-1
@002413F|arrow|arrow maker gene 25567 26182 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1;Name=POL
@002413F|arrow|arrow maker mRNA 25567 26182 2523 + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1;Dbxref=CDD:cd06093;Name=POL;product=Retrovirus-related Pol polyprotein from transposon 412;uniprot_id=P10394
@002413F|arrow|arrow maker mRNA 25567 26182 2523 + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1;Dbxref=CDD:cd06093;Name=POL;evalue=4e-12;os=Drosophila melanogaster;pe=4;product=Retrovirus-related Pol polyprotein from transposon 412;sv=1;uniprot_id=P10394
@002413F|arrow|arrow maker exon 25567 25720 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:1;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker exon 25824 26182 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:2;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker CDS 25663 25720 . + 0 ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:cds;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker CDS 25824 26134 . + 2 ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:cds;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker five_prime_UTR 25567 25662 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:five_prime_utr;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker three_prime_UTR 26135 26182 . + . ID=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1:three_prime_utr;Parent=maker-@002413F|arrow|arrow-exonerate_est2genome-gene-0.1-mRNA-1
@002413F|arrow|arrow maker gene 36917 37480 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Name=POL,metN1
@002413F|arrow|arrow maker mRNA 36917 37460 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Dbxref=PANTHER:PTHR24559:SF193,PANTHER:PTHR24559;Name=POL;product=Retrovirus-related Pol polyprotein from transposon 412;uniprot_id=P10394
@002413F|arrow|arrow maker mRNA 36917 37460 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Dbxref=PANTHER:PTHR24559:SF193,PANTHER:PTHR24559;Name=POL;evalue=4e-12;os=Drosophila melanogaster;pe=4;product=Retrovirus-related Pol polyprotein from transposon 412;sv=1;uniprot_id=P10394
@002413F|arrow|arrow maker exon 36917 37460 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1:1;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1
@002413F|arrow|arrow maker CDS 36917 37402 . - 0 ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1:cds;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1
@002413F|arrow|arrow maker five_prime_UTR 37403 37460 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1:five_prime_utr;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-1
@002413F|arrow|arrow maker mRNA 36917 37470 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Name=pol;product=Retrovirus-related Pol polyprotein from transposon 17.6;uniprot_id=P04323
@002413F|arrow|arrow maker mRNA 36917 37470 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Name=pol;evalue=4e-12;os=Drosophila melanogaster;pe=4;product=Retrovirus-related Pol polyprotein from transposon 17.6;sv=1;uniprot_id=P04323
@002413F|arrow|arrow maker exon 36917 37470 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2:1;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2
@002413F|arrow|arrow maker CDS 36917 37402 . - 0 ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2:cds;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2
@002413F|arrow|arrow maker five_prime_UTR 37403 37470 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2:five_prime_utr;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-2
@002413F|arrow|arrow maker mRNA 36917 37480 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Name=metN1;product=Methionine import ATP-binding protein MetN 1;uniprot_id=Q5HRU5
@002413F|arrow|arrow maker mRNA 36917 37480 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4;Name=metN1;evalue=4e-12;os=Staphylococcus epidermidis (strain ATCC 35984 / RP62A);pe=3;product=Methionine import ATP-binding protein MetN 1;sv=1;uniprot_id=Q5HRU5
@002413F|arrow|arrow maker exon 36917 37480 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3:1;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3
@002413F|arrow|arrow maker CDS 36917 37402 . - 0 ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3:cds;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3
@002413F|arrow|arrow maker five_prime_UTR 37403 37480 . - . ID=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3:five_prime_utr;Parent=snap_masked-@002413F|arrow|arrow-processed-gene-0.4-mRNA-3