From 25b4a2f769e0419738c73d7bd5e962edb60e0ecc Mon Sep 17 00:00:00 2001 From: Ivan Domenzain Date: Fri, 7 Feb 2020 14:26:47 +0100 Subject: [PATCH 1/5] fix: build empty KEGG database for organisms without KEGG id --- geckomat/get_enzyme_data/updateDatabases.m | 45 +++++++--------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/geckomat/get_enzyme_data/updateDatabases.m b/geckomat/get_enzyme_data/updateDatabases.m index 0a1bf2363..e88f4ba0e 100644 --- a/geckomat/get_enzyme_data/updateDatabases.m +++ b/geckomat/get_enzyme_data/updateDatabases.m @@ -10,41 +10,33 @@ % % Usage: [swissprot,kegg] = updateDatabases % -% Benjamin Sanchez, Cheng Zhang, Ivan Domenzain. Last edited: 2019-07-12 +% Ivan Domenzain. Last edited: 2020-01-25 % current = pwd; cd .. parameters = getModelParameters; -keggID = parameters.keggID; cd (current) -if ~regexp(keggID,'[a-z]{3,4}') - error('Please specify the KEGG organism ID in the script getModelParameters.m') +if isfield(parameters,'keggID') + keggID = parameters.keggID; + %Download KEGG data: + mkdir ../../databases/KEGG + downloadKEGGdata(keggID) + %Build KEGG table + kegg = buildKEGGtable(keggID); + %Remove KEGG files for compliance of repository: + delete ../../databases/KEGG/*.txt + rmdir ../../databases/KEGG +else + kegg = cell(1,7); end - %Build Swissprot table: swissprot = buildSWISSPROTtable; - -%Download KEGG data: -mkdir ../../databases/KEGG -downloadKEGGdata(keggID) - -%Build KEGG table -kegg = buildKEGGtable(keggID); - -%Remove KEGG files for compliance of repository: -delete ../../databases/KEGG/*.txt -rmdir ../../databases/KEGG - %Save both databases as .mat files: save('../../databases/ProtDatabase.mat','swissprot','kegg'); - end - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - function swissprot = buildSWISSPROTtable - %Build Swissprot table (uniprot code - protein name - gene names - EC number - sequence): fileID_uni = fopen('../../databases/uniprot.tab'); swissprot = textscan(fileID_uni,'%s %s %s %s %s','delimiter','\t'); @@ -63,19 +55,14 @@ swissprot{i,6} = sequence; end disp('Building Swiss-Prot database.') - end - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - function downloadKEGGdata(keggID) - base = 'http://rest.kegg.jp/'; operation = 'list/'; gene_list = urlread([base operation keggID]); gene_list = regexpi(gene_list, '[^\n]+','match')'; gene_id = regexpi(gene_list,['(?<=' keggID ':)\S+'],'match'); - % Retrieve information for every gene in the list (with a maximum of 10,000 % to avoid bulk downloads) operation = 'get/'; @@ -90,13 +77,9 @@ function downloadKEGGdata(keggID) disp(['Cannot find ' gene_id{i}{1} ' in KEGG']); end end - end - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - function kegg = buildKEGGtable(keggID) - %Build KEGG table (uniprot code - protein name - systematic gene name - EC number - MW - pathway - sequence): file_names = dir('../../databases/KEGG/'); file_names(1:2) = []; @@ -189,7 +172,5 @@ function downloadKEGGdata(keggID) disp(['Updating KEGG database: Ready with gene ' gene_name]) end kegg(n+1:end,:) = []; - end - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% From 019940cd1bf0fa504cf185d3f0fb4935b7c8ad6f Mon Sep 17 00:00:00 2001 From: Ivan Domenzain Date: Fri, 7 Feb 2020 16:11:41 +0100 Subject: [PATCH 2/5] fix: allow empty KEGG database in getEnzymeCodes --- geckomat/get_enzyme_data/getEnzymeCodes.m | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/geckomat/get_enzyme_data/getEnzymeCodes.m b/geckomat/get_enzyme_data/getEnzymeCodes.m index 0360ba857..007038746 100644 --- a/geckomat/get_enzyme_data/getEnzymeCodes.m +++ b/geckomat/get_enzyme_data/getEnzymeCodes.m @@ -25,7 +25,7 @@ % *count(4): #other rxns % % Benjamin Sanchez. Last edited: 2017-03-05 -% Ivan Domenzain. Last edited: 2018-09-07 +% Ivan Domenzain. Last edited: 2020-02-07 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function model_data = getEnzymeCodes(model,action) @@ -169,8 +169,11 @@ end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% function database = standardizeDatabase(database) -for i = 1:length(database) - database{i,3} = strsplit(database{i,3},' '); +[m,~] = size(database); +if m>1 + for i = 1:length(database) + database{i,3} = strsplit(database{i,3},' '); + end end end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% From c4b006a38c3ed42970a6af2bf2b97ec9e4ead552 Mon Sep 17 00:00:00 2001 From: Ivan Domenzain Date: Fri, 7 Feb 2020 16:35:53 +0100 Subject: [PATCH 3/5] fix: provide compatibility with an empty KEGG database --- geckomat/change_model/addProtein.m | 51 +++++++++++--------- geckomat/change_model/convertToEnzymeModel.m | 17 ++++--- geckomat/get_enzyme_data/updateDatabases.m | 2 +- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/geckomat/change_model/addProtein.m b/geckomat/change_model/addProtein.m index f7d9b811b..9c855f889 100644 --- a/geckomat/change_model/addProtein.m +++ b/geckomat/change_model/addProtein.m @@ -1,4 +1,6 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function model = addProtein(model,P,kegg,swissprot) +%addProtein +% % model = addProtein(model,P,kegg,swissprot) % Adds an exchange reaction for protein P and updates model.enzymes, % model.MWs and model.pathways to account for P. @@ -13,9 +15,8 @@ % model Model with the added protein % % Cheng Zhang & Benjamin J. Sanchez. Last edited: 2018-05-28 -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Ivan Domenzain. Last edited: 2020-02-07 -function model = addProtein(model,P,kegg,swissprot) %Update model.enzyme vector: prot_name = ['prot_' P]; @@ -64,27 +65,29 @@ %Update model.genes & model.pathways vectors: match_path = false; -for i = 1:length(kegg) - if strcmp(P,kegg{i,1}) - %Gene: - if ~isempty(kegg{i,3}) && ~match_gen - match_gen = true; - gene = kegg{i,3}; - end - %Pathway: - if ~isempty(kegg{i,6}) && ~match_path - match_path = true; - model.pathways{pos_e,1} = kegg{i,6}; - end - %Molecular Weight (if nothing found in uniprot): - if kegg{i,5} > 0 && ~match_MW - match_MW = true; - model.MWs(pos_e,1) = kegg{i,5}/1000; - end - %Sequence (if nothing found in uniprot): - if ~isempty(kegg{i,7}) && ~match_seq - match_seq = true; - model.sequences(pos_e,1) = kegg(i,7); +if isempty(kegg) + for i = 1:length(kegg) + if strcmp(P,kegg{i,1}) + %Gene: + if ~isempty(kegg{i,3}) && ~match_gen + match_gen = true; + gene = kegg{i,3}; + end + %Pathway: + if ~isempty(kegg{i,6}) && ~match_path + match_path = true; + model.pathways{pos_e,1} = kegg{i,6}; + end + %Molecular Weight (if nothing found in uniprot): + if kegg{i,5} > 0 && ~match_MW + match_MW = true; + model.MWs(pos_e,1) = kegg{i,5}/1000; + end + %Sequence (if nothing found in uniprot): + if ~isempty(kegg{i,7}) && ~match_seq + match_seq = true; + model.sequences(pos_e,1) = kegg(i,7); + end end end end diff --git a/geckomat/change_model/convertToEnzymeModel.m b/geckomat/change_model/convertToEnzymeModel.m index 8a65e7e42..e8f21fc21 100644 --- a/geckomat/change_model/convertToEnzymeModel.m +++ b/geckomat/change_model/convertToEnzymeModel.m @@ -1,4 +1,6 @@ -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +function eModel = convertToEnzymeModel(irrevModel,Genes,uniprots,kcats) +%convertToEnzymeModel +% % eModel = convertToEnzymeModel(model,Genes,uniprots,kcats) % Converts standard GEM to GEM accounting for enzymes as pseudo % metabolites, with -(1/kcat) as the corresponding stoich. coeffs. @@ -13,17 +15,14 @@ % eModel Modified GEM structure (1x1 struct) % % Cheng Zhang. Last edited: 2018-05-24 -% Ivan Domenzain. Last edited: 2018-09-07 % Benjamin J. Sanchez. Last edited: 2018-11-11 -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -function eModel = convertToEnzymeModel(irrevModel,Genes,uniprots,kcats) +% Ivan Domenzain. Last edited: 2020-02-07 %Load databases: data = load('../../databases/ProtDatabase.mat'); swissprot = data.swissprot; kegg = data.kegg; - +[nrows,~] = size(kegg); eModel = irrevModel; enzymes = cell(5000,1); [m,n] = size(uniprots); @@ -85,7 +84,11 @@ eModel.sequences = cell(0,1); eModel.pathways = cell(0,1); for i = 1:length(enzymes) - eModel = addProtein(eModel,enzymes{i},kegg,swissprot); + if nrows>1 + eModel = addProtein(eModel,enzymes{i},kegg,swissprot); + else + eModel = addProtein(eModel,enzymes{i},[],swissprot); + end end end diff --git a/geckomat/get_enzyme_data/updateDatabases.m b/geckomat/get_enzyme_data/updateDatabases.m index e88f4ba0e..811dfc95a 100644 --- a/geckomat/get_enzyme_data/updateDatabases.m +++ b/geckomat/get_enzyme_data/updateDatabases.m @@ -171,6 +171,6 @@ function downloadKEGGdata(keggID) kegg{n,7} = sequence; disp(['Updating KEGG database: Ready with gene ' gene_name]) end -kegg(n+1:end,:) = []; +kegg(n+1:end,:) = []; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% From cf547adb7e2cdb746b9323369e47925a85cd7f71 Mon Sep 17 00:00:00 2001 From: Ivan Domenzain Date: Tue, 5 Jan 2021 11:56:07 +0100 Subject: [PATCH 4/5] fix: bug in addProtein for empty KEGG ProtDatabase case --- geckomat/change_model/addProtein.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geckomat/change_model/addProtein.m b/geckomat/change_model/addProtein.m index 9c855f889..64e7f6710 100644 --- a/geckomat/change_model/addProtein.m +++ b/geckomat/change_model/addProtein.m @@ -65,7 +65,7 @@ %Update model.genes & model.pathways vectors: match_path = false; -if isempty(kegg) +if ~isempty(kegg) for i = 1:length(kegg) if strcmp(P,kegg{i,1}) %Gene: From 1e89c22746a1ed7c4b69c8949846acc4b1948664 Mon Sep 17 00:00:00 2001 From: Ivan Domenzain Date: Tue, 5 Jan 2021 16:47:16 +0100 Subject: [PATCH 5/5] fix: add warning message for conflicting KEGG ids --- geckomat/get_enzyme_data/updateDatabases.m | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/geckomat/get_enzyme_data/updateDatabases.m b/geckomat/get_enzyme_data/updateDatabases.m index 811dfc95a..6f45bd998 100644 --- a/geckomat/get_enzyme_data/updateDatabases.m +++ b/geckomat/get_enzyme_data/updateDatabases.m @@ -17,18 +17,21 @@ cd .. parameters = getModelParameters; cd (current) +kegg = cell(1,7); if isfield(parameters,'keggID') keggID = parameters.keggID; - %Download KEGG data: mkdir ../../databases/KEGG - downloadKEGGdata(keggID) - %Build KEGG table - kegg = buildKEGGtable(keggID); + try + %Download KEGG data: + downloadKEGGdata(keggID) + %Build KEGG table + kegg = buildKEGGtable(keggID); + catch + warning(['Unsuccessful query for "' keggID '", check the presence of the provided ID in the KEGG database (https://www.genome.jp/kegg/catalog/org_list.html)']) + end %Remove KEGG files for compliance of repository: delete ../../databases/KEGG/*.txt rmdir ../../databases/KEGG -else - kegg = cell(1,7); end %Build Swissprot table: swissprot = buildSWISSPROTtable;