From 86a2c731400ddfad0d45c4171bcec9a221d8c313 Mon Sep 17 00:00:00 2001 From: Mirek Kratochvil Date: Mon, 10 Jun 2024 20:18:40 +0200 Subject: [PATCH] switch to pikaparser parsing, implement conversion to MATFBCModel Closes #4 --- Project.toml | 2 + src/MATFBCModels.jl | 1 + src/grr_utils.jl | 192 ++++++++++++++++++++++++++++++++++++++++++++ src/interface.jl | 6 +- src/utils.jl | 19 ----- 5 files changed, 200 insertions(+), 20 deletions(-) create mode 100644 src/grr_utils.jl diff --git a/Project.toml b/Project.toml index b99698f..37fbf09 100644 --- a/Project.toml +++ b/Project.toml @@ -7,12 +7,14 @@ version = "0.2.0" AbstractFBCModels = "5a4f3dfa-1789-40f8-8221-69268c29937c" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" MAT = "23992714-dd62-5051-b70f-ba57cb901cac" +PikaParser = "3bbf5609-3e7b-44cd-8549-7c69f321e792" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [compat] AbstractFBCModels = "0.3" DocStringExtensions = "0.8, 0.9" MAT = "0.10" +PikaParser = "0.6" SparseArrays = "1" Test = "1" julia = "1" diff --git a/src/MATFBCModels.jl b/src/MATFBCModels.jl index 51eab18..fb5bc8e 100644 --- a/src/MATFBCModels.jl +++ b/src/MATFBCModels.jl @@ -12,6 +12,7 @@ include("constants.jl") include("interface.jl") include("io.jl") include("utils.jl") +include("grr_utils.jl") export MATFBCModel diff --git a/src/grr_utils.jl b/src/grr_utils.jl new file mode 100644 index 0000000..3b9a5fd --- /dev/null +++ b/src/grr_utils.jl @@ -0,0 +1,192 @@ + +# note: this file is a copy from JSONFBCModels. Might be nice to have some +# kindof mechanism to keep them roughly in sync. Maybe sink to the Abstract +# interface? + +import PikaParser as PP + +""" +`PikaParser.jl` grammar for stringy GRR expressions. +""" +const grr_grammar = begin + # characters that typically form the identifiers + isident(x::Char) = + isletter(x) || + isdigit(x) || + x == '_' || + x == '-' || + x == ':' || + x == '.' || + x == '@' || + x == '#' || + x == '\'' || + x == '[' || + x == ']' + + # scanner helpers + eat(p) = m -> begin + last = 0 + for i in eachindex(m) + p(m[i]) || break + last = i + end + return last + end + + # eat one of keywords + kws(w...) = m -> begin + last = eat(isident)(m) + m[begin:last] in w ? last : 0 + end + + PP.make_grammar( + [:expr], + PP.flatten( + Dict( + :space => PP.first(PP.scan(eat(isspace)), PP.epsilon), + :id => PP.scan(eat(isident)), + :orop => + PP.first(PP.tokens("||"), PP.token('|'), PP.scan(kws("OR", "or"))), + :andop => PP.first( + PP.tokens("&&"), + PP.token('&'), + PP.scan(kws("AND", "and")), + ), + :expr => PP.seq(:space, :orexpr, :space, PP.end_of_input), + :orexpr => PP.first( + :or => PP.seq(:andexpr, :space, :orop, :space, :orexpr), + :andexpr, + ), + :andexpr => PP.first( + :and => PP.seq(:baseexpr, :space, :andop, :space, :andexpr), + :baseexpr, + ), + :baseexpr => PP.first( + :id, + :parenexpr => PP.seq( + PP.token('('), + :space, + :orexpr, + :space, + PP.token(')'), + ), + ), + ), + Char, + ), + ) +end + +grr_grammar_open(m, _) = + m.rule == :expr ? Bool[0, 1, 0, 0] : + m.rule == :parenexpr ? Bool[0, 0, 1, 0, 0] : + m.rule in [:or, :and] ? Bool[1, 0, 0, 0, 1] : + m.rule in [:andexpr, :orexpr, :notexpr, :baseexpr] ? Bool[1] : + (false for _ in m.submatches) + +grr_grammar_fold(m, _, subvals) = + m.rule == :id ? Expr(:call, :gene, String(m.view)) : + m.rule == :and ? Expr(:call, :and, subvals[1], subvals[5]) : + m.rule == :or ? Expr(:call, :or, subvals[1], subvals[5]) : + m.rule == :parenexpr ? subvals[3] : + m.rule == :expr ? subvals[2] : isempty(subvals) ? nothing : subvals[1] + +""" +$(TYPEDSIGNATURES) + +Parses a JSON-ish data reference to a `Expr`-typed gene association. Contains +"calls" to `gene`, `and` and `or` functions that describe the association. +""" +function parse_gene_association(str::String)::Maybe{Expr} + all(isspace, str) && return nothing + tree = PP.parse_lex(grr_grammar, str) + match = PP.find_match_at!(tree, :expr, 1) + match > 0 || throw(DomainError(str, "cannot parse GRR")) + PP.traverse_match(tree, match, open = grr_grammar_open, fold = grr_grammar_fold) +end + +""" +$(TYPEDSIGNATURES) + +Evaluate the gene association expression with the reference values given by the +`val` function. +""" +function eval_gene_association(ga::Expr, val::Function)::Bool + (ga.head == :call && length(ga.args) >= 2) || + throw(DomainError(ga, "invalid gene association expr")) + if ga.args[1] == :gene && length(ga.args) == 2 + val(ga.args[2]) + elseif ga.args[1] == :and + all(eval_gene_association.(ga.args[2:end], Ref(val))) + elseif ga.args[1] == :or + any(eval_gene_association.(ga.args[2:end], Ref(val))) + else + throw(DomainError(ga, "unsupported gene association function")) + end +end + +""" +$(TYPEDSIGNATURES) + +A helper for producing predictable unique sequences. Might be faster if +compacting would be done directly in sort(). +""" +function sortunique(x) + o = collect(x) + sort!(o) + put = prevind(o, firstindex(o)) + for i in eachindex(o) + if put >= firstindex(o) && o[i] == o[put] + # we already have this one + continue + else + put = nextind(o, put) + if put != i + o[put] = o[i] + end + end + end + o[begin:put] +end + +""" +$(TYPEDSIGNATURES) + +Convert the given gene association expression to DNF. +""" +function flatten_gene_association(ga::Expr)::A.GeneAssociationDNF + function fold_and(dnfs::Vector{Vector{Vector{String}}})::Vector{Vector{String}} + if isempty(dnfs) + [String[]] + else + sortunique( + sortunique(String[l; r]) for l in dnfs[1] for r in fold_and(dnfs[2:end]) + ) + end + end + + (ga.head == :call && length(ga.args) >= 2) || + throw(DomainError(ga, "invalid gene association expr")) + if ga.args[1] == :gene && length(ga.args) == 2 + [[ga.args[2]]] + elseif ga.args[1] == :and + fold_and(flatten_gene_association.(ga.args[2:end])) + elseif ga.args[1] == :or + sortunique(vcat(flatten_gene_association.(ga.args[2:end])...)) + else + throw(DomainError(ga, "unsupported gene association function")) + end +end + +""" +$(TYPEDSIGNATURES) + +Formats a DNF gene association as a `String`. +""" +function format_gene_association_dnf( + grr::A.GeneAssociationDNF; + and = " && ", + or = " || ", +)::String + return join(("(" * join(gr, and) * ")" for gr in grr), or) +end diff --git a/src/interface.jl b/src/interface.jl index 8ca07a8..72c8315 100644 --- a/src/interface.jl +++ b/src/interface.jl @@ -140,6 +140,7 @@ function Base.convert(::Type{MATFBCModel}, m::A.AbstractFBCModel) end lb, ub = A.bounds(m) + clb, cub = A.coupling_bounds(m) return MATFBCModel( "model", # default name Dict( @@ -150,9 +151,12 @@ function Base.convert(::Type{MATFBCModel}, m::A.AbstractFBCModel) "ub" => Vector(ub), "b" => Vector(A.balance(m)), "c" => Vector(A.objective(m)), + "C" => A.coupling(m), + "cl" => Vector(clb), + "cu" => Vector(cub), "genes" => A.genes(m), "grRules" => [ - unparse_grr(A.reaction_gene_association_dnf(m, rid)) for + format_gene_association_dnf(A.reaction_gene_association_dnf(m, rid)) for rid in A.reactions(m) ], "metFormulas" => diff --git a/src/utils.jl b/src/utils.jl index ea43227..1b3a005 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,20 +1,6 @@ guesskeys(id, model) = first(intersect(keys(model.mat), getfield(key_names, id))) -function parse_grr(str::Maybe{String}) - isnothing(str) && return nothing - isempty(str) && return nothing - - dnf = A.GeneAssociationDNF() - for isozyme in string.(split(str, " or ")) - push!( - dnf, - string.(split(replace(isozyme, "(" => "", ")" => "", " and " => " "), " ")), - ) - end - return dnf -end - function parse_formula(x::Maybe{String}) isnothing(x) && return nothing x == "" && return nothing @@ -46,8 +32,3 @@ function unparse_formula(x::Maybe{A.MetaboliteFormula}) ks = sort(collect(keys(x))) join(k * string(x[k]) for k in ks) end - -function unparse_grr(xs::Maybe{A.GeneAssociationDNF}) - isnothing(xs) && return nothing - join((join(x, " and ") for x in xs), " or ") -end