diff --git a/src/fst/filters/Makefile.am b/src/fst/filters/Makefile.am index 2c3d5d92..60e51967 100644 --- a/src/fst/filters/Makefile.am +++ b/src/fst/filters/Makefile.am @@ -22,13 +22,23 @@ GIELLA_FILTER_LOCAL_REGEX_SRCS=\ downcase_UCletters.regex \ remove-norm-comp-tags.regex \ + remove-rel_focus-strings.regex \ remove-diaereses-enhancement.regex \ + block-illegal_compound-strings.regex \ + insert-default-compounding-tags.regex \ + insert-default_left_compounding-tags.regex \ remove-homonymy-tags.regex \ remove-dialect-tags.regex \ remove-usage-tags.regex \ remove-derivation-position-tags.regex \ + remove-illegal-derivation-strings.regex \ + remove-illegal-derivation-strings-flagbased.regex \ rename-POS_before_Der-tags.regex \ - remove-DNorm-tags.regex + split-CmpN-tags.regex \ + split-CmpNP-tags.regex \ + remove-DNorm-tags.regex \ + convert_to_flags-CmpNP-tags.regex \ + change-optionally-word_boundary_to_hyphen.regex # List any local filter xfscript files here: GIELLA_FILTER_LOCAL_XFSCRIPT_SRCS= diff --git a/src/fst/filters/block-illegal_compound-strings.regex b/src/fst/filters/block-illegal_compound-strings.regex new file mode 100644 index 00000000..28913be5 --- /dev/null +++ b/src/fst/filters/block-illegal_compound-strings.regex @@ -0,0 +1,33 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +# This filter removes all strings containing compounding tags +# in illegal combinations. +[ +! Convert normative tags to positive reset flags: + "@P.CmpN.SgN@" <- "+CmpN/SgN" , + "@P.CmpN.SgG@" <- "+CmpN/SgG" , + "@P.CmpN.PlG@" <- "+CmpN/PlG" , + +! Convert descriptive tags to require flags to match the set above: + [ "@R.CmpN.SgN@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgNom@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgNom" , + [ "@R.CmpN.SgG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgGen" , + [ "@R.CmpN.PlG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.PlGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/PlGen" , + +! Convert normative left-governing tags: +"@U.CmpN_Left.SgNom@" "@C.CmpN_Left@" <- "+CmpN/SgNomLeft" , +"@U.CmpN_Left.SgGen@" "@C.CmpN_Left@" <- "+CmpN/SgGenLeft" , +"@U.CmpN_Left.PlGen@" "@C.CmpN_Left@" <- "+CmpN/PlGenLeft" + + ]; diff --git a/src/fst/filters/change-optionally-word_boundary_to_hyphen.regex b/src/fst/filters/change-optionally-word_boundary_to_hyphen.regex new file mode 100644 index 00000000..bb46f7e3 --- /dev/null +++ b/src/fst/filters/change-optionally-word_boundary_to_hyphen.regex @@ -0,0 +1,20 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +# This filter changes # optionally into - but only for lexicalised compounds. +# The dynamic ones should be caught by the regular compounding system, and +# should get a tag when hyphenated at the compound border. + + %# (->) %- "@C.LexComp@" || \[ "😱" | "-" ] _ ; + \ No newline at end of file diff --git a/src/fst/filters/convert_to_flags-CmpNP-tags.regex b/src/fst/filters/convert_to_flags-CmpNP-tags.regex new file mode 100644 index 00000000..95915d66 --- /dev/null +++ b/src/fst/filters/convert_to_flags-CmpNP-tags.regex @@ -0,0 +1,27 @@ +# Divvun & Giellatekno - open source grammars for Sámi and other languages +# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament +# http://giellatekno.uit.no & http://divvun.no +# +# This program is free software; you can redistribute and/or modify +# this file under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. The GNU General Public License +# is found at http://www.gnu.org/licenses/gpl.html. It is +# also available in the file $GTHOME/LICENSE.txt. +# +# Other licensing options are available upon request, please contact +# giellatekno@uit.no or divvun@uit.no + +# This filter convert single CmpN tags into corresponding flag diacritics. +# This is the second step of two in converting CmpN tags into flag diacritics +# that will regulate compounding behaviour in fst-based spellers. + +[ + "@U.CmpFrst.TRUE@" <- "+CmpNP/First", + "@U.CmpPref.TRUE@" <- "+CmpNP/Pref" , + "@P.CmpLast.TRUE@" <- "+CmpNP/Last" , + [ "@P.CmpOnly.FALSE@" "@P.CmpLast.TRUE@" + | "@R.CmpOnly.TRUE@" "@P.CmpLast.TRUE@" ] <- "+CmpNP/Suff" , + [ "@P.CmpOnly.FALSE@" | "@R.CmpOnly.TRUE@" ] <- "+CmpNP/Only" , + "@U.CmpNone.TRUE@" <- "+CmpNP/None" +]; diff --git a/src/fst/filters/insert-default-compounding-tags.regex b/src/fst/filters/insert-default-compounding-tags.regex new file mode 100644 index 00000000..4fad2b10 --- /dev/null +++ b/src/fst/filters/insert-default-compounding-tags.regex @@ -0,0 +1,86 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +# This filter removes all strings containing derivational tags +# as defined below. +# Further, we check that we don't have double passives. +# See smX-lex.txt for more documentation. +# +# The filter should be applied on top of the nonrec transducer. + +# Insert default compounding tag if there is no compounding tag before the +# POS tag, possibly with an intervening OLang/XXX tag in between (the semantic +# tag has been removed in an earlier step): + +# All contexts: block insertion if there is an OLang or a CmpN tag to the left. + +# First context: bare POS tag: + "+CmpN/SgN" <- [. 0 .] || [ .#. | %- | "+Cmp" | %# ] # SNM 2.5.2018: Added + # barriers to restrict scope of + added further below + \[ "+CmpN/SgN" + | "+CmpN/SgG" + | "+CmpN/PlN" + | "+CmpN/PlG" + | "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + | "+CmpNP/All" + | "+CmpNP/First" + | "+CmpNP/Pref" + | "+CmpNP/Last" + | "+CmpNP/Suff" + | "+CmpNP/None" + | "+CmpNP/Only" + ]+ # SNM 2.5.2018: Added + to allow hypmin flag diacritics + _ + [ "+N" | "+A" ] + +.o. + +# Second context: CmpNP + POS: + "+CmpN/SgN" <- [. 0 .] || \[ "+CmpN/SgN" + | "+CmpN/SgG" + | "+CmpN/PlN" + | "+CmpN/PlG" + | "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + | "+CmpNP/All" + | "+CmpNP/First" + | "+CmpNP/Pref" + | "+CmpNP/Last" + | "+CmpNP/Suff" + | "+CmpNP/None" + | "+CmpNP/Only" + ] + _ + [ "+CmpNP/All" + | "+CmpNP/First" + | "+CmpNP/Pref" + | "+CmpNP/Last" + | "+CmpNP/Suff" + | "+CmpNP/None" + | "+CmpNP/Only" + | "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + ]+ + ? # SNM 2.5.2018: potential hypmin flag diacritic + + [ "+N" | "+A" ] + +; + +# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want +# to insert the default tag. diff --git a/src/fst/filters/insert-default_left_compounding-tags.regex b/src/fst/filters/insert-default_left_compounding-tags.regex new file mode 100644 index 00000000..f1109de1 --- /dev/null +++ b/src/fst/filters/insert-default_left_compounding-tags.regex @@ -0,0 +1,56 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +! Insert a Left-compound blocking flag diacritic for everything not explicitly +! marked for Left governing compounding. + + +! First context: no CmpN Left tag: +"@U.Want_Left.FALSE@" <- [..] || %# + _ + + \[ "+N" | "+A" | "+PrsPrc" | %# + | "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + ]+ + + [ "+N" | "+A" | "+PrsPrc" ] + +.o. + +! Second context: CmpN Left tag immediately before POS tag - insert a tag +! to block regular CmpN compounding: +"@U.Want_Left.TRUE@" <- [..] || %# + _ + + \[ "+N" | "+A" | "+PrsPrc" | %# + | "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + ]+ + + [ "+CmpN/SgNomLeft" + | "+CmpN/SgGenLeft" + | "+CmpN/PlGenLeft" + ]^{1,3} + + [ "+N" | "+A" | "+PrsPrc" ] +; + +# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want +# to insert the default tag. + +# Insert default left-compounding tags if there is no such tab before the +# semantic tag and after the regular compounding tags: diff --git a/src/fst/filters/remove-illegal-derivation-strings-flagbased.regex b/src/fst/filters/remove-illegal-derivation-strings-flagbased.regex new file mode 100644 index 00000000..550b15db --- /dev/null +++ b/src/fst/filters/remove-illegal-derivation-strings-flagbased.regex @@ -0,0 +1,32 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +! This filter blocks all strings containing derivational tags +! as defined below. +! Further, we check that we don't have double passives. +! See https://giellalt.uit.no/lang/sme/root-morphology.html for more documentation. + +! Block illegal derivations - keep the tags for general weight assignment to DerN tags at a later stage: +"@D.Der1.TRUE@" "@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der1.TRUE@" "+Der1" <- "+Der1" , + "@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der2.TRUE@" "+Der2" <- "+Der2" , + "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der3.TRUE@" "+Der3" <- "+Der3" , + "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der4.TRUE@" "+Der4" <- "+Der4" , + "@D.Der5.TRUE@" "@P.Der5.TRUE@" "+Der5" <- "+Der5" , + +! Block double passives: +"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassS.TRUE@" "+Der/PassS" <- "+Der/PassS" , +"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassL.TRUE@" "+Der/PassL" <- "+Der/PassL" , + +! Reset everything when passing a word boundary: +"@C.Der1@" "@C.Der2@" "@C.Der3@" "@C.Der4@" "@C.Der5@" "@C.Der_PassS@" "@C.Der_PassL@" %# <- %# ; diff --git a/src/fst/filters/remove-illegal-derivation-strings.regex b/src/fst/filters/remove-illegal-derivation-strings.regex new file mode 100644 index 00000000..fff64729 --- /dev/null +++ b/src/fst/filters/remove-illegal-derivation-strings.regex @@ -0,0 +1,39 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software. You can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +# This filter removes all strings containing derivational tags +# as defined below. +# Further, we check that we don't have double passives. +# See smX-lex.txt for more documentation. +# +# The filter should be applied on top of the nonrec transducer. + +~[ $[ "+Der1" ?* "+Der1" ] + | $[ "+Der2" ?* "+Der1" ] + | $[ "+Der2" ?* "+Der2" ] + | $[ "+Der3" ?* "+Der1" ] + | $[ "+Der3" ?* "+Der2" ] + | $[ "+Der3" ?* "+Der3" ] + | $[ "+Der4" ?* "+Der1" ] + | $[ "+Der4" ?* "+Der2" ] + | $[ "+Der4" ?* "+Der3" ] + | $[ "+Der4" ?* "+Der4" ] + | $[ "+Der5" ?* "+Der1" ] + | $[ "+Der5" ?* "+Der2" ] + | $[ "+Der5" ?* "+Der3" ] + | $[ "+Der5" ?* "+Der4" ] + | $[ "+Der5" ?* "+Der5" ] + | $[ "+Der/PassS" ?* "+Der/PassL" ] + | $[ "+Der/PassL" ?* "+Der/PassS" ] + ] ; diff --git a/src/fst/filters/remove-rel_focus-strings.regex b/src/fst/filters/remove-rel_focus-strings.regex new file mode 100644 index 00000000..2db482a6 --- /dev/null +++ b/src/fst/filters/remove-rel_focus-strings.regex @@ -0,0 +1,29 @@ +! Divvun & Giellatekno - open source grammars for Sámi and other languages +! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament +! http://giellatekno.uit.no & http://divvun.no +! +! This program is free software; you can redistribute and/or modify +! this file under the terms of the GNU General Public License as published by +! the Free Software Foundation, either version 3 of the License, or +! (at your option) any later version. The GNU General Public License +! is found at http://www.gnu.org/licenses/gpl.html. It is +! also available in the file $GTHOME/LICENSE.txt. +! +! Other licensing options are available upon request, please contact +! giellatekno@hum.uit.no or feedback@divvun.no + +# This filter removes Rel-Foc combinations. + +~[ $[ %+Rel ?* %+Qst ] + | $[ %+Rel ?* %+Foc/ge ] + | $[ %+Rel ?* %+Foc/gen ] + | $[ %+Rel ?* %+Foc/ges ] + | $[ %+Rel ?* %+Foc/gis ] + | $[ %+Rel ?* %+Foc/naj ] + | $[ %+Rel ?* %+Foc/ba ] + | $[ %+Rel ?* %+Foc/be ] + | $[ %+Rel ?* %+Foc/hal ] + | $[ %+Rel ?* %+Foc/han ] + | $[ %+Rel ?* %+Foc/bat ] + | $[ %+Rel ?* %+Foc/son ] + ] ; diff --git a/src/fst/filters/split-CmpN-tags.regex b/src/fst/filters/split-CmpN-tags.regex new file mode 100644 index 00000000..ccf6fcb3 --- /dev/null +++ b/src/fst/filters/split-CmpN-tags.regex @@ -0,0 +1,39 @@ +# Divvun & Giellatekno - open source grammars for Sámi and other languages +# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament +# http://giellatekno.uit.no & http://divvun.no +# +# This program is free software; you can redistribute and/or modify +# this file under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. The GNU General Public License +# is found at http://www.gnu.org/licenses/gpl.html. It is +# also available in the file $GTHOME/LICENSE.txt. +# +# Other licensing options are available upon request, please contact +# giellatekno@uit.no or divvun@uit.no + +# This filter splits a row of CmpN tags into parallel paths with single tags. +# This is the first step of two in converting CmpN tags into flag diacritics +# that will regulate compounding behaviour in fst-based spellers. + +# The split is done independently for the regular CmpN tags, the CmpNLeft tags +# and the CmpNDEf tags. + +! Only +CmpN/ tags, no Left or Def: +[ [ "+CmpN/SgN" | "+CmpN/SgG" ] <- "+CmpN/SgN" "+CmpN/SgG" , + [ "+CmpN/SgN" | "+CmpN/PlG" ] <- "+CmpN/SgN" "+CmpN/PlG" , + [ "+CmpN/SgG" | "+CmpN/PlG" ] <- "+CmpN/SgG" "+CmpN/PlG" , + +! Left tags: + [ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/SgGenLeft" , + [ "+CmpN/SgNomLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/PlGenLeft" , + [ "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgGenLeft" "+CmpN/PlGenLeft" +] + +.o. + +[ [ "+CmpN/SgN" | "+CmpN/SgG" | "+CmpN/PlG" ] <- + "+CmpN/SgN" "+CmpN/SgG" "+CmpN/PlG" , + [ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <- + "+CmpN/SgNomLeft" "+CmpN/SgGenLeft" "+CmpN/PlGenLeft" +]; diff --git a/src/fst/filters/split-CmpNP-tags.regex b/src/fst/filters/split-CmpNP-tags.regex new file mode 100644 index 00000000..40ada9c1 --- /dev/null +++ b/src/fst/filters/split-CmpNP-tags.regex @@ -0,0 +1,25 @@ +# Divvun & Giellatekno - open source grammars for Sámi and other languages +# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament +# http://giellatekno.uit.no & http://divvun.no +# +# This program is free software; you can redistribute and/or modify +# this file under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. The GNU General Public License +# is found at http://www.gnu.org/licenses/gpl.html. It is +# also available in the file $GTHOME/LICENSE.txt. +# +# Other licensing options are available upon request, please contact +# giellatekno@uit.no or divvun@uit.no + +# This filter splits a row of CmpN tags into parallel paths with single tags. +# This is the first step of two in converting CmpN tags into flag diacritics +# that will regulate compounding behaviour in fst-based spellers. + +~$[ "+CmpNP/First" "+CmpNP/Last" ] + +.o. + +[ "+CmpNP/Last" (<-) "+CmpNP/First" "+CmpNP/Last", + "+CmpNP/First" (<-) "+CmpNP/First" "+CmpNP/Last" ] +;