Skip to content

Commit

Permalink
Add filters parallel to lang-mdf/src/fst/filters/Makefile.am.
Browse files Browse the repository at this point in the history
  • Loading branch information
rueter committed Feb 22, 2024
1 parent e6ee1ae commit 3dac3e8
Show file tree
Hide file tree
Showing 11 changed files with 397 additions and 1 deletion.
12 changes: 11 additions & 1 deletion src/fst/filters/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,23 @@
GIELLA_FILTER_LOCAL_REGEX_SRCS=\
downcase_UCletters.regex \
remove-norm-comp-tags.regex \
remove-rel_focus-strings.regex \
remove-diaereses-enhancement.regex \
block-illegal_compound-strings.regex \
insert-default-compounding-tags.regex \
insert-default_left_compounding-tags.regex \
remove-homonymy-tags.regex \
remove-dialect-tags.regex \
remove-usage-tags.regex \
remove-derivation-position-tags.regex \
remove-illegal-derivation-strings.regex \
remove-illegal-derivation-strings-flagbased.regex \
rename-POS_before_Der-tags.regex \
remove-DNorm-tags.regex
split-CmpN-tags.regex \
split-CmpNP-tags.regex \
remove-DNorm-tags.regex \
convert_to_flags-CmpNP-tags.regex \
change-optionally-word_boundary_to_hyphen.regex

# List any local filter xfscript files here:
GIELLA_FILTER_LOCAL_XFSCRIPT_SRCS=
Expand Down
33 changes: 33 additions & 0 deletions src/fst/filters/block-illegal_compound-strings.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

# This filter removes all strings containing compounding tags
# in illegal combinations.
[
! Convert normative tags to positive reset flags:
"@P.CmpN.SgN@" <- "+CmpN/SgN" ,
"@P.CmpN.SgG@" <- "+CmpN/SgG" ,
"@P.CmpN.PlG@" <- "+CmpN/PlG" ,
! Convert descriptive tags to require flags to match the set above:
[ "@R.CmpN.SgN@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgNom@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgNom" ,
[ "@R.CmpN.SgG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgGen" ,
[ "@R.CmpN.PlG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.PlGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/PlGen" ,

! Convert normative left-governing tags:
"@U.CmpN_Left.SgNom@" "@C.CmpN_Left@" <- "+CmpN/SgNomLeft" ,
"@U.CmpN_Left.SgGen@" "@C.CmpN_Left@" <- "+CmpN/SgGenLeft" ,
"@U.CmpN_Left.PlGen@" "@C.CmpN_Left@" <- "+CmpN/PlGenLeft"

];
20 changes: 20 additions & 0 deletions src/fst/filters/change-optionally-word_boundary_to_hyphen.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

# This filter changes # optionally into - but only for lexicalised compounds.
# The dynamic ones should be caught by the regular compounding system, and
# should get a tag when hyphenated at the compound border.

%# (->) %- "@C.LexComp@" || \[ "😱" | "-" ] _ ;
27 changes: 27 additions & 0 deletions src/fst/filters/convert_to_flags-CmpNP-tags.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Divvun & Giellatekno - open source grammars for Sámi and other languages
# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
# This program is free software; you can redistribute and/or modify
# this file under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. The GNU General Public License
# is found at http://www.gnu.org/licenses/gpl.html. It is
# also available in the file $GTHOME/LICENSE.txt.
#
# Other licensing options are available upon request, please contact
# giellatekno@uit.no or divvun@uit.no

# This filter convert single CmpN tags into corresponding flag diacritics.
# This is the second step of two in converting CmpN tags into flag diacritics
# that will regulate compounding behaviour in fst-based spellers.

[
"@U.CmpFrst.TRUE@" <- "+CmpNP/First",
"@U.CmpPref.TRUE@" <- "+CmpNP/Pref" ,
"@P.CmpLast.TRUE@" <- "+CmpNP/Last" ,
[ "@P.CmpOnly.FALSE@" "@P.CmpLast.TRUE@"
| "@R.CmpOnly.TRUE@" "@P.CmpLast.TRUE@" ] <- "+CmpNP/Suff" ,
[ "@P.CmpOnly.FALSE@" | "@R.CmpOnly.TRUE@" ] <- "+CmpNP/Only" ,
"@U.CmpNone.TRUE@" <- "+CmpNP/None"
];
86 changes: 86 additions & 0 deletions src/fst/filters/insert-default-compounding-tags.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

# This filter removes all strings containing derivational tags
# as defined below.
# Further, we check that we don't have double passives.
# See smX-lex.txt for more documentation.
#
# The filter should be applied on top of the nonrec transducer.

# Insert default compounding tag if there is no compounding tag before the
# POS tag, possibly with an intervening OLang/XXX tag in between (the semantic
# tag has been removed in an earlier step):

# All contexts: block insertion if there is an OLang or a CmpN tag to the left.

# First context: bare POS tag:
"+CmpN/SgN" <- [. 0 .] || [ .#. | %- | "+Cmp" | %# ] # SNM 2.5.2018: Added
# barriers to restrict scope of + added further below
\[ "+CmpN/SgN"
| "+CmpN/SgG"
| "+CmpN/PlN"
| "+CmpN/PlG"
| "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
| "+CmpNP/All"
| "+CmpNP/First"
| "+CmpNP/Pref"
| "+CmpNP/Last"
| "+CmpNP/Suff"
| "+CmpNP/None"
| "+CmpNP/Only"
]+ # SNM 2.5.2018: Added + to allow hypmin flag diacritics
_
[ "+N" | "+A" ]

.o.

# Second context: CmpNP + POS:
"+CmpN/SgN" <- [. 0 .] || \[ "+CmpN/SgN"
| "+CmpN/SgG"
| "+CmpN/PlN"
| "+CmpN/PlG"
| "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
| "+CmpNP/All"
| "+CmpNP/First"
| "+CmpNP/Pref"
| "+CmpNP/Last"
| "+CmpNP/Suff"
| "+CmpNP/None"
| "+CmpNP/Only"
]
_
[ "+CmpNP/All"
| "+CmpNP/First"
| "+CmpNP/Pref"
| "+CmpNP/Last"
| "+CmpNP/Suff"
| "+CmpNP/None"
| "+CmpNP/Only"
| "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
]+
? # SNM 2.5.2018: potential hypmin flag diacritic
[ "+N" | "+A" ]

;

# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want
# to insert the default tag.
56 changes: 56 additions & 0 deletions src/fst/filters/insert-default_left_compounding-tags.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

! Insert a Left-compound blocking flag diacritic for everything not explicitly
! marked for Left governing compounding.


! First context: no CmpN Left tag:
"@U.Want_Left.FALSE@" <- [..] || %#
_

\[ "+N" | "+A" | "+PrsPrc" | %#
| "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
]+
[ "+N" | "+A" | "+PrsPrc" ]

.o.

! Second context: CmpN Left tag immediately before POS tag - insert a tag
! to block regular CmpN compounding:
"@U.Want_Left.TRUE@" <- [..] || %#
_

\[ "+N" | "+A" | "+PrsPrc" | %#
| "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
]+

[ "+CmpN/SgNomLeft"
| "+CmpN/SgGenLeft"
| "+CmpN/PlGenLeft"
]^{1,3}
[ "+N" | "+A" | "+PrsPrc" ]
;

# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want
# to insert the default tag.

# Insert default left-compounding tags if there is no such tab before the
# semantic tag and after the regular compounding tags:
32 changes: 32 additions & 0 deletions src/fst/filters/remove-illegal-derivation-strings-flagbased.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

! This filter blocks all strings containing derivational tags
! as defined below.
! Further, we check that we don't have double passives.
! See https://giellalt.uit.no/lang/sme/root-morphology.html for more documentation.

! Block illegal derivations - keep the tags for general weight assignment to DerN tags at a later stage:
"@D.Der1.TRUE@" "@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der1.TRUE@" "+Der1" <- "+Der1" ,
"@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der2.TRUE@" "+Der2" <- "+Der2" ,
"@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der3.TRUE@" "+Der3" <- "+Der3" ,
"@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der4.TRUE@" "+Der4" <- "+Der4" ,
"@D.Der5.TRUE@" "@P.Der5.TRUE@" "+Der5" <- "+Der5" ,

! Block double passives:
"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassS.TRUE@" "+Der/PassS" <- "+Der/PassS" ,
"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassL.TRUE@" "+Der/PassL" <- "+Der/PassL" ,

! Reset everything when passing a word boundary:
"@C.Der1@" "@C.Der2@" "@C.Der3@" "@C.Der4@" "@C.Der5@" "@C.Der_PassS@" "@C.Der_PassL@" %# <- %# ;
39 changes: 39 additions & 0 deletions src/fst/filters/remove-illegal-derivation-strings.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software. You can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

# This filter removes all strings containing derivational tags
# as defined below.
# Further, we check that we don't have double passives.
# See smX-lex.txt for more documentation.
#
# The filter should be applied on top of the nonrec transducer.

~[ $[ "+Der1" ?* "+Der1" ]
| $[ "+Der2" ?* "+Der1" ]
| $[ "+Der2" ?* "+Der2" ]
| $[ "+Der3" ?* "+Der1" ]
| $[ "+Der3" ?* "+Der2" ]
| $[ "+Der3" ?* "+Der3" ]
| $[ "+Der4" ?* "+Der1" ]
| $[ "+Der4" ?* "+Der2" ]
| $[ "+Der4" ?* "+Der3" ]
| $[ "+Der4" ?* "+Der4" ]
| $[ "+Der5" ?* "+Der1" ]
| $[ "+Der5" ?* "+Der2" ]
| $[ "+Der5" ?* "+Der3" ]
| $[ "+Der5" ?* "+Der4" ]
| $[ "+Der5" ?* "+Der5" ]
| $[ "+Der/PassS" ?* "+Der/PassL" ]
| $[ "+Der/PassL" ?* "+Der/PassS" ]
] ;
29 changes: 29 additions & 0 deletions src/fst/filters/remove-rel_focus-strings.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
! Divvun & Giellatekno - open source grammars for Sámi and other languages
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament
! http://giellatekno.uit.no & http://divvun.no
!
! This program is free software; you can redistribute and/or modify
! this file under the terms of the GNU General Public License as published by
! the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version. The GNU General Public License
! is found at http://www.gnu.org/licenses/gpl.html. It is
! also available in the file $GTHOME/LICENSE.txt.
!
! Other licensing options are available upon request, please contact
! giellatekno@hum.uit.no or feedback@divvun.no

# This filter removes Rel-Foc combinations.

~[ $[ %+Rel ?* %+Qst ]
| $[ %+Rel ?* %+Foc/ge ]
| $[ %+Rel ?* %+Foc/gen ]
| $[ %+Rel ?* %+Foc/ges ]
| $[ %+Rel ?* %+Foc/gis ]
| $[ %+Rel ?* %+Foc/naj ]
| $[ %+Rel ?* %+Foc/ba ]
| $[ %+Rel ?* %+Foc/be ]
| $[ %+Rel ?* %+Foc/hal ]
| $[ %+Rel ?* %+Foc/han ]
| $[ %+Rel ?* %+Foc/bat ]
| $[ %+Rel ?* %+Foc/son ]
] ;
39 changes: 39 additions & 0 deletions src/fst/filters/split-CmpN-tags.regex
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Divvun & Giellatekno - open source grammars for Sámi and other languages
# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament
# http://giellatekno.uit.no & http://divvun.no
#
# This program is free software; you can redistribute and/or modify
# this file under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. The GNU General Public License
# is found at http://www.gnu.org/licenses/gpl.html. It is
# also available in the file $GTHOME/LICENSE.txt.
#
# Other licensing options are available upon request, please contact
# giellatekno@uit.no or divvun@uit.no

# This filter splits a row of CmpN tags into parallel paths with single tags.
# This is the first step of two in converting CmpN tags into flag diacritics
# that will regulate compounding behaviour in fst-based spellers.

# The split is done independently for the regular CmpN tags, the CmpNLeft tags
# and the CmpNDEf tags.

! Only +CmpN/ tags, no Left or Def:
[ [ "+CmpN/SgN" | "+CmpN/SgG" ] <- "+CmpN/SgN" "+CmpN/SgG" ,
[ "+CmpN/SgN" | "+CmpN/PlG" ] <- "+CmpN/SgN" "+CmpN/PlG" ,
[ "+CmpN/SgG" | "+CmpN/PlG" ] <- "+CmpN/SgG" "+CmpN/PlG" ,

! Left tags:
[ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/SgGenLeft" ,
[ "+CmpN/SgNomLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/PlGenLeft" ,
[ "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgGenLeft" "+CmpN/PlGenLeft"
]

.o.

[ [ "+CmpN/SgN" | "+CmpN/SgG" | "+CmpN/PlG" ] <-
"+CmpN/SgN" "+CmpN/SgG" "+CmpN/PlG" ,
[ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <-
"+CmpN/SgNomLeft" "+CmpN/SgGenLeft" "+CmpN/PlGenLeft"
];
Loading

0 comments on commit 3dac3e8

Please sign in to comment.