-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add filters parallel to lang-mdf/src/fst/filters/Makefile.am.
- Loading branch information
Showing
11 changed files
with
397 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
# This filter removes all strings containing compounding tags | ||
# in illegal combinations. | ||
[ | ||
! Convert normative tags to positive reset flags: | ||
"@P.CmpN.SgN@" <- "+CmpN/SgN" , | ||
"@P.CmpN.SgG@" <- "+CmpN/SgG" , | ||
"@P.CmpN.PlG@" <- "+CmpN/PlG" , | ||
! Convert descriptive tags to require flags to match the set above: | ||
[ "@R.CmpN.SgN@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgNom@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgNom" , | ||
[ "@R.CmpN.SgG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.SgGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/SgGen" , | ||
[ "@R.CmpN.PlG@" "@P.Want_Left.FALSE@" | "@C.CmpN@" "@P.CmpN_Left.PlGen@" "@P.Want_Left.TRUE@" ] <- "+Cmp/PlGen" , | ||
|
||
! Convert normative left-governing tags: | ||
"@U.CmpN_Left.SgNom@" "@C.CmpN_Left@" <- "+CmpN/SgNomLeft" , | ||
"@U.CmpN_Left.SgGen@" "@C.CmpN_Left@" <- "+CmpN/SgGenLeft" , | ||
"@U.CmpN_Left.PlGen@" "@C.CmpN_Left@" <- "+CmpN/PlGenLeft" | ||
|
||
]; |
20 changes: 20 additions & 0 deletions
20
src/fst/filters/change-optionally-word_boundary_to_hyphen.regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
# This filter changes # optionally into - but only for lexicalised compounds. | ||
# The dynamic ones should be caught by the regular compounding system, and | ||
# should get a tag when hyphenated at the compound border. | ||
|
||
%# (->) %- "@C.LexComp@" || \[ "😱" | "-" ] _ ; | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament | ||
# http://giellatekno.uit.no & http://divvun.no | ||
# | ||
# This program is free software; you can redistribute and/or modify | ||
# this file under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. The GNU General Public License | ||
# is found at http://www.gnu.org/licenses/gpl.html. It is | ||
# also available in the file $GTHOME/LICENSE.txt. | ||
# | ||
# Other licensing options are available upon request, please contact | ||
# giellatekno@uit.no or divvun@uit.no | ||
|
||
# This filter convert single CmpN tags into corresponding flag diacritics. | ||
# This is the second step of two in converting CmpN tags into flag diacritics | ||
# that will regulate compounding behaviour in fst-based spellers. | ||
|
||
[ | ||
"@U.CmpFrst.TRUE@" <- "+CmpNP/First", | ||
"@U.CmpPref.TRUE@" <- "+CmpNP/Pref" , | ||
"@P.CmpLast.TRUE@" <- "+CmpNP/Last" , | ||
[ "@P.CmpOnly.FALSE@" "@P.CmpLast.TRUE@" | ||
| "@R.CmpOnly.TRUE@" "@P.CmpLast.TRUE@" ] <- "+CmpNP/Suff" , | ||
[ "@P.CmpOnly.FALSE@" | "@R.CmpOnly.TRUE@" ] <- "+CmpNP/Only" , | ||
"@U.CmpNone.TRUE@" <- "+CmpNP/None" | ||
]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
# This filter removes all strings containing derivational tags | ||
# as defined below. | ||
# Further, we check that we don't have double passives. | ||
# See smX-lex.txt for more documentation. | ||
# | ||
# The filter should be applied on top of the nonrec transducer. | ||
|
||
# Insert default compounding tag if there is no compounding tag before the | ||
# POS tag, possibly with an intervening OLang/XXX tag in between (the semantic | ||
# tag has been removed in an earlier step): | ||
|
||
# All contexts: block insertion if there is an OLang or a CmpN tag to the left. | ||
|
||
# First context: bare POS tag: | ||
"+CmpN/SgN" <- [. 0 .] || [ .#. | %- | "+Cmp" | %# ] # SNM 2.5.2018: Added | ||
# barriers to restrict scope of + added further below | ||
\[ "+CmpN/SgN" | ||
| "+CmpN/SgG" | ||
| "+CmpN/PlN" | ||
| "+CmpN/PlG" | ||
| "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
| "+CmpNP/All" | ||
| "+CmpNP/First" | ||
| "+CmpNP/Pref" | ||
| "+CmpNP/Last" | ||
| "+CmpNP/Suff" | ||
| "+CmpNP/None" | ||
| "+CmpNP/Only" | ||
]+ # SNM 2.5.2018: Added + to allow hypmin flag diacritics | ||
_ | ||
[ "+N" | "+A" ] | ||
|
||
.o. | ||
|
||
# Second context: CmpNP + POS: | ||
"+CmpN/SgN" <- [. 0 .] || \[ "+CmpN/SgN" | ||
| "+CmpN/SgG" | ||
| "+CmpN/PlN" | ||
| "+CmpN/PlG" | ||
| "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
| "+CmpNP/All" | ||
| "+CmpNP/First" | ||
| "+CmpNP/Pref" | ||
| "+CmpNP/Last" | ||
| "+CmpNP/Suff" | ||
| "+CmpNP/None" | ||
| "+CmpNP/Only" | ||
] | ||
_ | ||
[ "+CmpNP/All" | ||
| "+CmpNP/First" | ||
| "+CmpNP/Pref" | ||
| "+CmpNP/Last" | ||
| "+CmpNP/Suff" | ||
| "+CmpNP/None" | ||
| "+CmpNP/Only" | ||
| "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
]+ | ||
? # SNM 2.5.2018: potential hypmin flag diacritic | ||
[ "+N" | "+A" ] | ||
|
||
; | ||
|
||
# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want | ||
# to insert the default tag. |
56 changes: 56 additions & 0 deletions
56
src/fst/filters/insert-default_left_compounding-tags.regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
! Insert a Left-compound blocking flag diacritic for everything not explicitly | ||
! marked for Left governing compounding. | ||
|
||
|
||
! First context: no CmpN Left tag: | ||
"@U.Want_Left.FALSE@" <- [..] || %# | ||
_ | ||
|
||
\[ "+N" | "+A" | "+PrsPrc" | %# | ||
| "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
]+ | ||
[ "+N" | "+A" | "+PrsPrc" ] | ||
|
||
.o. | ||
|
||
! Second context: CmpN Left tag immediately before POS tag - insert a tag | ||
! to block regular CmpN compounding: | ||
"@U.Want_Left.TRUE@" <- [..] || %# | ||
_ | ||
|
||
\[ "+N" | "+A" | "+PrsPrc" | %# | ||
| "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
]+ | ||
|
||
[ "+CmpN/SgNomLeft" | ||
| "+CmpN/SgGenLeft" | ||
| "+CmpN/PlGenLeft" | ||
]^{1,3} | ||
[ "+N" | "+A" | "+PrsPrc" ] | ||
; | ||
|
||
# In all other contexts, there is already a +CmpN/ tag, and thus we do NOT want | ||
# to insert the default tag. | ||
|
||
# Insert default left-compounding tags if there is no such tab before the | ||
# semantic tag and after the regular compounding tags: |
32 changes: 32 additions & 0 deletions
32
src/fst/filters/remove-illegal-derivation-strings-flagbased.regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
! This filter blocks all strings containing derivational tags | ||
! as defined below. | ||
! Further, we check that we don't have double passives. | ||
! See https://giellalt.uit.no/lang/sme/root-morphology.html for more documentation. | ||
|
||
! Block illegal derivations - keep the tags for general weight assignment to DerN tags at a later stage: | ||
"@D.Der1.TRUE@" "@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der1.TRUE@" "+Der1" <- "+Der1" , | ||
"@D.Der2.TRUE@" "@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der2.TRUE@" "+Der2" <- "+Der2" , | ||
"@D.Der3.TRUE@" "@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der3.TRUE@" "+Der3" <- "+Der3" , | ||
"@D.Der4.TRUE@" "@D.Der5.TRUE@" "@P.Der4.TRUE@" "+Der4" <- "+Der4" , | ||
"@D.Der5.TRUE@" "@P.Der5.TRUE@" "+Der5" <- "+Der5" , | ||
|
||
! Block double passives: | ||
"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassS.TRUE@" "+Der/PassS" <- "+Der/PassS" , | ||
"@D.Der_PassS.TRUE@" "@D.Der_PassL.TRUE@" "@P.Der_PassL.TRUE@" "+Der/PassL" <- "+Der/PassL" , | ||
|
||
! Reset everything when passing a word boundary: | ||
"@C.Der1@" "@C.Der2@" "@C.Der3@" "@C.Der4@" "@C.Der5@" "@C.Der_PassS@" "@C.Der_PassL@" %# <- %# ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software. You can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
# This filter removes all strings containing derivational tags | ||
# as defined below. | ||
# Further, we check that we don't have double passives. | ||
# See smX-lex.txt for more documentation. | ||
# | ||
# The filter should be applied on top of the nonrec transducer. | ||
|
||
~[ $[ "+Der1" ?* "+Der1" ] | ||
| $[ "+Der2" ?* "+Der1" ] | ||
| $[ "+Der2" ?* "+Der2" ] | ||
| $[ "+Der3" ?* "+Der1" ] | ||
| $[ "+Der3" ?* "+Der2" ] | ||
| $[ "+Der3" ?* "+Der3" ] | ||
| $[ "+Der4" ?* "+Der1" ] | ||
| $[ "+Der4" ?* "+Der2" ] | ||
| $[ "+Der4" ?* "+Der3" ] | ||
| $[ "+Der4" ?* "+Der4" ] | ||
| $[ "+Der5" ?* "+Der1" ] | ||
| $[ "+Der5" ?* "+Der2" ] | ||
| $[ "+Der5" ?* "+Der3" ] | ||
| $[ "+Der5" ?* "+Der4" ] | ||
| $[ "+Der5" ?* "+Der5" ] | ||
| $[ "+Der/PassS" ?* "+Der/PassL" ] | ||
| $[ "+Der/PassL" ?* "+Der/PassS" ] | ||
] ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
! Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
! Copyright © 2000-2010 The University of Tromsø & the Norwegian Sámi Parliament | ||
! http://giellatekno.uit.no & http://divvun.no | ||
! | ||
! This program is free software; you can redistribute and/or modify | ||
! this file under the terms of the GNU General Public License as published by | ||
! the Free Software Foundation, either version 3 of the License, or | ||
! (at your option) any later version. The GNU General Public License | ||
! is found at http://www.gnu.org/licenses/gpl.html. It is | ||
! also available in the file $GTHOME/LICENSE.txt. | ||
! | ||
! Other licensing options are available upon request, please contact | ||
! giellatekno@hum.uit.no or feedback@divvun.no | ||
|
||
# This filter removes Rel-Foc combinations. | ||
|
||
~[ $[ %+Rel ?* %+Qst ] | ||
| $[ %+Rel ?* %+Foc/ge ] | ||
| $[ %+Rel ?* %+Foc/gen ] | ||
| $[ %+Rel ?* %+Foc/ges ] | ||
| $[ %+Rel ?* %+Foc/gis ] | ||
| $[ %+Rel ?* %+Foc/naj ] | ||
| $[ %+Rel ?* %+Foc/ba ] | ||
| $[ %+Rel ?* %+Foc/be ] | ||
| $[ %+Rel ?* %+Foc/hal ] | ||
| $[ %+Rel ?* %+Foc/han ] | ||
| $[ %+Rel ?* %+Foc/bat ] | ||
| $[ %+Rel ?* %+Foc/son ] | ||
] ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
# Divvun & Giellatekno - open source grammars for Sámi and other languages | ||
# Copyright © 2000-2015 The University of Tromsø & the Norwegian Sámi Parliament | ||
# http://giellatekno.uit.no & http://divvun.no | ||
# | ||
# This program is free software; you can redistribute and/or modify | ||
# this file under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. The GNU General Public License | ||
# is found at http://www.gnu.org/licenses/gpl.html. It is | ||
# also available in the file $GTHOME/LICENSE.txt. | ||
# | ||
# Other licensing options are available upon request, please contact | ||
# giellatekno@uit.no or divvun@uit.no | ||
|
||
# This filter splits a row of CmpN tags into parallel paths with single tags. | ||
# This is the first step of two in converting CmpN tags into flag diacritics | ||
# that will regulate compounding behaviour in fst-based spellers. | ||
|
||
# The split is done independently for the regular CmpN tags, the CmpNLeft tags | ||
# and the CmpNDEf tags. | ||
|
||
! Only +CmpN/ tags, no Left or Def: | ||
[ [ "+CmpN/SgN" | "+CmpN/SgG" ] <- "+CmpN/SgN" "+CmpN/SgG" , | ||
[ "+CmpN/SgN" | "+CmpN/PlG" ] <- "+CmpN/SgN" "+CmpN/PlG" , | ||
[ "+CmpN/SgG" | "+CmpN/PlG" ] <- "+CmpN/SgG" "+CmpN/PlG" , | ||
|
||
! Left tags: | ||
[ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/SgGenLeft" , | ||
[ "+CmpN/SgNomLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgNomLeft" "+CmpN/PlGenLeft" , | ||
[ "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <- "+CmpN/SgGenLeft" "+CmpN/PlGenLeft" | ||
] | ||
|
||
.o. | ||
|
||
[ [ "+CmpN/SgN" | "+CmpN/SgG" | "+CmpN/PlG" ] <- | ||
"+CmpN/SgN" "+CmpN/SgG" "+CmpN/PlG" , | ||
[ "+CmpN/SgNomLeft" | "+CmpN/SgGenLeft" | "+CmpN/PlGenLeft" ] <- | ||
"+CmpN/SgNomLeft" "+CmpN/SgGenLeft" "+CmpN/PlGenLeft" | ||
]; |
Oops, something went wrong.