Skip to content

Commit

Permalink
Merge branch 'release/0.2.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
JeltevanBoheemen committed Apr 26, 2024
2 parents 2d57ee9 + fea8fb8 commit fee15ce
Show file tree
Hide file tree
Showing 81 changed files with 12,551 additions and 2,049 deletions.
3 changes: 2 additions & 1 deletion .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ ignore =
exclude =
__pycache__
env
.env
.env
build/
3 changes: 0 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ jobs:
python -m pip install --upgrade pip
pip install flake8
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
flake8 src/**/*.py --count --max-line-length=127 --statistics
- name: Run unit tests
run: |
pip install pytest
Expand Down
2 changes: 1 addition & 1 deletion docs/asta.rst
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ A029: MLU/X
* iedere herhaling (**covered**, including many but not all partial repetitions)
* iedere echolalie (**covered partially**)
* iedere mislukte poging om te komen tot realisatie van het doelwoord (** covered partially**)
* Ik ging zitten op de kast, nee stoel, nee bank Aantal woorden voor bepalen samplegrootte = 10, MLU=6
* Ik ging zitten op de kast, nee stoel, nee bank Aantal woorden voor bepalen samplegrootte = 10, MLU=6
* Streep /hé/, /goh/, /och/ etc. weg. (**covered**)
* Uitingen die deels onverstaanbaar zijn worden in hun geheel weggelaten (eventuele lexicale maten zijn dan al wel geteld) (**covered**)

Expand Down
2 changes: 1 addition & 1 deletion docs/stap.rst
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ The query is defined as //node[%new_STAP_BB_t%], where the macro *new_STAP_BB_t*
%geledenBBt% or
%temporal_mwu%
) and %STAP_geen_BB%)
"""
"""

We discuss each of the macros used inside this query
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,8 @@ module = [
'xslxwriter',
'openpyxl','xlsx'
]
ignore_missing_imports = true
ignore_missing_imports = true


[tool.pyright]
reportInvalidTypeForm = false
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name='sastadev',
version='0.1.5',
version='0.2.0',
description='Linguistic functions for SASTA tool',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down
128 changes: 38 additions & 90 deletions src/sastadev/ASTApostfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from copy import deepcopy
from typing import Dict, List, Optional, Tuple

from sastadev.allresults import AllResults
from sastadev.allresults import AllResults, ResultsKey, mkresultskey
from sastadev.lexicon import getwordinfo, getwordposinfo
from sastadev.sastatypes import Position, QId, SynTree, UttId
from sastadev.sastatypes import Position, SynTree, UttId
from sastadev.stringfunctions import getallrealwords, realwordstring
from sastadev.treebankfunctions import getattval, getnodeyield

Expand All @@ -25,8 +25,23 @@
mqid = 'A020'
tijdfoutpvqid = 'A041'
nounlemmaqid = 'A046'
formqid = 'A047'
verblemmaqid = 'A049'

nounreskey = mkresultskey(nounqid)
lexreskey = mkresultskey(lexqid)

samplesizereskey = mkresultskey(samplesizeqid)
mluxreskey = mkresultskey(mluxqid)

pvreskey = mkresultskey(pvqid)
delpvreskey = mkresultskey(delpvqid)
subpvreskey = mkresultskey(subpvqid)
tijdfoutpvreskey = mkresultskey(tijdfoutpvqid)
kreskey = mkresultskey(kqid)
mreskey = mkresultskey(mqid)
formreskey = mkresultskey(formqid)

specialform = 'Special Form'
errormarking = 'Error Marking'

Expand Down Expand Up @@ -108,9 +123,9 @@ def wordcountperutt(allresults):
wordcounts = {uttid: sum(ctr.values()) for uttid, ctr in lemmas.items()}
ignorewordcounts = deepcopy(
allresults.coreresults[
samplesizeqid]) if samplesizeqid in allresults.coreresults else Counter() # samplesize
samplesizereskey]) if samplesizereskey in allresults.coreresults else Counter() # samplesize
ignorewordcounts += allresults.coreresults[
mluxqid] if mluxqid in allresults.coreresults else Counter() # mlux
mluxreskey] if mluxreskey in allresults.coreresults else Counter() # mlux
# ignorewordcounts += allresults.coreresults['A050'] if 'A050' in allresults.coreresults else Counter() # echolalie covered by mlux
result = {}
for uttid in wordcounts:
Expand Down Expand Up @@ -156,7 +171,7 @@ def getcutoffpoint(allresults: AllResults, uttid: UttId, diff: int) -> int:
theutt = allresults.allutts[uttid]
final = diff
for i, w in enumerate(theutt):
if (uttid, i + 1) in allresults.exactresults[samplesizeqid]:
if (uttid, i + 1) in allresults.exactresults[samplesizereskey]:
final += 1
if i + 1 == final:
break
Expand All @@ -165,13 +180,13 @@ def getcutoffpoint(allresults: AllResults, uttid: UttId, diff: int) -> int:

def finietheidsindex(allresults, _):
allpvs = allresults.coreresults[
pvqid] if pvqid in allresults.coreresults else Counter()
pvreskey] if pvreskey in allresults.coreresults else Counter()
subpvs = allresults.coreresults[
subpvqid] if subpvqid in allresults.coreresults else Counter()
subpvreskey] if subpvreskey in allresults.coreresults else Counter()
delpvs = allresults.coreresults[
delpvqid] if delpvqid in allresults.coreresults else Counter()
delpvreskey] if delpvreskey in allresults.coreresults else Counter()
tijdfoutpvs = allresults.coreresults[
tijdfoutpvqid] if tijdfoutpvqid in allresults.coreresults else Counter()
tijdfoutpvreskey] if tijdfoutpvreskey in allresults.coreresults else Counter()
foutepvs = subpvs + delpvs + tijdfoutpvs
allpvcount = sumctr(allpvs)
foutepvcount = sumctr(foutepvs)
Expand All @@ -186,9 +201,9 @@ def finietheidsindex(allresults, _):
def countwordsandcutoff(allresults, _):
# @@to be adapted
result = (None, 0)
if 'A047' in allresults.postresults:
if formreskey in allresults.postresults:
paddedlist = []
for key, val in allresults.postresults['A047'].items():
for key, val in allresults.postresults[formreskey].items():
paddedkey = key.rjust(lpad, zero)
paddedlist.append((paddedkey, val))
sortedlist = sorted(paddedlist)
Expand All @@ -205,29 +220,16 @@ def countwordsandcutoff(allresults, _):

def KMcount(allresults, _):
Kcount = sumctr(
allresults.coreresults[kqid]) if kqid in allresults.coreresults else 0
allresults.coreresults[kreskey]) if kreskey in allresults.coreresults else 0
Mcount = sumctr(
allresults.coreresults[mqid]) if mqid in allresults.coreresults else 0
result = Kcount + Mcount
return result


def old_old_getlemmas(allresults, _):
allmatches = allresults.allmatches
allresults.postresults['A046'] = Counter()
for el in allmatches:
(qid, uttid) = el
if qid in ['A021', 'A018']:
for amatch in allmatches[el]:
# theword = normalizedword(amatch[0])
theword = getattval(amatch[0], 'lemma')
allresults.postresults['A046'].update([(theword, uttid)])
return allresults


def getlemmas(allresults, _):
result = getcondlemmas(allresults, _, lambda qid: qid in [nounqid, lexqid])
return result
# def getlemmas(allresults, _):
# result = getcondlemmas(allresults, _, lambda reskey: reskey in [nounreskey, lexreskey])
# return result


def getnounlemmas(allresults, _):
Expand All @@ -239,7 +241,7 @@ def getnounlemmas(allresults, _):
.. autofunction:: ASTApostfunctions::getposlemmas
'''
result = getposlemmas(allresults, nounqid)
result = getposlemmas(allresults, nounreskey)
return result


Expand All @@ -251,7 +253,7 @@ def getlexlemmas(allresults, _):
.. autofunction:: ASTApostfunctions::getposlemmas
'''
result = getposlemmas(allresults, lexqid)
result = getposlemmas(allresults, lexreskey)
return result


Expand All @@ -276,61 +278,6 @@ def getalllemmas(allresults):
return result


def old_getlemmas(allresults, _):
allmatches = allresults.allmatches
result = Counter()
for el in allmatches:
(qid, uttid) = el
if qid in ['A021', 'A018']:
for amatch in allmatches[el]:
# theword = normalizedword(amatch[0])
theword = getattval(amatch[0], 'lemma')
result.update([(theword, uttid)])
return result


def oldgetcondlemmas(allresults, _, cond):
allmatches = allresults.allmatches
result = Counter()
for el in allmatches:
(qid, uttid) = el
if cond(qid):
for amatch in allmatches[el]:
# theword = normalizedword(amatch[0])
theword = getattval(amatch[0], 'lemma')
result.update([(theword, uttid)])
return result


# not used anymore, contains an error
def getcondlemmas(allresults, _, cond):
result = Counter()
if allresults.annotationinput:
for qid in allresults.exactresults:
if cond(qid):
for (uttid, position) in allresults.exactresults[qid]:
word = allresults.allutts[uttid][position - 1]
if qid == 'A021':
pos = 'n'
elif qid == 'A018':
pos = 'ww'
else:
pos = None
lemma = bgetlemma(word, pos)
result.update([(lemma, qid, uttid)])

else:
allmatches = allresults.allmatches
for el in allmatches:
(qid, uttid) = el
if cond(qid):
for amatch in allmatches[el]:
# theword = normalizedword(amatch[0])
theword = getattval(amatch[0], 'lemma')
result.update([(theword, uttid)])
return result


def getposfromqid(qid):
if qid == 'A021':
pos = 'n'
Expand All @@ -341,26 +288,27 @@ def getposfromqid(qid):
return pos


def getposlemmas(allresults: AllResults, posqid: QId) -> List[Tuple[str, UttId]]:
def getposlemmas(allresults: AllResults, posreskey: ResultsKey) -> List[Tuple[str, UttId]]:
'''
The function *getposlemmas* obtains the lemmas from *allresults* that have been
found by a query with identifier *posqid*.
found by a query with identifier *posreskey*.
The lemma is obtained from the parse tree if there is one, otherwise (in case the
input was an annotation form) from the lexicon (CELEX).
'''
result = Counter()
if allresults.annotationinput:
for (uttid, position) in allresults.exactresults[posqid]:
for (uttid, position) in allresults.exactresults[posreskey]:
word = allresults.allutts[uttid][position - 1]
posqid = posreskey[0]
pos = getposfromqid(posqid)
lemma = bgetlemma(word, pos)
result.update([(lemma, uttid)])
else:
allmatches = allresults.allmatches
for el in allmatches:
(qid, uttid) = el
if qid == posqid:
(reskey, uttid) = el
if reskey == posreskey:
for amatch in allmatches[el]:
# theword = normalizedword(amatch[0])
theword = getattval(amatch[0], 'lemma')
Expand Down
6 changes: 4 additions & 2 deletions src/sastadev/CHAT_Annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from sastadev import cleanCHILDEStokens
# import sastadev.cleanCHILDEStokens
from sastadev.conf import settings
from sastadev.metadata import Meta, bpl_delete, bpl_replacement
from sastadev.sastatoken import Token, show
Expand Down Expand Up @@ -62,8 +63,9 @@ def refunction(x):
wordre = re.compile(fullwordpat)
# interpunction = r'(:?' + r'[!\?\.,;]' + '|' + u'[\u201C\u201D\u2039\u203A]' + r'|' + r'(?<=\s):' + r')'
interpunction = r'\-\-\-|\-\-|\-|\-' + r'|' + \
r'[!\?\.,;]' + '|' + \
u'[\u2013\u2014\u2015\u201C\u201D\u2039\u203A]' + r'|' + r'(?<=\s):'
r'[!\?\.,;]' + '|' + \
u'[\u2013\u2014\u2015\u201C\u201D\u2039\u203A]' + \
r'|' + r'(?<=\s):'
filenamepat = r'[\w\.]+'
fullfilenamepat = fullre(filenamepat)
fullfilenamere = re.compile(fullfilenamepat)
Expand Down
Loading

0 comments on commit fee15ce

Please sign in to comment.