From 8018c477a5f2e5ff5fd9c90cf16461dfdf8880c0 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Wed, 15 May 2024 17:32:59 +0200 Subject: [PATCH] Read unaligned colum even if before words --- setup.py | 2 +- src/sastadev/SAFreader.py | 102 ++------------------------------------ 2 files changed, 4 insertions(+), 100 deletions(-) diff --git a/setup.py b/setup.py index b3aa8cd..bbef96f 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name='sastadev', - version='0.2.0', + version='0.2.1', description='Linguistic functions for SASTA tool', long_description=long_description, long_description_content_type='text/markdown', diff --git a/src/sastadev/SAFreader.py b/src/sastadev/SAFreader.py index 4b2bf46..99a6b11 100644 --- a/src/sastadev/SAFreader.py +++ b/src/sastadev/SAFreader.py @@ -162,104 +162,6 @@ def getcleanlevelsandlabels(thelabelstr: str, thelevel: str, prefix: str, allval return results -def oldget_annotations(infilename: FileName, patterns: Tuple[Pattern, Pattern]) \ - -> Tuple[UttWordDict, Dict[Tuple[Level, Item], List[Tuple[UttId, Position]]]]: - ''' - Reads the file with name filename in SASTA Annotation Format - :param infilename: - :param patterns - :return: a dictionary with as key a tuple (level, item) and as value a list of (uttid, tokenposition) pairs - ''' - - thedata = defaultdict(list) - - allutts = {} - - # To open Workbook - header, data = xlsx.getxlsxdata(infilename) - - levelcol = 1 - uttidcol = 0 - stagescol = -1 - commentscol = -1 - unalignedcol = -1 - - # uttlevel = 'utt' - - uttcount = 0 - - for col, val in enumerate(header): - if iswordcolumn(val): - lastwordcol = col - if isfirstwordcolumn(val): - firstwordcol = col - elif clean(val) in speakerheaders: - spkcol = col - elif clean(val) in uttidheaders: - uttidcol = col - elif clean(val) in levelheaders: - levelcol = col - elif clean(val) in stagesheaders: - stagescol = col - elif clean(val) in commentsheaders: - commentscol = col - elif clean(val) in unalignedheaders: - unalignedcol = col - else: - pass # maybe warn here that an unknow column header has been encountered? - - for row in data: - if row[uttidcol] != "": - # this might go wrong if there is no integer there @@make it robust - uttid = str(int(row[uttidcol])) - thelevel = row[levelcol] - thelevel = clean(thelevel) - all_levels.add(thelevel) - # if thelevel == uttlevel: - # uttcount += 1 - curuttwlist = [] - for colctr in range(firstwordcol, len(row)): - if thelevel.lower() in uttidheaders: - rawcurcellval = str(row[colctr]) - curcellval = getname(rawcurcellval) - if curcellval != '': - curuttwlist.append(curcellval) - elif thelevel in literallevels and colctr != stagescol and colctr != commentscol: - rawthelabel = str(row[colctr]) - thelabel = getname(rawthelabel) - if colctr > lastwordcol: - tokenposition = 0 - else: - tokenposition = colctr - firstwordcol + 1 - cleanlevel = thelevel - cleanlabel = thelabel - if cleanlabel != '': - thedata[(cleanlevel, cleanlabel)].append( - (uttid, tokenposition)) - elif not isuttlevel(thelevel) and colctr != stagescol and colctr != commentscol: - thelabelstr = row[colctr] - thelevel = row[levelcol] - if colctr == unalignedcol: - prefix = '' - if lastwordcol + 1 <= colctr < len(row): - # prefix = headers[colctr] aangepast om het simpeler te houden - prefix = "" - else: - prefix = "" - cleanlevelsandlabels = getcleanlevelsandlabels( - thelabelstr, thelevel, prefix, patterns) - if colctr > lastwordcol or colctr == unalignedcol: - tokenposition = 0 - else: - tokenposition = colctr - firstwordcol + 1 - for (cleanlevel, cleanlabel) in cleanlevelsandlabels: - thedata[(cleanlevel, cleanlabel)].append( - (uttid, tokenposition)) - if curuttwlist != []: - allutts[uttid] = curuttwlist - return allutts, thedata - - def get_annotations(infilename: FileName, allitems: List[str], themethod: Method) \ -> Tuple[UttWordDict, Dict[Tuple[Level, Item], List[Tuple[UttId, Position]]]]: ''' @@ -307,6 +209,8 @@ def get_annotations(infilename: FileName, allitems: List[str], themethod: Method else: pass # maybe warn here that an unknow column header has been encountered? + startcol = min( + [col for col in [firstwordcol, unalignedcol, commentscol, stagescol]]) for row in data: if row[uttidcol] != "": # this might go wrong if there is no integer there @@make it robust @@ -317,7 +221,7 @@ def get_annotations(infilename: FileName, allitems: List[str], themethod: Method # if thelevel == uttlevel: # uttcount += 1 curuttwlist = [] - for colctr in range(firstwordcol, len(row)): + for colctr in range(startcol, len(row)): if thelevel.lower() in uttidheaders: rawcurcellval = str(row[colctr]) curcellval = getname(rawcurcellval)