-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGenerator.py
574 lines (507 loc) · 31.4 KB
/
Generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
import threading
from pathlib import Path
from typing import List, Set, Tuple, Optional
import os
from src.Core.Dataset.DiscreteScale import DiscreteScale
from src.Core.IWordSource import IWordSource
from src.Core.Languages.LinguisticContext import LinguisticContext
from src.Core.Morphology.MorphoLex.MorphoLexSegmentedDataset import MorphoLexSegmentedDataset
from src.Core.Morphology.POSTypes import POSTypes
from src.Core.Morphology.RootDetection.IRootDetector import IRootDetector
from src.Core.Morphology.RootDetection.SharingRootDetector import SharingRootDetector
from src.Core.Morphology.RootDetection.IRootDetectorStack import IRootDetectorStack
from src.Core.Morphology.RootDetection.RootDetectorCacher import RootDetectorCacher
from src.Core.Morphology.RootDetection.RootDetectorStackCacher import RootDetectorStackCacher
from src.Core.OSimUnrPipeline.EnglishPipeline import EnglishPipeline
from src.Core.OSimUnrPipeline.PipelineProviderBase import PipelineProviderBase
from src.Core.Orthographic.NormalizedStringSimilarity.EditDistance import EditDistance
from src.Core.Orthographic.OverlappingMeasures import OverlapCoefficient
from src.Core.Preprocessing.WordsFilterer import WordsFilterer
from src.Core.Segmentation.Tokenizers.ITokenizer import ITokenizer
from src.Core.Segmentation.Tokenizers.NLTKWhitespaceTokenizer import NLTKWhitespaceTokenizer
from src.Core.Segmentation.Tokenizers.TokenizerCacher import TokenizerCacher
from src.Core.Task.IWordRelatednessBinaryClassifier import IWordRelatednessBinaryClassifier
from src.Core.WordNet.Classifiers.BlacklistedConceptsWordNetRelatednessFilterer import \
BlacklistedConceptsWordNetRelatednessFilterer
from src.Core.WordNet.Classifiers.ConceptWiseWordNetRelatednessFilterer import ConceptWiseWordNetRelatednessFilterer
from src.Core.WordNet.Classifiers.DefinitionBasedRelatednessClassifier import DefinitionBasedRelatednessClassifier
from src.Core.WordNet.Classifiers.WordNetDerivationallyRelatedBinaryClassifier import \
WordNetDerivationallyRelatedBinaryClassifier
from src.Core.WordNet.IWordNet import IWordNet, WordNetSimilarityAlgorithms, Lemma2SynsetMatching
from src.Core.WordNet.NLTKWordNetWrapper import NLTKWordNetWrapper
from src.Core.WordNet.WordPairDefinitionSourceFilter import WordPairDefinitionSourceFilter
from src.Core.WordPair import WordPair
from src.Core.WordPairSynthesizer import WordPairSynthesizer
from src.Core.WordSim.IWordSimilarity import IWordSimilarity
from src.Core.WordSim.WordSimDataset import WordSimDataset
from src.Core.WordSim.WordSimilarityNormalizerWrapper import WordSimilarityNormalizerWrapper
from src.Tools import StringHelper, Resources
from src.Tools.FormatHelper import NoDecimal
from src.Tools.Logger import logp, log, logl, logpif
#region Console Settings
import warnings # https://stackoverflow.com/questions/15777951/how-to-suppress-pandas-future-warning/15778297
from src.Tools.Progressor import Progressor
warnings.simplefilter(action='ignore', category=FutureWarning)
#endregion
logp("Building common context for the study...", True)
Provider: PipelineProviderBase = EnglishPipeline(LinguisticContext.BuildEnglishContext(),EditDistance())
_Context: LinguisticContext = Provider.Context
_StudyName = "MyStudy"
def GetStudyPath() -> str:
return Resources.GetStudyPath(_StudyName)
_RndName = StringHelper.GenerateRandomStr(5)
_MaxRelatedness = 0.25
logp("Common context executed!", True)
def IsIWordSimilarityInstance(model) -> bool:
"""
Special type check is required for performance purposes because some models' bases cannot be implemented when cythonized.
:param model:
:return:
"""
if isinstance(model, IWordSimilarity): return True
name = str(model)
if name == "nedit" or name == "edit": return True
if name.__contains__("over_ft"): return True
if name.__contains__("jacc"): return True
if name.__contains__("dice"): return True
if name.__contains__("ngram"): return True
return False
# region: 0: Common Helper
def snapshotSave(wordpairs, subDatasetName: str, scale: DiscreteScale) -> Optional[str]:
"""
Saves any list of word pairs.
:param wordpairs:
:param subDatasetName:
:param scale:
:return: Returns the path where it saved the file after saving.
"""
if (not wordpairs or wordpairs.__len__() == 0): return None
logp("Saving dataset or dataset snapshot named: " + subDatasetName + " ...", anyMode=True)
dsName = subDatasetName + "-" + _RndName
dsOrthographicallySimilars: WordSimDataset = WordSimDataset(dsName, scale=scale)
dsOrthographicallySimilars.LoadWithWordPairs(wordpairs)
fpath = str(Path(StudyPathForLanguage()).joinpath(f'{dsName}.csv'))
dsOrthographicallySimilars.Persist(fpath)
logp("Saved snapshot: " + fpath + "(" + str(len(wordpairs)) + ")", anyMode=True)
return fpath
def StudyPathForLanguage():
return Path(GetStudyPath()).joinpath(_Context.Language.Code.lower())
def SetContext(lcontext: LinguisticContext):
"""
Aims to set the context before accessing the paper's setup.
:param lcontext:
:return:
"""
global _Context
_Context = lcontext
#endregion
def S2_GenerateOrthographicallySimilarWordPairsExhaustive(wordpool: List[str], orthographicSim: IWordSimilarity, outputName3: str = None, outputName4: str = None,
minOrthographicSimQ4: float = 0.75, minOrthographicSimQ3: float = 0.5,
limitResults: int = None, resumeStage2: str = None,
snapshotPersistenceDetectedBatch: int = None, snapshotPersistenceBatchPercentage: int = 10,
snapshotCallback=None, tryLoadStage2SessionCallback=None, snapshotFinalScale: DiscreteScale = None) -> Tuple[List[WordPair], List[WordPair], int]:
"""
Always perform the cheaper task first.
:param snapshotPersistenceBatchPercentage: Specifies at which percentage intervals a snapshot will be taken.
:param wordSource:
:param orthographicSim:
:param snapshotPersistenceBatch: Saves a snapshot of extracted pairs after the specified number of detections. Must be between 0-100.
:return:
"""
# Validate
if (minOrthographicSimQ4 == 0 or minOrthographicSimQ3 == 0): raise Exception("Orthographic similarity thresholds cannot be 0.")
if (minOrthographicSimQ4 is None or minOrthographicSimQ3 is None): raise Exception("Orthographic similarity thresholds cannot be None.")
if (minOrthographicSimQ3 >= minOrthographicSimQ4): raise Exception("Q3 cannot be greater than Q4.")
if (not outputName4 or not outputName3): raise Exception("Output names are not provided.")
oSimName: str = str(orthographicSim)
logp("Going to use '" + oSimName + "' as the orthographic similarity algorithm!")
wpOrthographicallySimilarsQ3: List[WordPair] = []
wpOrthographicallySimilarsQ4: List[WordPair] = []
# region Resume Mode
startingChar: str = None
reachedToLastSession: bool = None
if (resumeStage2):
ds4 = tryLoadStage2SessionCallback(outputName4, resumeStage2)
ds3 = tryLoadStage2SessionCallback(outputName3, resumeStage2)
lastWordProcessed = ds3.Wordpairs[ds3.Wordpairs.__len__() - 1].Word1 # Continue always from Q3.
lastCharProccesed = lastWordProcessed[0]
startingChar: str = _Context.Grammar.ToLowerCase(lastCharProccesed)
logp("In the previous session, we stopped at the character '" + lastCharProccesed + "', and will continue from the beginning of the character '" + startingChar + "'...")
reachedToLastSession: bool = False
wpOrthographicallySimilarsQ3 = ds3.Wordpairs
wpOrthographicallySimilarsQ4 = ds4.Wordpairs
# Delete latest pairs - leftovers to avoid possible duplicates - assumes ordered list, otherwise it takes too long.
logl(str(wpOrthographicallySimilarsQ4.__len__()), "Q4.Items", anyMode=True)
logp("Deleting word pairs for the last remaining characters to avoid duplicates...", anyMode=True)
for wp in wpOrthographicallySimilarsQ4:
if (wp.Word1[0] == startingChar): # Assumes order! Deletes automatically after finding the first character.
index4 = wpOrthographicallySimilarsQ4.index(wp)
wpOrthographicallySimilarsQ4 = wpOrthographicallySimilarsQ4[0:index4]
break
logl(str(wpOrthographicallySimilarsQ4.__len__()), "Q4.Items", anyMode=True)
logl(str(wpOrthographicallySimilarsQ3.__len__()), "Q3.Items", anyMode=True)
for wp in wpOrthographicallySimilarsQ3:
if (wp.Word1[0] == startingChar): # Same assumption of order for Q3.
index3 = wpOrthographicallySimilarsQ3.index(wp)
wpOrthographicallySimilarsQ3 = wpOrthographicallySimilarsQ3[0:index3]
break
logl(str(wpOrthographicallySimilarsQ3.__len__()), "Q3.Items", anyMode=True)
logp("Deletion process completed.", anyMode=True)
snapshotCallback(wpOrthographicallySimilarsQ3, outputName3, snapshotFinalScale) # Save again with a new ID after loading.
snapshotCallback(wpOrthographicallySimilarsQ4, outputName4, snapshotFinalScale)
#endregion
wpSynthesizer = WordPairSynthesizer().GeneratePossibleWordPairs(wordpool, allowSameWordsInAPair=False)
i: int = 1
estsize: int = int(NoDecimal((len(wordpool) * (len(wordpool) / 2)))) # Complexity: (n*n/2)
prog = Progressor(reportRemaniningTime=True, expectedIteration=estsize)
progBatchSize = 10 if estsize < 10000 else int(estsize / 10000)
wpCursor = None
def saveInSeperateThread(wps, name, scale):
"""
#ref:http://sebastiandahlgren.se/2014/06/27/running-a-method-as-a-background-thread-in-python/
:param wps:
:param name:
:return:
"""
thread = threading.Thread(target=snapshotCallback, args=(wps, name, scale))
thread.daemon = True # Daemonize thread
thread.start() # Start the execution
firstChar: str = None
for wp in wpSynthesizer:
firstChar = wp.Word1[0]
perc: float = prog.logpif(i, progressBatchSize=progBatchSize, anyMode=True, iterstr="iter-" + firstChar)
# region resume check
if (reachedToLastSession == False):
if (firstChar == startingChar): # Skipping until reaching the stopped character; skipping is 10x faster than similarity calculation.
reachedToLastSession = True
else:
i = i + 1
continue
#endregion
# Do the job
try:
sim = orthographicSim.WordSimilarity(wp.Word1, wp.Word2) # Without this cost, generating all possibilities for EN takes only 36 minutes. 95% of time is spent on this operation!
except Exception as e:
print("Error in osim operation! Alg:" + str(orthographicSim) + ", wp:" + wp.ToPairDisplay())
print(e)
raise
if (sim >= minOrthographicSimQ3): # Ignore if smaller than both thresholds.
wp.SetOtherSimilarity(oSimName, sim)
wpCursor = wpOrthographicallySimilarsQ4 if sim >= minOrthographicSimQ4 else wpOrthographicallySimilarsQ3
wpCursor.append(wp)
extracted: int = len(wpCursor)
logpif(iter=len(wpCursor), iterstr="Detected", progressBatchSize=100, anyMode=True) # Log every 100 detections.
if (limitResults):
if (extracted >= limitResults):
return wpOrthographicallySimilarsQ3, wpOrthographicallySimilarsQ4, i - 1
# Snapshot by detected (disabled by default)
if (snapshotPersistenceDetectedBatch): # Save snapshot after specified detections.
if (extracted % snapshotPersistenceDetectedBatch == 0):
saveInSeperateThread(wpOrthographicallySimilarsQ3, outputName3, snapshotFinalScale)
saveInSeperateThread(wpOrthographicallySimilarsQ4, outputName4, snapshotFinalScale)
# Snapshot by percentage
if (snapshotPersistenceBatchPercentage and perc):
if ((perc % snapshotPersistenceBatchPercentage == 0)): # Only at whole percentage points.
saveInSeperateThread(wpOrthographicallySimilarsQ3, outputName3, snapshotFinalScale)
saveInSeperateThread(wpOrthographicallySimilarsQ4, outputName4, snapshotFinalScale)
i = i + 1
return wpOrthographicallySimilarsQ3, wpOrthographicallySimilarsQ4, i - 1
# noinspection PyUnresolvedReferences
# @with_goto
def S3_Run(orthographicallySimilarWpsPathQ4: str = None, autoPersist: bool = True, posFilters: List[POSTypes] = [POSTypes.NOUN], includeQ3: bool = True,
orthographicallySimilarsWithRelatednessPathQ4=None, maxRelatedness: float = 0.25, skip: str = ""):
"""
Executes the steps of Stage4-Morphological Relatedness Filtering.
:param orthographicallySimilarWpsPathQ4:
:param autoPersist:
:param posFilters:
:param includeQ3:
:param orthographicallySimilarsWithRelatednessPathQ4: If provided, skips to 4a and starts there.
:param maxRelatedness:
:param skip: Indicates the subprocesses to be skipped.
:return:
"""
# region Commons
dsOrthographicallySimilarsQ4 = None
dsOrthographicallySimilarsQ3 = None
finalScale = DiscreteScale(0, 1)
log("S3: Starting Stage3 Relatedness Filtering...", anyMode=True)
wnSimAlgorithm: WordNetSimilarityAlgorithms = Provider.CreateWordNetSimAlgorithm()
wnSimName: str = "Wn_" + str(wnSimAlgorithm.name).lower()
logl(wnSimName, "CreateWordNetSimAlgorithm")
log("S3: Creating WordNet similarity for the language...")
wordnetSim: IWordSimilarity = Provider.CreateWordNetForSimilarity(wnSimAlgorithm, wordSimPOSFilters=posFilters)
logl(str(wordnetSim), "WordNetSim Type", anyMode=True)
logp("S3: Calculating WordNet relatedness values for WordPairs...", anyMode=True)
if skip is None: skip = ""
# endregion
if orthographicallySimilarsWithRelatednessPathQ4 is not None:
raise NotImplemented("Skipping to 4a is not implemented yet!")
# goto .Stage3a
# region Load
logp("S3: Loading", anyMode=True)
if not orthographicallySimilarWpsPathQ4: raise Exception("orthographicallySimilarWpsPathQ4 is null!")
orthographicallySimilarWpsPathQ3: str = orthographicallySimilarWpsPathQ4.replace("SimilarsQ4", "SimilarsQ3")
dsOrthographicallySimilarsQ3: WordSimDataset = None
# Q4
if orthographicallySimilarWpsPathQ4:
dsOrthographicallySimilarsQ4 = WordSimDataset(fullPath=orthographicallySimilarWpsPathQ4, linguisticContext=_Context)
dsOrthographicallySimilarsQ4.Load()
# Q3
if includeQ3:
dsOrthographicallySimilarsQ3 = WordSimDataset(fullPath=orthographicallySimilarWpsPathQ3, linguisticContext=_Context)
dsOrthographicallySimilarsQ3.Load()
# endregion
def setWordNetSimilarities(wn, ds, s2ExistingPath: str, allowNoneSims: bool = False) -> Tuple[str, str]:
"""
Adds WordNet similarity scores to the original dataset and saves it if necessary.
:param wn:
:param ds:
:param allowNoneSims: In the OSimUnr usage, since the word pool is selected from WordNet, in theory, every word should have a similarity score. If True, the process continues even if None; if False, an error is raised.
:param existingPath: Output of the S2 process without WordNet scores.
:return: Returns the name and physical path of the new S3 dataset.
"""
logp("Setting WordNet similarities for " + s2ExistingPath + " ...", anyMode=True)
scoreScale = wn.SimilarityScale()
wnEff: IWordSimilarity = None # Can be wrapped for normalization purposes if needed.
if not scoreScale.IsNormalized():
wnEff = WordSimilarityNormalizerWrapper(wn, ds.Wordpairs, finalScale)
else:
wnEff = wn
wpIndex: int = 0
for wp in ds.Wordpairs:
sim = wnEff.WordSimilarityInScale(wp.Word1, wp.Word2, finalScale)
if sim is None and not allowNoneSims:
raise Exception("OSim result is 'None'. Cannot proceed without enabling allowNoneSims mode. OSimUnr does not accept None TSim." + str(wp))
ds.Wordpairs[wpIndex].SetOtherSimilarity(wnSimName, sim)
wpIndex += 1
if autoPersist:
head, tail = os.path.split(s2ExistingPath)
newSubDatasetName = tail.replace("S2", "S3").replace("OrthographicallySimilars", "OrthographicallySimilarsWithWN")
newSubDatasetName = newSubDatasetName[0:-10] # Replace extension and stuff
savedPath: str = snapshotSave(ds.Wordpairs, newSubDatasetName, finalScale)
return newSubDatasetName, savedPath
newS3DatasetNameQ4, newS3DatasetPathQ4 = setWordNetSimilarities(wordnetSim, dsOrthographicallySimilarsQ4, orthographicallySimilarWpsPathQ4)
if orthographicallySimilarsWithRelatednessPathQ4 is None:
orthographicallySimilarsWithRelatednessPathQ4 = newS3DatasetPathQ4
if includeQ3:
newS3DatasetNameQ3, newS3DatasetPathQ3 = setWordNetSimilarities(wordnetSim, dsOrthographicallySimilarsQ3, orthographicallySimilarWpsPathQ3)
# region 3a - Relatedness Filtering
# label .Stage3a
if dsOrthographicallySimilarsQ4 is None:
dsOrthographicallySimilarsQ4 = WordSimDataset(fullPath=orthographicallySimilarsWithRelatednessPathQ4, linguisticContext=_Context)
dsOrthographicallySimilarsQ4.Load()
if includeQ3:
orthographicallySimilarsWithRelatednessPathQ3: str = orthographicallySimilarsWithRelatednessPathQ4.replace("SimilarsWithWNQ4", "SimilarsWithWNQ3")
dsOrthographicallySimilarsQ3 = WordSimDataset(fullPath=orthographicallySimilarsWithRelatednessPathQ3, linguisticContext=_Context)
dsOrthographicallySimilarsQ3.Load()
# Initialize Root Detection Tools
logp("Initializing root detection dependencies...", anyMode=True)
rootDetector: IRootDetector = Provider.CreateRootDetector() # Will cache it because I want to use the same ML instance!
fastRootDetector: IRootDetector = Provider.CreateFastRootDetector() # No need to cache for the fast one; it's already dictionary-based!
sharingRootDetector = SharingRootDetector(rootDetector, useOutOfLexiconRoots=True)
wnDerRel: IWordRelatednessBinaryClassifier = Provider.CreateDerivationallyRelatedClassifier()
priorPOS: POSTypes = posFilters[0] if posFilters.__len__() == 1 else None # If it's a single POS filter experiment, conveying the POS info can increase success.
# Initialize D-stage Definition-based classifiers
logp("Initializing definition-based filtering dependencies...", anyMode=True)
defClassifier: DefinitionBasedRelatednessClassifier = Provider.CreateDefinitionBasedRelatednessClassifier(priorPOS, rootDetector, fastRootDetector)
defClassifier.SkipReferencing = skip.__contains__("3A4")
defClassifier.SkipKeywordInTypeHierarchy = skip.__contains__("3A3")
defClassifier.SkipMutualMeaningfulAffixes = skip.__contains__("3C3") # Not 3A5!
blacklistedFilterer: BlacklistedConceptsWordNetRelatednessFilterer = Provider.CreateBlacklistedConceptsFilterer(priorPOS)
conceptFilterer: ConceptWiseWordNetRelatednessFilterer = Provider.CreateConceptPairFilterer(priorPOS)
def detectUnrelateds(wordpairs, existingS3Path: str):
logp("detectUnrelateds...", anyMode=True)
if not rootDetector: logp("Gloss-based relatedness filter will not be applied as RootDetector is missing!!")
unrelateds: List[WordPair] = []
eliminateds: List[WordPair] = [] # Related pairs eliminated during filtering.
if existingS3Path is not None:
logp("S3a Relatedness Filtering is about to begin for " + existingS3Path + " ...", anyMode=True)
else:
logp("existingS3Path is null. Operation terminated.", anyMode=True)
exit()
prog = Progressor(expectedIteration=wordpairs.Wordpairs.__len__())
batchSize: int = int(wordpairs.Wordpairs.__len__() / 100)
i = 0
for wp in wordpairs.Wordpairs:
wp.Reason: str = None
wp.SharedRoot = ""
prog.logpif(i, "wp", progressBatchSize=batchSize, anyMode=True)
isUnrelated: bool = True # Default assumption is unrelated. Ask the stages until a related judgment is made.
# Stage 3A1: WordNet Relatedness
if isUnrelated and not skip.__contains__("3A1"):
sim: float = wp.GetOtherSimilarity(wnSimName)
if sim:
isUnrelated = sim <= maxRelatedness
# Additional filtering stages...
# Finalize
if wp.Reason is not None:
wp.Note = wp.Reason
if wp.SharedRoot: wp.Note = wp.Note + ". Root:" + wp.SharedRoot
if isUnrelated:
unrelateds.append(wp)
else:
eliminateds.append(wp)
i += 1
if autoPersist:
head, tail = os.path.split(existingS3Path)
newSubDatasetName = tail.replace("S3", "S3a").replace("OrthographicallySimilarsWithWN", "OrthographicallySimilarButUnrelateds")
snapshotSave(unrelateds, newSubDatasetName[0:-10], finalScale)
newRootSubDatasetName = tail.replace("S3", "S3a").replace("OrthographicallySimilarsWithWN", "OrthographicallySimilarAndSharingRoots")
snapshotSave(eliminateds, newRootSubDatasetName[0:-10], finalScale)
logp("Relatedness filtering ended.", anyMode=True)
detectUnrelateds(dsOrthographicallySimilarsQ4, orthographicallySimilarsWithRelatednessPathQ4)
if includeQ3:
pathQ3: str = orthographicallySimilarsWithRelatednessPathQ4.replace("SimilarsWithWNQ4", "SimilarsWithWNQ3")
detectUnrelateds(dsOrthographicallySimilarsQ3, pathQ3)
logp(_StudyName + " S3a process completed.", anyMode=True)
def RunStudy(wordPosFilters: List[POSTypes] = None, preExtractedWordPairsPath=None, wordpoolPath=None, wordpairLimit: int = None, autoPersist=True, limitWordCands: int = None,
wordpairsPath: str = None, minOrthographicSimQ4: float = None, minOrthographicSimQ3: float = None, orthographicSim: IWordSimilarity = None, resumeStage2: str = None, s1Only: bool = False,
allowAccentDuplicates: bool = True, resumeStage3and4: bool = True, maxRelatedness: float = 0.25):
"""
:param resumeStage2: If the session ID of a previously incomplete stage2 is provided, it continues from there. If None, it calculates a new session from scratch. Default: None
:param wordPosFilters: If None, all words found are used. If filters are provided, only those POS words are included in the pipeline at the wordpool level.
:param preExtractedWordPairsPath:
:param wordpoolPath:
:param wordpairLimit:
:param autoPersist:
:param limitWordCands:
:param wordpairsPath:
:param minOrthographicSimQ4:
:param minOrthographicSimQ3:
:param allowAccentDuplicates: If True, allows matches like "harekât-harekat". If False, keeps the accented version and removes unaccented matches.
:return:
"""
finalScale = DiscreteScale(0, 1)
_MaxRelatedness = maxRelatedness
# General Warnings
if (wordPosFilters is None): logp("No POS Filter defined for the entire pipeline! Are you sure? Ignore this if you are using an existing pool file!")
def tryLoadStage2Session(subDatasetName: str, sessionId: str) -> WordSimDataset:
logp("Loading dataset or dataset snapshot named: " + subDatasetName + " ...", anyMode=True)
logp("For session: " + sessionId)
dsName = subDatasetName + "-" + sessionId
fpath = str(Path(StudyPathForLanguage().joinpath(f'{dsName}.csv')))
dsOrthographicallySimilars: WordSimDataset = WordSimDataset(fullPath=fpath, linguisticContext=_Context)
WordSimDataset.Load(dsOrthographicallySimilars, autoLowerCase=False)
logp("Loaded snapshot: " + fpath + "(" + str(len(dsOrthographicallySimilars.Wordpairs)) + ")", anyMode=True)
return dsOrthographicallySimilars
def saveWordpool(wordpool, filePath):
os.makedirs(os.path.dirname(filePath), exist_ok=True)
with open(filePath, 'w', encoding="utf-8-sig") as f:
for w in wordpool:
f.write("%s\n" % w)
logp("Wordpool saved. " + filePath)
# endregion
# region S1-S2: CANDIDATE SELECTION
dsOrthographicallySimilars: WordSimDataset = None
if (wordpairsPath):
dsOrthographicallySimilars = WordSimDataset(fullPath=wordpairsPath, linguisticContext=_Context)
dsOrthographicallySimilars.Load()
else:
if (preExtractedWordPairsPath):
dsOrthographicallySimilars = WordSimDataset(preExtractedWordPairsPath)
dsOrthographicallySimilars.Load()
else:
# 1.2: Load Wordpool from existing
wordpool = []
sortedWordpool: List = None
if (wordpoolPath):
with open(wordpoolPath, encoding="utf-8") as file:
for w in file:
wordpool.append(w.strip())
log("Wordpool loaded. Size:" + str(len(wordpool)), anyMode=True)
log(wordpoolPath)
sortedWordpool = sorted(wordpool)
else:
# region S1.2: Prepare WordPool
# InitialWordPool: Since Word definition only fetches single words, the raw state cannot be retrieved from here.
wordsource: IWordSource = Provider.GetWordSource()
# Single WordPool
singleWordpool: Set[str] = wordsource.GetWords(posFilter=None)
logp("S1-SingleWordPoolSize: " + str(len(singleWordpool)))
saveWordpool(singleWordpool, str(Path(StudyPathForLanguage().joinpath(f'S1-SingleWordPool-{_RndName}.txt'))))
# S1-POS Filtering
if (wordPosFilters is None or wordPosFilters.__len__() == 0):
wordpoolAllPos = singleWordpool
else:
wordpoolAllPos = set()
for pos in wordPosFilters:
wordpoolForPos: Set[str] = wordsource.GetWords(pos)
logp("S1-" + str(pos.name) + "-WordPoolSize: " + str(len(wordpoolForPos)))
saveWordpool(sorted(wordpoolForPos), str(Path(StudyPathForLanguage(),f'S1-{pos.name}-WordPool-{_RndName}.txt')))
wordpoolAllPos = wordpoolAllPos.union(wordpoolForPos)
# Make all distinct and lowercase
wordpoolAllPos2: Set[str] = set()
for wordRaw in wordpoolAllPos:
wordpoolAllPos2.add(_Context.Grammar.ToLowerCase(wordRaw))
logp("S1-POSFilteredWordPoolSize: " + str(len(wordpoolAllPos2)))
saveWordpool(wordpoolAllPos2, str(Path(StudyPathForLanguage(),f'S1-POSFilteredWordPool-{_RndName}.txt')))
# Filter
logp("Applying WordFilter for " + str(len(wordpoolAllPos2)) + " ...")
logp("Following word filters are hardcoded: minLength=6, allowPunctuation=False, allowNumbers=False")
wordpoolFinal, removed = WordsFilterer().ToFiltered(wordpoolAllPos2, minLength=6, allowPunctuation=False, allowNumbers=False)
logp(str(len(wordpoolFinal)) + " words left after filtering. Removed: " + str(removed))
logp("S1-LengthAndPunctFilteredWordPool: " + str(len(wordpoolFinal)))
saveWordpool(wordpoolFinal, str(Path(StudyPathForLanguage()).joinpath(f'S1-LengthAndPunctFilteredWordPool-{_RndName}.txt')))
# region AccentedDuplicates
# This process filters out non-accented versions of words with accents. Disabled by default.
if (not allowAccentDuplicates):
wordpoolFinalAccentSafe: Set[str] = set(wordpoolFinal) # Clone to a new set.
accentedCount: int = 0
for w in wordpoolFinal:
if (_Context.Grammar.HasAccent(w)):
wReduced: str = _Context.Grammar.ReduceAccents(w)
wordpoolFinalAccentSafe.discard(wReduced)
accentedCount = accentedCount + 1
logp(str(accentedCount) + " words were found with accents. Possible reduced versions have been removed.")
saveWordpool(wordpoolFinalAccentSafe, str(Path(StudyPathForLanguage()).joinpath(f'S1-FinalAccentSafe-{_RndName}.txt'))) # Unsorted
wordpoolFinal = wordpoolFinalAccentSafe
# endregion
# Persist
if (limitWordCands): wordpoolFinal = wordpoolFinal[:limitWordCands]
sortedWordpool = sorted(wordpoolFinal)
saveWordpool(sortedWordpool, str(Path(StudyPathForLanguage()).joinpath(f'S1-FinalWordPool-{_RndName}.txt')))
# endregion
# 2: GENERATE WORDPAIRS
if (s1Only):
log("S1 completed. Exiting due to S1Only mode!", anyMode=True)
exit()
log("Starting Stage2...", anyMode=True)
if (not orthographicSim): raise Exception("No OrthographicSim defined. Cannot continue to Stage2!")
oSimName: str = str(orthographicSim).lower()
logl(oSimName, "oSimAlg", anyMode=True)
outputName3: str = "S2-OrthographicallySimilarsQ3-" + oSimName
outputName4: str = "S2-OrthographicallySimilarsQ4-" + oSimName
orthographicallySimilarQ3, orthographicallySimilarQ4, totalSpace = S2_GenerateOrthographicallySimilarWordPairsExhaustive(
wordpool=sortedWordpool,
orthographicSim=orthographicSim, minOrthographicSimQ3=minOrthographicSimQ3, minOrthographicSimQ4=minOrthographicSimQ4, limitResults=wordpairLimit,
snapshotPersistenceDetectedBatch=None if autoPersist else None,
snapshotPersistenceBatchPercentage=4 if autoPersist else None,
snapshotCallback=snapshotSave, outputName3=outputName3, outputName4=outputName4,
tryLoadStage2SessionCallback=tryLoadStage2Session, resumeStage2=resumeStage2, snapshotFinalScale=finalScale)
print("totalSpace: " + str(totalSpace))
# Save as WordSim Dataset
if (autoPersist):
pathQ3: Optional[str] = snapshotSave(orthographicallySimilarQ3, outputName3, finalScale)
pathQ4: Optional[str] = snapshotSave(orthographicallySimilarQ4, outputName4, finalScale)
# Resume Stage 3 and 4
if (resumeStage3and4):
S3_Run(posFilters=wordPosFilters, orthographicallySimilarWpsPathQ4=pathQ4, autoPersist=autoPersist,
maxRelatedness=_MaxRelatedness)
else:
raise Exception("AutoPersist is disabled. Cannot continue to Stage 3 without saving the results.")
# endregion
logp("Process completed for " + _StudyName, anyMode=True)
def GenerateDataset(wordpoolPath: str = None, wordpairsPath: str = None,
resume: str = None, s1Only: bool = False, limitWordCands: int = None, minOrthographicSimQ4:float=None, minOrthographicSimQ3:float=None,
wordPosFilters:List[POSTypes]=None, resumeStage3and4=True, maxRelatedness:float = 0.25):
if wordPosFilters is None:
wordPosFilters = []
oSimAlg = Provider.GetOrthographicSimilarityAlgorithm()
RunStudy(
wordPosFilters=wordPosFilters, limitWordCands=limitWordCands, minOrthographicSimQ3=minOrthographicSimQ3, minOrthographicSimQ4=minOrthographicSimQ4,
orthographicSim=oSimAlg,
wordpoolPath=wordpoolPath,
wordpairsPath=wordpairsPath, wordpairLimit=None,
resumeStage2=resume, s1Only=s1Only, resumeStage3and4=resumeStage3and4, maxRelatedness=maxRelatedness
)