Skip to content

Commit

Permalink
Settings: Add Settings - Measures - Readability - Coleman's Readabili…
Browse files Browse the repository at this point in the history
…ty Formula; Work Area: Add Profiler - Readability - Coleman's Readability Formula
  • Loading branch information
BLKSerene committed Jul 27, 2023
1 parent 8ff158d commit c5e200b
Show file tree
Hide file tree
Showing 14 changed files with 780 additions and 306 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@

## [3.3.0](https://github.com/BLKSerene/Wordless/releases/tag/3.3.0) - ??/??/2023
### 🎉 New Features
- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Flesch Reading Ease
- Settings: Add Settings - Measures - Readability - Bormuth's Grade Placement / Coleman's Readability Formula / Flesch Reading Ease
- Utils: Add khmer-nltk's Khmer sentence tokenizer, word tokenizer, and part-of-speech tagger
- Utils: Add PyThaiNLP's perceptron part-of-speech tagger (Blackboard)
- Utils: Add spaCy's Korean sentence recognizer, word tokenizer, part-of-speech tagger, lemmatizer, and dependency parser
- Utils: Add spaCy's Malay word tokenizer
- Utils: Add spaCy's Slovenian sentence recognizer, part-of-speech tagger, lemmatizer, and dependency parser
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement
- Work Area: Add Profiler - Readability - Bormuth's Cloze Mean / Bormuth's Grade Placement / Coleman's Readability Formula

### ✨ Improvements
- Utils: Update Wordless's sentence and sentence segment splitters
Expand Down
96 changes: 54 additions & 42 deletions doc/doc_eng.md

Large diffs are not rendered by default.

411 changes: 411 additions & 0 deletions doc/measures/readability/colemans_readability_formula.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
53 changes: 21 additions & 32 deletions doc/measures/readability/lensear_write.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
120 changes: 54 additions & 66 deletions doc/measures/readability/re_simplified.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 6 additions & 19 deletions doc/measures/readability/rgl.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 12 additions & 18 deletions doc/measures/readability/smog_grade.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
150 changes: 68 additions & 82 deletions doc/measures/readability/wstf.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
69 changes: 51 additions & 18 deletions tests/wl_tests_measures/test_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
from tests import wl_test_init
from wordless.wl_measures import wl_measures_readability

main = wl_test_init.Wl_Test_Main()

class Wl_Test_Text():
def __init__(self, tokens_multilevel, lang = 'eng_us'):
super().__init__()
Expand All @@ -31,6 +29,9 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'):
self.lang = lang
self.tokens_multilevel = tokens_multilevel

main = wl_test_init.Wl_Test_Main()
settings = main.settings_custom['measures']['readability']

TOKENS_MULTILEVEL_0 = []
TOKENS_MULTILEVEL_12 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['This', 'is', 'a', 'sen-tence0', '.']]]]
TOKENS_MULTILEVEL_12_PROPN = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['Louisiana', 'readability', 'boxes', 'created', '.']]]]
Expand Down Expand Up @@ -150,6 +151,33 @@ def test_coleman_liau_index():
assert grade_level_eng_0 == 'text_too_short'
assert grade_level_eng_12 == grade_level_spa_12 == -27.4004 * (est_cloze_pct / 100) + 23.06395

def test_colemans_readability_formula():
cloze_pct_eng_0 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_0)
settings['colemans_readability_formula']['variant'] = '1'
cloze_pct_eng_12_1 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
settings['colemans_readability_formula']['variant'] = '2'
cloze_pct_eng_12_2 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
settings['colemans_readability_formula']['variant'] = '3'
cloze_pct_eng_12_3 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
settings['colemans_readability_formula']['variant'] = '4'
cloze_pct_eng_12_4 = wl_measures_readability.colemans_readability_formula(main, test_text_eng_12)
cloze_pct_other_12 = wl_measures_readability.colemans_readability_formula(main, test_text_other_12)

print("Coleman's Readability Formula:")
print(f'\teng/0: {cloze_pct_eng_0}')
print(f'\teng/12-1: {cloze_pct_eng_12_1}')
print(f'\teng/12-2: {cloze_pct_eng_12_2}')
print(f'\teng/12-3: {cloze_pct_eng_12_3}')
print(f'\teng/12-4: {cloze_pct_eng_12_4}')
print(f'\tother/12: {cloze_pct_other_12}')

assert cloze_pct_eng_0 == 'text_too_short'
assert cloze_pct_eng_12_1 == 1.29 * (9 / 12 * 100) - 38.45
assert cloze_pct_eng_12_2 == 1.16 * (9 / 12 * 100) + 1.48 * (3 / 12 * 100) - 37.95
assert cloze_pct_eng_12_3 == 1.07 * (9 / 12 * 100) + 1.18 * (3 / 12 * 100) + 0.76 * (0 / 12 * 100) - 34.02
assert cloze_pct_eng_12_4 == 1.04 * (9 / 12 * 100) + 1.06 * (3 / 12 * 100) + 0.56 * (0 / 12 * 100) - 0.36 * (0 / 12) - 26.01
assert cloze_pct_other_12 == 'no_support'

def test_dale_chall_readability_score():
x_c50_eng_0 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_0)
x_c50_eng_12 = wl_measures_readability.dale_chall_readability_score(main, test_text_eng_12)
Expand Down Expand Up @@ -198,19 +226,19 @@ def test_flesch_reading_ease():
flesch_re_eng_0 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_0)
flesch_re_eng_12 = wl_measures_readability.flesch_reading_ease(main, test_text_eng_12)

main.settings_custom['measures']['readability']['re']['variant_nld'] = 'Douma'
settings['re']['variant_nld'] = 'Douma'
flesch_re_nld_12_douma = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12)
main.settings_custom['measures']['readability']['re']['variant_nld'] = "Brouwer's Leesindex A"
settings['re']['variant_nld'] = "Brouwer's Leesindex A"
flesch_re_nld_12_brouwer = wl_measures_readability.flesch_reading_ease(main, test_text_nld_12)

flesch_re_fra_12 = wl_measures_readability.flesch_reading_ease(main, test_text_fra_12)
flesch_re_deu_12 = wl_measures_readability.flesch_reading_ease(main, test_text_deu_12)
flesch_re_ita_12 = wl_measures_readability.flesch_reading_ease(main, test_text_ita_12)
flesch_re_rus_12 = wl_measures_readability.flesch_reading_ease(main, test_text_rus_12)

main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Fernández Huerta'
settings['re']['variant_spa'] = 'Fernández Huerta'
flesch_re_spa_12_fh = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12)
main.settings_custom['measures']['readability']['re']['variant_spa'] = 'Szigriszt Pazos'
settings['re']['variant_spa'] = 'Szigriszt Pazos'
flesch_re_spa_12_sp = wl_measures_readability.flesch_reading_ease(main, test_text_spa_12)

flesch_re_afr_12 = wl_measures_readability.flesch_reading_ease(main, test_text_afr_12)
Expand All @@ -219,14 +247,14 @@ def test_flesch_reading_ease():
print('Flesch Reading Ease:')
print(f'\teng/0: {flesch_re_eng_0}')
print(f'\teng/12: {flesch_re_eng_12}')
print(f'\tnld-douma/12: {flesch_re_nld_12_douma}')
print(f'\tnld-brouwer/12: {flesch_re_nld_12_brouwer}')
print(f'\tnld/12-douma: {flesch_re_nld_12_douma}')
print(f'\tnld/12-brouwer: {flesch_re_nld_12_brouwer}')
print(f'\tfra/12: {flesch_re_fra_12}')
print(f'\tdeu/12: {flesch_re_deu_12}')
print(f'\tita/12: {flesch_re_ita_12}')
print(f'\trus/12: {flesch_re_rus_12}')
print(f'\tspa-fh/12: {flesch_re_spa_12_fh}')
print(f'\tspa-sp/12: {flesch_re_spa_12_sp}')
print(f'\tspa/12-fh: {flesch_re_spa_12_fh}')
print(f'\tspa/12-sp: {flesch_re_spa_12_sp}')
print(f'\tafr/12: {flesch_re_afr_12}')
print(f'\tother/12: {flesch_re_other_12}')

Expand Down Expand Up @@ -453,18 +481,22 @@ def test_spache_grade_level():

def test_wiener_sachtextformel():
wstf_deu_0 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_0)
wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '1')
wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '2')
wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '3')
wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12, variant = '4')
settings['wstf']['variant'] = '1'
wstf_deu_12_1 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
settings['wstf']['variant'] = '2'
wstf_deu_12_2 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
settings['wstf']['variant'] = '3'
wstf_deu_12_3 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
settings['wstf']['variant'] = '4'
wstf_deu_12_4 = wl_measures_readability.wiener_sachtextformel(main, test_text_deu_12)
wstf_eng_12 = wl_measures_readability.wiener_sachtextformel(main, test_text_eng_12)

print('Wiener Sachtextformel:')
print(f'\tdeu/0: {wstf_deu_0}')
print(f'\tdeu-1/12: {wstf_deu_12_1}')
print(f'\tdeu-2/12: {wstf_deu_12_2}')
print(f'\tdeu-3/12: {wstf_deu_12_3}')
print(f'\tdeu-4/12: {wstf_deu_12_4}')
print(f'\tdeu/12-1: {wstf_deu_12_1}')
print(f'\tdeu/12-2: {wstf_deu_12_2}')
print(f'\tdeu/12-3: {wstf_deu_12_3}')
print(f'\tdeu/12-4: {wstf_deu_12_4}')
print(f'\teng/12: {wstf_eng_12}')

ms = 0 / 12
Expand All @@ -485,6 +517,7 @@ def test_wiener_sachtextformel():
test_bormuths_cloze_mean()
test_bormuths_gp()
test_coleman_liau_index()
test_colemans_readability_formula()
test_dale_chall_readability_score()
test_devereux_readability_index()
test_flesch_kincaid_grade_level()
Expand Down
2 changes: 1 addition & 1 deletion tests/wl_tests_work_area/test_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def update_gui(err_msg, texts_stats_files):
count_tokens_lens_syls.append(collections.Counter(len_tokens_syls))
count_tokens_lens_chars.append(collections.Counter(len_tokens_chars))

assert len(readability_statistics) == 24
assert len(readability_statistics) == 25

# Counts
assert count_paras
Expand Down
94 changes: 71 additions & 23 deletions wordless/wl_measures/wl_measures_readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def get_count_words_dale(words, num_easy_words):
# Automated Arabic Readability Index
# Reference: Al-Tamimi, A., Jaradat M., Aljarrah, N., & Ghanim, S. (2013). AARI: Automatic Arabic readability index. The International Arab Journal of Information Technology, 11(4), pp. 370–378.
def automated_ara_readability_index(main, text):
text = get_counts(main, text)

if text.lang == 'ara':
text = get_counts(main, text)

if text.count_words and text.count_sentences:
aari = (
3.28 * text.count_chars_alphanumeric
Expand Down Expand Up @@ -164,9 +164,9 @@ def automated_readability_index(main, text):
# Bormuth's Cloze Mean & Grade Placement
# Reference: Bormuth, J. R. (1969). Development of readability analyses. U.S. Department of Health, Education, and Welfare. http://files.eric.ed.gov/fulltext/ED029166.pdf
def bormuths_cloze_mean(main, text):
text = get_counts(main, text)

if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_sentences and text.count_words:
ddl = get_count_words_dale(text.words_flat, 3000)
m = (
Expand Down Expand Up @@ -219,12 +219,62 @@ def coleman_liau_index(main, text):

return grade_level

# Coleman's Readability Formula
# Reference: Liau, T. L., Bassin, C. B., Martin, C. J., & Coleman, E. B. (1976). Modification of the Coleman readability formulas. Journal of Reading Behavior, 8(4), 381–386. https://journals.sagepub.com/doi/pdf/10.1080/10862967609547193
def colemans_readability_formula(main, text):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_words:
variant = main.settings_custom['measures']['readability']['colemans_readability_formula']['variant']
count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)

if variant in ['3', '4']:
pos_tags = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal')
count_prons = sum((1 for _, pos in pos_tags if pos == 'PRON'))

if variant == '4':
count_preps = sum((1 for _, pos in pos_tags if pos == 'ADP'))

if variant == '1':
cloze_pct = (
1.29 * (count_words_1_syl / text.count_words * 100) -
38.45
)
elif variant == '2':
cloze_pct = (
1.16 * (count_words_1_syl / text.count_words * 100) +
1.48 * (text.count_sentences / text.count_words * 100) -
37.95
)
elif variant == '3':
cloze_pct = (
1.07 * (count_words_1_syl / text.count_words * 100) +
1.18 * (text.count_sentences / text.count_words * 100) +
0.76 * (count_prons / text.count_words * 100) -
34.02
)
elif variant == '4':
cloze_pct = (
1.04 * (count_words_1_syl / text.count_words * 100) +
1.06 * (text.count_sentences / text.count_words * 100) +
0.56 * (count_prons / text.count_words * 100) -
0.36 * (count_preps / text.count_words) -
26.01
)
else:
cloze_pct = 'text_too_short'
else:
cloze_pct = 'no_support'

return cloze_pct

# Dale-Chall Readability Score
# References:
# Dale, E., & Chall, J. S. (1948a). A formula for predicting readability. Educational Research Bulletin, 27(1), 11–20, 28.
# Dale, E., & Chall, J. S. (1948b). A formula for predicting readability: Instructions. Educational Research Bulletin, 27(2), 37–54.
def dale_chall_readability_score(main, text):
if text.lang.startswith('eng'):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
Expand Down Expand Up @@ -374,10 +424,10 @@ def flesch_reading_ease_simplified(main, text):
text = get_counts(main, text)

if text.count_words and text.count_sentences:
count_words_monosyllabic = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)
count_words_1_syl = get_count_words_syls(text.syls_words, len_min = 1, len_max = 1)

flesch_re_simplified = (
1.599 * (count_words_monosyllabic / text.count_words * 100)
1.599 * (count_words_1_syl / text.count_words * 100)
- 1.015 * (text.count_words / text.count_sentences)
- 31.517
)
Expand All @@ -398,8 +448,8 @@ def forcast_grade_level(main, text):
sample_start = random.randint(0, text.count_words - 150)
sample = text.syls_words[sample_start : sample_start + 150]

count_words_monosyllabic = get_count_words_syls(sample, len_min = 1, len_max = 1)
rgl = 20.43 - 0.11 * count_words_monosyllabic
count_words_1_syl = get_count_words_syls(sample, len_min = 1, len_max = 1)
rgl = 20.43 - 0.11 * count_words_1_syl
else:
rgl = 'text_too_short'
else:
Expand Down Expand Up @@ -452,7 +502,7 @@ def formula_de_crawford(main, text):
# Lucisano, P., & Emanuela Piemontese, M. (1988). GULPEASE: A formula for the prediction of the difficulty of texts in Italian. Scuola e Città, 39(3), pp. 110–124.
# Indice Gulpease. (2021, July 9). In Wikipedia.https://it.wikipedia.org/w/index.php?title=Indice_Gulpease&oldid=121763335.
def gulpease_index(main, text):
if text.lang.startswith('ita'):
if text.lang == 'ita':
text = get_counts(main, text)

if text.count_words:
Expand All @@ -470,13 +520,13 @@ def gulpease_index(main, text):
# Polish variant:
# Pisarek, W. (1969). Jak mierzyć zrozumiałość tekstu?. Zeszyty Prasoznawcze, 4(42), 35–48.
def gunning_fog_index(main, text):
if text.lang.startswith('eng') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']:
if text.lang.startswith('eng_') or text.lang == 'pol' and text.lang in main.settings_global['syl_tokenizers']:
text = get_counts(main, text)

if text.count_sentences and text.count_words:
count_hard_words = 0

if text.lang.startswith('eng'):
if text.lang.startswith('eng_'):
words_tagged = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = text.lang, tagset = 'universal')

for syls, (word, tag) in zip(text.syls_words, words_tagged):
Expand Down Expand Up @@ -529,7 +579,7 @@ def legibility_mu(main, text):
# Lensear Write
# Reference: O’Hayre, J. (1966). Gobbledygook has gotta go. U.S. Government Printing Office. https://www.governmentattic.org/15docs/Gobbledygook_Has_Gotta_Go_1966.pdf
def lensear_write(main, text):
if text.lang.startswith('eng') and text.lang in main.settings_global['syl_tokenizers']:
if text.lang.startswith('eng_') and text.lang in main.settings_global['syl_tokenizers']:
text = get_counts(main, text)

if text.count_words > 0:
Expand Down Expand Up @@ -589,7 +639,7 @@ def lix(main, text):
# McAlpine EFLAW Readability Score
# Reference: Nirmaldasan. (2009, April 30). McAlpine EFLAW readability score. Readability Monitor. Retrieved November 15, 2022, from https://strainindex.wordpress.com/2009/04/30/mcalpine-eflaw-readability-score/
def mcalpine_eflaw(main, text):
if text.lang.startswith('eng'):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_sentences:
Expand Down Expand Up @@ -703,14 +753,14 @@ def smog_grade(main, text):
)

# Calculate the number of words with 3 or more syllables
count_words_polysyllabic = 0
count_words_3_plus_syls = 0

for sentence in samples:
syls_words = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang)

count_words_polysyllabic += get_count_words_syls(syls_words, len_min = 3)
count_words_3_plus_syls += get_count_words_syls(syls_words, len_min = 3)

g = 3.1291 + 1.043 * (count_words_polysyllabic ** 0.5)
g = 3.1291 + 1.043 * (count_words_3_plus_syls ** 0.5)
else:
g = 'text_too_short'
else:
Expand All @@ -723,7 +773,7 @@ def smog_grade(main, text):
# Dale, E. (1931). A comparison of two word lists. Educational Research Bulletin, 10(18), 484–489.
# Spache, G. (1953). A new readability formula for primary-grade reading materials. Elementary School Journal, 53(7), 410–413. https://doi.org/10.1086/458513
def spache_grade_level(main, text):
if text.lang.startswith('eng'):
if text.lang.startswith('eng_'):
text = get_counts(main, text)

if text.count_words >= 100:
Expand Down Expand Up @@ -771,14 +821,12 @@ def spache_grade_level(main, text):
# References:
# Bamberger, R., & Vanecek, E. (1984). Lesen – Verstehen – Lernen – Schreiben. Jugend und Volk.
# Lesbarkeitsindex. (2022, July 21). In Wikipedia. https://de.wikipedia.org/w/index.php?title=Lesbarkeitsindex&oldid=224664667
def wiener_sachtextformel(main, text, variant = None):
if text.lang.startswith('deu') and text.lang in main.settings_global['syl_tokenizers']:
def wiener_sachtextformel(main, text):
if text.lang.startswith('deu_') and text.lang in main.settings_global['syl_tokenizers']:
text = get_counts(main, text)

if text.count_words and text.count_sentences:
if not variant:
variant = main.settings_custom['measures']['readability']['wstf']['variant']

variant = main.settings_custom['measures']['readability']['wstf']['variant']
ms = get_count_words_syls(text.syls_words, len_min = 3) / text.count_words
sl = text.count_words / text.count_sentences
iw = get_count_words_letters(text.words_flat, len_min = 7) / text.count_words
Expand Down
2 changes: 2 additions & 0 deletions wordless/wl_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ def __init__(self, parent):
_tr('wl_profiler', "Bormuth's Cloze Mean"),
_tr('wl_profiler', "Bormuth's Grade Placement"),
_tr('wl_profiler', 'Coleman-Liau Index'),
_tr('wl_profiler', "Coleman's Readability Formula"),
_tr('wl_profiler', 'Dale-Chall Readability Score'),
_tr('wl_profiler', 'Devereaux Readability Index'),
_tr('wl_profiler', 'Flesch-Kincaid Grade Level'),
Expand Down Expand Up @@ -1178,6 +1179,7 @@ def run(self):
wl_measures_readability.bormuths_cloze_mean(self.main, text),
wl_measures_readability.bormuths_gp(self.main, text),
wl_measures_readability.coleman_liau_index(self.main, text),
wl_measures_readability.colemans_readability_formula(self.main, text),
wl_measures_readability.dale_chall_readability_score(self.main, text),
wl_measures_readability.devereux_readability_index(self.main, text),
wl_measures_readability.flesch_kincaid_grade_level(self.main, text),
Expand Down
Loading

0 comments on commit c5e200b

Please sign in to comment.