Skip to content

Commit

Permalink
fix small error in sentence tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
Kubus42 committed May 2, 2024
1 parent 32a3769 commit 999436f
Showing 1 changed file with 11 additions and 13 deletions.
24 changes: 11 additions & 13 deletions nlp/exercises/ex_tokenization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -45,29 +45,28 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from typing import List\n",
"\n",
"def tokenize_sentences_at_dot(paragraph: str) -> List[str]:\n",
" sentence_tokens = paragraph.split(\".\")\n",
" sentence_tokens = [s.strip() for s in sentence_tokens] # remove white space after .\n",
" \n",
" sentence_tokens = [s.strip() for s in sentence_tokens if s.strip() != \"\"] # remove white space after .\n",
" return sentence_tokens"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The paragraph contains 3 sentences.\n"
"The paragraph contains 2 sentences.\n"
]
}
],
Expand All @@ -78,29 +77,29 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import re \n",
"\n",
"def tokenize_sentences_at_punctuation(paragraph: str) -> List[str]:\n",
" sentence_tokens = re.split(r'[.:;!?]\\s*', paragraph)\n",
" sentence_tokens = [s.strip() for s in sentence_tokens] # remove white space after .\n",
" sentence_tokens = [s.strip() for s in sentence_tokens if s.strip() != \"\"] # remove white space after .\n",
" \n",
" return sentence_tokens"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The paragraph contains 8 sentences.\n"
"The paragraph contains 7 sentences.\n"
]
}
],
Expand All @@ -111,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 22,
"metadata": {},
"outputs": [
{
Expand All @@ -124,8 +123,7 @@
"Oh, the wonders it conceals\n",
"ancient ruins and extraterrestrial life forms, waiting to be discovered\n",
"As the spacecraft descended through the atmosphere, anticipation filled the hearts of the crew\n",
"Little did they know, their journey was about to unveil secrets beyond their wildest imagination\n",
"\n"
"Little did they know, their journey was about to unveil secrets beyond their wildest imagination\n"
]
}
],
Expand Down

0 comments on commit 999436f

Please sign in to comment.