Skip to content

Commit

Permalink
fix documents with a lot of paragraphs being removed by the repetitio…
Browse files Browse the repository at this point in the history
…n filter
  • Loading branch information
guipenedo committed Apr 26, 2024
1 parent 4e9235f commit a8d21e2
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/datatrove/pipeline/filters/gopher_repetition_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def __init__(
self.top_n_grams = top_n_grams
self.dup_n_grams = dup_n_grams
self.paragraph_exp = re.compile(r"\n{2,}")
self._line_splitter = re.compile("\n+")

def filter(self, doc: Document) -> bool | tuple[bool, str]:
from nltk.tokenize import word_tokenize
Expand All @@ -115,7 +116,7 @@ def filter(self, doc: Document) -> bool | tuple[bool, str]:
if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
return False, "dup_para_char_frac"

lines = text.splitlines()
lines = self._line_splitter.split(text)
line_duplicates, char_duplicates = find_duplicates(lines)
if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
return False, "dup_line_frac"
Expand Down

0 comments on commit a8d21e2

Please sign in to comment.