new

kiasar · May 25, 2019 · 1602b5c · 1602b5c
1 parent 4a6e7a0
commit 1602b5c
Show file tree

Hide file tree

Showing 14 changed files with 72 additions and 177 deletions.
diff --git a/gitignore.gitignore → .gitignore b/gitignore.gitignore → .gitignore
diff --git a/Pipfile b/Pipfile
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/gutenberg_cleaning_options/__init__.py → _cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py → _cleaning_options/__init__.py
diff --git a/_cleaning_options/cleaner.py b/_cleaning_options/cleaner.py
@@ -0,0 +1,42 @@
+from _cleaning_options.cleaning_options import _is_title_or_etc, _is_books_copy, \
+    _is_email_init, _is_footnote, _is_image, _is_table
+from _cleaning_options.strip_headers import _strip_headers
+
+
+def simple_cleaner(book: str) -> str:
+    """
+    Just removes lines that are part of the Project Gutenberg header or footer.
+    Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+    :rtype: str
+    :param book: str of a gutenberg's book
+    :return: str of the book without the lines that are part of the Project Gutenberg header or footer.
+    """
+    return _strip_headers(book)
+
+
+def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
+    """
+    Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+    ^_^ Do you have a comment to make it better? Email to [email protected] ^_^.
+    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+    :rtype: str
+    :param book: str of a gutenberg's book.
+    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+     -1 means don't tokenize the txt (so it will be faster).
+    :param max_token: The maximum tokens of a paragraph.
+    :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
+    you can split the book to paragraphs by "\n\n".
+    """
+    headless_book = _strip_headers(book)
+    paragraphs = headless_book.split("\n\n")  # split the book to paragraphs.
+
+    paragraphs_after_cleaning = []
+    for par in paragraphs:
+        if _is_image(par) or _is_footnote(par) or _is_email_init(par) or \
+                _is_books_copy(par) or _is_table(par) or _is_title_or_etc(par, min_token, max_token):
+            paragraphs_after_cleaning.append("[deleted]")  # if the paragraph is not good , replace it with [deleted]
+        else:
+            paragraphs_after_cleaning.append(par)
+
+    cleaned_book = "\n\n".join(paragraphs_after_cleaning)  # joining the list of paragraphs into one string
+    return cleaned_book
diff --git a/...berg_cleaning_options/cleaning_options.py → _cleaning_options/cleaning_options.py b/...berg_cleaning_options/cleaning_options.py → _cleaning_options/cleaning_options.py
@@ -9,7 +9,7 @@
 image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:")  # Regex to find images.
 
 
-def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
+def _is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
     """
     determining if a paragraph is title or information of the book.
     IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
@@ -40,7 +40,7 @@ def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool
     return False
 
 
-def is_table(text: str) -> bool:
+def _is_table(text: str) -> bool:
     """
     determining if a paragraph is a table or catalog.
     :rtype: bool
@@ -56,7 +56,7 @@ def is_table(text: str) -> bool:
         return True  # mostly catalogs and etc.
 
 
-def is_image(text: str) -> bool:
+def _is_image(text: str) -> bool:
     """
     determining if a paragraph is for mentioning an image.
     :param text: Raw paragraph.
@@ -65,7 +65,7 @@ def is_image(text: str) -> bool:
     return bool(re.search(image_formats_regex, text.lower()))
 
 
-def is_footnote(text: str) -> bool:
+def _is_footnote(text: str) -> bool:
     """
     determining if a paragraph is the footnote of the book.
     :rtype: bool
@@ -79,11 +79,7 @@ def is_footnote(text: str) -> bool:
     return bool(re.search(footnote_notation_regex, txt))  # if a line starts with {...} it might be a footnote.
 
 
-print(is_footnote("""     [0] The country-seat of Bishop Shipley, the good bishop,
-         as Dr. Franklin used to style him.--B."""))
-
-
-def is_books_copy(text: str) -> bool:
+def _is_books_copy(text: str) -> bool:
     """
     determining if a paragraph indicates the number of copies of this book.
     :rtype: bool
@@ -95,7 +91,7 @@ def is_books_copy(text: str) -> bool:
     return False
 
 
-def is_email_init(text: str) -> bool:
+def _is_email_init(text: str) -> bool:
     """
     determining if a paragraph includes an Email.
     :rtype: bool

diff --git a/gutenberg_cleaning_options/strip_headers.py → _cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py → _cleaning_options/strip_headers.py
@@ -94,7 +94,7 @@
 LEGALESE_END_MARKERS = frozenset(("SERVICE THAT CHARGES FOR DOWNLOAD",))
 
 
-def strip_headers(text):
+def _strip_headers(text):
     """Remove lines that are part of the Project Gutenberg header or footer.
     Note: The original version of the code can be found at:
     https://github.com/c-w/gutenberg/blob/master/gutenberg/cleanup/strip_headers.py

diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz
diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO
@@ -1,8 +1,8 @@
 Metadata-Version: 1.1
 Name: gutenberg-cleaner
-Version: 0.0.1
+Version: 0.1.3
 Summary: cleans gutenberg dataset books
-Home-page: UNKNOWN
+Home-page: https://github.com/kiasar/gutenberg_cleaner
 Author: UNKNOWN
 Author-email: [email protected]
 License: MIT
@@ -51,7 +51,7 @@ Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png)
         This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
 
 Platform: UNKNOWN
-Classifier: Programming language :: Python :: 3
-Classifier: Programming language :: Python :: 3.6
-Classifier: Programming language :: Python :: 3.7
-Classifier: Operation System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Operating System :: OS Independent
diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt
@@ -1,5 +1,10 @@
 README.md
+gutenberg_cleaner.py
 setup.py
+_cleaning_options/__init__.py
+_cleaning_options/cleaner.py
+_cleaning_options/cleaning_options.py
+_cleaning_options/strip_headers.py
 gutenberg_cleaner.egg-info/PKG-INFO
 gutenberg_cleaner.egg-info/SOURCES.txt
 gutenberg_cleaner.egg-info/dependency_links.txt

diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt
@@ -1 +1,2 @@
-gutenbergـcleaner
+_cleaning_options
+gutenberg_cleaner
diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py
@@ -1,42 +1 @@
-from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \
-    is_image, is_table
-from gutenberg_cleaning_options.strip_headers import strip_headers
-
-
-def simple_cleaner(book: str) -> str:
-    """
-    Just removes lines that are part of the Project Gutenberg header or footer.
-    Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
-    :rtype: str
-    :param book: str of a gutenberg's book
-    :return: str of the book without the lines that are part of the Project Gutenberg header or footer.
-    """
-    return strip_headers(book)
-
-
-def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
-    """
-    Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
-    ^_^ Do you have a comment to make it better? Email to [email protected] ^_^.
-    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
-    :rtype: str
-    :param book: str of a gutenberg's book.
-    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
-     -1 means don't tokenize the txt (so it will be faster).
-    :param max_token: The maximum tokens of a paragraph.
-    :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
-    you can split the book to paragraphs by "\n\n".
-    """
-    headless_book = strip_headers(book)
-    paragraphs = headless_book.split("\n\n")  # split the book to paragraphs.
-
-    paragraphs_after_cleaning = []
-    for par in paragraphs:
-        if is_image(par) or is_footnote(par) or is_email_init(par) or \
-                is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token):
-            paragraphs_after_cleaning.append("[deleted]")  # if the paragraph is not good , replace it with [deleted]
-        else:
-            paragraphs_after_cleaning.append(par)
-
-    cleaned_book = "\n\n".join(paragraphs_after_cleaning)  # joining the list of paragraphs into one string
-    return cleaned_book
+from _cleaning_options.cleaner import super_cleaner, simple_cleaner
diff --git a/setup.py b/setup.py
@@ -6,16 +6,18 @@
 setup(
     name="gutenberg_cleaner",
     install_requires=['nltk'],
-    version='0.0.1',
+    version='0.1.3',
     description="cleans gutenberg dataset books",
     author_email='[email protected]',
-    py_modules=["gutenbergـcleaner"],
+    packages=["_cleaning_options"],
+    py_modules=["gutenberg_cleaner"],
+    url="https://github.com/kiasar/gutenberg_cleaner",
     license='MIT',
     long_description=long_description,
     classifiers=[
-        "Programming language :: Python :: 3",
-        "Programming language :: Python :: 3.6",
-        "Programming language :: Python :: 3.7",
-        "Operation System :: OS Independent"
-    ]
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Operating System :: OS Independent",
+    ],
 )