Skip to content

Commit

Permalink
new
Browse files Browse the repository at this point in the history
  • Loading branch information
kiasari committed May 25, 2019
1 parent 4a6e7a0 commit 1602b5c
Show file tree
Hide file tree
Showing 14 changed files with 72 additions and 177 deletions.
File renamed without changes.
13 changes: 0 additions & 13 deletions Pipfile

This file was deleted.

97 changes: 0 additions & 97 deletions Pipfile.lock

This file was deleted.

File renamed without changes.
42 changes: 42 additions & 0 deletions _cleaning_options/cleaner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from _cleaning_options.cleaning_options import _is_title_or_etc, _is_books_copy, \
_is_email_init, _is_footnote, _is_image, _is_table
from _cleaning_options.strip_headers import _strip_headers


def simple_cleaner(book: str) -> str:
"""
Just removes lines that are part of the Project Gutenberg header or footer.
Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
:rtype: str
:param book: str of a gutenberg's book
:return: str of the book without the lines that are part of the Project Gutenberg header or footer.
"""
return _strip_headers(book)


def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
"""
Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
^_^ Do you have a comment to make it better? Email to [email protected] ^_^.
IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
:rtype: str
:param book: str of a gutenberg's book.
:param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
-1 means don't tokenize the txt (so it will be faster).
:param max_token: The maximum tokens of a paragraph.
:return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
you can split the book to paragraphs by "\n\n".
"""
headless_book = _strip_headers(book)
paragraphs = headless_book.split("\n\n") # split the book to paragraphs.

paragraphs_after_cleaning = []
for par in paragraphs:
if _is_image(par) or _is_footnote(par) or _is_email_init(par) or \
_is_books_copy(par) or _is_table(par) or _is_title_or_etc(par, min_token, max_token):
paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted]
else:
paragraphs_after_cleaning.append(par)

cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string
return cleaned_book
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:") # Regex to find images.


def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
def _is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
"""
determining if a paragraph is title or information of the book.
IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
Expand Down Expand Up @@ -40,7 +40,7 @@ def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool
return False


def is_table(text: str) -> bool:
def _is_table(text: str) -> bool:
"""
determining if a paragraph is a table or catalog.
:rtype: bool
Expand All @@ -56,7 +56,7 @@ def is_table(text: str) -> bool:
return True # mostly catalogs and etc.


def is_image(text: str) -> bool:
def _is_image(text: str) -> bool:
"""
determining if a paragraph is for mentioning an image.
:param text: Raw paragraph.
Expand All @@ -65,7 +65,7 @@ def is_image(text: str) -> bool:
return bool(re.search(image_formats_regex, text.lower()))


def is_footnote(text: str) -> bool:
def _is_footnote(text: str) -> bool:
"""
determining if a paragraph is the footnote of the book.
:rtype: bool
Expand All @@ -79,11 +79,7 @@ def is_footnote(text: str) -> bool:
return bool(re.search(footnote_notation_regex, txt)) # if a line starts with {...} it might be a footnote.


print(is_footnote(""" [0] The country-seat of Bishop Shipley, the good bishop,
as Dr. Franklin used to style him.--B."""))


def is_books_copy(text: str) -> bool:
def _is_books_copy(text: str) -> bool:
"""
determining if a paragraph indicates the number of copies of this book.
:rtype: bool
Expand All @@ -95,7 +91,7 @@ def is_books_copy(text: str) -> bool:
return False


def is_email_init(text: str) -> bool:
def _is_email_init(text: str) -> bool:
"""
determining if a paragraph includes an Email.
:rtype: bool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
LEGALESE_END_MARKERS = frozenset(("SERVICE THAT CHARGES FOR DOWNLOAD",))


def strip_headers(text):
def _strip_headers(text):
"""Remove lines that are part of the Project Gutenberg header or footer.
Note: The original version of the code can be found at:
https://github.com/c-w/gutenberg/blob/master/gutenberg/cleanup/strip_headers.py
Expand Down
Binary file removed dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
Binary file not shown.
Binary file removed dist/gutenberg_cleaner-0.0.1.tar.gz
Binary file not shown.
12 changes: 6 additions & 6 deletions gutenberg_cleaner.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Metadata-Version: 1.1
Name: gutenberg-cleaner
Version: 0.0.1
Version: 0.1.3
Summary: cleans gutenberg dataset books
Home-page: UNKNOWN
Home-page: https://github.com/kiasar/gutenberg_cleaner
Author: UNKNOWN
Author-email: [email protected]
License: MIT
Expand Down Expand Up @@ -51,7 +51,7 @@ Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png)
This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details

Platform: UNKNOWN
Classifier: Programming language :: Python :: 3
Classifier: Programming language :: Python :: 3.6
Classifier: Programming language :: Python :: 3.7
Classifier: Operation System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Operating System :: OS Independent
5 changes: 5 additions & 0 deletions gutenberg_cleaner.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
README.md
gutenberg_cleaner.py
setup.py
_cleaning_options/__init__.py
_cleaning_options/cleaner.py
_cleaning_options/cleaning_options.py
_cleaning_options/strip_headers.py
gutenberg_cleaner.egg-info/PKG-INFO
gutenberg_cleaner.egg-info/SOURCES.txt
gutenberg_cleaner.egg-info/dependency_links.txt
Expand Down
3 changes: 2 additions & 1 deletion gutenberg_cleaner.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
gutenbergـcleaner
_cleaning_options
gutenberg_cleaner
43 changes: 1 addition & 42 deletions gutenberg_cleaner.py
Original file line number Diff line number Diff line change
@@ -1,42 +1 @@
from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \
is_image, is_table
from gutenberg_cleaning_options.strip_headers import strip_headers


def simple_cleaner(book: str) -> str:
"""
Just removes lines that are part of the Project Gutenberg header or footer.
Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
:rtype: str
:param book: str of a gutenberg's book
:return: str of the book without the lines that are part of the Project Gutenberg header or footer.
"""
return strip_headers(book)


def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
"""
Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
^_^ Do you have a comment to make it better? Email to [email protected] ^_^.
IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
:rtype: str
:param book: str of a gutenberg's book.
:param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
-1 means don't tokenize the txt (so it will be faster).
:param max_token: The maximum tokens of a paragraph.
:return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
you can split the book to paragraphs by "\n\n".
"""
headless_book = strip_headers(book)
paragraphs = headless_book.split("\n\n") # split the book to paragraphs.

paragraphs_after_cleaning = []
for par in paragraphs:
if is_image(par) or is_footnote(par) or is_email_init(par) or \
is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token):
paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted]
else:
paragraphs_after_cleaning.append(par)

cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string
return cleaned_book
from _cleaning_options.cleaner import super_cleaner, simple_cleaner
16 changes: 9 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,18 @@
setup(
name="gutenberg_cleaner",
install_requires=['nltk'],
version='0.0.1',
version='0.1.3',
description="cleans gutenberg dataset books",
author_email='[email protected]',
py_modules=["gutenbergـcleaner"],
packages=["_cleaning_options"],
py_modules=["gutenberg_cleaner"],
url="https://github.com/kiasar/gutenberg_cleaner",
license='MIT',
long_description=long_description,
classifiers=[
"Programming language :: Python :: 3",
"Programming language :: Python :: 3.6",
"Programming language :: Python :: 3.7",
"Operation System :: OS Independent"
]
"Programming Language :: Python",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Operating System :: OS Independent",
],
)

0 comments on commit 1602b5c

Please sign in to comment.