-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
kiasari
committed
May 25, 2019
1 parent
4a6e7a0
commit 1602b5c
Showing
14 changed files
with
72 additions
and
177 deletions.
There are no files selected for viewing
File renamed without changes.
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from _cleaning_options.cleaning_options import _is_title_or_etc, _is_books_copy, \ | ||
_is_email_init, _is_footnote, _is_image, _is_table | ||
from _cleaning_options.strip_headers import _strip_headers | ||
|
||
|
||
def simple_cleaner(book: str) -> str: | ||
""" | ||
Just removes lines that are part of the Project Gutenberg header or footer. | ||
Doesnt go deeply in the text to remove other things like titles or footnotes or etc... | ||
:rtype: str | ||
:param book: str of a gutenberg's book | ||
:return: str of the book without the lines that are part of the Project Gutenberg header or footer. | ||
""" | ||
return _strip_headers(book) | ||
|
||
|
||
def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str: | ||
""" | ||
Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. | ||
^_^ Do you have a comment to make it better? Email to [email protected] ^_^. | ||
IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. | ||
:rtype: str | ||
:param book: str of a gutenberg's book. | ||
:param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", | ||
-1 means don't tokenize the txt (so it will be faster). | ||
:param max_token: The maximum tokens of a paragraph. | ||
:return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it. | ||
you can split the book to paragraphs by "\n\n". | ||
""" | ||
headless_book = _strip_headers(book) | ||
paragraphs = headless_book.split("\n\n") # split the book to paragraphs. | ||
|
||
paragraphs_after_cleaning = [] | ||
for par in paragraphs: | ||
if _is_image(par) or _is_footnote(par) or _is_email_init(par) or \ | ||
_is_books_copy(par) or _is_table(par) or _is_title_or_etc(par, min_token, max_token): | ||
paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted] | ||
else: | ||
paragraphs_after_cleaning.append(par) | ||
|
||
cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string | ||
return cleaned_book |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,8 @@ | ||
Metadata-Version: 1.1 | ||
Name: gutenberg-cleaner | ||
Version: 0.0.1 | ||
Version: 0.1.3 | ||
Summary: cleans gutenberg dataset books | ||
Home-page: UNKNOWN | ||
Home-page: https://github.com/kiasar/gutenberg_cleaner | ||
Author: UNKNOWN | ||
Author-email: [email protected] | ||
License: MIT | ||
|
@@ -51,7 +51,7 @@ Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png) | |
This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details | ||
|
||
Platform: UNKNOWN | ||
Classifier: Programming language :: Python :: 3 | ||
Classifier: Programming language :: Python :: 3.6 | ||
Classifier: Programming language :: Python :: 3.7 | ||
Classifier: Operation System :: OS Independent | ||
Classifier: Programming Language :: Python | ||
Classifier: Programming Language :: Python :: 3.6 | ||
Classifier: Programming Language :: Python :: 3.7 | ||
Classifier: Operating System :: OS Independent |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
gutenbergـcleaner | ||
_cleaning_options | ||
gutenberg_cleaner |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1 @@ | ||
from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \ | ||
is_image, is_table | ||
from gutenberg_cleaning_options.strip_headers import strip_headers | ||
|
||
|
||
def simple_cleaner(book: str) -> str: | ||
""" | ||
Just removes lines that are part of the Project Gutenberg header or footer. | ||
Doesnt go deeply in the text to remove other things like titles or footnotes or etc... | ||
:rtype: str | ||
:param book: str of a gutenberg's book | ||
:return: str of the book without the lines that are part of the Project Gutenberg header or footer. | ||
""" | ||
return strip_headers(book) | ||
|
||
|
||
def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str: | ||
""" | ||
Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. | ||
^_^ Do you have a comment to make it better? Email to [email protected] ^_^. | ||
IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. | ||
:rtype: str | ||
:param book: str of a gutenberg's book. | ||
:param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", | ||
-1 means don't tokenize the txt (so it will be faster). | ||
:param max_token: The maximum tokens of a paragraph. | ||
:return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it. | ||
you can split the book to paragraphs by "\n\n". | ||
""" | ||
headless_book = strip_headers(book) | ||
paragraphs = headless_book.split("\n\n") # split the book to paragraphs. | ||
|
||
paragraphs_after_cleaning = [] | ||
for par in paragraphs: | ||
if is_image(par) or is_footnote(par) or is_email_init(par) or \ | ||
is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token): | ||
paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted] | ||
else: | ||
paragraphs_after_cleaning.append(par) | ||
|
||
cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string | ||
return cleaned_book | ||
from _cleaning_options.cleaner import super_cleaner, simple_cleaner |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,16 +6,18 @@ | |
setup( | ||
name="gutenberg_cleaner", | ||
install_requires=['nltk'], | ||
version='0.0.1', | ||
version='0.1.3', | ||
description="cleans gutenberg dataset books", | ||
author_email='[email protected]', | ||
py_modules=["gutenbergـcleaner"], | ||
packages=["_cleaning_options"], | ||
py_modules=["gutenberg_cleaner"], | ||
url="https://github.com/kiasar/gutenberg_cleaner", | ||
license='MIT', | ||
long_description=long_description, | ||
classifiers=[ | ||
"Programming language :: Python :: 3", | ||
"Programming language :: Python :: 3.6", | ||
"Programming language :: Python :: 3.7", | ||
"Operation System :: OS Independent" | ||
] | ||
"Programming Language :: Python", | ||
"Programming Language :: Python :: 3.6", | ||
"Programming Language :: Python :: 3.7", | ||
"Operating System :: OS Independent", | ||
], | ||
) |