diff --git a/.idea/.name b/.idea/.name
new file mode 100644
index 0000000..9761cd8
--- /dev/null
+++ b/.idea/.name
@@ -0,0 +1 @@
+Gutenberg_cleaner
\ No newline at end of file
diff --git a/.idea/Gutenberg_cleaner.iml b/.idea/Gutenberg_cleaner.iml
new file mode 100644
index 0000000..5138a24
--- /dev/null
+++ b/.idea/Gutenberg_cleaner.iml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/dictionaries/peyman.xml b/.idea/dictionaries/peyman.xml
new file mode 100644
index 0000000..5c50b93
--- /dev/null
+++ b/.idea/dictionaries/peyman.xml
@@ -0,0 +1,7 @@
+
+
+
+ dataset
+
+
+
\ No newline at end of file
diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml
new file mode 100644
index 0000000..71f5ff7
--- /dev/null
+++ b/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..3999087
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..535c83f
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..058a5a4
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Peyman Mohseni Kiasari
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..08bebb3
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = ">=3.6"
+
+[packages]
+gutenberg-cleaner = {editable = true,path = "."}
+
+[requires]
+python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..872fbe3
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,97 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "a94cfd56101fcd45bbc7bc59fdc39f4fbc900808a4e792ef202fa7839f0b6413"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.6"
+ },
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "gutenberg-cleaner": {
+ "editable": true,
+ "path": "."
+ },
+ "nltk": {
+ "hashes": [
+ "sha256:3a64b1cb685bbf344adec416871fee07996671c876ff313b3e504158fa1500e1"
+ ],
+ "version": "==3.4.1"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ }
+ },
+ "develop": {
+ "atomicwrites": {
+ "hashes": [
+ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+ "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+ ],
+ "version": "==1.3.0"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+ "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+ ],
+ "version": "==19.1.0"
+ },
+ "more-itertools": {
+ "hashes": [
+ "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7",
+ "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a"
+ ],
+ "markers": "python_version > '2.7'",
+ "version": "==7.0.0"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180",
+ "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a"
+ ],
+ "version": "==0.11.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+ "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+ ],
+ "version": "==1.8.0"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24",
+ "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6"
+ ],
+ "index": "pypi",
+ "version": "==4.5.0"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "version": "==1.12.0"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ }
+ }
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3f0d21f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+![](https://i.ibb.co/sCJXhmz/header-sp.png)
+![](https://img.shields.io/apm/l/vim-mode.svg)
+
+
+# gutenberg-cleaner
+
+a python package for cleaning Gutenberg books and dataset.
+
+### Prerequisites
+nltk package
+
+### Installing
+```
+[sudo] pip install gutenberg-cleaner
+```
+
+## How to use it?
+
+it has two methods called "simple_cleaner" and "super_cleaner".
+### simple_claner:
+Just removes lines that are part of the Project Gutenberg header or footer.
+Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+```
+simple_cleaner(book: str) -> str
+```
+### super_cleaner:
+Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+```
+super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+```
+min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+max_token: The maximum tokens of a paragraph.
+
+it will mark deleted paragraphs with: [deleted]
+
+
+## Author
+
+* **Peyman Mohseni kiasari**
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
new file mode 100644
index 0000000..bb9ca61
Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl differ
diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz
new file mode 100644
index 0000000..450881c
Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1.tar.gz differ
diff --git a/gitignore.gitignore b/gitignore.gitignore
new file mode 100644
index 0000000..b9af354
--- /dev/null
+++ b/gitignore.gitignore
@@ -0,0 +1,232 @@
+
+# Created by https://www.gitignore.io/api/linux,python,pycharm+all
+# Edit at https://www.gitignore.io/?templates=linux,python,pycharm+all
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# JetBrains templates
+**___jb_tmp___
+
+### PyCharm+all Patch ###
+# Ignores the whole .idea folder and all .iml files
+# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
+
+.idea/
+
+# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
+
+*.iml
+modules.xml
+.idea/misc.xml
+*.ipr
+
+# Sonarlint plugin
+.idea/sonarlint
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don’t work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# End of https://www.gitignore.io/api/linux,python,pycharm+all
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO
new file mode 100644
index 0000000..1096e29
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/PKG-INFO
@@ -0,0 +1,57 @@
+Metadata-Version: 1.1
+Name: gutenberg-cleaner
+Version: 0.0.1
+Summary: cleans gutenberg dataset books
+Home-page: UNKNOWN
+Author: UNKNOWN
+Author-email: mohsenikiasari@ce.sharif.edu
+License: MIT
+Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png)
+ ![](https://img.shields.io/apm/l/vim-mode.svg)
+
+
+ # gutenberg-cleaner
+
+ a python package for cleaning Gutenberg books and dataset.
+
+ ### Prerequisites
+ nltk package
+
+ ### Installing
+ ```
+ [sudo] pip install gutenberg-cleaner
+ ```
+
+ ## How to use it?
+
+ it has two methods called "simple_cleaner" and "super_cleaner".
+ ### simple_claner:
+ Just removes lines that are part of the Project Gutenberg header or footer.
+ Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+ ```
+ simple_cleaner(book: str) -> str
+ ```
+ ### super_cleaner:
+ Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+ ```
+ super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+ ```
+ min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+ max_token: The maximum tokens of a paragraph.
+
+ it will mark deleted paragraphs with: [deleted]
+
+
+ ## Author
+
+ * **Peyman Mohseni kiasari**
+
+ ## License
+
+ This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
+
+Platform: UNKNOWN
+Classifier: Programming language :: Python :: 3
+Classifier: Programming language :: Python :: 3.6
+Classifier: Programming language :: Python :: 3.7
+Classifier: Operation System :: OS Independent
diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt
new file mode 100644
index 0000000..564eaf9
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/SOURCES.txt
@@ -0,0 +1,7 @@
+README.md
+setup.py
+gutenberg_cleaner.egg-info/PKG-INFO
+gutenberg_cleaner.egg-info/SOURCES.txt
+gutenberg_cleaner.egg-info/dependency_links.txt
+gutenberg_cleaner.egg-info/requires.txt
+gutenberg_cleaner.egg-info/top_level.txt
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/dependency_links.txt b/gutenberg_cleaner.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/gutenberg_cleaner.egg-info/requires.txt b/gutenberg_cleaner.egg-info/requires.txt
new file mode 100644
index 0000000..8469296
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/requires.txt
@@ -0,0 +1 @@
+nltk
diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt
new file mode 100644
index 0000000..fae547e
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/top_level.txt
@@ -0,0 +1 @@
+gutenbergـcleaner
diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py
new file mode 100644
index 0000000..62e8457
--- /dev/null
+++ b/gutenberg_cleaner.py
@@ -0,0 +1,42 @@
+from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \
+ is_image, is_table
+from gutenberg_cleaning_options.strip_headers import strip_headers
+
+
+def simple_cleaner(book: str) -> str:
+ """
+ Just removes lines that are part of the Project Gutenberg header or footer.
+ Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+ :rtype: str
+ :param book: str of a gutenberg's book
+ :return: str of the book without the lines that are part of the Project Gutenberg header or footer.
+ """
+ return strip_headers(book)
+
+
+def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
+ """
+ Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+ ^_^ Do you have a comment to make it better? Email to mohsenikiasari@ce.sharif.edu ^_^.
+ IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+ :rtype: str
+ :param book: str of a gutenberg's book.
+ :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+ -1 means don't tokenize the txt (so it will be faster).
+ :param max_token: The maximum tokens of a paragraph.
+ :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
+ you can split the book to paragraphs by "\n\n".
+ """
+ headless_book = strip_headers(book)
+ paragraphs = headless_book.split("\n\n") # split the book to paragraphs.
+
+ paragraphs_after_cleaning = []
+ for par in paragraphs:
+ if is_image(par) or is_footnote(par) or is_email_init(par) or \
+ is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token):
+ paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted]
+ else:
+ paragraphs_after_cleaning.append(par)
+
+ cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string
+ return cleaned_book
diff --git a/gutenberg_cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/gutenberg_cleaning_options/cleaning_options.py b/gutenberg_cleaning_options/cleaning_options.py
new file mode 100644
index 0000000..b0850ac
--- /dev/null
+++ b/gutenberg_cleaning_options/cleaning_options.py
@@ -0,0 +1,105 @@
+import string
+import re
+from nltk import word_tokenize
+
+email_regex = re.compile("[\w.-]+@[\w.-]+\.\w+") # Regex to find Emails.
+footnote_notation_regex = re.compile("^\{.+\}|^\[.+\]") # Regex to find start of footnotes.
+number_of_copies_regex = re.compile("[0-9]* copies|copyright") # Regex to find copy mentioning.
+starts_with_regex = re.compile('^[%_<>*]') # If the text is started with these, it is not a good one.
+image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:") # Regex to find images.
+
+
+def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
+ """
+ determining if a paragraph is title or information of the book.
+ IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+ :rtype: bool
+ :param text: Raw paragraph.
+ :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+ -1 means don't tokenize the txt (so it will be faster).
+ :param max_token: The maximum tokens of a paragraph.
+ :return: Boolean, True if it is title or information of the book or a bad paragraph.
+ """
+ txt = text.strip()
+ num_token = len(word_tokenize(txt)) if min_token >= 0 else -1
+ if num_token > max_token:
+ return True
+ if len(txt) == 0 or num_token < min_token and not (txt.count('"') == 2 or txt.count('\'') == 2 or txt[-1] == ":"):
+ return True # Length is short but not "dialog" or "quote"
+ if sum(1 for c in txt if c.isupper() or c.isdigit() or c in string.punctuation.replace("\"", "")) \
+ / len(txt.replace(" ", "")) > 0.6:
+ return True # More than 60% of chars are UPPER or digits or punctuations so it might be title or etc.
+ if txt.lower().startswith("appendix") or bool(re.search(starts_with_regex, txt)):
+ return True
+ if txt.count(":") > 3 and 2 * txt.count(":") - txt.count("\"") > 3:
+ return True # mostly information about the book.
+ if txt.count(" ") > 3 or txt.count("\t") > 2 or txt.count("*") > 3 or txt.count("=") > 2:
+ return True # mostly tables and catalogs and etc.
+ if "@" in txt and len(txt) < 100:
+ return True
+ return False
+
+
+def is_table(text: str) -> bool:
+ """
+ determining if a paragraph is a table or catalog.
+ :rtype: bool
+ :param text: Raw paragraph.
+ :return: Boolean, True if it is a table or catalog.
+ """
+ txt = text.strip()
+ if txt.count(" ") > 3 or txt.count("\t") > 2:
+ txt = " ".join([line.strip() for line in txt.split("\n")])
+ if txt.count(" ") > 3 or txt.count("\t") > 2:
+ return True # mostly tables.
+ if txt.count("*") > 3 or txt.count("=") > 2:
+ return True # mostly catalogs and etc.
+
+
+def is_image(text: str) -> bool:
+ """
+ determining if a paragraph is for mentioning an image.
+ :param text: Raw paragraph.
+ :return: Boolean, True if it is for mentioning an image.
+ """
+ return bool(re.search(image_formats_regex, text.lower()))
+
+
+def is_footnote(text: str) -> bool:
+ """
+ determining if a paragraph is the footnote of the book.
+ :rtype: bool
+ :param text: Raw paragraph.
+ :return: Boolean, True if it is the footnote of the book.
+ """
+ txt = text.strip()
+ print(txt)
+ if "footnote" in txt.lower() and len(txt.replace(" ", "")) < 50:
+ return True
+ return bool(re.search(footnote_notation_regex, txt)) # if a line starts with {...} it might be a footnote.
+
+
+print(is_footnote(""" [0] The country-seat of Bishop Shipley, the good bishop,
+ as Dr. Franklin used to style him.--B."""))
+
+
+def is_books_copy(text: str) -> bool:
+ """
+ determining if a paragraph indicates the number of copies of this book.
+ :rtype: bool
+ :param text: text: Raw paragraph.
+ :return: Boolean, True if it is indicating the copy of book or copyrights.
+ """
+ if bool(re.search(number_of_copies_regex, text)) and len(text.replace(" ", "")) < 500:
+ return True
+ return False
+
+
+def is_email_init(text: str) -> bool:
+ """
+ determining if a paragraph includes an Email.
+ :rtype: bool
+ :param text: Raw paragraph.
+ :return: Boolean, True if it includes an Email.
+ """
+ return bool(re.search(email_regex, text))
diff --git a/gutenberg_cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py
new file mode 100644
index 0000000..3acb1c2
--- /dev/null
+++ b/gutenberg_cleaning_options/strip_headers.py
@@ -0,0 +1,149 @@
+"""Module to remove the noise from Project Gutenberg texts."""
+
+from __future__ import absolute_import, unicode_literals
+from builtins import str
+import os
+
+TEXT_START_MARKERS = frozenset((
+ "*END*THE SMALL PRINT",
+ "*** START OF THE PROJECT GUTENBERG",
+ "*** START OF THIS PROJECT GUTENBERG",
+ "This etext was prepared by",
+ "E-text prepared by",
+ "Produced by",
+ "Distributed Proofreading Team",
+ "Proofreading Team at http://www.pgdp.net",
+ "http://gallica.bnf.fr)",
+ " http://archive.org/details/",
+ "http://www.pgdp.net",
+ "by The Internet Archive)",
+ "by The Internet Archive/Canadian Libraries",
+ "by The Internet Archive/American Libraries",
+ "public domain material from the Internet Archive",
+ "Internet Archive)",
+ "Internet Archive/Canadian Libraries",
+ "Internet Archive/American Libraries",
+ "material from the Google Print project",
+ "*END THE SMALL PRINT",
+ "***START OF THE PROJECT GUTENBERG",
+ "This etext was produced by",
+ "*** START OF THE COPYRIGHTED",
+ "The Project Gutenberg",
+ "http://gutenberg.spiegel.de/ erreichbar.",
+ "Project Runeberg publishes",
+ "Beginning of this Project Gutenberg",
+ "Project Gutenberg Online Distributed",
+ "Gutenberg Online Distributed",
+ "the Project Gutenberg Online Distributed",
+ "Project Gutenberg TEI",
+ "This eBook was prepared by",
+ "http://gutenberg2000.de erreichbar.",
+ "This Etext was prepared by",
+ "This Project Gutenberg Etext was prepared by",
+ "Gutenberg Distributed Proofreaders",
+ "Project Gutenberg Distributed Proofreaders",
+ "the Project Gutenberg Online Distributed Proofreading Team",
+ "**The Project Gutenberg",
+ "*SMALL PRINT!",
+ "More information about this book is at the top of this file.",
+ "tells you about restrictions in how the file may be used.",
+ "l'authorization à les utilizer pour preparer ce texte.",
+ "of the etext through OCR.",
+ "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
+ "We need your donations more than ever!",
+ " *** START OF THIS PROJECT GUTENBERG",
+ "**** SMALL PRINT!",
+ '["Small Print" V.',
+ ' (http://www.ibiblio.org/gutenberg/',
+ 'and the Project Gutenberg Online Distributed Proofreading Team',
+ 'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading',
+ ' this Project Gutenberg edition.',
+))
+
+TEXT_END_MARKERS = frozenset((
+ "*** END OF THE PROJECT GUTENBERG",
+ "*** END OF THIS PROJECT GUTENBERG",
+ "***END OF THE PROJECT GUTENBERG",
+ "End of the Project Gutenberg",
+ "End of The Project Gutenberg",
+ "Ende dieses Project Gutenberg",
+ "by Project Gutenberg",
+ "End of Project Gutenberg",
+ "End of this Project Gutenberg",
+ "Ende dieses Projekt Gutenberg",
+ " ***END OF THE PROJECT GUTENBERG",
+ "*** END OF THE COPYRIGHTED",
+ "End of this is COPYRIGHTED",
+ "Ende dieses Etextes ",
+ "Ende dieses Project Gutenber",
+ "Ende diese Project Gutenberg",
+ "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
+ "Fin de Project Gutenberg",
+ "The Project Gutenberg Etext of ",
+ "Ce document fut presente en lecture",
+ "Ce document fut présenté en lecture",
+ "More information about this book is at the top of this file.",
+ "We need your donations more than ever!",
+ "END OF PROJECT GUTENBERG",
+ " End of the Project Gutenberg",
+ " *** END OF THIS PROJECT GUTENBERG",
+))
+
+LEGALESE_START_MARKERS = frozenset(("<= 100:
+ # Check if the footer begins here
+ if any(line.startswith(token) for token in TEXT_END_MARKERS):
+ footer_found = True
+
+ # If it's the beginning of the footer, stop output
+ if footer_found:
+ break
+
+ if any(line.startswith(token) for token in LEGALESE_START_MARKERS):
+ ignore_section = True
+ continue
+ elif any(line.startswith(token) for token in LEGALESE_END_MARKERS):
+ ignore_section = False
+ continue
+
+ if not ignore_section:
+ out.append(line.rstrip(sep))
+ i += 1
+
+ return sep.join(out)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1e8d2f3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,21 @@
+from setuptools import setup
+
+with open('README.md') as f:
+ long_description = f.read()
+
+setup(
+ name="gutenberg_cleaner",
+ install_requires=['nltk'],
+ version='0.0.1',
+ description="cleans gutenberg dataset books",
+ author_email='mohsenikiasari@ce.sharif.edu',
+ py_modules=["gutenbergـcleaner"],
+ license='MIT',
+ long_description=long_description,
+ classifiers=[
+ "Programming language :: Python :: 3",
+ "Programming language :: Python :: 3.6",
+ "Programming language :: Python :: 3.7",
+ "Operation System :: OS Independent"
+ ]
+)