diff --git a/.idea/.name b/.idea/.name
new file mode 100644
index 0000000..9761cd8
--- /dev/null
+++ b/.idea/.name
@@ -0,0 +1 @@
+Gutenberg_cleaner
\ No newline at end of file
diff --git a/.idea/Gutenberg_cleaner.iml b/.idea/Gutenberg_cleaner.iml
new file mode 100644
index 0000000..5138a24
--- /dev/null
+++ b/.idea/Gutenberg_cleaner.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="R User Library" level="project" />
+    <orderEntry type="library" name="R Skeletons" level="application" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Twisted Trial" />
+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/dictionaries/peyman.xml b/.idea/dictionaries/peyman.xml
new file mode 100644
index 0000000..5c50b93
--- /dev/null
+++ b/.idea/dictionaries/peyman.xml
@@ -0,0 +1,7 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="peyman">
+    <words>
+      <w>dataset</w>
+    </words>
+  </dictionary>
+</component>
\ No newline at end of file
diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml
new file mode 100644
index 0000000..71f5ff7
--- /dev/null
+++ b/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+<component name="libraryTable">
+  <library name="R User Library">
+    <CLASSES />
+    <SOURCES />
+  </library>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..3999087
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..535c83f
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Gutenberg_API.iml" filepath="$PROJECT_DIR$/.idea/Gutenberg_API.iml" />
+      <module fileurl="file://$PROJECT_DIR$/.idea/Gutenberg_cleaner.iml" filepath="$PROJECT_DIR$/.idea/Gutenberg_cleaner.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..058a5a4
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Peyman Mohseni Kiasari
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..08bebb3
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = ">=3.6"
+
+[packages]
+gutenberg-cleaner = {editable = true,path = "."}
+
+[requires]
+python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..872fbe3
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,97 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "a94cfd56101fcd45bbc7bc59fdc39f4fbc900808a4e792ef202fa7839f0b6413"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.6"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "gutenberg-cleaner": {
+            "editable": true,
+            "path": "."
+        },
+        "nltk": {
+            "hashes": [
+                "sha256:3a64b1cb685bbf344adec416871fee07996671c876ff313b3e504158fa1500e1"
+            ],
+            "version": "==3.4.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    },
+    "develop": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7",
+                "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a"
+            ],
+            "markers": "python_version > '2.7'",
+            "version": "==7.0.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180",
+                "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a"
+            ],
+            "version": "==0.11.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+            ],
+            "version": "==1.8.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24",
+                "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6"
+            ],
+            "index": "pypi",
+            "version": "==4.5.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        }
+    }
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3f0d21f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+![](https://i.ibb.co/sCJXhmz/header-sp.png)
+![](https://img.shields.io/apm/l/vim-mode.svg)
+
+
+# gutenberg-cleaner
+
+a python package for cleaning Gutenberg books and dataset.
+
+### Prerequisites
+nltk package
+
+### Installing
+```
+[sudo] pip install gutenberg-cleaner
+```
+
+## How to use it?
+
+it has two methods called "simple_cleaner" and "super_cleaner".
+### simple_claner:
+Just removes lines that are part of the Project Gutenberg header or footer.
+Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+```
+simple_cleaner(book: str) -> str
+```
+### super_cleaner:
+Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+```
+super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+```
+min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+max_token: The maximum tokens of a paragraph.
+
+it will mark deleted paragraphs with: [deleted]
+
+
+## Author
+
+* **Peyman Mohseni kiasari**
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
new file mode 100644
index 0000000..bb9ca61
Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl differ
diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz
new file mode 100644
index 0000000..450881c
Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1.tar.gz differ
diff --git a/gitignore.gitignore b/gitignore.gitignore
new file mode 100644
index 0000000..b9af354
--- /dev/null
+++ b/gitignore.gitignore
@@ -0,0 +1,232 @@
+
+# Created by https://www.gitignore.io/api/linux,python,pycharm+all
+# Edit at https://www.gitignore.io/?templates=linux,python,pycharm+all
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# JetBrains templates
+**___jb_tmp___
+
+### PyCharm+all Patch ###
+# Ignores the whole .idea folder and all .iml files
+# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
+
+.idea/
+
+# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
+
+*.iml
+modules.xml
+.idea/misc.xml
+*.ipr
+
+# Sonarlint plugin
+.idea/sonarlint
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# End of https://www.gitignore.io/api/linux,python,pycharm+all
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO
new file mode 100644
index 0000000..1096e29
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/PKG-INFO
@@ -0,0 +1,57 @@
+Metadata-Version: 1.1
+Name: gutenberg-cleaner
+Version: 0.0.1
+Summary: cleans gutenberg dataset books
+Home-page: UNKNOWN
+Author: UNKNOWN
+Author-email: mohsenikiasari@ce.sharif.edu
+License: MIT
+Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png)
+        ![](https://img.shields.io/apm/l/vim-mode.svg)
+        
+        
+        # gutenberg-cleaner
+        
+        a python package for cleaning Gutenberg books and dataset.
+        
+        ### Prerequisites
+        nltk package
+        
+        ### Installing
+        ```
+        [sudo] pip install gutenberg-cleaner
+        ```
+        
+        ## How to use it?
+        
+        it has two methods called "simple_cleaner" and "super_cleaner".
+        ### simple_claner:
+        Just removes lines that are part of the Project Gutenberg header or footer.
+        Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+        ```
+        simple_cleaner(book: str) -> str
+        ```
+        ### super_cleaner:
+        Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+        ```
+        super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+        ```
+        min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+        max_token: The maximum tokens of a paragraph.
+        
+        it will mark deleted paragraphs with: [deleted]
+        
+        
+        ## Author
+        
+        * **Peyman Mohseni kiasari**
+        
+        ## License
+        
+        This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
+        
+Platform: UNKNOWN
+Classifier: Programming language :: Python :: 3
+Classifier: Programming language :: Python :: 3.6
+Classifier: Programming language :: Python :: 3.7
+Classifier: Operation System :: OS Independent
diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt
new file mode 100644
index 0000000..564eaf9
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/SOURCES.txt
@@ -0,0 +1,7 @@
+README.md
+setup.py
+gutenberg_cleaner.egg-info/PKG-INFO
+gutenberg_cleaner.egg-info/SOURCES.txt
+gutenberg_cleaner.egg-info/dependency_links.txt
+gutenberg_cleaner.egg-info/requires.txt
+gutenberg_cleaner.egg-info/top_level.txt
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/dependency_links.txt b/gutenberg_cleaner.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/gutenberg_cleaner.egg-info/requires.txt b/gutenberg_cleaner.egg-info/requires.txt
new file mode 100644
index 0000000..8469296
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/requires.txt
@@ -0,0 +1 @@
+nltk
diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt
new file mode 100644
index 0000000..fae547e
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/top_level.txt
@@ -0,0 +1 @@
+gutenbergـcleaner
diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py
new file mode 100644
index 0000000..62e8457
--- /dev/null
+++ b/gutenberg_cleaner.py
@@ -0,0 +1,42 @@
+from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \
+    is_image, is_table
+from gutenberg_cleaning_options.strip_headers import strip_headers
+
+
+def simple_cleaner(book: str) -> str:
+    """
+    Just removes lines that are part of the Project Gutenberg header or footer.
+    Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+    :rtype: str
+    :param book: str of a gutenberg's book
+    :return: str of the book without the lines that are part of the Project Gutenberg header or footer.
+    """
+    return strip_headers(book)
+
+
+def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
+    """
+    Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+    ^_^ Do you have a comment to make it better? Email to mohsenikiasari@ce.sharif.edu ^_^.
+    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+    :rtype: str
+    :param book: str of a gutenberg's book.
+    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+     -1 means don't tokenize the txt (so it will be faster).
+    :param max_token: The maximum tokens of a paragraph.
+    :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
+    you can split the book to paragraphs by "\n\n".
+    """
+    headless_book = strip_headers(book)
+    paragraphs = headless_book.split("\n\n")  # split the book to paragraphs.
+
+    paragraphs_after_cleaning = []
+    for par in paragraphs:
+        if is_image(par) or is_footnote(par) or is_email_init(par) or \
+                is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token):
+            paragraphs_after_cleaning.append("[deleted]")  # if the paragraph is not good , replace it with [deleted]
+        else:
+            paragraphs_after_cleaning.append(par)
+
+    cleaned_book = "\n\n".join(paragraphs_after_cleaning)  # joining the list of paragraphs into one string
+    return cleaned_book
diff --git a/gutenberg_cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/gutenberg_cleaning_options/cleaning_options.py b/gutenberg_cleaning_options/cleaning_options.py
new file mode 100644
index 0000000..b0850ac
--- /dev/null
+++ b/gutenberg_cleaning_options/cleaning_options.py
@@ -0,0 +1,105 @@
+import string
+import re
+from nltk import word_tokenize
+
+email_regex = re.compile("[\w.-]+@[\w.-]+\.\w+")  # Regex to find Emails.
+footnote_notation_regex = re.compile("^\{.+\}|^\[.+\]")  # Regex to find start of footnotes.
+number_of_copies_regex = re.compile("[0-9]* copies|copyright")  # Regex to find copy mentioning.
+starts_with_regex = re.compile('^[%_<>*]')  # If the text is started with these, it is not a good one.
+image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:")  # Regex to find images.
+
+
+def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
+    """
+    determining if a paragraph is title or information of the book.
+    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+     -1 means don't tokenize the txt (so it will be faster).
+    :param max_token: The maximum tokens of a paragraph.
+    :return: Boolean, True if it is title or information of the book or a bad paragraph.
+    """
+    txt = text.strip()
+    num_token = len(word_tokenize(txt)) if min_token >= 0 else -1
+    if num_token > max_token:
+        return True
+    if len(txt) == 0 or num_token < min_token and not (txt.count('"') == 2 or txt.count('\'') == 2 or txt[-1] == ":"):
+        return True  # Length is short but not "dialog" or "quote"
+    if sum(1 for c in txt if c.isupper() or c.isdigit() or c in string.punctuation.replace("\"", "")) \
+            / len(txt.replace(" ", "")) > 0.6:
+        return True  # More than 60% of chars are UPPER or digits or punctuations so it might be title or etc.
+    if txt.lower().startswith("appendix") or bool(re.search(starts_with_regex, txt)):
+        return True
+    if txt.count(":") > 3 and 2 * txt.count(":") - txt.count("\"") > 3:
+        return True  # mostly information about the book.
+    if txt.count("   ") > 3 or txt.count("\t") > 2 or txt.count("*") > 3 or txt.count("=") > 2:
+        return True  # mostly tables and catalogs and etc.
+    if "@" in txt and len(txt) < 100:
+        return True
+    return False
+
+
+def is_table(text: str) -> bool:
+    """
+    determining if a paragraph is a table or catalog.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return:  Boolean, True if it is a table or catalog.
+    """
+    txt = text.strip()
+    if txt.count("   ") > 3 or txt.count("\t") > 2:
+        txt = " ".join([line.strip() for line in txt.split("\n")])
+        if txt.count("   ") > 3 or txt.count("\t") > 2:
+            return True  # mostly tables.
+    if txt.count("*") > 3 or txt.count("=") > 2:
+        return True  # mostly catalogs and etc.
+
+
+def is_image(text: str) -> bool:
+    """
+    determining if a paragraph is for mentioning an image.
+    :param text: Raw paragraph.
+    :return: Boolean, True if it is for mentioning an image.
+    """
+    return bool(re.search(image_formats_regex, text.lower()))
+
+
+def is_footnote(text: str) -> bool:
+    """
+    determining if a paragraph is the footnote of the book.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return: Boolean, True if it is the footnote of the book.
+    """
+    txt = text.strip()
+    print(txt)
+    if "footnote" in txt.lower() and len(txt.replace(" ", "")) < 50:
+        return True
+    return bool(re.search(footnote_notation_regex, txt))  # if a line starts with {...} it might be a footnote.
+
+
+print(is_footnote("""     [0] The country-seat of Bishop Shipley, the good bishop,
+         as Dr. Franklin used to style him.--B."""))
+
+
+def is_books_copy(text: str) -> bool:
+    """
+    determining if a paragraph indicates the number of copies of this book.
+    :rtype: bool
+    :param text: text: Raw paragraph.
+    :return: Boolean, True if it is indicating the copy of book or copyrights.
+    """
+    if bool(re.search(number_of_copies_regex, text)) and len(text.replace(" ", "")) < 500:
+        return True
+    return False
+
+
+def is_email_init(text: str) -> bool:
+    """
+    determining if a paragraph includes an Email.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return: Boolean, True if it includes an Email.
+    """
+    return bool(re.search(email_regex, text))
diff --git a/gutenberg_cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py
new file mode 100644
index 0000000..3acb1c2
--- /dev/null
+++ b/gutenberg_cleaning_options/strip_headers.py
@@ -0,0 +1,149 @@
+"""Module to remove the noise from Project Gutenberg texts."""
+
+from __future__ import absolute_import, unicode_literals
+from builtins import str
+import os
+
+TEXT_START_MARKERS = frozenset((
+    "*END*THE SMALL PRINT",
+    "*** START OF THE PROJECT GUTENBERG",
+    "*** START OF THIS PROJECT GUTENBERG",
+    "This etext was prepared by",
+    "E-text prepared by",
+    "Produced by",
+    "Distributed Proofreading Team",
+    "Proofreading Team at http://www.pgdp.net",
+    "http://gallica.bnf.fr)",
+    "      http://archive.org/details/",
+    "http://www.pgdp.net",
+    "by The Internet Archive)",
+    "by The Internet Archive/Canadian Libraries",
+    "by The Internet Archive/American Libraries",
+    "public domain material from the Internet Archive",
+    "Internet Archive)",
+    "Internet Archive/Canadian Libraries",
+    "Internet Archive/American Libraries",
+    "material from the Google Print project",
+    "*END THE SMALL PRINT",
+    "***START OF THE PROJECT GUTENBERG",
+    "This etext was produced by",
+    "*** START OF THE COPYRIGHTED",
+    "The Project Gutenberg",
+    "http://gutenberg.spiegel.de/ erreichbar.",
+    "Project Runeberg publishes",
+    "Beginning of this Project Gutenberg",
+    "Project Gutenberg Online Distributed",
+    "Gutenberg Online Distributed",
+    "the Project Gutenberg Online Distributed",
+    "Project Gutenberg TEI",
+    "This eBook was prepared by",
+    "http://gutenberg2000.de erreichbar.",
+    "This Etext was prepared by",
+    "This Project Gutenberg Etext was prepared by",
+    "Gutenberg Distributed Proofreaders",
+    "Project Gutenberg Distributed Proofreaders",
+    "the Project Gutenberg Online Distributed Proofreading Team",
+    "**The Project Gutenberg",
+    "*SMALL PRINT!",
+    "More information about this book is at the top of this file.",
+    "tells you about restrictions in how the file may be used.",
+    "l'authorization à les utilizer pour preparer ce texte.",
+    "of the etext through OCR.",
+    "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
+    "We need your donations more than ever!",
+    " *** START OF THIS PROJECT GUTENBERG",
+    "****     SMALL PRINT!",
+    '["Small Print" V.',
+    '      (http://www.ibiblio.org/gutenberg/',
+    'and the Project Gutenberg Online Distributed Proofreading Team',
+    'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading',
+    '                this Project Gutenberg edition.',
+))
+
+TEXT_END_MARKERS = frozenset((
+    "*** END OF THE PROJECT GUTENBERG",
+    "*** END OF THIS PROJECT GUTENBERG",
+    "***END OF THE PROJECT GUTENBERG",
+    "End of the Project Gutenberg",
+    "End of The Project Gutenberg",
+    "Ende dieses Project Gutenberg",
+    "by Project Gutenberg",
+    "End of Project Gutenberg",
+    "End of this Project Gutenberg",
+    "Ende dieses Projekt Gutenberg",
+    "        ***END OF THE PROJECT GUTENBERG",
+    "*** END OF THE COPYRIGHTED",
+    "End of this is COPYRIGHTED",
+    "Ende dieses Etextes ",
+    "Ende dieses Project Gutenber",
+    "Ende diese Project Gutenberg",
+    "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
+    "Fin de Project Gutenberg",
+    "The Project Gutenberg Etext of ",
+    "Ce document fut presente en lecture",
+    "Ce document fut présenté en lecture",
+    "More information about this book is at the top of this file.",
+    "We need your donations more than ever!",
+    "END OF PROJECT GUTENBERG",
+    " End of the Project Gutenberg",
+    " *** END OF THIS PROJECT GUTENBERG",
+))
+
+LEGALESE_START_MARKERS = frozenset(("<<THIS ELECTRONIC VERSION OF",))
+
+LEGALESE_END_MARKERS = frozenset(("SERVICE THAT CHARGES FOR DOWNLOAD",))
+
+
+def strip_headers(text):
+    """Remove lines that are part of the Project Gutenberg header or footer.
+    Note: The original version of the code can be found at:
+    https://github.com/c-w/gutenberg/blob/master/gutenberg/cleanup/strip_headers.py
+    Args:
+        text (unicode): The body of the text to clean up.
+    Returns:
+        unicode: The text with any non-text content removed.
+    """
+    lines = text.splitlines()
+    sep = str(os.linesep)
+
+    out = []
+    i = 0
+    footer_found = False
+    ignore_section = False
+
+    for line in lines:
+        reset = False
+
+        if i <= 600:
+            # Check if the header ends here
+            if any(line.startswith(token) for token in TEXT_START_MARKERS):
+                reset = True
+
+            # If it's the end of the header, delete the output produced so far.
+            # May be done several times, if multiple lines occur indicating the
+            # end of the header
+            if reset:
+                out = []
+                continue
+
+        if i >= 100:
+            # Check if the footer begins here
+            if any(line.startswith(token) for token in TEXT_END_MARKERS):
+                footer_found = True
+
+            # If it's the beginning of the footer, stop output
+            if footer_found:
+                break
+
+        if any(line.startswith(token) for token in LEGALESE_START_MARKERS):
+            ignore_section = True
+            continue
+        elif any(line.startswith(token) for token in LEGALESE_END_MARKERS):
+            ignore_section = False
+            continue
+
+        if not ignore_section:
+            out.append(line.rstrip(sep))
+            i += 1
+
+    return sep.join(out)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1e8d2f3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,21 @@
+from setuptools import setup
+
+with open('README.md') as f:
+    long_description = f.read()
+
+setup(
+    name="gutenberg_cleaner",
+    install_requires=['nltk'],
+    version='0.0.1',
+    description="cleans gutenberg dataset books",
+    author_email='mohsenikiasari@ce.sharif.edu',
+    py_modules=["gutenbergـcleaner"],
+    license='MIT',
+    long_description=long_description,
+    classifiers=[
+        "Programming language :: Python :: 3",
+        "Programming language :: Python :: 3.6",
+        "Programming language :: Python :: 3.7",
+        "Operation System :: OS Independent"
+    ]
+)