diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..9761cd8 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +Gutenberg_cleaner \ No newline at end of file diff --git a/.idea/Gutenberg_cleaner.iml b/.idea/Gutenberg_cleaner.iml new file mode 100644 index 0000000..5138a24 --- /dev/null +++ b/.idea/Gutenberg_cleaner.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/dictionaries/peyman.xml b/.idea/dictionaries/peyman.xml new file mode 100644 index 0000000..5c50b93 --- /dev/null +++ b/.idea/dictionaries/peyman.xml @@ -0,0 +1,7 @@ + + + + dataset + + + \ No newline at end of file diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml new file mode 100644 index 0000000..71f5ff7 --- /dev/null +++ b/.idea/libraries/R_User_Library.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..3999087 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..535c83f --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..058a5a4 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Peyman Mohseni Kiasari + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..08bebb3 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pytest = ">=3.6" + +[packages] +gutenberg-cleaner = {editable = true,path = "."} + +[requires] +python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..872fbe3 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,97 @@ +{ + "_meta": { + "hash": { + "sha256": "a94cfd56101fcd45bbc7bc59fdc39f4fbc900808a4e792ef202fa7839f0b6413" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.6" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "gutenberg-cleaner": { + "editable": true, + "path": "." + }, + "nltk": { + "hashes": [ + "sha256:3a64b1cb685bbf344adec416871fee07996671c876ff313b3e504158fa1500e1" + ], + "version": "==3.4.1" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + }, + "develop": { + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "more-itertools": { + "hashes": [ + "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", + "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" + ], + "markers": "python_version > '2.7'", + "version": "==7.0.0" + }, + "pluggy": { + "hashes": [ + "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180", + "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a" + ], + "version": "==0.11.0" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pytest": { + "hashes": [ + "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24", + "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6" + ], + "index": "pypi", + "version": "==4.5.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + } + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f0d21f --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +![](https://i.ibb.co/sCJXhmz/header-sp.png) +![](https://img.shields.io/apm/l/vim-mode.svg) + + +# gutenberg-cleaner + +a python package for cleaning Gutenberg books and dataset. + +### Prerequisites +nltk package + +### Installing +``` +[sudo] pip install gutenberg-cleaner +``` + +## How to use it? + +it has two methods called "simple_cleaner" and "super_cleaner". +### simple_claner: +Just removes lines that are part of the Project Gutenberg header or footer. +Doesnt go deeply in the text to remove other things like titles or footnotes or etc... +``` +simple_cleaner(book: str) -> str +``` +### super_cleaner: +Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. +``` +super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str +``` +min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning). +max_token: The maximum tokens of a paragraph. + +it will mark deleted paragraphs with: [deleted] + + +## Author + +* **Peyman Mohseni kiasari** + +## License + +This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl new file mode 100644 index 0000000..bb9ca61 Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl differ diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz new file mode 100644 index 0000000..450881c Binary files /dev/null and b/dist/gutenberg_cleaner-0.0.1.tar.gz differ diff --git a/gitignore.gitignore b/gitignore.gitignore new file mode 100644 index 0000000..b9af354 --- /dev/null +++ b/gitignore.gitignore @@ -0,0 +1,232 @@ + +# Created by https://www.gitignore.io/api/linux,python,pycharm+all +# Edit at https://www.gitignore.io/?templates=linux,python,pycharm+all + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +# JetBrains templates +**___jb_tmp___ + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# End of https://www.gitignore.io/api/linux,python,pycharm+all \ No newline at end of file diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO new file mode 100644 index 0000000..1096e29 --- /dev/null +++ b/gutenberg_cleaner.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 1.1 +Name: gutenberg-cleaner +Version: 0.0.1 +Summary: cleans gutenberg dataset books +Home-page: UNKNOWN +Author: UNKNOWN +Author-email: mohsenikiasari@ce.sharif.edu +License: MIT +Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png) + ![](https://img.shields.io/apm/l/vim-mode.svg) + + + # gutenberg-cleaner + + a python package for cleaning Gutenberg books and dataset. + + ### Prerequisites + nltk package + + ### Installing + ``` + [sudo] pip install gutenberg-cleaner + ``` + + ## How to use it? + + it has two methods called "simple_cleaner" and "super_cleaner". + ### simple_claner: + Just removes lines that are part of the Project Gutenberg header or footer. + Doesnt go deeply in the text to remove other things like titles or footnotes or etc... + ``` + simple_cleaner(book: str) -> str + ``` + ### super_cleaner: + Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. + ``` + super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str + ``` + min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning). + max_token: The maximum tokens of a paragraph. + + it will mark deleted paragraphs with: [deleted] + + + ## Author + + * **Peyman Mohseni kiasari** + + ## License + + This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details + +Platform: UNKNOWN +Classifier: Programming language :: Python :: 3 +Classifier: Programming language :: Python :: 3.6 +Classifier: Programming language :: Python :: 3.7 +Classifier: Operation System :: OS Independent diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt new file mode 100644 index 0000000..564eaf9 --- /dev/null +++ b/gutenberg_cleaner.egg-info/SOURCES.txt @@ -0,0 +1,7 @@ +README.md +setup.py +gutenberg_cleaner.egg-info/PKG-INFO +gutenberg_cleaner.egg-info/SOURCES.txt +gutenberg_cleaner.egg-info/dependency_links.txt +gutenberg_cleaner.egg-info/requires.txt +gutenberg_cleaner.egg-info/top_level.txt \ No newline at end of file diff --git a/gutenberg_cleaner.egg-info/dependency_links.txt b/gutenberg_cleaner.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/gutenberg_cleaner.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/gutenberg_cleaner.egg-info/requires.txt b/gutenberg_cleaner.egg-info/requires.txt new file mode 100644 index 0000000..8469296 --- /dev/null +++ b/gutenberg_cleaner.egg-info/requires.txt @@ -0,0 +1 @@ +nltk diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt new file mode 100644 index 0000000..fae547e --- /dev/null +++ b/gutenberg_cleaner.egg-info/top_level.txt @@ -0,0 +1 @@ +gutenbergـcleaner diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py new file mode 100644 index 0000000..62e8457 --- /dev/null +++ b/gutenberg_cleaner.py @@ -0,0 +1,42 @@ +from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \ + is_image, is_table +from gutenberg_cleaning_options.strip_headers import strip_headers + + +def simple_cleaner(book: str) -> str: + """ + Just removes lines that are part of the Project Gutenberg header or footer. + Doesnt go deeply in the text to remove other things like titles or footnotes or etc... + :rtype: str + :param book: str of a gutenberg's book + :return: str of the book without the lines that are part of the Project Gutenberg header or footer. + """ + return strip_headers(book) + + +def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str: + """ + Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. + ^_^ Do you have a comment to make it better? Email to mohsenikiasari@ce.sharif.edu ^_^. + IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. + :rtype: str + :param book: str of a gutenberg's book. + :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", + -1 means don't tokenize the txt (so it will be faster). + :param max_token: The maximum tokens of a paragraph. + :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it. + you can split the book to paragraphs by "\n\n". + """ + headless_book = strip_headers(book) + paragraphs = headless_book.split("\n\n") # split the book to paragraphs. + + paragraphs_after_cleaning = [] + for par in paragraphs: + if is_image(par) or is_footnote(par) or is_email_init(par) or \ + is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token): + paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted] + else: + paragraphs_after_cleaning.append(par) + + cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string + return cleaned_book diff --git a/gutenberg_cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gutenberg_cleaning_options/cleaning_options.py b/gutenberg_cleaning_options/cleaning_options.py new file mode 100644 index 0000000..b0850ac --- /dev/null +++ b/gutenberg_cleaning_options/cleaning_options.py @@ -0,0 +1,105 @@ +import string +import re +from nltk import word_tokenize + +email_regex = re.compile("[\w.-]+@[\w.-]+\.\w+") # Regex to find Emails. +footnote_notation_regex = re.compile("^\{.+\}|^\[.+\]") # Regex to find start of footnotes. +number_of_copies_regex = re.compile("[0-9]* copies|copyright") # Regex to find copy mentioning. +starts_with_regex = re.compile('^[%_<>*]') # If the text is started with these, it is not a good one. +image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:") # Regex to find images. + + +def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool: + """ + determining if a paragraph is title or information of the book. + IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. + :rtype: bool + :param text: Raw paragraph. + :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", + -1 means don't tokenize the txt (so it will be faster). + :param max_token: The maximum tokens of a paragraph. + :return: Boolean, True if it is title or information of the book or a bad paragraph. + """ + txt = text.strip() + num_token = len(word_tokenize(txt)) if min_token >= 0 else -1 + if num_token > max_token: + return True + if len(txt) == 0 or num_token < min_token and not (txt.count('"') == 2 or txt.count('\'') == 2 or txt[-1] == ":"): + return True # Length is short but not "dialog" or "quote" + if sum(1 for c in txt if c.isupper() or c.isdigit() or c in string.punctuation.replace("\"", "")) \ + / len(txt.replace(" ", "")) > 0.6: + return True # More than 60% of chars are UPPER or digits or punctuations so it might be title or etc. + if txt.lower().startswith("appendix") or bool(re.search(starts_with_regex, txt)): + return True + if txt.count(":") > 3 and 2 * txt.count(":") - txt.count("\"") > 3: + return True # mostly information about the book. + if txt.count(" ") > 3 or txt.count("\t") > 2 or txt.count("*") > 3 or txt.count("=") > 2: + return True # mostly tables and catalogs and etc. + if "@" in txt and len(txt) < 100: + return True + return False + + +def is_table(text: str) -> bool: + """ + determining if a paragraph is a table or catalog. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it is a table or catalog. + """ + txt = text.strip() + if txt.count(" ") > 3 or txt.count("\t") > 2: + txt = " ".join([line.strip() for line in txt.split("\n")]) + if txt.count(" ") > 3 or txt.count("\t") > 2: + return True # mostly tables. + if txt.count("*") > 3 or txt.count("=") > 2: + return True # mostly catalogs and etc. + + +def is_image(text: str) -> bool: + """ + determining if a paragraph is for mentioning an image. + :param text: Raw paragraph. + :return: Boolean, True if it is for mentioning an image. + """ + return bool(re.search(image_formats_regex, text.lower())) + + +def is_footnote(text: str) -> bool: + """ + determining if a paragraph is the footnote of the book. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it is the footnote of the book. + """ + txt = text.strip() + print(txt) + if "footnote" in txt.lower() and len(txt.replace(" ", "")) < 50: + return True + return bool(re.search(footnote_notation_regex, txt)) # if a line starts with {...} it might be a footnote. + + +print(is_footnote(""" [0] The country-seat of Bishop Shipley, the good bishop, + as Dr. Franklin used to style him.--B.""")) + + +def is_books_copy(text: str) -> bool: + """ + determining if a paragraph indicates the number of copies of this book. + :rtype: bool + :param text: text: Raw paragraph. + :return: Boolean, True if it is indicating the copy of book or copyrights. + """ + if bool(re.search(number_of_copies_regex, text)) and len(text.replace(" ", "")) < 500: + return True + return False + + +def is_email_init(text: str) -> bool: + """ + determining if a paragraph includes an Email. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it includes an Email. + """ + return bool(re.search(email_regex, text)) diff --git a/gutenberg_cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py new file mode 100644 index 0000000..3acb1c2 --- /dev/null +++ b/gutenberg_cleaning_options/strip_headers.py @@ -0,0 +1,149 @@ +"""Module to remove the noise from Project Gutenberg texts.""" + +from __future__ import absolute_import, unicode_literals +from builtins import str +import os + +TEXT_START_MARKERS = frozenset(( + "*END*THE SMALL PRINT", + "*** START OF THE PROJECT GUTENBERG", + "*** START OF THIS PROJECT GUTENBERG", + "This etext was prepared by", + "E-text prepared by", + "Produced by", + "Distributed Proofreading Team", + "Proofreading Team at http://www.pgdp.net", + "http://gallica.bnf.fr)", + " http://archive.org/details/", + "http://www.pgdp.net", + "by The Internet Archive)", + "by The Internet Archive/Canadian Libraries", + "by The Internet Archive/American Libraries", + "public domain material from the Internet Archive", + "Internet Archive)", + "Internet Archive/Canadian Libraries", + "Internet Archive/American Libraries", + "material from the Google Print project", + "*END THE SMALL PRINT", + "***START OF THE PROJECT GUTENBERG", + "This etext was produced by", + "*** START OF THE COPYRIGHTED", + "The Project Gutenberg", + "http://gutenberg.spiegel.de/ erreichbar.", + "Project Runeberg publishes", + "Beginning of this Project Gutenberg", + "Project Gutenberg Online Distributed", + "Gutenberg Online Distributed", + "the Project Gutenberg Online Distributed", + "Project Gutenberg TEI", + "This eBook was prepared by", + "http://gutenberg2000.de erreichbar.", + "This Etext was prepared by", + "This Project Gutenberg Etext was prepared by", + "Gutenberg Distributed Proofreaders", + "Project Gutenberg Distributed Proofreaders", + "the Project Gutenberg Online Distributed Proofreading Team", + "**The Project Gutenberg", + "*SMALL PRINT!", + "More information about this book is at the top of this file.", + "tells you about restrictions in how the file may be used.", + "l'authorization à les utilizer pour preparer ce texte.", + "of the etext through OCR.", + "*****These eBooks Were Prepared By Thousands of Volunteers!*****", + "We need your donations more than ever!", + " *** START OF THIS PROJECT GUTENBERG", + "**** SMALL PRINT!", + '["Small Print" V.', + ' (http://www.ibiblio.org/gutenberg/', + 'and the Project Gutenberg Online Distributed Proofreading Team', + 'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading', + ' this Project Gutenberg edition.', +)) + +TEXT_END_MARKERS = frozenset(( + "*** END OF THE PROJECT GUTENBERG", + "*** END OF THIS PROJECT GUTENBERG", + "***END OF THE PROJECT GUTENBERG", + "End of the Project Gutenberg", + "End of The Project Gutenberg", + "Ende dieses Project Gutenberg", + "by Project Gutenberg", + "End of Project Gutenberg", + "End of this Project Gutenberg", + "Ende dieses Projekt Gutenberg", + " ***END OF THE PROJECT GUTENBERG", + "*** END OF THE COPYRIGHTED", + "End of this is COPYRIGHTED", + "Ende dieses Etextes ", + "Ende dieses Project Gutenber", + "Ende diese Project Gutenberg", + "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**", + "Fin de Project Gutenberg", + "The Project Gutenberg Etext of ", + "Ce document fut presente en lecture", + "Ce document fut présenté en lecture", + "More information about this book is at the top of this file.", + "We need your donations more than ever!", + "END OF PROJECT GUTENBERG", + " End of the Project Gutenberg", + " *** END OF THIS PROJECT GUTENBERG", +)) + +LEGALESE_START_MARKERS = frozenset(("<= 100: + # Check if the footer begins here + if any(line.startswith(token) for token in TEXT_END_MARKERS): + footer_found = True + + # If it's the beginning of the footer, stop output + if footer_found: + break + + if any(line.startswith(token) for token in LEGALESE_START_MARKERS): + ignore_section = True + continue + elif any(line.startswith(token) for token in LEGALESE_END_MARKERS): + ignore_section = False + continue + + if not ignore_section: + out.append(line.rstrip(sep)) + i += 1 + + return sep.join(out) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1e8d2f3 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup + +with open('README.md') as f: + long_description = f.read() + +setup( + name="gutenberg_cleaner", + install_requires=['nltk'], + version='0.0.1', + description="cleans gutenberg dataset books", + author_email='mohsenikiasari@ce.sharif.edu', + py_modules=["gutenbergـcleaner"], + license='MIT', + long_description=long_description, + classifiers=[ + "Programming language :: Python :: 3", + "Programming language :: Python :: 3.6", + "Programming language :: Python :: 3.7", + "Operation System :: OS Independent" + ] +)