From 4a6e7a06daf88b4902927dc3591d67dccbd9e2b9 Mon Sep 17 00:00:00 2001 From: Peyman Date: Sun, 26 May 2019 00:48:02 +0430 Subject: [PATCH] Initial commit --- .idea/.name | 1 + .idea/Gutenberg_cleaner.iml | 14 ++ .idea/dictionaries/peyman.xml | 7 + .idea/libraries/R_User_Library.xml | 6 + .idea/misc.xml | 7 + .idea/modules.xml | 9 + LICENSE.md | 21 ++ Pipfile | 13 + Pipfile.lock | 97 ++++++++ README.md | 43 ++++ dist/gutenberg_cleaner-0.0.1-py3-none-any.whl | Bin 0 -> 2662 bytes dist/gutenberg_cleaner-0.0.1.tar.gz | Bin 0 -> 1698 bytes gitignore.gitignore | 232 ++++++++++++++++++ gutenberg_cleaner.egg-info/PKG-INFO | 57 +++++ gutenberg_cleaner.egg-info/SOURCES.txt | 7 + .../dependency_links.txt | 1 + gutenberg_cleaner.egg-info/requires.txt | 1 + gutenberg_cleaner.egg-info/top_level.txt | 1 + gutenberg_cleaner.py | 42 ++++ gutenberg_cleaning_options/__init__.py | 0 .../cleaning_options.py | 105 ++++++++ gutenberg_cleaning_options/strip_headers.py | 149 +++++++++++ setup.py | 21 ++ 23 files changed, 834 insertions(+) create mode 100644 .idea/.name create mode 100644 .idea/Gutenberg_cleaner.iml create mode 100644 .idea/dictionaries/peyman.xml create mode 100644 .idea/libraries/R_User_Library.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 LICENSE.md create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 dist/gutenberg_cleaner-0.0.1-py3-none-any.whl create mode 100644 dist/gutenberg_cleaner-0.0.1.tar.gz create mode 100644 gitignore.gitignore create mode 100644 gutenberg_cleaner.egg-info/PKG-INFO create mode 100644 gutenberg_cleaner.egg-info/SOURCES.txt create mode 100644 gutenberg_cleaner.egg-info/dependency_links.txt create mode 100644 gutenberg_cleaner.egg-info/requires.txt create mode 100644 gutenberg_cleaner.egg-info/top_level.txt create mode 100644 gutenberg_cleaner.py create mode 100644 gutenberg_cleaning_options/__init__.py create mode 100644 gutenberg_cleaning_options/cleaning_options.py create mode 100644 gutenberg_cleaning_options/strip_headers.py create mode 100644 setup.py diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..9761cd8 --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +Gutenberg_cleaner \ No newline at end of file diff --git a/.idea/Gutenberg_cleaner.iml b/.idea/Gutenberg_cleaner.iml new file mode 100644 index 0000000..5138a24 --- /dev/null +++ b/.idea/Gutenberg_cleaner.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/dictionaries/peyman.xml b/.idea/dictionaries/peyman.xml new file mode 100644 index 0000000..5c50b93 --- /dev/null +++ b/.idea/dictionaries/peyman.xml @@ -0,0 +1,7 @@ + + + + dataset + + + \ No newline at end of file diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml new file mode 100644 index 0000000..71f5ff7 --- /dev/null +++ b/.idea/libraries/R_User_Library.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..3999087 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..535c83f --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..058a5a4 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Peyman Mohseni Kiasari + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..08bebb3 --- /dev/null +++ b/Pipfile @@ -0,0 +1,13 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] +pytest = ">=3.6" + +[packages] +gutenberg-cleaner = {editable = true,path = "."} + +[requires] +python_version = "3.6" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..872fbe3 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,97 @@ +{ + "_meta": { + "hash": { + "sha256": "a94cfd56101fcd45bbc7bc59fdc39f4fbc900808a4e792ef202fa7839f0b6413" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.6" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "gutenberg-cleaner": { + "editable": true, + "path": "." + }, + "nltk": { + "hashes": [ + "sha256:3a64b1cb685bbf344adec416871fee07996671c876ff313b3e504158fa1500e1" + ], + "version": "==3.4.1" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + } + }, + "develop": { + "atomicwrites": { + "hashes": [ + "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", + "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" + ], + "version": "==1.3.0" + }, + "attrs": { + "hashes": [ + "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79", + "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399" + ], + "version": "==19.1.0" + }, + "more-itertools": { + "hashes": [ + "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7", + "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a" + ], + "markers": "python_version > '2.7'", + "version": "==7.0.0" + }, + "pluggy": { + "hashes": [ + "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180", + "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a" + ], + "version": "==0.11.0" + }, + "py": { + "hashes": [ + "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", + "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + ], + "version": "==1.8.0" + }, + "pytest": { + "hashes": [ + "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24", + "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6" + ], + "index": "pypi", + "version": "==4.5.0" + }, + "six": { + "hashes": [ + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" + ], + "version": "==1.12.0" + }, + "wcwidth": { + "hashes": [ + "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", + "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + ], + "version": "==0.1.7" + } + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..3f0d21f --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +![](https://i.ibb.co/sCJXhmz/header-sp.png) +![](https://img.shields.io/apm/l/vim-mode.svg) + + +# gutenberg-cleaner + +a python package for cleaning Gutenberg books and dataset. + +### Prerequisites +nltk package + +### Installing +``` +[sudo] pip install gutenberg-cleaner +``` + +## How to use it? + +it has two methods called "simple_cleaner" and "super_cleaner". +### simple_claner: +Just removes lines that are part of the Project Gutenberg header or footer. +Doesnt go deeply in the text to remove other things like titles or footnotes or etc... +``` +simple_cleaner(book: str) -> str +``` +### super_cleaner: +Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. +``` +super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str +``` +min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning). +max_token: The maximum tokens of a paragraph. + +it will mark deleted paragraphs with: [deleted] + + +## Author + +* **Peyman Mohseni kiasari** + +## License + +This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..bb9ca613acf8fecee490b658e76174825a94217e GIT binary patch literal 2662 zcmb7`c{J2}AIE>gC`&|kBFf${%9>o;L}u(`%_XiSgqg8UizPFbM3x6*i9*PlEnCJ# zjH0n+-)k%*xiqM8v)o7bp69tI=RD_ne!ufM-}C$Z@%p^a_q^Zd^D#GK1oHp@zzmc= zZZh@RvwC@#0RU9N0024o>k$;_j&gDL_i%FcasLJ7?k}&Xpr~*`!ObfmP~Ho50}VBX z!}Lup^%Z>G9Faz7iX!K)JGSxab;#k+>0q=}hKmaHQW4~8iU7s>zJ(7TZ5=B$5ZrtU zxo|tAQaoMg;@G&jT^BO^Wy0Ek{w+_@4?AOM?cAENr-EX1$0|IY-vcKo;+iIsr~sxO z$AC00PtNdiJQd9M6S=6s7}oLfNnzb1|5idspi^ z+0(C)i5rK`*46Y34bYkt&oT_TJ2nNJtgcloTNkc<9a~UW?YwTc*oDp*sEBN+%K{*ZJb-D2S6W<-!K2%@T@Y&C#SD@HcBC4lxu74bhSXS zI#X6sA~Y=0dFQb>4mOiAb~0$_1YFx9mnLeF+f%sty5H2nd1q5ytS~3oJ~7!OzSWnz zL~s7?9DQGJR7m?p!s)0d@m@ZeIe)lHLmQM%YUK|<0&BE=oG+!@EE~^t^vyEDuE&q< zx_U@_v{uLl;9_SWd zkK|_;HfW8Vgjkq0Vpt{hK{v6dpe+mX&Zi}bq(t_sl%cgk0!p~7(sd%SGO{-r-bNv8 zw-*93;VmZP$Q7JodLaV`udRL2r}*>5qos1*5nF7L@@~~uZ_fH#EK9qQgznph=(JF- z8Cb9BbnRobGCMzSJ@RFlILm>hw6D< zNy_}TQ5}S>CYcarvu!+3_-?&S_*xve@n0%THCFDA6(4Y`^{{~HtbVY(;txamn8?{r z_C&^4dY?~J-?W#CLwA^kgUo+0vEZU$+a*Q-usQ|+az`?;iN2NY6rxUJFiJti>7t%efbHQ;1s};C`>ugR_U1ORQWc zD)DMi?D(l_lu~QIMf}WF4R7A$Yk~{Qw(VoO*SsCrEBu4{FX?oDWuIGmH%52*1a%7g z5_=}m)FD{F%8l>C9T=DIMiVhQ;8`^+tWjom{`D-lfRdJHhx4AdzmG<(cbUYCq-{1O z-)KrrdonaJ6)v>nHJkfRfOW-E7ohkeK z*Qb${dm@2Fj7qB^;w1}jVm>?!k6g}^26KmVy}IJbqZ_j2oSZTW=Dn)Je(qwuH?2eU zm5j7T{$uy?3Aa8EYKBkYttrdMd!yQ7Adl|;@nU8 zm?%(iYcbg&auRQ_I>~;H>~T(u6zSS{Omm6>ttk1iN9A(H_y%dpJmKjgQx^h}IiVh{ zX6llvbHO-w`%+7-Wi*z#X<=z`QOv_sQY4awnez#sIuY+q+RhP)H<}@Q!V+pT46AN- zRN2QfcjolTw1i2KcYFyk>FX)kAZ%aE^4J>~NfD;qe$vO%>~($mvu9T^92_&46{2!| zLy}2qpaoo4@@Rfj(-GZ0eD^q| zcRUr-Jk!M8#1fq++sRWjbbi{-X2T^TyGkTy$SZg{!B`w6B$YqKR}NPps3~umQhPd- zt)J}W0k0xrrm$3xO8&9R!V&EJ9vy5`BytSDq#f%OtF&|C#{hCnkfa|oj}$;# zLw$W?wthpC-a&RFdAUAve=SmRSn0jLS+9{i0@4kwK^h{uAquh3<0TdYiUUfjqe_Dd ztUdkE!7q?eb{Tm@Z?|y`QbtzJWRVqa+<=rRRvztr+ix^tLM~gNFaSRaO#7&t$9o`* z`=Fpl5*UcaIQh5-yZa~v-U>8=%ENn&5b}uDiKOh5tc-N~PNqX0dgbidj)MT8;h;~?nxae^5Tt-sDRDrjT+$rVxt zQ*0uoo3^k`0*5B&Js>JyRM4rxoxYSIq;(21&5GLXBmK7a^o~KW@Hx4Ch(YxN3)*%v zJG%abVj#Xo2;{NS>z&u0fCsuCtvGd(B1+35zMU0o5|*e7NltddtU`!EGQ=*{HF~n! z_3cUDQ?D^Y%X{tZIm|YkG>|KGMEc@{0f%;trOw5lLPZIu#zpf<%n*sMc=pelaNVY2 z*=P69uaq}G5!M(oa5npjyrb=fwG7Q_?z7{>5gO84ypr@P`w=c8!i%+jajE*-=+qiL zT6OS7P{PzMgSioifrs&b?aNDr5*|TgNqNt4_o%Xa1i|XQ564WU~?m;11SK&bnw1B LFh7C$``5n#kA`gF literal 0 HcmV?d00001 diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..450881c0d8dc681c77ad9f8175e227a0bee03d4b GIT binary patch literal 1698 zcmV;T23`3diwFoep6Ofy|72-%bT4OhbY*U0WpZa^L+Er`>8IUk)#g8QDA=qX4bK9w3p5#sGzgjRP7!jJIw+zexW1oq+cM|KFTEeRg(Y1@4ac z-)=V_;(xu~%K2Yw+O?GbopcClhW}q3e!9LX50#3fQ>}7~-`-k%QI*Fp{~QLNt3yUz zln5DFQRp3zd$@sT$suRHD=jXnGzzMI^^OOXK)B43cZg2N6PSuuWg=dLP>9BAC_;#6 ze?&b70}+Fs<)H`9^967##7F`SU2v(Qk|_%}eDVY?V-~Xy371?kNkU(Z@}08z(-1l8 z`?wi-|NcF>mWeBFAmS15Y-R;fDIzY07vdu*0f}V5)h~o_1w$%9eH0KdB-)kG#~qjp zMahH6XSvoCHK9eBL@b_;S(=_HVmjuKmnd)$vq0Q22}mXWRYR(P#tiQfD-Z)5V0caO z7wfCJ2g$er_*w%Y6pJnLOfVTL@C3MwMgACXsMjj?Nu~G5jsbyVF;3uQ`8oa6Ncwv7S9Zu%`ZHsvCM8`6 z1LPQfY6Kpl5Cl|l5gutO4lD@h7_a3s#XyRH;Vp%mKbQ~`D(4TM5qSgw4|__CSm@y1 z3cAq3G5YEL7mSJV*l2Lo($HO*urpm`1~lXjUz-p5l>?cNUYA)<-E@>It3gbPXNYRO}8cKSBQhYCoRy~FkE)8mu# ztJKfjlxN=$V8H#%Ft|)n?n_cIuvh<&_BKJYKQ4TI|8LuMJCFa`%|^4JXIsYq?*j(0 z2tYF%jjAuvEJp&JkRBlCq>EtjEexagyb8u^AC7(@MI*#D(*i+Nrp zjnk_C$^LJZ|Kpc5|1zYRo)3DZ91Ec?s>i^}d=atj*-!68Q|2Ht{U(o+%y=DCW zUf_%=nkLJY-?Jp!0o$_4Ii~B*iaPleNec8EIB8WGNO~#6v?w$PVK-p&5S&`-Lk2XFI)bys@buG^aah2m)~s2z`*+a1nUcCy6K9p)Y}M3x#AHpIJxf!MHh)%alsAKx z=JwDWnx1*-Q`qC=-*5bdMvi`V-8b4ZJ2U=hM zu@~aMPTg*s_-`LzvOSakZK?lP7w_I2pImKR{#~p8jYfO1{)0)O(f@tGw5?0>jz8HT z^(UK&2iek#kPW*IWS7-RqyG>8|3`n|?cj{H{=d1P|8}QcZyNpI2VA4#^isvSA&Ppv z+l2crW?TA)OEJCcLXDUp;eV(9TNinES^u>=cFp+zZ>Rp-Gxc9n|26erQ~x#fUsL}z s_1_<<{%gR10RsjM7%*VKfB^#r3>YwAz<>b*27XNV7bp_XAplSS00Cr6&j0`b literal 0 HcmV?d00001 diff --git a/gitignore.gitignore b/gitignore.gitignore new file mode 100644 index 0000000..b9af354 --- /dev/null +++ b/gitignore.gitignore @@ -0,0 +1,232 @@ + +# Created by https://www.gitignore.io/api/linux,python,pycharm+all +# Edit at https://www.gitignore.io/?templates=linux,python,pycharm+all + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +# JetBrains templates +**___jb_tmp___ + +### PyCharm+all Patch ### +# Ignores the whole .idea folder and all .iml files +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 + +*.iml +modules.xml +.idea/misc.xml +*.ipr + +# Sonarlint plugin +.idea/sonarlint + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# End of https://www.gitignore.io/api/linux,python,pycharm+all \ No newline at end of file diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO new file mode 100644 index 0000000..1096e29 --- /dev/null +++ b/gutenberg_cleaner.egg-info/PKG-INFO @@ -0,0 +1,57 @@ +Metadata-Version: 1.1 +Name: gutenberg-cleaner +Version: 0.0.1 +Summary: cleans gutenberg dataset books +Home-page: UNKNOWN +Author: UNKNOWN +Author-email: mohsenikiasari@ce.sharif.edu +License: MIT +Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png) + ![](https://img.shields.io/apm/l/vim-mode.svg) + + + # gutenberg-cleaner + + a python package for cleaning Gutenberg books and dataset. + + ### Prerequisites + nltk package + + ### Installing + ``` + [sudo] pip install gutenberg-cleaner + ``` + + ## How to use it? + + it has two methods called "simple_cleaner" and "super_cleaner". + ### simple_claner: + Just removes lines that are part of the Project Gutenberg header or footer. + Doesnt go deeply in the text to remove other things like titles or footnotes or etc... + ``` + simple_cleaner(book: str) -> str + ``` + ### super_cleaner: + Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. + ``` + super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str + ``` + min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning). + max_token: The maximum tokens of a paragraph. + + it will mark deleted paragraphs with: [deleted] + + + ## Author + + * **Peyman Mohseni kiasari** + + ## License + + This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details + +Platform: UNKNOWN +Classifier: Programming language :: Python :: 3 +Classifier: Programming language :: Python :: 3.6 +Classifier: Programming language :: Python :: 3.7 +Classifier: Operation System :: OS Independent diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt new file mode 100644 index 0000000..564eaf9 --- /dev/null +++ b/gutenberg_cleaner.egg-info/SOURCES.txt @@ -0,0 +1,7 @@ +README.md +setup.py +gutenberg_cleaner.egg-info/PKG-INFO +gutenberg_cleaner.egg-info/SOURCES.txt +gutenberg_cleaner.egg-info/dependency_links.txt +gutenberg_cleaner.egg-info/requires.txt +gutenberg_cleaner.egg-info/top_level.txt \ No newline at end of file diff --git a/gutenberg_cleaner.egg-info/dependency_links.txt b/gutenberg_cleaner.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/gutenberg_cleaner.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/gutenberg_cleaner.egg-info/requires.txt b/gutenberg_cleaner.egg-info/requires.txt new file mode 100644 index 0000000..8469296 --- /dev/null +++ b/gutenberg_cleaner.egg-info/requires.txt @@ -0,0 +1 @@ +nltk diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt new file mode 100644 index 0000000..fae547e --- /dev/null +++ b/gutenberg_cleaner.egg-info/top_level.txt @@ -0,0 +1 @@ +gutenbergـcleaner diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py new file mode 100644 index 0000000..62e8457 --- /dev/null +++ b/gutenberg_cleaner.py @@ -0,0 +1,42 @@ +from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \ + is_image, is_table +from gutenberg_cleaning_options.strip_headers import strip_headers + + +def simple_cleaner(book: str) -> str: + """ + Just removes lines that are part of the Project Gutenberg header or footer. + Doesnt go deeply in the text to remove other things like titles or footnotes or etc... + :rtype: str + :param book: str of a gutenberg's book + :return: str of the book without the lines that are part of the Project Gutenberg header or footer. + """ + return strip_headers(book) + + +def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str: + """ + Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too. + ^_^ Do you have a comment to make it better? Email to mohsenikiasari@ce.sharif.edu ^_^. + IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. + :rtype: str + :param book: str of a gutenberg's book. + :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", + -1 means don't tokenize the txt (so it will be faster). + :param max_token: The maximum tokens of a paragraph. + :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it. + you can split the book to paragraphs by "\n\n". + """ + headless_book = strip_headers(book) + paragraphs = headless_book.split("\n\n") # split the book to paragraphs. + + paragraphs_after_cleaning = [] + for par in paragraphs: + if is_image(par) or is_footnote(par) or is_email_init(par) or \ + is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token): + paragraphs_after_cleaning.append("[deleted]") # if the paragraph is not good , replace it with [deleted] + else: + paragraphs_after_cleaning.append(par) + + cleaned_book = "\n\n".join(paragraphs_after_cleaning) # joining the list of paragraphs into one string + return cleaned_book diff --git a/gutenberg_cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/gutenberg_cleaning_options/cleaning_options.py b/gutenberg_cleaning_options/cleaning_options.py new file mode 100644 index 0000000..b0850ac --- /dev/null +++ b/gutenberg_cleaning_options/cleaning_options.py @@ -0,0 +1,105 @@ +import string +import re +from nltk import word_tokenize + +email_regex = re.compile("[\w.-]+@[\w.-]+\.\w+") # Regex to find Emails. +footnote_notation_regex = re.compile("^\{.+\}|^\[.+\]") # Regex to find start of footnotes. +number_of_copies_regex = re.compile("[0-9]* copies|copyright") # Regex to find copy mentioning. +starts_with_regex = re.compile('^[%_<>*]') # If the text is started with these, it is not a good one. +image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:") # Regex to find images. + + +def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool: + """ + determining if a paragraph is title or information of the book. + IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1. + :rtype: bool + :param text: Raw paragraph. + :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", + -1 means don't tokenize the txt (so it will be faster). + :param max_token: The maximum tokens of a paragraph. + :return: Boolean, True if it is title or information of the book or a bad paragraph. + """ + txt = text.strip() + num_token = len(word_tokenize(txt)) if min_token >= 0 else -1 + if num_token > max_token: + return True + if len(txt) == 0 or num_token < min_token and not (txt.count('"') == 2 or txt.count('\'') == 2 or txt[-1] == ":"): + return True # Length is short but not "dialog" or "quote" + if sum(1 for c in txt if c.isupper() or c.isdigit() or c in string.punctuation.replace("\"", "")) \ + / len(txt.replace(" ", "")) > 0.6: + return True # More than 60% of chars are UPPER or digits or punctuations so it might be title or etc. + if txt.lower().startswith("appendix") or bool(re.search(starts_with_regex, txt)): + return True + if txt.count(":") > 3 and 2 * txt.count(":") - txt.count("\"") > 3: + return True # mostly information about the book. + if txt.count(" ") > 3 or txt.count("\t") > 2 or txt.count("*") > 3 or txt.count("=") > 2: + return True # mostly tables and catalogs and etc. + if "@" in txt and len(txt) < 100: + return True + return False + + +def is_table(text: str) -> bool: + """ + determining if a paragraph is a table or catalog. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it is a table or catalog. + """ + txt = text.strip() + if txt.count(" ") > 3 or txt.count("\t") > 2: + txt = " ".join([line.strip() for line in txt.split("\n")]) + if txt.count(" ") > 3 or txt.count("\t") > 2: + return True # mostly tables. + if txt.count("*") > 3 or txt.count("=") > 2: + return True # mostly catalogs and etc. + + +def is_image(text: str) -> bool: + """ + determining if a paragraph is for mentioning an image. + :param text: Raw paragraph. + :return: Boolean, True if it is for mentioning an image. + """ + return bool(re.search(image_formats_regex, text.lower())) + + +def is_footnote(text: str) -> bool: + """ + determining if a paragraph is the footnote of the book. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it is the footnote of the book. + """ + txt = text.strip() + print(txt) + if "footnote" in txt.lower() and len(txt.replace(" ", "")) < 50: + return True + return bool(re.search(footnote_notation_regex, txt)) # if a line starts with {...} it might be a footnote. + + +print(is_footnote(""" [0] The country-seat of Bishop Shipley, the good bishop, + as Dr. Franklin used to style him.--B.""")) + + +def is_books_copy(text: str) -> bool: + """ + determining if a paragraph indicates the number of copies of this book. + :rtype: bool + :param text: text: Raw paragraph. + :return: Boolean, True if it is indicating the copy of book or copyrights. + """ + if bool(re.search(number_of_copies_regex, text)) and len(text.replace(" ", "")) < 500: + return True + return False + + +def is_email_init(text: str) -> bool: + """ + determining if a paragraph includes an Email. + :rtype: bool + :param text: Raw paragraph. + :return: Boolean, True if it includes an Email. + """ + return bool(re.search(email_regex, text)) diff --git a/gutenberg_cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py new file mode 100644 index 0000000..3acb1c2 --- /dev/null +++ b/gutenberg_cleaning_options/strip_headers.py @@ -0,0 +1,149 @@ +"""Module to remove the noise from Project Gutenberg texts.""" + +from __future__ import absolute_import, unicode_literals +from builtins import str +import os + +TEXT_START_MARKERS = frozenset(( + "*END*THE SMALL PRINT", + "*** START OF THE PROJECT GUTENBERG", + "*** START OF THIS PROJECT GUTENBERG", + "This etext was prepared by", + "E-text prepared by", + "Produced by", + "Distributed Proofreading Team", + "Proofreading Team at http://www.pgdp.net", + "http://gallica.bnf.fr)", + " http://archive.org/details/", + "http://www.pgdp.net", + "by The Internet Archive)", + "by The Internet Archive/Canadian Libraries", + "by The Internet Archive/American Libraries", + "public domain material from the Internet Archive", + "Internet Archive)", + "Internet Archive/Canadian Libraries", + "Internet Archive/American Libraries", + "material from the Google Print project", + "*END THE SMALL PRINT", + "***START OF THE PROJECT GUTENBERG", + "This etext was produced by", + "*** START OF THE COPYRIGHTED", + "The Project Gutenberg", + "http://gutenberg.spiegel.de/ erreichbar.", + "Project Runeberg publishes", + "Beginning of this Project Gutenberg", + "Project Gutenberg Online Distributed", + "Gutenberg Online Distributed", + "the Project Gutenberg Online Distributed", + "Project Gutenberg TEI", + "This eBook was prepared by", + "http://gutenberg2000.de erreichbar.", + "This Etext was prepared by", + "This Project Gutenberg Etext was prepared by", + "Gutenberg Distributed Proofreaders", + "Project Gutenberg Distributed Proofreaders", + "the Project Gutenberg Online Distributed Proofreading Team", + "**The Project Gutenberg", + "*SMALL PRINT!", + "More information about this book is at the top of this file.", + "tells you about restrictions in how the file may be used.", + "l'authorization à les utilizer pour preparer ce texte.", + "of the etext through OCR.", + "*****These eBooks Were Prepared By Thousands of Volunteers!*****", + "We need your donations more than ever!", + " *** START OF THIS PROJECT GUTENBERG", + "**** SMALL PRINT!", + '["Small Print" V.', + ' (http://www.ibiblio.org/gutenberg/', + 'and the Project Gutenberg Online Distributed Proofreading Team', + 'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading', + ' this Project Gutenberg edition.', +)) + +TEXT_END_MARKERS = frozenset(( + "*** END OF THE PROJECT GUTENBERG", + "*** END OF THIS PROJECT GUTENBERG", + "***END OF THE PROJECT GUTENBERG", + "End of the Project Gutenberg", + "End of The Project Gutenberg", + "Ende dieses Project Gutenberg", + "by Project Gutenberg", + "End of Project Gutenberg", + "End of this Project Gutenberg", + "Ende dieses Projekt Gutenberg", + " ***END OF THE PROJECT GUTENBERG", + "*** END OF THE COPYRIGHTED", + "End of this is COPYRIGHTED", + "Ende dieses Etextes ", + "Ende dieses Project Gutenber", + "Ende diese Project Gutenberg", + "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**", + "Fin de Project Gutenberg", + "The Project Gutenberg Etext of ", + "Ce document fut presente en lecture", + "Ce document fut présenté en lecture", + "More information about this book is at the top of this file.", + "We need your donations more than ever!", + "END OF PROJECT GUTENBERG", + " End of the Project Gutenberg", + " *** END OF THIS PROJECT GUTENBERG", +)) + +LEGALESE_START_MARKERS = frozenset(("<= 100: + # Check if the footer begins here + if any(line.startswith(token) for token in TEXT_END_MARKERS): + footer_found = True + + # If it's the beginning of the footer, stop output + if footer_found: + break + + if any(line.startswith(token) for token in LEGALESE_START_MARKERS): + ignore_section = True + continue + elif any(line.startswith(token) for token in LEGALESE_END_MARKERS): + ignore_section = False + continue + + if not ignore_section: + out.append(line.rstrip(sep)) + i += 1 + + return sep.join(out) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..1e8d2f3 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +from setuptools import setup + +with open('README.md') as f: + long_description = f.read() + +setup( + name="gutenberg_cleaner", + install_requires=['nltk'], + version='0.0.1', + description="cleans gutenberg dataset books", + author_email='mohsenikiasari@ce.sharif.edu', + py_modules=["gutenbergـcleaner"], + license='MIT', + long_description=long_description, + classifiers=[ + "Programming language :: Python :: 3", + "Programming language :: Python :: 3.6", + "Programming language :: Python :: 3.7", + "Operation System :: OS Independent" + ] +)