From 4a6e7a06daf88b4902927dc3591d67dccbd9e2b9 Mon Sep 17 00:00:00 2001
From: Peyman <you@example.com>
Date: Sun, 26 May 2019 00:48:02 +0430
Subject: [PATCH] Initial commit

---
 .idea/.name                                   |   1 +
 .idea/Gutenberg_cleaner.iml                   |  14 ++
 .idea/dictionaries/peyman.xml                 |   7 +
 .idea/libraries/R_User_Library.xml            |   6 +
 .idea/misc.xml                                |   7 +
 .idea/modules.xml                             |   9 +
 LICENSE.md                                    |  21 ++
 Pipfile                                       |  13 +
 Pipfile.lock                                  |  97 ++++++++
 README.md                                     |  43 ++++
 dist/gutenberg_cleaner-0.0.1-py3-none-any.whl | Bin 0 -> 2662 bytes
 dist/gutenberg_cleaner-0.0.1.tar.gz           | Bin 0 -> 1698 bytes
 gitignore.gitignore                           | 232 ++++++++++++++++++
 gutenberg_cleaner.egg-info/PKG-INFO           |  57 +++++
 gutenberg_cleaner.egg-info/SOURCES.txt        |   7 +
 .../dependency_links.txt                      |   1 +
 gutenberg_cleaner.egg-info/requires.txt       |   1 +
 gutenberg_cleaner.egg-info/top_level.txt      |   1 +
 gutenberg_cleaner.py                          |  42 ++++
 gutenberg_cleaning_options/__init__.py        |   0
 .../cleaning_options.py                       | 105 ++++++++
 gutenberg_cleaning_options/strip_headers.py   | 149 +++++++++++
 setup.py                                      |  21 ++
 23 files changed, 834 insertions(+)
 create mode 100644 .idea/.name
 create mode 100644 .idea/Gutenberg_cleaner.iml
 create mode 100644 .idea/dictionaries/peyman.xml
 create mode 100644 .idea/libraries/R_User_Library.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 LICENSE.md
 create mode 100644 Pipfile
 create mode 100644 Pipfile.lock
 create mode 100644 README.md
 create mode 100644 dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
 create mode 100644 dist/gutenberg_cleaner-0.0.1.tar.gz
 create mode 100644 gitignore.gitignore
 create mode 100644 gutenberg_cleaner.egg-info/PKG-INFO
 create mode 100644 gutenberg_cleaner.egg-info/SOURCES.txt
 create mode 100644 gutenberg_cleaner.egg-info/dependency_links.txt
 create mode 100644 gutenberg_cleaner.egg-info/requires.txt
 create mode 100644 gutenberg_cleaner.egg-info/top_level.txt
 create mode 100644 gutenberg_cleaner.py
 create mode 100644 gutenberg_cleaning_options/__init__.py
 create mode 100644 gutenberg_cleaning_options/cleaning_options.py
 create mode 100644 gutenberg_cleaning_options/strip_headers.py
 create mode 100644 setup.py
diff --git a/.idea/.name b/.idea/.name
new file mode 100644
index 0000000..9761cd8
--- /dev/null
+++ b/.idea/.name
@@ -0,0 +1 @@
+Gutenberg_cleaner
\ No newline at end of file
diff --git a/.idea/Gutenberg_cleaner.iml b/.idea/Gutenberg_cleaner.iml
new file mode 100644
index 0000000..5138a24
--- /dev/null
+++ b/.idea/Gutenberg_cleaner.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" name="R User Library" level="project" />
+    <orderEntry type="library" name="R Skeletons" level="application" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Twisted Trial" />
+    <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/dictionaries/peyman.xml b/.idea/dictionaries/peyman.xml
new file mode 100644
index 0000000..5c50b93
--- /dev/null
+++ b/.idea/dictionaries/peyman.xml
@@ -0,0 +1,7 @@
+<component name="ProjectDictionaryState">
+  <dictionary name="peyman">
+    <words>
+      <w>dataset</w>
+    </words>
+  </dictionary>
+</component>
\ No newline at end of file
diff --git a/.idea/libraries/R_User_Library.xml b/.idea/libraries/R_User_Library.xml
new file mode 100644
index 0000000..71f5ff7
--- /dev/null
+++ b/.idea/libraries/R_User_Library.xml
@@ -0,0 +1,6 @@
+<component name="libraryTable">
+  <library name="R User Library">
+    <CLASSES />
+    <SOURCES />
+  </library>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..3999087
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..535c83f
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/Gutenberg_API.iml" filepath="$PROJECT_DIR$/.idea/Gutenberg_API.iml" />
+      <module fileurl="file://$PROJECT_DIR$/.idea/Gutenberg_cleaner.iml" filepath="$PROJECT_DIR$/.idea/Gutenberg_cleaner.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..058a5a4
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Peyman Mohseni Kiasari
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..08bebb3
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,13 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+pytest = ">=3.6"
+
+[packages]
+gutenberg-cleaner = {editable = true,path = "."}
+
+[requires]
+python_version = "3.6"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..872fbe3
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,97 @@
+{
+    "_meta": {
+        "hash": {
+            "sha256": "a94cfd56101fcd45bbc7bc59fdc39f4fbc900808a4e792ef202fa7839f0b6413"
+        },
+        "pipfile-spec": 6,
+        "requires": {
+            "python_version": "3.6"
+        },
+        "sources": [
+            {
+                "name": "pypi",
+                "url": "https://pypi.org/simple",
+                "verify_ssl": true
+            }
+        ]
+    },
+    "default": {
+        "gutenberg-cleaner": {
+            "editable": true,
+            "path": "."
+        },
+        "nltk": {
+            "hashes": [
+                "sha256:3a64b1cb685bbf344adec416871fee07996671c876ff313b3e504158fa1500e1"
+            ],
+            "version": "==3.4.1"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        }
+    },
+    "develop": {
+        "atomicwrites": {
+            "hashes": [
+                "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+                "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+            ],
+            "version": "==1.3.0"
+        },
+        "attrs": {
+            "hashes": [
+                "sha256:69c0dbf2ed392de1cb5ec704444b08a5ef81680a61cb899dc08127123af36a79",
+                "sha256:f0b870f674851ecbfbbbd364d6b5cbdff9dcedbc7f3f5e18a6891057f21fe399"
+            ],
+            "version": "==19.1.0"
+        },
+        "more-itertools": {
+            "hashes": [
+                "sha256:2112d2ca570bb7c3e53ea1a35cd5df42bb0fd10c45f0fb97178679c3c03d64c7",
+                "sha256:c3e4748ba1aad8dba30a4886b0b1a2004f9a863837b8654e7059eebf727afa5a"
+            ],
+            "markers": "python_version > '2.7'",
+            "version": "==7.0.0"
+        },
+        "pluggy": {
+            "hashes": [
+                "sha256:25a1bc1d148c9a640211872b4ff859878d422bccb59c9965e04eed468a0aa180",
+                "sha256:964cedd2b27c492fbf0b7f58b3284a09cf7f99b0f715941fb24a439b3af1bd1a"
+            ],
+            "version": "==0.11.0"
+        },
+        "py": {
+            "hashes": [
+                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
+                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+            ],
+            "version": "==1.8.0"
+        },
+        "pytest": {
+            "hashes": [
+                "sha256:1a8aa4fa958f8f451ac5441f3ac130d9fc86ea38780dd2715e6d5c5882700b24",
+                "sha256:b8bf138592384bd4e87338cb0f256bf5f615398a649d4bd83915f0e4047a5ca6"
+            ],
+            "index": "pypi",
+            "version": "==4.5.0"
+        },
+        "six": {
+            "hashes": [
+                "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+                "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+            ],
+            "version": "==1.12.0"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+            ],
+            "version": "==0.1.7"
+        }
+    }
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3f0d21f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+![](https://i.ibb.co/sCJXhmz/header-sp.png)
+![](https://img.shields.io/apm/l/vim-mode.svg)
+
+
+# gutenberg-cleaner
+
+a python package for cleaning Gutenberg books and dataset.
+
+### Prerequisites
+nltk package
+
+### Installing
+```
+[sudo] pip install gutenberg-cleaner
+```
+
+## How to use it?
+
+it has two methods called "simple_cleaner" and "super_cleaner".
+### simple_claner:
+Just removes lines that are part of the Project Gutenberg header or footer.
+Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+```
+simple_cleaner(book: str) -> str
+```
+### super_cleaner:
+Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+```
+super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+```
+min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+max_token: The maximum tokens of a paragraph.
+
+it will mark deleted paragraphs with: [deleted]
+
+
+## Author
+
+* **Peyman Mohseni kiasari**
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
diff --git a/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl b/dist/gutenberg_cleaner-0.0.1-py3-none-any.whl
new file mode 100644
index 0000000000000000000000000000000000000000..bb9ca613acf8fecee490b658e76174825a94217e
GIT binary patch
literal 2662
zcmb7`c{J2}AIE>gC`&|kBFf${%9>o;L}u(`%_XiSgqg8UizPFbM3x6*i9*PlEnCJ#
zjH0n+-)k%*xiqM8v)o7bp69tI=RD_ne!ufM-}C$Z@%p^a_q^Zd^D#GK1oHp@zzmc=
zZZh@RvwC@#0RU9N0024o>k$;_j&gDL_i%FcasLJ7?k}&Xpr~*`!ObfmP~Ho50}VBX
z!}Lup^%Z>G9Faz7iX!K)JGSxab;#k+>0q=}hKmaHQW4~8iU7s>zJ(7TZ5=B$5ZrtU
zxo|tAQaoMg;@G&jT^BO^Wy0Ek{w+_@4?AOM?cAENr-EX1$0|IY-vcKo;+iIsr~sxO
z$AC00PtNdiJQd9M<sMHQ#m&KnGHK4-_eIH=s1y-Kg>6S=6s7}oLfNnzb1|5idspi^
z+0(C)i5rK`*46Y34bYkt&oT_TJ2nNJtgcloTNkc<9a~UW<beWKTI$|YT&VqA2EvqO
zp;v}&>?YwTc*oDp*sEBN+%K{*ZJb-D2S6W<-!K2%@T@Y&C#SD@HcBC4lxu74bhSXS
zI#X6sA~Y=0dFQb>4mOiAb~0$_1YFx9mnLeF+f%sty5H2nd1q5ytS~3oJ~7!OzSWnz
zL~s7?9DQGJR7m?p!s)0d@m@ZeIe)lHLmQM%YUK|<0&BE=oG+!@EE~^t^vyEDuE&q<
zx_U@_<D`A=+rn(jchXyXr5Bu7ir*H_VkC7&%}X|{6*?Ji5WDD*h=;>v{uLl;9_SWd
zkK|_;HfW8Vgjkq0Vpt{hK{v6dpe+mX&Zi}bq(t_sl%cgk0!p~7(sd%SGO{-r-bNv8
zw-*93;VmZP$Q7JodLaV`udRL2r}*>5qos1*5nF7L@@~~uZ_fH#EK9qQgznph=(JF-
z8Cb<vTC|)EO(KlgbW_u!tuy7pd{l-s+HEP6m8V>9BbnRobGCMzSJ@RFlILm>hw6D<
zNy_}TQ5}S>CYcarvu!+3_-?&S_*xve@n0%THCFDA6(4Y`^{{~HtbVY(;txamn8?{r
z_C&^4dY?~J-?W#CLwA^kgUo+0vEZU$+a*Q-usQ|+az`?;iN2NY6<sUcG;3e<lq%<b
zpOasJ7FiXdF;m6CS*2D$%6jgHle!DS<>rxUJFiJti>7t%efbHQ;1s<EYc-_MZV{=s
z*G%~Z(rhev!+#N*cVwCC=6A`15X|XZGpAsVR`PN&x~tiyi?Yp*^wH8PS@HFueV`yI
zl8Z1~2`AF{;!H<QZ}yYU#Jz3hr!;7(YVBejbFGWy+rLE*WY$>};C`>ugR_U1ORQWc
zD)DMi?D(l_lu~QIMf}WF4R7A$Yk~{Qw(VoO*SsCrEBu4{FX?oDWuIGmH%52*1a%7g
z5_=}m)FD{F%8l>C9T=DIMiVhQ;8`^+tWjom{`D-lfRdJHhx4AdzmG<(cbUYCq-{1O
z-)KrrdonaJ6)v>nHJkfRfOW-<y3$=;S&{h@zCy}t*C8l)w(m^5eEAsp6Js-*-v(24
z6b_{`SaLH5b72ra%<|j%<?7lux!GX^NB_1I?xkZeJw2(jv5}NZ@2bMrWJ>E7ohkeK
z*Qb${dm@2Fj7qB^;w1}jVm>?!k6g}^26KmVy}IJbqZ_j2oSZTW=Dn)Je(qwuH?2eU
zm5j7T{$uy?3Aa8EYKBkYttrdMd!yQ7Adl|;@nU8<v3X{$-3RO$i1$wA6p%PxsLiK>
zm?%(iYcbg&auRQ_I>~;H>~T(u6zSS{Omm6>ttk1iN9A(H_y%dpJmKjgQx^h}IiVh{
zX6llvbHO-w`%+7-Wi*z#X<=z`QOv_sQY4awnez#sIuY+q+RhP)H<}@Q!V+pT46AN-
zRN2QfcjolTw1i2KcYFyk>FX)kAZ%aE^4J>~NfD;qe$vO%>~($mvu9T^92_&46{2!|
zLy}2qpaoo<lry1%_l&;xq0LZu<lCK8_0yrNg6}<#k=UMRD!0DC&UJ}D_&pM@Q{JDU
z$uFoW^!$w^HiA;WNKOw0FFgQk&sz$jrX2%}3)I_^EUnD`c#=>4@@Rfj(-GZ0eD^q|
zcRUr-Jk!M8#1fq++sRWjbbi{-X2T^TyGkTy$SZg{!B`w6B$YqKR}NPps3~umQhPd-
zt)J}W0k0xrrm$3xO8&9R!V&EJ9vy5`BytSDq#f%OtF&|C#{hCnkfa<Ag7>|oj}$;#
zLw$W?wthpC-a&RFdAUAve=SmRSn0jLS+9{i0@4kwK^h{uAquh3<0TdYiUUfjqe_Dd
ztUdkE!7q?eb{Tm@Z?|y`QbtzJWRVqa+<=rRRvztr+ix^tLM~gNFaSRaO#7&t$9o`*
z`=Fpl5*UcaIQh5-yZa~v-U>8=%ENn&5b}uDiKOh5tc-N~PNqX0dgbidj)MT8;h<!W
z)B&jvGecf^IPlgVrwHf_w#+doN8EHCWMb64Eh|Y>;~?nxae^5Tt-sDRDrjT+$rVxt
zQ*0uoo3^k`0*5B&Js>JyRM4rxoxYSIq;(21&5GLXBmK7a^o~KW@Hx4Ch(YxN3)*%v
zJG%abVj#Xo2;{NS>z&u0fCsuCtvGd(B1+35zMU0o5|*e7NltddtU`!EGQ=*{HF~n!
z_3cUDQ?D^Y%X{tZIm|YkG>|KGMEc@{0f%;trOw5lLPZIu#zpf<%n*sMc=pelaNVY2
z*=P69uaq}G5!M(oa5npjyrb=fwG7Q_?z7{>5gO84ypr@P`w=c8!i%+jajE*-=+qiL
zT6OS7P{PzMgSioifrs&b?<EH}`sec^;`{OcY$=Dyhlj;qTmX0xK|0uj{+axr(Q%l2
zI0Juk4GzZAKXd;~#9?UKA>aNDr5*|TgNqNt4_o%Xa1i|XQ564WU~?m;11SK&bnw1B
LFh7C$``5n#kA`gF

literal 0
HcmV?d00001

diff --git a/dist/gutenberg_cleaner-0.0.1.tar.gz b/dist/gutenberg_cleaner-0.0.1.tar.gz
new file mode 100644
index 0000000000000000000000000000000000000000..450881c0d8dc681c77ad9f8175e227a0bee03d4b
GIT binary patch
literal 1698
zcmV;T23`3diwFoep6Ofy|72-%bT4OhbY*U0WpZa<V{Bz%Ze?;UFfK4IF)nmrascfb
zU2_^YkZ1jhEqMTY%)#+Fb2GHfIBw!3{+KvvJLB;XnG=V~-RW}DIU!H|oBcQ4({X?S
z8)yiVq}m4q(n?xszjy5=iiNi<_IiDv(U8TJnpLyxYGH>^L+Er`>8IUk)#g8QDA<i=
zr{1tTwRWdav)i>=qX4bK9w3p5#sGzgjRP7!jJIw+zexW1oq+cM|KFTEeRg(Y1@4ac
z-)=V_;(xu~%K2Yw+O?GbopcClhW}q3e!9LX50#3fQ>}7~-`-k%QI*Fp{~QLNt3yUz
zln5DFQRp3zd$@sT$suRHD=jXnGzzMI^^OOXK)B43cZg2N6PSuuWg=dLP>9BAC_;#6
ze?&b70}+Fs<)H`9^967##7F`SU2v(Qk|_%}eDVY?V-~Xy371?kNkU(Z@}08z(-1l8
z`?wi-|NcF>mWeBFAmS15Y-R;fDIzY07vdu*0f}V5)h~o_1w$%9eH0KdB-)kG#~qjp
zMahH6XSvoCHK9eBL@b_;S(=_HVmjuKmnd)$vq0Q22}mXWRYR(P#tiQfD-Z)5V0caO
z7wfCJ2g$er_*w%Y6pJnLOfVTL@C3MwMgACXsMjj?Nu~G5jsbyVF;3u<q(_brfh!*m
zo@^E3Zu$pPeao`+YuvwHIc0_eQpE>Q`8oa6Ncwv7S9Zu%`ZHsvCM8`6<?Otp*=a|>
z1LPQfY6Kpl5Cl|l5gutO4lD@h7_a3s#XyRH;Vp%mKbQ~`D(4TM5qSgw4|__CSm@y1
z3cAq3G5YEL7<!p|*O^=N7D>mSJV*l2Lo($HO*urpm`1~lX<Pzc2Z}DIzVM3a8x}t#
zc){WkRBV(PiVIy4mQ=P2|I9K5A%DtJppd~wjuLr`T0kZ86yGCJfNv#%4F<f=8B&@n
z$ALxW?x0DepB^UB((<eK4`@7^Jf=HCkT|J^4qQ)WZirTbrwK|rD(WE|9$spz2WMg^
zk$M<$Drw9Q4>jUz-p5l>?cNUYA)<-E@>It3gbPXNYRO}8cKSBQhYCoRy~FkE)8mu#
ztJKfjlxN=$V8H#%Ft|)n?n_cIuvh<&_BKJYKQ4TI|8LuMJCFa`%|^4JXIsYq?*j(0
z2tYF%jjAuvEJp&JkRBl<wGSX77M4rXfUtA`=n@AG0R&%!Ue9H+AM+@+-e^?^R*a6S
zjQ(G5Qr37O4Oq9hBy7baJv(vCJ?+<HCcD?A)UTIrvgtb(OWeC#(xO?)2Img#7WH;=
z=GgO)AqyfVr%-LzdMrSL(k%s<Nt;--U;7xe4DrjrVs0`yA{zHFno0bWpc(D|{GAg&
zA#Y;sx+S!6la-6l(p?<L&@d*Dk`FlEuY0XWQYfP9NBbNEX*`5cgO{XHlH<T-7B}Iy
z#^g$yqFe247j)(qT%ecL&Ka)8=%s=b<>Cq>EtjEexagyb8u^AC7(@MI*#D(*i+Nrp
zjnk_C$^LJZ|Kpc5|1<gjH_89pN?RDk?!x~gv=;c^YBd|i|L+4*KWl8ZvHx4>zY<Z;
zXLros)Bvp2|9WFl|JxlC|L>Ro)3DZ91Ec?s>i^}d=atj*-!68Q|2Ht{U(o+%y=DCW
zUf_%=nkLJY-?Jp!0o$_4Ii~B*iaPleNec<NN`ipKV+S73zsU;`uu6nkrUUQJU!7n4
zaZa)#<Kpj%PR<>8EIB8WGNO~#6v?w$PVK-p&5S&`-Lk2XFI)bys@buG^aah2<pY<R
zA^FU`rpU^5b4`+bIIV!^-#2QStXepC$YL{P3FS+hEA#hRXtLx}yKJ`Pdw!*+%c|F0
z+k{!QXhl;dpW0t@X60>m)~s2z`*+a1nUcCy6K9p)Y}M3x#AHpIJxf!MHh)%alsAKx
z=JwDWnx1*-Q<Vzq%&#0zORNrTQ1{#x-)e8YqH}M>`qC=-*5bdMvi`V-8b4ZJ2U=hM
zu@~aMPTg*s_-`LzvOSakZK?lP7w_I2pImKR{#~p8jYfO1{)0)O(f@tGw5?0>jz8HT
z^(UK&2iek#kPW*IWS7-RqyG>8|3`n|?cj{H{=d1P|8}QcZyNpI2VA4#^isvSA&Ppv
z+l2crW?TA)OEJCcLXDUp;eV(9TNinES^u>=cFp+zZ>Rp-Gxc9n|26erQ~x#fUsL}z
s_1_<<{%gR10RsjM7%*VKfB^#r3>YwAz<>b*27XNV7bp_XAplSS00Cr6&j0`b

literal 0
HcmV?d00001

diff --git a/gitignore.gitignore b/gitignore.gitignore
new file mode 100644
index 0000000..b9af354
--- /dev/null
+++ b/gitignore.gitignore
@@ -0,0 +1,232 @@
+
+# Created by https://www.gitignore.io/api/linux,python,pycharm+all
+# Edit at https://www.gitignore.io/?templates=linux,python,pycharm+all
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# JetBrains templates
+**___jb_tmp___
+
+### PyCharm+all Patch ###
+# Ignores the whole .idea folder and all .iml files
+# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
+
+.idea/
+
+# Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
+
+*.iml
+modules.xml
+.idea/misc.xml
+*.ipr
+
+# Sonarlint plugin
+.idea/sonarlint
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don’t work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# End of https://www.gitignore.io/api/linux,python,pycharm+all
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/PKG-INFO b/gutenberg_cleaner.egg-info/PKG-INFO
new file mode 100644
index 0000000..1096e29
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/PKG-INFO
@@ -0,0 +1,57 @@
+Metadata-Version: 1.1
+Name: gutenberg-cleaner
+Version: 0.0.1
+Summary: cleans gutenberg dataset books
+Home-page: UNKNOWN
+Author: UNKNOWN
+Author-email: mohsenikiasari@ce.sharif.edu
+License: MIT
+Description: ![](https://i.ibb.co/sCJXhmz/header-sp.png)
+        ![](https://img.shields.io/apm/l/vim-mode.svg)
+        
+        
+        # gutenberg-cleaner
+        
+        a python package for cleaning Gutenberg books and dataset.
+        
+        ### Prerequisites
+        nltk package
+        
+        ### Installing
+        ```
+        [sudo] pip install gutenberg-cleaner
+        ```
+        
+        ## How to use it?
+        
+        it has two methods called "simple_cleaner" and "super_cleaner".
+        ### simple_claner:
+        Just removes lines that are part of the Project Gutenberg header or footer.
+        Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+        ```
+        simple_cleaner(book: str) -> str
+        ```
+        ### super_cleaner:
+        Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+        ```
+        super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str
+        ```
+        min_token: The minimum tokens of a paragraph that is not "dialog" or "quote", -1 means don't tokenize the txt (so it will be faster, but less efficient cleaning).
+        max_token: The maximum tokens of a paragraph.
+        
+        it will mark deleted paragraphs with: [deleted]
+        
+        
+        ## Author
+        
+        * **Peyman Mohseni kiasari**
+        
+        ## License
+        
+        This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
+        
+Platform: UNKNOWN
+Classifier: Programming language :: Python :: 3
+Classifier: Programming language :: Python :: 3.6
+Classifier: Programming language :: Python :: 3.7
+Classifier: Operation System :: OS Independent
diff --git a/gutenberg_cleaner.egg-info/SOURCES.txt b/gutenberg_cleaner.egg-info/SOURCES.txt
new file mode 100644
index 0000000..564eaf9
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/SOURCES.txt
@@ -0,0 +1,7 @@
+README.md
+setup.py
+gutenberg_cleaner.egg-info/PKG-INFO
+gutenberg_cleaner.egg-info/SOURCES.txt
+gutenberg_cleaner.egg-info/dependency_links.txt
+gutenberg_cleaner.egg-info/requires.txt
+gutenberg_cleaner.egg-info/top_level.txt
\ No newline at end of file
diff --git a/gutenberg_cleaner.egg-info/dependency_links.txt b/gutenberg_cleaner.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/gutenberg_cleaner.egg-info/requires.txt b/gutenberg_cleaner.egg-info/requires.txt
new file mode 100644
index 0000000..8469296
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/requires.txt
@@ -0,0 +1 @@
+nltk
diff --git a/gutenberg_cleaner.egg-info/top_level.txt b/gutenberg_cleaner.egg-info/top_level.txt
new file mode 100644
index 0000000..fae547e
--- /dev/null
+++ b/gutenberg_cleaner.egg-info/top_level.txt
@@ -0,0 +1 @@
+gutenbergـcleaner
diff --git a/gutenberg_cleaner.py b/gutenberg_cleaner.py
new file mode 100644
index 0000000..62e8457
--- /dev/null
+++ b/gutenberg_cleaner.py
@@ -0,0 +1,42 @@
+from gutenberg_cleaning_options.cleaning_options import is_title_or_etc, is_books_copy, is_email_init, is_footnote, \
+    is_image, is_table
+from gutenberg_cleaning_options.strip_headers import strip_headers
+
+
+def simple_cleaner(book: str) -> str:
+    """
+    Just removes lines that are part of the Project Gutenberg header or footer.
+    Doesnt go deeply in the text to remove other things like titles or footnotes or etc...
+    :rtype: str
+    :param book: str of a gutenberg's book
+    :return: str of the book without the lines that are part of the Project Gutenberg header or footer.
+    """
+    return strip_headers(book)
+
+
+def super_cleaner(book: str, min_token: int = 5, max_token: int = 600) -> str:
+    """
+    Super clean the book (titles, footnotes, images, book information, etc.). may delete some good lines too.
+    ^_^ Do you have a comment to make it better? Email to mohsenikiasari@ce.sharif.edu ^_^.
+    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+    :rtype: str
+    :param book: str of a gutenberg's book.
+    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+     -1 means don't tokenize the txt (so it will be faster).
+    :param max_token: The maximum tokens of a paragraph.
+    :return: str of the book with paragraphs that have been deleted are shown with "[deleted]" in it.
+    you can split the book to paragraphs by "\n\n".
+    """
+    headless_book = strip_headers(book)
+    paragraphs = headless_book.split("\n\n")  # split the book to paragraphs.
+
+    paragraphs_after_cleaning = []
+    for par in paragraphs:
+        if is_image(par) or is_footnote(par) or is_email_init(par) or \
+                is_books_copy(par) or is_table(par) or is_title_or_etc(par, min_token, max_token):
+            paragraphs_after_cleaning.append("[deleted]")  # if the paragraph is not good , replace it with [deleted]
+        else:
+            paragraphs_after_cleaning.append(par)
+
+    cleaned_book = "\n\n".join(paragraphs_after_cleaning)  # joining the list of paragraphs into one string
+    return cleaned_book
diff --git a/gutenberg_cleaning_options/__init__.py b/gutenberg_cleaning_options/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/gutenberg_cleaning_options/cleaning_options.py b/gutenberg_cleaning_options/cleaning_options.py
new file mode 100644
index 0000000..b0850ac
--- /dev/null
+++ b/gutenberg_cleaning_options/cleaning_options.py
@@ -0,0 +1,105 @@
+import string
+import re
+from nltk import word_tokenize
+
+email_regex = re.compile("[\w.-]+@[\w.-]+\.\w+")  # Regex to find Emails.
+footnote_notation_regex = re.compile("^\{.+\}|^\[.+\]")  # Regex to find start of footnotes.
+number_of_copies_regex = re.compile("[0-9]* copies|copyright")  # Regex to find copy mentioning.
+starts_with_regex = re.compile('^[%_<>*]')  # If the text is started with these, it is not a good one.
+image_formats_regex = re.compile("\.png|\.jpg|\.jpeg|\.gif|picture:")  # Regex to find images.
+
+
+def is_title_or_etc(text: str, min_token: int = 5, max_token: int = 600) -> bool:
+    """
+    determining if a paragraph is title or information of the book.
+    IMPORTANT: if you don't want the text to be tokenize, just put min_token = -1.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :param min_token: The minimum tokens of a paragraph that is not "dialog" or "quote",
+     -1 means don't tokenize the txt (so it will be faster).
+    :param max_token: The maximum tokens of a paragraph.
+    :return: Boolean, True if it is title or information of the book or a bad paragraph.
+    """
+    txt = text.strip()
+    num_token = len(word_tokenize(txt)) if min_token >= 0 else -1
+    if num_token > max_token:
+        return True
+    if len(txt) == 0 or num_token < min_token and not (txt.count('"') == 2 or txt.count('\'') == 2 or txt[-1] == ":"):
+        return True  # Length is short but not "dialog" or "quote"
+    if sum(1 for c in txt if c.isupper() or c.isdigit() or c in string.punctuation.replace("\"", "")) \
+            / len(txt.replace(" ", "")) > 0.6:
+        return True  # More than 60% of chars are UPPER or digits or punctuations so it might be title or etc.
+    if txt.lower().startswith("appendix") or bool(re.search(starts_with_regex, txt)):
+        return True
+    if txt.count(":") > 3 and 2 * txt.count(":") - txt.count("\"") > 3:
+        return True  # mostly information about the book.
+    if txt.count("   ") > 3 or txt.count("\t") > 2 or txt.count("*") > 3 or txt.count("=") > 2:
+        return True  # mostly tables and catalogs and etc.
+    if "@" in txt and len(txt) < 100:
+        return True
+    return False
+
+
+def is_table(text: str) -> bool:
+    """
+    determining if a paragraph is a table or catalog.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return:  Boolean, True if it is a table or catalog.
+    """
+    txt = text.strip()
+    if txt.count("   ") > 3 or txt.count("\t") > 2:
+        txt = " ".join([line.strip() for line in txt.split("\n")])
+        if txt.count("   ") > 3 or txt.count("\t") > 2:
+            return True  # mostly tables.
+    if txt.count("*") > 3 or txt.count("=") > 2:
+        return True  # mostly catalogs and etc.
+
+
+def is_image(text: str) -> bool:
+    """
+    determining if a paragraph is for mentioning an image.
+    :param text: Raw paragraph.
+    :return: Boolean, True if it is for mentioning an image.
+    """
+    return bool(re.search(image_formats_regex, text.lower()))
+
+
+def is_footnote(text: str) -> bool:
+    """
+    determining if a paragraph is the footnote of the book.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return: Boolean, True if it is the footnote of the book.
+    """
+    txt = text.strip()
+    print(txt)
+    if "footnote" in txt.lower() and len(txt.replace(" ", "")) < 50:
+        return True
+    return bool(re.search(footnote_notation_regex, txt))  # if a line starts with {...} it might be a footnote.
+
+
+print(is_footnote("""     [0] The country-seat of Bishop Shipley, the good bishop,
+         as Dr. Franklin used to style him.--B."""))
+
+
+def is_books_copy(text: str) -> bool:
+    """
+    determining if a paragraph indicates the number of copies of this book.
+    :rtype: bool
+    :param text: text: Raw paragraph.
+    :return: Boolean, True if it is indicating the copy of book or copyrights.
+    """
+    if bool(re.search(number_of_copies_regex, text)) and len(text.replace(" ", "")) < 500:
+        return True
+    return False
+
+
+def is_email_init(text: str) -> bool:
+    """
+    determining if a paragraph includes an Email.
+    :rtype: bool
+    :param text: Raw paragraph.
+    :return: Boolean, True if it includes an Email.
+    """
+    return bool(re.search(email_regex, text))
diff --git a/gutenberg_cleaning_options/strip_headers.py b/gutenberg_cleaning_options/strip_headers.py
new file mode 100644
index 0000000..3acb1c2
--- /dev/null
+++ b/gutenberg_cleaning_options/strip_headers.py
@@ -0,0 +1,149 @@
+"""Module to remove the noise from Project Gutenberg texts."""
+
+from __future__ import absolute_import, unicode_literals
+from builtins import str
+import os
+
+TEXT_START_MARKERS = frozenset((
+    "*END*THE SMALL PRINT",
+    "*** START OF THE PROJECT GUTENBERG",
+    "*** START OF THIS PROJECT GUTENBERG",
+    "This etext was prepared by",
+    "E-text prepared by",
+    "Produced by",
+    "Distributed Proofreading Team",
+    "Proofreading Team at http://www.pgdp.net",
+    "http://gallica.bnf.fr)",
+    "      http://archive.org/details/",
+    "http://www.pgdp.net",
+    "by The Internet Archive)",
+    "by The Internet Archive/Canadian Libraries",
+    "by The Internet Archive/American Libraries",
+    "public domain material from the Internet Archive",
+    "Internet Archive)",
+    "Internet Archive/Canadian Libraries",
+    "Internet Archive/American Libraries",
+    "material from the Google Print project",
+    "*END THE SMALL PRINT",
+    "***START OF THE PROJECT GUTENBERG",
+    "This etext was produced by",
+    "*** START OF THE COPYRIGHTED",
+    "The Project Gutenberg",
+    "http://gutenberg.spiegel.de/ erreichbar.",
+    "Project Runeberg publishes",
+    "Beginning of this Project Gutenberg",
+    "Project Gutenberg Online Distributed",
+    "Gutenberg Online Distributed",
+    "the Project Gutenberg Online Distributed",
+    "Project Gutenberg TEI",
+    "This eBook was prepared by",
+    "http://gutenberg2000.de erreichbar.",
+    "This Etext was prepared by",
+    "This Project Gutenberg Etext was prepared by",
+    "Gutenberg Distributed Proofreaders",
+    "Project Gutenberg Distributed Proofreaders",
+    "the Project Gutenberg Online Distributed Proofreading Team",
+    "**The Project Gutenberg",
+    "*SMALL PRINT!",
+    "More information about this book is at the top of this file.",
+    "tells you about restrictions in how the file may be used.",
+    "l'authorization à les utilizer pour preparer ce texte.",
+    "of the etext through OCR.",
+    "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
+    "We need your donations more than ever!",
+    " *** START OF THIS PROJECT GUTENBERG",
+    "****     SMALL PRINT!",
+    '["Small Print" V.',
+    '      (http://www.ibiblio.org/gutenberg/',
+    'and the Project Gutenberg Online Distributed Proofreading Team',
+    'Mary Meehan, and the Project Gutenberg Online Distributed Proofreading',
+    '                this Project Gutenberg edition.',
+))
+
+TEXT_END_MARKERS = frozenset((
+    "*** END OF THE PROJECT GUTENBERG",
+    "*** END OF THIS PROJECT GUTENBERG",
+    "***END OF THE PROJECT GUTENBERG",
+    "End of the Project Gutenberg",
+    "End of The Project Gutenberg",
+    "Ende dieses Project Gutenberg",
+    "by Project Gutenberg",
+    "End of Project Gutenberg",
+    "End of this Project Gutenberg",
+    "Ende dieses Projekt Gutenberg",
+    "        ***END OF THE PROJECT GUTENBERG",
+    "*** END OF THE COPYRIGHTED",
+    "End of this is COPYRIGHTED",
+    "Ende dieses Etextes ",
+    "Ende dieses Project Gutenber",
+    "Ende diese Project Gutenberg",
+    "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
+    "Fin de Project Gutenberg",
+    "The Project Gutenberg Etext of ",
+    "Ce document fut presente en lecture",
+    "Ce document fut présenté en lecture",
+    "More information about this book is at the top of this file.",
+    "We need your donations more than ever!",
+    "END OF PROJECT GUTENBERG",
+    " End of the Project Gutenberg",
+    " *** END OF THIS PROJECT GUTENBERG",
+))
+
+LEGALESE_START_MARKERS = frozenset(("<<THIS ELECTRONIC VERSION OF",))
+
+LEGALESE_END_MARKERS = frozenset(("SERVICE THAT CHARGES FOR DOWNLOAD",))
+
+
+def strip_headers(text):
+    """Remove lines that are part of the Project Gutenberg header or footer.
+    Note: The original version of the code can be found at:
+    https://github.com/c-w/gutenberg/blob/master/gutenberg/cleanup/strip_headers.py
+    Args:
+        text (unicode): The body of the text to clean up.
+    Returns:
+        unicode: The text with any non-text content removed.
+    """
+    lines = text.splitlines()
+    sep = str(os.linesep)
+
+    out = []
+    i = 0
+    footer_found = False
+    ignore_section = False
+
+    for line in lines:
+        reset = False
+
+        if i <= 600:
+            # Check if the header ends here
+            if any(line.startswith(token) for token in TEXT_START_MARKERS):
+                reset = True
+
+            # If it's the end of the header, delete the output produced so far.
+            # May be done several times, if multiple lines occur indicating the
+            # end of the header
+            if reset:
+                out = []
+                continue
+
+        if i >= 100:
+            # Check if the footer begins here
+            if any(line.startswith(token) for token in TEXT_END_MARKERS):
+                footer_found = True
+
+            # If it's the beginning of the footer, stop output
+            if footer_found:
+                break
+
+        if any(line.startswith(token) for token in LEGALESE_START_MARKERS):
+            ignore_section = True
+            continue
+        elif any(line.startswith(token) for token in LEGALESE_END_MARKERS):
+            ignore_section = False
+            continue
+
+        if not ignore_section:
+            out.append(line.rstrip(sep))
+            i += 1
+
+    return sep.join(out)
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..1e8d2f3
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,21 @@
+from setuptools import setup
+
+with open('README.md') as f:
+    long_description = f.read()
+
+setup(
+    name="gutenberg_cleaner",
+    install_requires=['nltk'],
+    version='0.0.1',
+    description="cleans gutenberg dataset books",
+    author_email='mohsenikiasari@ce.sharif.edu',
+    py_modules=["gutenbergـcleaner"],
+    license='MIT',
+    long_description=long_description,
+    classifiers=[
+        "Programming language :: Python :: 3",
+        "Programming language :: Python :: 3.6",
+        "Programming language :: Python :: 3.7",
+        "Operation System :: OS Independent"
+    ]
+)